{ "best_metric": 0.8419870138168335, "best_model_checkpoint": "./model_fine-tune/glot/xlm-r/amh-Ethi/checkpoint-94500", "epoch": 121.46529562982005, "eval_steps": 500, "global_step": 94500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.6426735218508998, "grad_norm": 4.40508508682251, "learning_rate": 9.95e-05, "loss": 1.6805, "step": 500 }, { "epoch": 0.6426735218508998, "eval_accuracy": 0.6904924931069853, "eval_loss": 1.4787547588348389, "eval_runtime": 34.6674, "eval_samples_per_second": 179.506, "eval_steps_per_second": 5.625, "step": 500 }, { "epoch": 1.2853470437017995, "grad_norm": 3.9189295768737793, "learning_rate": 9.900000000000001e-05, "loss": 1.5356, "step": 1000 }, { "epoch": 1.2853470437017995, "eval_accuracy": 0.7092839911070465, "eval_loss": 1.3684136867523193, "eval_runtime": 35.1037, "eval_samples_per_second": 177.275, "eval_steps_per_second": 5.555, "step": 1000 }, { "epoch": 1.9280205655526992, "grad_norm": 4.308630466461182, "learning_rate": 9.850000000000001e-05, "loss": 1.4583, "step": 1500 }, { "epoch": 1.9280205655526992, "eval_accuracy": 0.7174822425497339, "eval_loss": 1.3316432237625122, "eval_runtime": 32.7687, "eval_samples_per_second": 189.907, "eval_steps_per_second": 5.951, "step": 1500 }, { "epoch": 2.570694087403599, "grad_norm": 3.4791815280914307, "learning_rate": 9.8e-05, "loss": 1.4029, "step": 2000 }, { "epoch": 2.570694087403599, "eval_accuracy": 0.7272827658735376, "eval_loss": 1.280930519104004, "eval_runtime": 35.7018, "eval_samples_per_second": 174.305, "eval_steps_per_second": 5.462, "step": 2000 }, { "epoch": 3.2133676092544987, "grad_norm": 3.705988645553589, "learning_rate": 9.75e-05, "loss": 1.3485, "step": 2500 }, { "epoch": 3.2133676092544987, "eval_accuracy": 0.7312191932269085, "eval_loss": 1.2635687589645386, "eval_runtime": 34.9687, "eval_samples_per_second": 177.959, "eval_steps_per_second": 5.576, "step": 2500 }, { "epoch": 3.8560411311053984, "grad_norm": 3.63777232170105, "learning_rate": 9.7e-05, "loss": 1.318, "step": 3000 }, { "epoch": 3.8560411311053984, "eval_accuracy": 0.7356960949650779, "eval_loss": 1.2287126779556274, "eval_runtime": 35.1013, "eval_samples_per_second": 177.287, "eval_steps_per_second": 5.555, "step": 3000 }, { "epoch": 4.4987146529562985, "grad_norm": 3.065805673599243, "learning_rate": 9.65e-05, "loss": 1.2853, "step": 3500 }, { "epoch": 4.4987146529562985, "eval_accuracy": 0.7411218708879311, "eval_loss": 1.2096679210662842, "eval_runtime": 35.3152, "eval_samples_per_second": 176.213, "eval_steps_per_second": 5.522, "step": 3500 }, { "epoch": 5.141388174807198, "grad_norm": 3.083160161972046, "learning_rate": 9.6e-05, "loss": 1.2579, "step": 4000 }, { "epoch": 5.141388174807198, "eval_accuracy": 0.743514192750392, "eval_loss": 1.1946351528167725, "eval_runtime": 35.3519, "eval_samples_per_second": 176.03, "eval_steps_per_second": 5.516, "step": 4000 }, { "epoch": 5.784061696658098, "grad_norm": 3.1521048545837402, "learning_rate": 9.55e-05, "loss": 1.225, "step": 4500 }, { "epoch": 5.784061696658098, "eval_accuracy": 0.7478054754473986, "eval_loss": 1.1656205654144287, "eval_runtime": 35.2967, "eval_samples_per_second": 176.305, "eval_steps_per_second": 5.525, "step": 4500 }, { "epoch": 6.426735218508997, "grad_norm": 2.947038412094116, "learning_rate": 9.5e-05, "loss": 1.2156, "step": 5000 }, { "epoch": 6.426735218508997, "eval_accuracy": 0.7494688799077729, "eval_loss": 1.1547260284423828, "eval_runtime": 35.244, "eval_samples_per_second": 176.569, "eval_steps_per_second": 5.533, "step": 5000 }, { "epoch": 7.069408740359897, "grad_norm": 3.245880603790283, "learning_rate": 9.449999999999999e-05, "loss": 1.1878, "step": 5500 }, { "epoch": 7.069408740359897, "eval_accuracy": 0.7533389855345517, "eval_loss": 1.1394718885421753, "eval_runtime": 34.1955, "eval_samples_per_second": 181.983, "eval_steps_per_second": 5.703, "step": 5500 }, { "epoch": 7.712082262210797, "grad_norm": 3.02457857131958, "learning_rate": 9.4e-05, "loss": 1.1713, "step": 6000 }, { "epoch": 7.712082262210797, "eval_accuracy": 0.7548033682748899, "eval_loss": 1.136972188949585, "eval_runtime": 33.9827, "eval_samples_per_second": 183.123, "eval_steps_per_second": 5.738, "step": 6000 }, { "epoch": 8.354755784061696, "grad_norm": 2.8089816570281982, "learning_rate": 9.350000000000001e-05, "loss": 1.1587, "step": 6500 }, { "epoch": 8.354755784061696, "eval_accuracy": 0.7558409318256807, "eval_loss": 1.1281076669692993, "eval_runtime": 34.9074, "eval_samples_per_second": 178.272, "eval_steps_per_second": 5.586, "step": 6500 }, { "epoch": 8.997429305912597, "grad_norm": 2.808964729309082, "learning_rate": 9.300000000000001e-05, "loss": 1.1416, "step": 7000 }, { "epoch": 8.997429305912597, "eval_accuracy": 0.7600179626094105, "eval_loss": 1.107399821281433, "eval_runtime": 34.0824, "eval_samples_per_second": 182.587, "eval_steps_per_second": 5.721, "step": 7000 }, { "epoch": 9.640102827763496, "grad_norm": 2.8558380603790283, "learning_rate": 9.250000000000001e-05, "loss": 1.128, "step": 7500 }, { "epoch": 9.640102827763496, "eval_accuracy": 0.7628805020537848, "eval_loss": 1.0930116176605225, "eval_runtime": 33.4368, "eval_samples_per_second": 186.112, "eval_steps_per_second": 5.832, "step": 7500 }, { "epoch": 10.282776349614396, "grad_norm": 2.9800262451171875, "learning_rate": 9.200000000000001e-05, "loss": 1.1213, "step": 8000 }, { "epoch": 10.282776349614396, "eval_accuracy": 0.7627036601926251, "eval_loss": 1.0944606065750122, "eval_runtime": 34.4461, "eval_samples_per_second": 180.659, "eval_steps_per_second": 5.661, "step": 8000 }, { "epoch": 10.925449871465295, "grad_norm": 3.040515422821045, "learning_rate": 9.15e-05, "loss": 1.1075, "step": 8500 }, { "epoch": 10.925449871465295, "eval_accuracy": 0.764068771058977, "eval_loss": 1.0873337984085083, "eval_runtime": 32.7071, "eval_samples_per_second": 190.265, "eval_steps_per_second": 5.962, "step": 8500 }, { "epoch": 11.568123393316196, "grad_norm": 2.973078966140747, "learning_rate": 9.1e-05, "loss": 1.0912, "step": 9000 }, { "epoch": 11.568123393316196, "eval_accuracy": 0.7658058887449325, "eval_loss": 1.0768815279006958, "eval_runtime": 32.6454, "eval_samples_per_second": 190.624, "eval_steps_per_second": 5.973, "step": 9000 }, { "epoch": 12.210796915167094, "grad_norm": 3.5373220443725586, "learning_rate": 9.05e-05, "loss": 1.085, "step": 9500 }, { "epoch": 12.210796915167094, "eval_accuracy": 0.7658247272593731, "eval_loss": 1.086098074913025, "eval_runtime": 32.662, "eval_samples_per_second": 190.527, "eval_steps_per_second": 5.97, "step": 9500 }, { "epoch": 12.853470437017995, "grad_norm": 5.5106964111328125, "learning_rate": 9e-05, "loss": 1.0746, "step": 10000 }, { "epoch": 12.853470437017995, "eval_accuracy": 0.7682209938962093, "eval_loss": 1.0635614395141602, "eval_runtime": 32.6593, "eval_samples_per_second": 190.543, "eval_steps_per_second": 5.971, "step": 10000 }, { "epoch": 13.496143958868895, "grad_norm": 3.238605499267578, "learning_rate": 8.950000000000001e-05, "loss": 1.0531, "step": 10500 }, { "epoch": 13.496143958868895, "eval_accuracy": 0.7686876543500863, "eval_loss": 1.0645390748977661, "eval_runtime": 32.6258, "eval_samples_per_second": 190.739, "eval_steps_per_second": 5.977, "step": 10500 }, { "epoch": 14.138817480719794, "grad_norm": 2.7707929611206055, "learning_rate": 8.900000000000001e-05, "loss": 1.0545, "step": 11000 }, { "epoch": 14.138817480719794, "eval_accuracy": 0.7713913141190644, "eval_loss": 1.0523449182510376, "eval_runtime": 33.0121, "eval_samples_per_second": 188.507, "eval_steps_per_second": 5.907, "step": 11000 }, { "epoch": 14.781491002570695, "grad_norm": 3.1346287727355957, "learning_rate": 8.850000000000001e-05, "loss": 1.042, "step": 11500 }, { "epoch": 14.781491002570695, "eval_accuracy": 0.7711833853943258, "eval_loss": 1.0591264963150024, "eval_runtime": 32.5788, "eval_samples_per_second": 191.014, "eval_steps_per_second": 5.985, "step": 11500 }, { "epoch": 15.424164524421593, "grad_norm": 2.98738169670105, "learning_rate": 8.800000000000001e-05, "loss": 1.0327, "step": 12000 }, { "epoch": 15.424164524421593, "eval_accuracy": 0.7727139790786457, "eval_loss": 1.0433602333068848, "eval_runtime": 32.6912, "eval_samples_per_second": 190.357, "eval_steps_per_second": 5.965, "step": 12000 }, { "epoch": 16.066838046272494, "grad_norm": 2.7987964153289795, "learning_rate": 8.75e-05, "loss": 1.0226, "step": 12500 }, { "epoch": 16.066838046272494, "eval_accuracy": 0.7744916530689092, "eval_loss": 1.0427190065383911, "eval_runtime": 32.7105, "eval_samples_per_second": 190.245, "eval_steps_per_second": 5.961, "step": 12500 }, { "epoch": 16.709511568123393, "grad_norm": 2.927727699279785, "learning_rate": 8.7e-05, "loss": 1.0164, "step": 13000 }, { "epoch": 16.709511568123393, "eval_accuracy": 0.7755782307589745, "eval_loss": 1.033629059791565, "eval_runtime": 32.6997, "eval_samples_per_second": 190.307, "eval_steps_per_second": 5.963, "step": 13000 }, { "epoch": 17.35218508997429, "grad_norm": 2.902130603790283, "learning_rate": 8.65e-05, "loss": 1.012, "step": 13500 }, { "epoch": 17.35218508997429, "eval_accuracy": 0.7768737795368466, "eval_loss": 1.024613380432129, "eval_runtime": 32.6543, "eval_samples_per_second": 190.572, "eval_steps_per_second": 5.972, "step": 13500 }, { "epoch": 17.994858611825194, "grad_norm": 2.6647133827209473, "learning_rate": 8.6e-05, "loss": 0.9992, "step": 14000 }, { "epoch": 17.994858611825194, "eval_accuracy": 0.7779743325654593, "eval_loss": 1.0130186080932617, "eval_runtime": 32.7375, "eval_samples_per_second": 190.088, "eval_steps_per_second": 5.956, "step": 14000 }, { "epoch": 18.637532133676093, "grad_norm": 2.590914249420166, "learning_rate": 8.55e-05, "loss": 0.9928, "step": 14500 }, { "epoch": 18.637532133676093, "eval_accuracy": 0.7779691811484551, "eval_loss": 1.0204987525939941, "eval_runtime": 33.4597, "eval_samples_per_second": 185.985, "eval_steps_per_second": 5.828, "step": 14500 }, { "epoch": 19.28020565552699, "grad_norm": 2.6548004150390625, "learning_rate": 8.5e-05, "loss": 0.985, "step": 15000 }, { "epoch": 19.28020565552699, "eval_accuracy": 0.7788039344262295, "eval_loss": 1.0153322219848633, "eval_runtime": 34.4688, "eval_samples_per_second": 180.54, "eval_steps_per_second": 5.657, "step": 15000 }, { "epoch": 19.922879177377894, "grad_norm": 2.6247718334198, "learning_rate": 8.450000000000001e-05, "loss": 0.9822, "step": 15500 }, { "epoch": 19.922879177377894, "eval_accuracy": 0.7795580110497238, "eval_loss": 1.0121690034866333, "eval_runtime": 34.3855, "eval_samples_per_second": 180.977, "eval_steps_per_second": 5.671, "step": 15500 }, { "epoch": 20.565552699228792, "grad_norm": 2.7304604053497314, "learning_rate": 8.4e-05, "loss": 0.9735, "step": 16000 }, { "epoch": 20.565552699228792, "eval_accuracy": 0.7793682688302468, "eval_loss": 1.018441081047058, "eval_runtime": 34.5499, "eval_samples_per_second": 180.116, "eval_steps_per_second": 5.644, "step": 16000 }, { "epoch": 21.20822622107969, "grad_norm": 2.7982735633850098, "learning_rate": 8.35e-05, "loss": 0.971, "step": 16500 }, { "epoch": 21.20822622107969, "eval_accuracy": 0.7816012690588251, "eval_loss": 1.0020060539245605, "eval_runtime": 34.6648, "eval_samples_per_second": 179.519, "eval_steps_per_second": 5.625, "step": 16500 }, { "epoch": 21.85089974293059, "grad_norm": 2.6124517917633057, "learning_rate": 8.3e-05, "loss": 0.965, "step": 17000 }, { "epoch": 21.85089974293059, "eval_accuracy": 0.7828390025055424, "eval_loss": 0.9919770359992981, "eval_runtime": 34.2625, "eval_samples_per_second": 181.627, "eval_steps_per_second": 5.691, "step": 17000 }, { "epoch": 22.493573264781492, "grad_norm": 2.7260468006134033, "learning_rate": 8.25e-05, "loss": 0.9536, "step": 17500 }, { "epoch": 22.493573264781492, "eval_accuracy": 0.7826561078737454, "eval_loss": 0.9964642524719238, "eval_runtime": 35.1121, "eval_samples_per_second": 177.232, "eval_steps_per_second": 5.554, "step": 17500 }, { "epoch": 23.13624678663239, "grad_norm": 2.5959136486053467, "learning_rate": 8.2e-05, "loss": 0.9515, "step": 18000 }, { "epoch": 23.13624678663239, "eval_accuracy": 0.7842808178702281, "eval_loss": 0.9843628406524658, "eval_runtime": 35.1683, "eval_samples_per_second": 176.949, "eval_steps_per_second": 5.545, "step": 18000 }, { "epoch": 23.77892030848329, "grad_norm": 2.8720030784606934, "learning_rate": 8.15e-05, "loss": 0.9459, "step": 18500 }, { "epoch": 23.77892030848329, "eval_accuracy": 0.7822331893892659, "eval_loss": 0.9950876235961914, "eval_runtime": 35.4899, "eval_samples_per_second": 175.345, "eval_steps_per_second": 5.495, "step": 18500 }, { "epoch": 24.42159383033419, "grad_norm": 3.0184552669525146, "learning_rate": 8.1e-05, "loss": 0.9374, "step": 19000 }, { "epoch": 24.42159383033419, "eval_accuracy": 0.7852158669501615, "eval_loss": 0.9859277009963989, "eval_runtime": 34.116, "eval_samples_per_second": 182.407, "eval_steps_per_second": 5.716, "step": 19000 }, { "epoch": 25.06426735218509, "grad_norm": 2.516874074935913, "learning_rate": 8.05e-05, "loss": 0.9318, "step": 19500 }, { "epoch": 25.06426735218509, "eval_accuracy": 0.7829394214826243, "eval_loss": 0.9957559108734131, "eval_runtime": 35.0753, "eval_samples_per_second": 177.419, "eval_steps_per_second": 5.559, "step": 19500 }, { "epoch": 25.70694087403599, "grad_norm": 2.5600101947784424, "learning_rate": 8e-05, "loss": 0.9311, "step": 20000 }, { "epoch": 25.70694087403599, "eval_accuracy": 0.786506985609304, "eval_loss": 0.9760250449180603, "eval_runtime": 34.6952, "eval_samples_per_second": 179.362, "eval_steps_per_second": 5.62, "step": 20000 }, { "epoch": 26.34961439588689, "grad_norm": 2.7094664573669434, "learning_rate": 7.950000000000001e-05, "loss": 0.913, "step": 20500 }, { "epoch": 26.34961439588689, "eval_accuracy": 0.78658190904128, "eval_loss": 0.9806557297706604, "eval_runtime": 34.3785, "eval_samples_per_second": 181.014, "eval_steps_per_second": 5.672, "step": 20500 }, { "epoch": 26.99228791773779, "grad_norm": 2.3693931102752686, "learning_rate": 7.900000000000001e-05, "loss": 0.9171, "step": 21000 }, { "epoch": 26.99228791773779, "eval_accuracy": 0.7854132867260752, "eval_loss": 0.9825689196586609, "eval_runtime": 34.4341, "eval_samples_per_second": 180.722, "eval_steps_per_second": 5.663, "step": 21000 }, { "epoch": 27.63496143958869, "grad_norm": 3.0254580974578857, "learning_rate": 7.850000000000001e-05, "loss": 0.9062, "step": 21500 }, { "epoch": 27.63496143958869, "eval_accuracy": 0.7879520726883571, "eval_loss": 0.9732692241668701, "eval_runtime": 34.5783, "eval_samples_per_second": 179.968, "eval_steps_per_second": 5.639, "step": 21500 }, { "epoch": 28.27763496143959, "grad_norm": 2.6540822982788086, "learning_rate": 7.800000000000001e-05, "loss": 0.9095, "step": 22000 }, { "epoch": 28.27763496143959, "eval_accuracy": 0.7879547877788341, "eval_loss": 0.9751275777816772, "eval_runtime": 33.0181, "eval_samples_per_second": 188.473, "eval_steps_per_second": 5.906, "step": 22000 }, { "epoch": 28.920308483290487, "grad_norm": 2.7978150844573975, "learning_rate": 7.75e-05, "loss": 0.9003, "step": 22500 }, { "epoch": 28.920308483290487, "eval_accuracy": 0.789157797413295, "eval_loss": 0.9676902890205383, "eval_runtime": 34.176, "eval_samples_per_second": 182.087, "eval_steps_per_second": 5.706, "step": 22500 }, { "epoch": 29.56298200514139, "grad_norm": 2.6367812156677246, "learning_rate": 7.7e-05, "loss": 0.8994, "step": 23000 }, { "epoch": 29.56298200514139, "eval_accuracy": 0.78875261302136, "eval_loss": 0.9720832109451294, "eval_runtime": 34.6435, "eval_samples_per_second": 179.63, "eval_steps_per_second": 5.629, "step": 23000 }, { "epoch": 30.205655526992288, "grad_norm": 2.8410837650299072, "learning_rate": 7.65e-05, "loss": 0.8937, "step": 23500 }, { "epoch": 30.205655526992288, "eval_accuracy": 0.7898726325583352, "eval_loss": 0.9666265249252319, "eval_runtime": 34.382, "eval_samples_per_second": 180.996, "eval_steps_per_second": 5.672, "step": 23500 }, { "epoch": 30.848329048843187, "grad_norm": 2.6249313354492188, "learning_rate": 7.6e-05, "loss": 0.8853, "step": 24000 }, { "epoch": 30.848329048843187, "eval_accuracy": 0.7914164731021732, "eval_loss": 0.9592417478561401, "eval_runtime": 34.7355, "eval_samples_per_second": 179.154, "eval_steps_per_second": 5.614, "step": 24000 }, { "epoch": 31.491002570694086, "grad_norm": 2.563669443130493, "learning_rate": 7.55e-05, "loss": 0.8838, "step": 24500 }, { "epoch": 31.491002570694086, "eval_accuracy": 0.7892845518211694, "eval_loss": 0.969780683517456, "eval_runtime": 34.7542, "eval_samples_per_second": 179.057, "eval_steps_per_second": 5.611, "step": 24500 }, { "epoch": 32.13367609254499, "grad_norm": 2.7462027072906494, "learning_rate": 7.500000000000001e-05, "loss": 0.8809, "step": 25000 }, { "epoch": 32.13367609254499, "eval_accuracy": 0.7920457676713335, "eval_loss": 0.9529361128807068, "eval_runtime": 34.3098, "eval_samples_per_second": 181.377, "eval_steps_per_second": 5.684, "step": 25000 }, { "epoch": 32.77634961439589, "grad_norm": 2.533267021179199, "learning_rate": 7.450000000000001e-05, "loss": 0.874, "step": 25500 }, { "epoch": 32.77634961439589, "eval_accuracy": 0.7919883512638669, "eval_loss": 0.9449800252914429, "eval_runtime": 34.7988, "eval_samples_per_second": 178.828, "eval_steps_per_second": 5.604, "step": 25500 }, { "epoch": 33.419023136246786, "grad_norm": 2.718332529067993, "learning_rate": 7.4e-05, "loss": 0.8647, "step": 26000 }, { "epoch": 33.419023136246786, "eval_accuracy": 0.7927094725330193, "eval_loss": 0.9489922523498535, "eval_runtime": 34.773, "eval_samples_per_second": 178.961, "eval_steps_per_second": 5.608, "step": 26000 }, { "epoch": 34.06169665809769, "grad_norm": 2.664407253265381, "learning_rate": 7.35e-05, "loss": 0.8638, "step": 26500 }, { "epoch": 34.06169665809769, "eval_accuracy": 0.7909423101119826, "eval_loss": 0.9638357758522034, "eval_runtime": 34.4764, "eval_samples_per_second": 180.501, "eval_steps_per_second": 5.656, "step": 26500 }, { "epoch": 34.70437017994858, "grad_norm": 2.9981689453125, "learning_rate": 7.3e-05, "loss": 0.8604, "step": 27000 }, { "epoch": 34.70437017994858, "eval_accuracy": 0.792258094990671, "eval_loss": 0.9567773938179016, "eval_runtime": 34.4124, "eval_samples_per_second": 180.836, "eval_steps_per_second": 5.667, "step": 27000 }, { "epoch": 35.347043701799485, "grad_norm": 2.6397457122802734, "learning_rate": 7.25e-05, "loss": 0.8516, "step": 27500 }, { "epoch": 35.347043701799485, "eval_accuracy": 0.7944613334662306, "eval_loss": 0.9480540156364441, "eval_runtime": 34.3734, "eval_samples_per_second": 181.041, "eval_steps_per_second": 5.673, "step": 27500 }, { "epoch": 35.98971722365039, "grad_norm": 2.9263813495635986, "learning_rate": 7.2e-05, "loss": 0.8529, "step": 28000 }, { "epoch": 35.98971722365039, "eval_accuracy": 0.7949814058148135, "eval_loss": 0.9380202293395996, "eval_runtime": 34.1807, "eval_samples_per_second": 182.062, "eval_steps_per_second": 5.705, "step": 28000 }, { "epoch": 36.63239074550128, "grad_norm": 2.446913719177246, "learning_rate": 7.15e-05, "loss": 0.8454, "step": 28500 }, { "epoch": 36.63239074550128, "eval_accuracy": 0.7940900415285876, "eval_loss": 0.9452695250511169, "eval_runtime": 33.1944, "eval_samples_per_second": 187.472, "eval_steps_per_second": 5.874, "step": 28500 }, { "epoch": 37.275064267352185, "grad_norm": 2.429306983947754, "learning_rate": 7.1e-05, "loss": 0.8432, "step": 29000 }, { "epoch": 37.275064267352185, "eval_accuracy": 0.7944350218702906, "eval_loss": 0.9466774463653564, "eval_runtime": 33.291, "eval_samples_per_second": 186.927, "eval_steps_per_second": 5.857, "step": 29000 }, { "epoch": 37.91773778920309, "grad_norm": 2.648399591445923, "learning_rate": 7.05e-05, "loss": 0.8468, "step": 29500 }, { "epoch": 37.91773778920309, "eval_accuracy": 0.7957481551099058, "eval_loss": 0.9400731325149536, "eval_runtime": 33.2405, "eval_samples_per_second": 187.211, "eval_steps_per_second": 5.866, "step": 29500 }, { "epoch": 38.56041131105398, "grad_norm": 2.644608974456787, "learning_rate": 7e-05, "loss": 0.8328, "step": 30000 }, { "epoch": 38.56041131105398, "eval_accuracy": 0.795926071569013, "eval_loss": 0.9422577619552612, "eval_runtime": 33.3493, "eval_samples_per_second": 186.601, "eval_steps_per_second": 5.847, "step": 30000 }, { "epoch": 39.203084832904885, "grad_norm": 2.6563777923583984, "learning_rate": 6.95e-05, "loss": 0.8323, "step": 30500 }, { "epoch": 39.203084832904885, "eval_accuracy": 0.7951947535355016, "eval_loss": 0.9368035793304443, "eval_runtime": 32.9628, "eval_samples_per_second": 188.788, "eval_steps_per_second": 5.916, "step": 30500 }, { "epoch": 39.84575835475579, "grad_norm": 2.5932886600494385, "learning_rate": 6.9e-05, "loss": 0.8268, "step": 31000 }, { "epoch": 39.84575835475579, "eval_accuracy": 0.7967557476448579, "eval_loss": 0.9303238391876221, "eval_runtime": 32.4851, "eval_samples_per_second": 191.565, "eval_steps_per_second": 6.003, "step": 31000 }, { "epoch": 40.48843187660668, "grad_norm": 2.7462244033813477, "learning_rate": 6.850000000000001e-05, "loss": 0.8246, "step": 31500 }, { "epoch": 40.48843187660668, "eval_accuracy": 0.7966305277950054, "eval_loss": 0.9323325157165527, "eval_runtime": 33.1165, "eval_samples_per_second": 187.912, "eval_steps_per_second": 5.888, "step": 31500 }, { "epoch": 41.131105398457585, "grad_norm": 2.647944211959839, "learning_rate": 6.800000000000001e-05, "loss": 0.8216, "step": 32000 }, { "epoch": 41.131105398457585, "eval_accuracy": 0.7962699125415105, "eval_loss": 0.9350172281265259, "eval_runtime": 32.873, "eval_samples_per_second": 189.305, "eval_steps_per_second": 5.932, "step": 32000 }, { "epoch": 41.77377892030848, "grad_norm": 2.661036729812622, "learning_rate": 6.750000000000001e-05, "loss": 0.8218, "step": 32500 }, { "epoch": 41.77377892030848, "eval_accuracy": 0.7972857023271388, "eval_loss": 0.9303329586982727, "eval_runtime": 32.6506, "eval_samples_per_second": 190.594, "eval_steps_per_second": 5.972, "step": 32500 }, { "epoch": 42.41645244215938, "grad_norm": 2.7694153785705566, "learning_rate": 6.7e-05, "loss": 0.8132, "step": 33000 }, { "epoch": 42.41645244215938, "eval_accuracy": 0.7977203787112787, "eval_loss": 0.9363131523132324, "eval_runtime": 33.0889, "eval_samples_per_second": 188.069, "eval_steps_per_second": 5.893, "step": 33000 }, { "epoch": 43.059125964010285, "grad_norm": 2.8761026859283447, "learning_rate": 6.65e-05, "loss": 0.8082, "step": 33500 }, { "epoch": 43.059125964010285, "eval_accuracy": 0.7981091852326369, "eval_loss": 0.9364966154098511, "eval_runtime": 32.3923, "eval_samples_per_second": 192.113, "eval_steps_per_second": 6.02, "step": 33500 }, { "epoch": 43.70179948586118, "grad_norm": 2.6908044815063477, "learning_rate": 6.6e-05, "loss": 0.806, "step": 34000 }, { "epoch": 43.70179948586118, "eval_accuracy": 0.7986442847736496, "eval_loss": 0.9224198460578918, "eval_runtime": 33.388, "eval_samples_per_second": 186.384, "eval_steps_per_second": 5.84, "step": 34000 }, { "epoch": 44.34447300771208, "grad_norm": 2.6623470783233643, "learning_rate": 6.55e-05, "loss": 0.8024, "step": 34500 }, { "epoch": 44.34447300771208, "eval_accuracy": 0.7992224783044672, "eval_loss": 0.9192689061164856, "eval_runtime": 33.3722, "eval_samples_per_second": 186.473, "eval_steps_per_second": 5.843, "step": 34500 }, { "epoch": 44.987146529562985, "grad_norm": 2.6698567867279053, "learning_rate": 6.500000000000001e-05, "loss": 0.8016, "step": 35000 }, { "epoch": 44.987146529562985, "eval_accuracy": 0.7984851112126992, "eval_loss": 0.9178963303565979, "eval_runtime": 33.2663, "eval_samples_per_second": 187.066, "eval_steps_per_second": 5.862, "step": 35000 }, { "epoch": 45.62982005141388, "grad_norm": 2.980325222015381, "learning_rate": 6.450000000000001e-05, "loss": 0.796, "step": 35500 }, { "epoch": 45.62982005141388, "eval_accuracy": 0.7979037090039865, "eval_loss": 0.9273726344108582, "eval_runtime": 33.0665, "eval_samples_per_second": 188.196, "eval_steps_per_second": 5.897, "step": 35500 }, { "epoch": 46.27249357326478, "grad_norm": 2.536480188369751, "learning_rate": 6.400000000000001e-05, "loss": 0.7976, "step": 36000 }, { "epoch": 46.27249357326478, "eval_accuracy": 0.8009755918651776, "eval_loss": 0.9136722087860107, "eval_runtime": 33.7816, "eval_samples_per_second": 184.213, "eval_steps_per_second": 5.772, "step": 36000 }, { "epoch": 46.915167095115685, "grad_norm": 2.369147777557373, "learning_rate": 6.35e-05, "loss": 0.7888, "step": 36500 }, { "epoch": 46.915167095115685, "eval_accuracy": 0.8006721188810271, "eval_loss": 0.9139747619628906, "eval_runtime": 33.0443, "eval_samples_per_second": 188.323, "eval_steps_per_second": 5.901, "step": 36500 }, { "epoch": 47.55784061696658, "grad_norm": 2.606424570083618, "learning_rate": 6.3e-05, "loss": 0.7826, "step": 37000 }, { "epoch": 47.55784061696658, "eval_accuracy": 0.8006546893234877, "eval_loss": 0.9165197610855103, "eval_runtime": 32.9373, "eval_samples_per_second": 188.935, "eval_steps_per_second": 5.92, "step": 37000 }, { "epoch": 48.20051413881748, "grad_norm": 2.869199514389038, "learning_rate": 6.25e-05, "loss": 0.789, "step": 37500 }, { "epoch": 48.20051413881748, "eval_accuracy": 0.8009030033534118, "eval_loss": 0.9167375564575195, "eval_runtime": 33.2315, "eval_samples_per_second": 187.262, "eval_steps_per_second": 5.868, "step": 37500 }, { "epoch": 48.84318766066838, "grad_norm": 2.5320894718170166, "learning_rate": 6.2e-05, "loss": 0.7827, "step": 38000 }, { "epoch": 48.84318766066838, "eval_accuracy": 0.7997841239160237, "eval_loss": 0.9282008409500122, "eval_runtime": 33.3582, "eval_samples_per_second": 186.551, "eval_steps_per_second": 5.846, "step": 38000 }, { "epoch": 49.48586118251928, "grad_norm": 2.741992712020874, "learning_rate": 6.15e-05, "loss": 0.7739, "step": 38500 }, { "epoch": 49.48586118251928, "eval_accuracy": 0.8020669565628894, "eval_loss": 0.9078426361083984, "eval_runtime": 33.2299, "eval_samples_per_second": 187.271, "eval_steps_per_second": 5.868, "step": 38500 }, { "epoch": 50.12853470437018, "grad_norm": 2.563089370727539, "learning_rate": 6.1e-05, "loss": 0.7766, "step": 39000 }, { "epoch": 50.12853470437018, "eval_accuracy": 0.8028247073903633, "eval_loss": 0.9107823967933655, "eval_runtime": 33.5047, "eval_samples_per_second": 185.735, "eval_steps_per_second": 5.82, "step": 39000 }, { "epoch": 50.77120822622108, "grad_norm": 2.5732595920562744, "learning_rate": 6.05e-05, "loss": 0.7728, "step": 39500 }, { "epoch": 50.77120822622108, "eval_accuracy": 0.8025567474818919, "eval_loss": 0.9065195918083191, "eval_runtime": 33.1402, "eval_samples_per_second": 187.778, "eval_steps_per_second": 5.884, "step": 39500 }, { "epoch": 51.41388174807198, "grad_norm": 2.4717209339141846, "learning_rate": 6e-05, "loss": 0.7676, "step": 40000 }, { "epoch": 51.41388174807198, "eval_accuracy": 0.8028714592601541, "eval_loss": 0.9060749411582947, "eval_runtime": 32.3856, "eval_samples_per_second": 192.153, "eval_steps_per_second": 6.021, "step": 40000 }, { "epoch": 52.05655526992288, "grad_norm": 2.5366320610046387, "learning_rate": 5.95e-05, "loss": 0.7653, "step": 40500 }, { "epoch": 52.05655526992288, "eval_accuracy": 0.8030310980056771, "eval_loss": 0.9148956537246704, "eval_runtime": 33.3546, "eval_samples_per_second": 186.571, "eval_steps_per_second": 5.846, "step": 40500 }, { "epoch": 52.69922879177378, "grad_norm": 2.9074325561523438, "learning_rate": 5.9e-05, "loss": 0.7607, "step": 41000 }, { "epoch": 52.69922879177378, "eval_accuracy": 0.8039304797381431, "eval_loss": 0.8974832892417908, "eval_runtime": 33.4483, "eval_samples_per_second": 186.049, "eval_steps_per_second": 5.83, "step": 41000 }, { "epoch": 53.34190231362468, "grad_norm": 3.1211259365081787, "learning_rate": 5.85e-05, "loss": 0.7604, "step": 41500 }, { "epoch": 53.34190231362468, "eval_accuracy": 0.8036355728594897, "eval_loss": 0.90870201587677, "eval_runtime": 33.4646, "eval_samples_per_second": 185.957, "eval_steps_per_second": 5.827, "step": 41500 }, { "epoch": 53.98457583547558, "grad_norm": 2.762848377227783, "learning_rate": 5.8e-05, "loss": 0.7593, "step": 42000 }, { "epoch": 53.98457583547558, "eval_accuracy": 0.802461854621613, "eval_loss": 0.9101512432098389, "eval_runtime": 32.3612, "eval_samples_per_second": 192.298, "eval_steps_per_second": 6.026, "step": 42000 }, { "epoch": 54.62724935732648, "grad_norm": 2.6548171043395996, "learning_rate": 5.7499999999999995e-05, "loss": 0.7564, "step": 42500 }, { "epoch": 54.62724935732648, "eval_accuracy": 0.804102518442478, "eval_loss": 0.9050089120864868, "eval_runtime": 33.1101, "eval_samples_per_second": 187.949, "eval_steps_per_second": 5.889, "step": 42500 }, { "epoch": 55.26992287917738, "grad_norm": 2.6228950023651123, "learning_rate": 5.6999999999999996e-05, "loss": 0.7452, "step": 43000 }, { "epoch": 55.26992287917738, "eval_accuracy": 0.8040075271997299, "eval_loss": 0.9055400490760803, "eval_runtime": 33.014, "eval_samples_per_second": 188.496, "eval_steps_per_second": 5.907, "step": 43000 }, { "epoch": 55.912596401028274, "grad_norm": 2.6490426063537598, "learning_rate": 5.65e-05, "loss": 0.7488, "step": 43500 }, { "epoch": 55.912596401028274, "eval_accuracy": 0.8048906062428912, "eval_loss": 0.9053667187690735, "eval_runtime": 32.3856, "eval_samples_per_second": 192.153, "eval_steps_per_second": 6.021, "step": 43500 }, { "epoch": 56.55526992287918, "grad_norm": 2.494063377380371, "learning_rate": 5.6000000000000006e-05, "loss": 0.7472, "step": 44000 }, { "epoch": 56.55526992287918, "eval_accuracy": 0.8050882383051106, "eval_loss": 0.8990674614906311, "eval_runtime": 34.0904, "eval_samples_per_second": 182.544, "eval_steps_per_second": 5.72, "step": 44000 }, { "epoch": 57.19794344473008, "grad_norm": 2.8385369777679443, "learning_rate": 5.550000000000001e-05, "loss": 0.7469, "step": 44500 }, { "epoch": 57.19794344473008, "eval_accuracy": 0.8047514040498369, "eval_loss": 0.9029610753059387, "eval_runtime": 32.596, "eval_samples_per_second": 190.913, "eval_steps_per_second": 5.982, "step": 44500 }, { "epoch": 57.840616966580974, "grad_norm": 2.4786102771759033, "learning_rate": 5.500000000000001e-05, "loss": 0.7386, "step": 45000 }, { "epoch": 57.840616966580974, "eval_accuracy": 0.8045387474256496, "eval_loss": 0.8992837071418762, "eval_runtime": 33.7728, "eval_samples_per_second": 184.261, "eval_steps_per_second": 5.774, "step": 45000 }, { "epoch": 58.48329048843188, "grad_norm": 2.57100248336792, "learning_rate": 5.45e-05, "loss": 0.7383, "step": 45500 }, { "epoch": 58.48329048843188, "eval_accuracy": 0.8045938243522368, "eval_loss": 0.9126896858215332, "eval_runtime": 33.0001, "eval_samples_per_second": 188.575, "eval_steps_per_second": 5.909, "step": 45500 }, { "epoch": 59.12596401028278, "grad_norm": 2.6267364025115967, "learning_rate": 5.4000000000000005e-05, "loss": 0.7372, "step": 46000 }, { "epoch": 59.12596401028278, "eval_accuracy": 0.8047429982319032, "eval_loss": 0.9037800431251526, "eval_runtime": 33.4956, "eval_samples_per_second": 185.786, "eval_steps_per_second": 5.822, "step": 46000 }, { "epoch": 59.768637532133674, "grad_norm": 2.6067042350769043, "learning_rate": 5.3500000000000006e-05, "loss": 0.7293, "step": 46500 }, { "epoch": 59.768637532133674, "eval_accuracy": 0.8063738822187712, "eval_loss": 0.891742467880249, "eval_runtime": 33.3465, "eval_samples_per_second": 186.616, "eval_steps_per_second": 5.848, "step": 46500 }, { "epoch": 60.411311053984576, "grad_norm": 2.748093843460083, "learning_rate": 5.300000000000001e-05, "loss": 0.7266, "step": 47000 }, { "epoch": 60.411311053984576, "eval_accuracy": 0.8060851226391891, "eval_loss": 0.8950145840644836, "eval_runtime": 33.417, "eval_samples_per_second": 186.223, "eval_steps_per_second": 5.835, "step": 47000 }, { "epoch": 61.05398457583548, "grad_norm": 2.7752370834350586, "learning_rate": 5.25e-05, "loss": 0.7278, "step": 47500 }, { "epoch": 61.05398457583548, "eval_accuracy": 0.8052570807938081, "eval_loss": 0.9001266956329346, "eval_runtime": 33.2575, "eval_samples_per_second": 187.116, "eval_steps_per_second": 5.863, "step": 47500 }, { "epoch": 61.696658097686374, "grad_norm": 2.4993159770965576, "learning_rate": 5.2000000000000004e-05, "loss": 0.7224, "step": 48000 }, { "epoch": 61.696658097686374, "eval_accuracy": 0.8063545774342619, "eval_loss": 0.9000985622406006, "eval_runtime": 32.5061, "eval_samples_per_second": 191.441, "eval_steps_per_second": 5.999, "step": 48000 }, { "epoch": 62.339331619537276, "grad_norm": 2.795344829559326, "learning_rate": 5.1500000000000005e-05, "loss": 0.7156, "step": 48500 }, { "epoch": 62.339331619537276, "eval_accuracy": 0.8078125860121053, "eval_loss": 0.8850185871124268, "eval_runtime": 33.3065, "eval_samples_per_second": 186.84, "eval_steps_per_second": 5.855, "step": 48500 }, { "epoch": 62.98200514138817, "grad_norm": 2.4736196994781494, "learning_rate": 5.1000000000000006e-05, "loss": 0.7179, "step": 49000 }, { "epoch": 62.98200514138817, "eval_accuracy": 0.8069766099061457, "eval_loss": 0.8914857506752014, "eval_runtime": 33.2699, "eval_samples_per_second": 187.046, "eval_steps_per_second": 5.861, "step": 49000 }, { "epoch": 63.624678663239074, "grad_norm": 2.7739675045013428, "learning_rate": 5.05e-05, "loss": 0.7158, "step": 49500 }, { "epoch": 63.624678663239074, "eval_accuracy": 0.806712037699331, "eval_loss": 0.8928523063659668, "eval_runtime": 33.5382, "eval_samples_per_second": 185.55, "eval_steps_per_second": 5.814, "step": 49500 }, { "epoch": 64.26735218508998, "grad_norm": 2.6932947635650635, "learning_rate": 5e-05, "loss": 0.711, "step": 50000 }, { "epoch": 64.26735218508998, "eval_accuracy": 0.8060159735552406, "eval_loss": 0.9013872742652893, "eval_runtime": 33.9301, "eval_samples_per_second": 183.407, "eval_steps_per_second": 5.747, "step": 50000 }, { "epoch": 64.91002570694087, "grad_norm": 2.7735486030578613, "learning_rate": 4.9500000000000004e-05, "loss": 0.7097, "step": 50500 }, { "epoch": 64.91002570694087, "eval_accuracy": 0.8067475980060699, "eval_loss": 0.8957981467247009, "eval_runtime": 32.2026, "eval_samples_per_second": 193.245, "eval_steps_per_second": 6.055, "step": 50500 }, { "epoch": 65.55269922879178, "grad_norm": 2.9156525135040283, "learning_rate": 4.9e-05, "loss": 0.7079, "step": 51000 }, { "epoch": 65.55269922879178, "eval_accuracy": 0.8081258363288117, "eval_loss": 0.894716739654541, "eval_runtime": 32.7636, "eval_samples_per_second": 189.936, "eval_steps_per_second": 5.952, "step": 51000 }, { "epoch": 66.19537275064268, "grad_norm": 2.6359803676605225, "learning_rate": 4.85e-05, "loss": 0.7091, "step": 51500 }, { "epoch": 66.19537275064268, "eval_accuracy": 0.8079719372684523, "eval_loss": 0.8968275189399719, "eval_runtime": 32.9482, "eval_samples_per_second": 188.872, "eval_steps_per_second": 5.918, "step": 51500 }, { "epoch": 66.83804627249357, "grad_norm": 2.481441020965576, "learning_rate": 4.8e-05, "loss": 0.7023, "step": 52000 }, { "epoch": 66.83804627249357, "eval_accuracy": 0.8089100382369007, "eval_loss": 0.8855522274971008, "eval_runtime": 33.2751, "eval_samples_per_second": 187.017, "eval_steps_per_second": 5.86, "step": 52000 }, { "epoch": 67.48071979434447, "grad_norm": 2.7268731594085693, "learning_rate": 4.75e-05, "loss": 0.701, "step": 52500 }, { "epoch": 67.48071979434447, "eval_accuracy": 0.8084944735097863, "eval_loss": 0.8864369988441467, "eval_runtime": 33.1906, "eval_samples_per_second": 187.493, "eval_steps_per_second": 5.875, "step": 52500 }, { "epoch": 68.12339331619538, "grad_norm": 2.5931150913238525, "learning_rate": 4.7e-05, "loss": 0.6988, "step": 53000 }, { "epoch": 68.12339331619538, "eval_accuracy": 0.8089927462608983, "eval_loss": 0.8866479992866516, "eval_runtime": 33.2058, "eval_samples_per_second": 187.407, "eval_steps_per_second": 5.872, "step": 53000 }, { "epoch": 68.76606683804627, "grad_norm": 2.736328601837158, "learning_rate": 4.6500000000000005e-05, "loss": 0.6994, "step": 53500 }, { "epoch": 68.76606683804627, "eval_accuracy": 0.8084927024019897, "eval_loss": 0.8820142149925232, "eval_runtime": 32.9508, "eval_samples_per_second": 188.857, "eval_steps_per_second": 5.918, "step": 53500 }, { "epoch": 69.40874035989717, "grad_norm": 2.8003087043762207, "learning_rate": 4.600000000000001e-05, "loss": 0.6831, "step": 54000 }, { "epoch": 69.40874035989717, "eval_accuracy": 0.8091836520937638, "eval_loss": 0.8856648206710815, "eval_runtime": 33.3733, "eval_samples_per_second": 186.466, "eval_steps_per_second": 5.843, "step": 54000 }, { "epoch": 70.05141388174808, "grad_norm": 2.7809290885925293, "learning_rate": 4.55e-05, "loss": 0.6948, "step": 54500 }, { "epoch": 70.05141388174808, "eval_accuracy": 0.8089959519265054, "eval_loss": 0.89335036277771, "eval_runtime": 33.324, "eval_samples_per_second": 186.743, "eval_steps_per_second": 5.852, "step": 54500 }, { "epoch": 70.69408740359897, "grad_norm": 2.5319223403930664, "learning_rate": 4.5e-05, "loss": 0.688, "step": 55000 }, { "epoch": 70.69408740359897, "eval_accuracy": 0.8102897851773414, "eval_loss": 0.8859269022941589, "eval_runtime": 33.0875, "eval_samples_per_second": 188.077, "eval_steps_per_second": 5.893, "step": 55000 }, { "epoch": 71.33676092544987, "grad_norm": 3.0281503200531006, "learning_rate": 4.4500000000000004e-05, "loss": 0.6894, "step": 55500 }, { "epoch": 71.33676092544987, "eval_accuracy": 0.8097594530479655, "eval_loss": 0.8875166773796082, "eval_runtime": 32.7248, "eval_samples_per_second": 190.161, "eval_steps_per_second": 5.959, "step": 55500 }, { "epoch": 71.97943444730078, "grad_norm": 2.476452589035034, "learning_rate": 4.4000000000000006e-05, "loss": 0.6855, "step": 56000 }, { "epoch": 71.97943444730078, "eval_accuracy": 0.8097316597458161, "eval_loss": 0.8884576559066772, "eval_runtime": 33.1154, "eval_samples_per_second": 187.919, "eval_steps_per_second": 5.889, "step": 56000 }, { "epoch": 72.62210796915167, "grad_norm": 2.662523031234741, "learning_rate": 4.35e-05, "loss": 0.6811, "step": 56500 }, { "epoch": 72.62210796915167, "eval_accuracy": 0.809297297864581, "eval_loss": 0.8833766579627991, "eval_runtime": 33.1424, "eval_samples_per_second": 187.766, "eval_steps_per_second": 5.884, "step": 56500 }, { "epoch": 73.26478149100257, "grad_norm": 2.4902071952819824, "learning_rate": 4.3e-05, "loss": 0.6782, "step": 57000 }, { "epoch": 73.26478149100257, "eval_accuracy": 0.8089903844893186, "eval_loss": 0.8910095691680908, "eval_runtime": 33.3324, "eval_samples_per_second": 186.695, "eval_steps_per_second": 5.85, "step": 57000 }, { "epoch": 73.90745501285348, "grad_norm": 2.897806167602539, "learning_rate": 4.25e-05, "loss": 0.6761, "step": 57500 }, { "epoch": 73.90745501285348, "eval_accuracy": 0.8115941649425137, "eval_loss": 0.8758607506752014, "eval_runtime": 33.305, "eval_samples_per_second": 186.849, "eval_steps_per_second": 5.855, "step": 57500 }, { "epoch": 74.55012853470437, "grad_norm": 2.6833012104034424, "learning_rate": 4.2e-05, "loss": 0.6713, "step": 58000 }, { "epoch": 74.55012853470437, "eval_accuracy": 0.8105924384594244, "eval_loss": 0.8842668533325195, "eval_runtime": 32.104, "eval_samples_per_second": 193.839, "eval_steps_per_second": 6.074, "step": 58000 }, { "epoch": 75.19280205655527, "grad_norm": 3.058183193206787, "learning_rate": 4.15e-05, "loss": 0.6751, "step": 58500 }, { "epoch": 75.19280205655527, "eval_accuracy": 0.8100959331379869, "eval_loss": 0.8922275304794312, "eval_runtime": 32.0158, "eval_samples_per_second": 194.373, "eval_steps_per_second": 6.091, "step": 58500 }, { "epoch": 75.83547557840618, "grad_norm": 2.5572476387023926, "learning_rate": 4.1e-05, "loss": 0.667, "step": 59000 }, { "epoch": 75.83547557840618, "eval_accuracy": 0.8115087833220307, "eval_loss": 0.8827089071273804, "eval_runtime": 32.068, "eval_samples_per_second": 194.056, "eval_steps_per_second": 6.081, "step": 59000 }, { "epoch": 76.47814910025707, "grad_norm": 2.6060643196105957, "learning_rate": 4.05e-05, "loss": 0.6628, "step": 59500 }, { "epoch": 76.47814910025707, "eval_accuracy": 0.8114595604343708, "eval_loss": 0.8741211891174316, "eval_runtime": 32.0679, "eval_samples_per_second": 194.057, "eval_steps_per_second": 6.081, "step": 59500 }, { "epoch": 77.12082262210797, "grad_norm": 2.643742322921753, "learning_rate": 4e-05, "loss": 0.6685, "step": 60000 }, { "epoch": 77.12082262210797, "eval_accuracy": 0.8120777359560294, "eval_loss": 0.8767244219779968, "eval_runtime": 32.012, "eval_samples_per_second": 194.396, "eval_steps_per_second": 6.091, "step": 60000 }, { "epoch": 77.76349614395887, "grad_norm": 3.0320351123809814, "learning_rate": 3.9500000000000005e-05, "loss": 0.6618, "step": 60500 }, { "epoch": 77.76349614395887, "eval_accuracy": 0.8108977044235925, "eval_loss": 0.882882833480835, "eval_runtime": 32.0023, "eval_samples_per_second": 194.455, "eval_steps_per_second": 6.093, "step": 60500 }, { "epoch": 78.40616966580977, "grad_norm": 2.645836114883423, "learning_rate": 3.9000000000000006e-05, "loss": 0.6626, "step": 61000 }, { "epoch": 78.40616966580977, "eval_accuracy": 0.8128617042506485, "eval_loss": 0.8741999268531799, "eval_runtime": 32.0443, "eval_samples_per_second": 194.2, "eval_steps_per_second": 6.085, "step": 61000 }, { "epoch": 79.04884318766067, "grad_norm": 2.858215570449829, "learning_rate": 3.85e-05, "loss": 0.6617, "step": 61500 }, { "epoch": 79.04884318766067, "eval_accuracy": 0.8136581954693509, "eval_loss": 0.8712067604064941, "eval_runtime": 32.061, "eval_samples_per_second": 194.099, "eval_steps_per_second": 6.082, "step": 61500 }, { "epoch": 79.69151670951157, "grad_norm": 2.5988807678222656, "learning_rate": 3.8e-05, "loss": 0.655, "step": 62000 }, { "epoch": 79.69151670951157, "eval_accuracy": 0.8128823572620334, "eval_loss": 0.8742080926895142, "eval_runtime": 32.0565, "eval_samples_per_second": 194.126, "eval_steps_per_second": 6.083, "step": 62000 }, { "epoch": 80.33419023136247, "grad_norm": 2.6401941776275635, "learning_rate": 3.7500000000000003e-05, "loss": 0.6545, "step": 62500 }, { "epoch": 80.33419023136247, "eval_accuracy": 0.812843042566724, "eval_loss": 0.8718605041503906, "eval_runtime": 31.9943, "eval_samples_per_second": 194.503, "eval_steps_per_second": 6.095, "step": 62500 }, { "epoch": 80.97686375321337, "grad_norm": 2.8062636852264404, "learning_rate": 3.7e-05, "loss": 0.655, "step": 63000 }, { "epoch": 80.97686375321337, "eval_accuracy": 0.811865863240631, "eval_loss": 0.8775948882102966, "eval_runtime": 32.0581, "eval_samples_per_second": 194.116, "eval_steps_per_second": 6.083, "step": 63000 }, { "epoch": 81.61953727506426, "grad_norm": 2.548220157623291, "learning_rate": 3.65e-05, "loss": 0.6468, "step": 63500 }, { "epoch": 81.61953727506426, "eval_accuracy": 0.8119920345795418, "eval_loss": 0.883579432964325, "eval_runtime": 32.0753, "eval_samples_per_second": 194.012, "eval_steps_per_second": 6.079, "step": 63500 }, { "epoch": 82.26221079691517, "grad_norm": 2.5861518383026123, "learning_rate": 3.6e-05, "loss": 0.6487, "step": 64000 }, { "epoch": 82.26221079691517, "eval_accuracy": 0.8131239363417762, "eval_loss": 0.8730902075767517, "eval_runtime": 32.1045, "eval_samples_per_second": 193.835, "eval_steps_per_second": 6.074, "step": 64000 }, { "epoch": 82.90488431876607, "grad_norm": 2.780217409133911, "learning_rate": 3.55e-05, "loss": 0.6495, "step": 64500 }, { "epoch": 82.90488431876607, "eval_accuracy": 0.8124547391400174, "eval_loss": 0.8819155097007751, "eval_runtime": 32.0183, "eval_samples_per_second": 194.357, "eval_steps_per_second": 6.09, "step": 64500 }, { "epoch": 83.54755784061696, "grad_norm": 2.5522782802581787, "learning_rate": 3.5e-05, "loss": 0.6425, "step": 65000 }, { "epoch": 83.54755784061696, "eval_accuracy": 0.8124381288595807, "eval_loss": 0.885295569896698, "eval_runtime": 32.0293, "eval_samples_per_second": 194.291, "eval_steps_per_second": 6.088, "step": 65000 }, { "epoch": 84.19023136246787, "grad_norm": 2.6785833835601807, "learning_rate": 3.45e-05, "loss": 0.6423, "step": 65500 }, { "epoch": 84.19023136246787, "eval_accuracy": 0.8132641437128263, "eval_loss": 0.8791692852973938, "eval_runtime": 32.0427, "eval_samples_per_second": 194.209, "eval_steps_per_second": 6.086, "step": 65500 }, { "epoch": 84.83290488431876, "grad_norm": 2.6489408016204834, "learning_rate": 3.4000000000000007e-05, "loss": 0.6377, "step": 66000 }, { "epoch": 84.83290488431876, "eval_accuracy": 0.8154822401853656, "eval_loss": 0.8657127618789673, "eval_runtime": 32.102, "eval_samples_per_second": 193.851, "eval_steps_per_second": 6.074, "step": 66000 }, { "epoch": 85.47557840616966, "grad_norm": 2.556199312210083, "learning_rate": 3.35e-05, "loss": 0.6334, "step": 66500 }, { "epoch": 85.47557840616966, "eval_accuracy": 0.8141076970581924, "eval_loss": 0.8747490644454956, "eval_runtime": 32.0636, "eval_samples_per_second": 194.083, "eval_steps_per_second": 6.082, "step": 66500 }, { "epoch": 86.11825192802057, "grad_norm": 2.477529764175415, "learning_rate": 3.3e-05, "loss": 0.6421, "step": 67000 }, { "epoch": 86.11825192802057, "eval_accuracy": 0.8136067662347515, "eval_loss": 0.8816725015640259, "eval_runtime": 32.0966, "eval_samples_per_second": 193.883, "eval_steps_per_second": 6.075, "step": 67000 }, { "epoch": 86.76092544987146, "grad_norm": 2.6774518489837646, "learning_rate": 3.2500000000000004e-05, "loss": 0.6366, "step": 67500 }, { "epoch": 86.76092544987146, "eval_accuracy": 0.814412672148727, "eval_loss": 0.8695808053016663, "eval_runtime": 32.0613, "eval_samples_per_second": 194.097, "eval_steps_per_second": 6.082, "step": 67500 }, { "epoch": 87.40359897172236, "grad_norm": 2.6455330848693848, "learning_rate": 3.2000000000000005e-05, "loss": 0.6305, "step": 68000 }, { "epoch": 87.40359897172236, "eval_accuracy": 0.8148675345064433, "eval_loss": 0.8716031312942505, "eval_runtime": 32.0969, "eval_samples_per_second": 193.882, "eval_steps_per_second": 6.075, "step": 68000 }, { "epoch": 88.04627249357327, "grad_norm": 2.7725086212158203, "learning_rate": 3.15e-05, "loss": 0.6325, "step": 68500 }, { "epoch": 88.04627249357327, "eval_accuracy": 0.813131776644422, "eval_loss": 0.8786645531654358, "eval_runtime": 32.024, "eval_samples_per_second": 194.323, "eval_steps_per_second": 6.089, "step": 68500 }, { "epoch": 88.68894601542416, "grad_norm": 2.6008856296539307, "learning_rate": 3.1e-05, "loss": 0.6282, "step": 69000 }, { "epoch": 88.68894601542416, "eval_accuracy": 0.8146181108967369, "eval_loss": 0.8678516149520874, "eval_runtime": 32.0037, "eval_samples_per_second": 194.447, "eval_steps_per_second": 6.093, "step": 69000 }, { "epoch": 89.33161953727506, "grad_norm": 2.7128331661224365, "learning_rate": 3.05e-05, "loss": 0.6261, "step": 69500 }, { "epoch": 89.33161953727506, "eval_accuracy": 0.8144260367762641, "eval_loss": 0.871059000492096, "eval_runtime": 32.0173, "eval_samples_per_second": 194.364, "eval_steps_per_second": 6.09, "step": 69500 }, { "epoch": 89.97429305912597, "grad_norm": 2.324878692626953, "learning_rate": 3e-05, "loss": 0.6263, "step": 70000 }, { "epoch": 89.97429305912597, "eval_accuracy": 0.8156161510225485, "eval_loss": 0.8654680252075195, "eval_runtime": 32.0295, "eval_samples_per_second": 194.29, "eval_steps_per_second": 6.088, "step": 70000 }, { "epoch": 90.61696658097686, "grad_norm": 2.7399654388427734, "learning_rate": 2.95e-05, "loss": 0.6233, "step": 70500 }, { "epoch": 90.61696658097686, "eval_accuracy": 0.8162172670337523, "eval_loss": 0.8661888241767883, "eval_runtime": 32.0203, "eval_samples_per_second": 194.346, "eval_steps_per_second": 6.09, "step": 70500 }, { "epoch": 91.25964010282776, "grad_norm": 2.8856687545776367, "learning_rate": 2.9e-05, "loss": 0.6176, "step": 71000 }, { "epoch": 91.25964010282776, "eval_accuracy": 0.8161184520753214, "eval_loss": 0.8683423399925232, "eval_runtime": 32.0579, "eval_samples_per_second": 194.117, "eval_steps_per_second": 6.083, "step": 71000 }, { "epoch": 91.90231362467867, "grad_norm": 2.4138548374176025, "learning_rate": 2.8499999999999998e-05, "loss": 0.6239, "step": 71500 }, { "epoch": 91.90231362467867, "eval_accuracy": 0.8153887598602306, "eval_loss": 0.8599680066108704, "eval_runtime": 32.0297, "eval_samples_per_second": 194.289, "eval_steps_per_second": 6.088, "step": 71500 }, { "epoch": 92.54498714652956, "grad_norm": 2.6343512535095215, "learning_rate": 2.8000000000000003e-05, "loss": 0.6189, "step": 72000 }, { "epoch": 92.54498714652956, "eval_accuracy": 0.8154344802246543, "eval_loss": 0.8673732876777649, "eval_runtime": 32.0317, "eval_samples_per_second": 194.276, "eval_steps_per_second": 6.088, "step": 72000 }, { "epoch": 93.18766066838046, "grad_norm": 2.6886157989501953, "learning_rate": 2.7500000000000004e-05, "loss": 0.6117, "step": 72500 }, { "epoch": 93.18766066838046, "eval_accuracy": 0.8162594874166894, "eval_loss": 0.8670706748962402, "eval_runtime": 32.0796, "eval_samples_per_second": 193.986, "eval_steps_per_second": 6.079, "step": 72500 }, { "epoch": 93.83033419023137, "grad_norm": 2.6037755012512207, "learning_rate": 2.7000000000000002e-05, "loss": 0.615, "step": 73000 }, { "epoch": 93.83033419023137, "eval_accuracy": 0.8150874471307802, "eval_loss": 0.8663885593414307, "eval_runtime": 32.0301, "eval_samples_per_second": 194.286, "eval_steps_per_second": 6.088, "step": 73000 }, { "epoch": 94.47300771208226, "grad_norm": 2.5215036869049072, "learning_rate": 2.6500000000000004e-05, "loss": 0.6138, "step": 73500 }, { "epoch": 94.47300771208226, "eval_accuracy": 0.8168731185953577, "eval_loss": 0.8630892634391785, "eval_runtime": 32.066, "eval_samples_per_second": 194.069, "eval_steps_per_second": 6.081, "step": 73500 }, { "epoch": 95.11568123393316, "grad_norm": 2.797325611114502, "learning_rate": 2.6000000000000002e-05, "loss": 0.6107, "step": 74000 }, { "epoch": 95.11568123393316, "eval_accuracy": 0.8156241335474724, "eval_loss": 0.8718482851982117, "eval_runtime": 32.0559, "eval_samples_per_second": 194.13, "eval_steps_per_second": 6.083, "step": 74000 }, { "epoch": 95.75835475578407, "grad_norm": 3.0883982181549072, "learning_rate": 2.5500000000000003e-05, "loss": 0.6099, "step": 74500 }, { "epoch": 95.75835475578407, "eval_accuracy": 0.8160560285491784, "eval_loss": 0.8676818013191223, "eval_runtime": 32.0504, "eval_samples_per_second": 194.163, "eval_steps_per_second": 6.084, "step": 74500 }, { "epoch": 96.40102827763496, "grad_norm": 2.922588348388672, "learning_rate": 2.5e-05, "loss": 0.6026, "step": 75000 }, { "epoch": 96.40102827763496, "eval_accuracy": 0.8162867029734353, "eval_loss": 0.8721068501472473, "eval_runtime": 32.019, "eval_samples_per_second": 194.353, "eval_steps_per_second": 6.09, "step": 75000 }, { "epoch": 97.04370179948586, "grad_norm": 2.663724660873413, "learning_rate": 2.45e-05, "loss": 0.6084, "step": 75500 }, { "epoch": 97.04370179948586, "eval_accuracy": 0.8182687960636296, "eval_loss": 0.858277440071106, "eval_runtime": 32.0344, "eval_samples_per_second": 194.26, "eval_steps_per_second": 6.087, "step": 75500 }, { "epoch": 97.68637532133675, "grad_norm": 2.8198440074920654, "learning_rate": 2.4e-05, "loss": 0.6033, "step": 76000 }, { "epoch": 97.68637532133675, "eval_accuracy": 0.8163839511643823, "eval_loss": 0.8688974380493164, "eval_runtime": 32.0629, "eval_samples_per_second": 194.087, "eval_steps_per_second": 6.082, "step": 76000 }, { "epoch": 98.32904884318766, "grad_norm": 2.3979711532592773, "learning_rate": 2.35e-05, "loss": 0.603, "step": 76500 }, { "epoch": 98.32904884318766, "eval_accuracy": 0.8169446192573946, "eval_loss": 0.8678939342498779, "eval_runtime": 32.0422, "eval_samples_per_second": 194.213, "eval_steps_per_second": 6.086, "step": 76500 }, { "epoch": 98.97172236503856, "grad_norm": 2.643324613571167, "learning_rate": 2.3000000000000003e-05, "loss": 0.6009, "step": 77000 }, { "epoch": 98.97172236503856, "eval_accuracy": 0.8176424081803114, "eval_loss": 0.8609100580215454, "eval_runtime": 32.0681, "eval_samples_per_second": 194.056, "eval_steps_per_second": 6.081, "step": 77000 }, { "epoch": 99.61439588688945, "grad_norm": 2.791355848312378, "learning_rate": 2.25e-05, "loss": 0.601, "step": 77500 }, { "epoch": 99.61439588688945, "eval_accuracy": 0.816467387038539, "eval_loss": 0.867489755153656, "eval_runtime": 32.1192, "eval_samples_per_second": 193.747, "eval_steps_per_second": 6.071, "step": 77500 }, { "epoch": 100.25706940874036, "grad_norm": 2.787290096282959, "learning_rate": 2.2000000000000003e-05, "loss": 0.5966, "step": 78000 }, { "epoch": 100.25706940874036, "eval_accuracy": 0.8171603328039044, "eval_loss": 0.8652510046958923, "eval_runtime": 32.0539, "eval_samples_per_second": 194.142, "eval_steps_per_second": 6.083, "step": 78000 }, { "epoch": 100.89974293059126, "grad_norm": 2.791646957397461, "learning_rate": 2.15e-05, "loss": 0.5935, "step": 78500 }, { "epoch": 100.89974293059126, "eval_accuracy": 0.8178409455191369, "eval_loss": 0.8600234985351562, "eval_runtime": 32.0149, "eval_samples_per_second": 194.378, "eval_steps_per_second": 6.091, "step": 78500 }, { "epoch": 101.54241645244215, "grad_norm": 2.4990010261535645, "learning_rate": 2.1e-05, "loss": 0.5912, "step": 79000 }, { "epoch": 101.54241645244215, "eval_accuracy": 0.8169466975666281, "eval_loss": 0.8653005957603455, "eval_runtime": 31.9972, "eval_samples_per_second": 194.486, "eval_steps_per_second": 6.094, "step": 79000 }, { "epoch": 102.18508997429306, "grad_norm": 2.3002467155456543, "learning_rate": 2.05e-05, "loss": 0.5873, "step": 79500 }, { "epoch": 102.18508997429306, "eval_accuracy": 0.81857837810817, "eval_loss": 0.8576123118400574, "eval_runtime": 31.9904, "eval_samples_per_second": 194.527, "eval_steps_per_second": 6.096, "step": 79500 }, { "epoch": 102.82776349614396, "grad_norm": 2.712336540222168, "learning_rate": 2e-05, "loss": 0.5913, "step": 80000 }, { "epoch": 102.82776349614396, "eval_accuracy": 0.8185828450767534, "eval_loss": 0.855664074420929, "eval_runtime": 32.0488, "eval_samples_per_second": 194.173, "eval_steps_per_second": 6.084, "step": 80000 }, { "epoch": 103.47043701799485, "grad_norm": 2.769562005996704, "learning_rate": 1.9500000000000003e-05, "loss": 0.5908, "step": 80500 }, { "epoch": 103.47043701799485, "eval_accuracy": 0.8184711461830915, "eval_loss": 0.8605988621711731, "eval_runtime": 32.0286, "eval_samples_per_second": 194.295, "eval_steps_per_second": 6.088, "step": 80500 }, { "epoch": 104.11311053984576, "grad_norm": 2.6929941177368164, "learning_rate": 1.9e-05, "loss": 0.5898, "step": 81000 }, { "epoch": 104.11311053984576, "eval_accuracy": 0.8183153166169729, "eval_loss": 0.8647379875183105, "eval_runtime": 32.0673, "eval_samples_per_second": 194.06, "eval_steps_per_second": 6.081, "step": 81000 }, { "epoch": 104.75578406169666, "grad_norm": 2.8258962631225586, "learning_rate": 1.85e-05, "loss": 0.5841, "step": 81500 }, { "epoch": 104.75578406169666, "eval_accuracy": 0.817774798787225, "eval_loss": 0.8651922941207886, "eval_runtime": 32.011, "eval_samples_per_second": 194.402, "eval_steps_per_second": 6.092, "step": 81500 }, { "epoch": 105.39845758354755, "grad_norm": 2.46081805229187, "learning_rate": 1.8e-05, "loss": 0.582, "step": 82000 }, { "epoch": 105.39845758354755, "eval_accuracy": 0.817711216701606, "eval_loss": 0.8619120717048645, "eval_runtime": 32.0215, "eval_samples_per_second": 194.338, "eval_steps_per_second": 6.09, "step": 82000 }, { "epoch": 106.04113110539846, "grad_norm": 2.960406541824341, "learning_rate": 1.75e-05, "loss": 0.5833, "step": 82500 }, { "epoch": 106.04113110539846, "eval_accuracy": 0.8198030559915176, "eval_loss": 0.8566803932189941, "eval_runtime": 31.9884, "eval_samples_per_second": 194.539, "eval_steps_per_second": 6.096, "step": 82500 }, { "epoch": 106.68380462724936, "grad_norm": 2.3195433616638184, "learning_rate": 1.7000000000000003e-05, "loss": 0.5804, "step": 83000 }, { "epoch": 106.68380462724936, "eval_accuracy": 0.8193473285144606, "eval_loss": 0.8546783328056335, "eval_runtime": 32.0522, "eval_samples_per_second": 194.152, "eval_steps_per_second": 6.084, "step": 83000 }, { "epoch": 107.32647814910025, "grad_norm": 2.7828962802886963, "learning_rate": 1.65e-05, "loss": 0.5839, "step": 83500 }, { "epoch": 107.32647814910025, "eval_accuracy": 0.817869892847695, "eval_loss": 0.859340488910675, "eval_runtime": 32.2415, "eval_samples_per_second": 193.012, "eval_steps_per_second": 6.048, "step": 83500 }, { "epoch": 107.96915167095116, "grad_norm": 2.8258416652679443, "learning_rate": 1.6000000000000003e-05, "loss": 0.5774, "step": 84000 }, { "epoch": 107.96915167095116, "eval_accuracy": 0.8181858662056681, "eval_loss": 0.8593936562538147, "eval_runtime": 32.0552, "eval_samples_per_second": 194.134, "eval_steps_per_second": 6.083, "step": 84000 }, { "epoch": 108.61182519280206, "grad_norm": 2.5473225116729736, "learning_rate": 1.55e-05, "loss": 0.5791, "step": 84500 }, { "epoch": 108.61182519280206, "eval_accuracy": 0.8201209689038143, "eval_loss": 0.8578335046768188, "eval_runtime": 32.0789, "eval_samples_per_second": 193.99, "eval_steps_per_second": 6.079, "step": 84500 }, { "epoch": 109.25449871465295, "grad_norm": 2.6014022827148438, "learning_rate": 1.5e-05, "loss": 0.5724, "step": 85000 }, { "epoch": 109.25449871465295, "eval_accuracy": 0.8179463406457561, "eval_loss": 0.8615684509277344, "eval_runtime": 32.0619, "eval_samples_per_second": 194.093, "eval_steps_per_second": 6.082, "step": 85000 }, { "epoch": 109.89717223650386, "grad_norm": 2.6044790744781494, "learning_rate": 1.45e-05, "loss": 0.5782, "step": 85500 }, { "epoch": 109.89717223650386, "eval_accuracy": 0.818407341792689, "eval_loss": 0.8595022559165955, "eval_runtime": 32.0145, "eval_samples_per_second": 194.381, "eval_steps_per_second": 6.091, "step": 85500 }, { "epoch": 110.53984575835476, "grad_norm": 2.570466995239258, "learning_rate": 1.4000000000000001e-05, "loss": 0.5735, "step": 86000 }, { "epoch": 110.53984575835476, "eval_accuracy": 0.8203102481461101, "eval_loss": 0.8511986136436462, "eval_runtime": 32.0544, "eval_samples_per_second": 194.139, "eval_steps_per_second": 6.083, "step": 86000 }, { "epoch": 111.18251928020565, "grad_norm": 2.4701082706451416, "learning_rate": 1.3500000000000001e-05, "loss": 0.5664, "step": 86500 }, { "epoch": 111.18251928020565, "eval_accuracy": 0.8206346877390821, "eval_loss": 0.8540939688682556, "eval_runtime": 32.046, "eval_samples_per_second": 194.19, "eval_steps_per_second": 6.085, "step": 86500 }, { "epoch": 111.82519280205655, "grad_norm": 2.9233691692352295, "learning_rate": 1.3000000000000001e-05, "loss": 0.5719, "step": 87000 }, { "epoch": 111.82519280205655, "eval_accuracy": 0.8200771187996747, "eval_loss": 0.8584414124488831, "eval_runtime": 32.0359, "eval_samples_per_second": 194.251, "eval_steps_per_second": 6.087, "step": 87000 }, { "epoch": 112.46786632390746, "grad_norm": 2.3986382484436035, "learning_rate": 1.25e-05, "loss": 0.5682, "step": 87500 }, { "epoch": 112.46786632390746, "eval_accuracy": 0.8193907912255869, "eval_loss": 0.8565849661827087, "eval_runtime": 32.0525, "eval_samples_per_second": 194.15, "eval_steps_per_second": 6.084, "step": 87500 }, { "epoch": 113.11053984575835, "grad_norm": 2.5226054191589355, "learning_rate": 1.2e-05, "loss": 0.5719, "step": 88000 }, { "epoch": 113.11053984575835, "eval_accuracy": 0.8199272051177008, "eval_loss": 0.8500083684921265, "eval_runtime": 32.0889, "eval_samples_per_second": 193.93, "eval_steps_per_second": 6.077, "step": 88000 }, { "epoch": 113.75321336760925, "grad_norm": 2.609858274459839, "learning_rate": 1.1500000000000002e-05, "loss": 0.5674, "step": 88500 }, { "epoch": 113.75321336760925, "eval_accuracy": 0.8205847363486674, "eval_loss": 0.8524363040924072, "eval_runtime": 32.0335, "eval_samples_per_second": 194.265, "eval_steps_per_second": 6.087, "step": 88500 }, { "epoch": 114.39588688946016, "grad_norm": 2.2834887504577637, "learning_rate": 1.1000000000000001e-05, "loss": 0.5643, "step": 89000 }, { "epoch": 114.39588688946016, "eval_accuracy": 0.821479995279122, "eval_loss": 0.8454416990280151, "eval_runtime": 32.0112, "eval_samples_per_second": 194.401, "eval_steps_per_second": 6.092, "step": 89000 }, { "epoch": 115.03856041131105, "grad_norm": 2.776373863220215, "learning_rate": 1.05e-05, "loss": 0.5681, "step": 89500 }, { "epoch": 115.03856041131105, "eval_accuracy": 0.8196546091505466, "eval_loss": 0.8588145971298218, "eval_runtime": 31.2312, "eval_samples_per_second": 199.256, "eval_steps_per_second": 6.244, "step": 89500 }, { "epoch": 115.68123393316195, "grad_norm": 2.7792084217071533, "learning_rate": 1e-05, "loss": 0.563, "step": 90000 }, { "epoch": 115.68123393316195, "eval_accuracy": 0.8215367161011942, "eval_loss": 0.8472273945808411, "eval_runtime": 32.0865, "eval_samples_per_second": 193.944, "eval_steps_per_second": 6.077, "step": 90000 }, { "epoch": 116.32390745501286, "grad_norm": 2.5021698474884033, "learning_rate": 9.5e-06, "loss": 0.5563, "step": 90500 }, { "epoch": 116.32390745501286, "eval_accuracy": 0.8198419059293134, "eval_loss": 0.8576837778091431, "eval_runtime": 32.0421, "eval_samples_per_second": 194.214, "eval_steps_per_second": 6.086, "step": 90500 }, { "epoch": 116.96658097686375, "grad_norm": 2.7690136432647705, "learning_rate": 9e-06, "loss": 0.5667, "step": 91000 }, { "epoch": 116.96658097686375, "eval_accuracy": 0.8201571685631533, "eval_loss": 0.8592654466629028, "eval_runtime": 32.0402, "eval_samples_per_second": 194.225, "eval_steps_per_second": 6.086, "step": 91000 }, { "epoch": 117.60925449871465, "grad_norm": 2.7082719802856445, "learning_rate": 8.500000000000002e-06, "loss": 0.5598, "step": 91500 }, { "epoch": 117.60925449871465, "eval_accuracy": 0.8194785991809854, "eval_loss": 0.8624646067619324, "eval_runtime": 31.9751, "eval_samples_per_second": 194.62, "eval_steps_per_second": 6.098, "step": 91500 }, { "epoch": 118.25192802056556, "grad_norm": 2.6081254482269287, "learning_rate": 8.000000000000001e-06, "loss": 0.5588, "step": 92000 }, { "epoch": 118.25192802056556, "eval_accuracy": 0.8200990472992823, "eval_loss": 0.8558657169342041, "eval_runtime": 32.0709, "eval_samples_per_second": 194.039, "eval_steps_per_second": 6.08, "step": 92000 }, { "epoch": 118.89460154241645, "grad_norm": 2.39003849029541, "learning_rate": 7.5e-06, "loss": 0.5573, "step": 92500 }, { "epoch": 118.89460154241645, "eval_accuracy": 0.821544695178437, "eval_loss": 0.8529016375541687, "eval_runtime": 32.0688, "eval_samples_per_second": 194.052, "eval_steps_per_second": 6.081, "step": 92500 }, { "epoch": 119.53727506426735, "grad_norm": 2.671393871307373, "learning_rate": 7.000000000000001e-06, "loss": 0.5583, "step": 93000 }, { "epoch": 119.53727506426735, "eval_accuracy": 0.8202954907050746, "eval_loss": 0.8513576984405518, "eval_runtime": 32.0535, "eval_samples_per_second": 194.144, "eval_steps_per_second": 6.084, "step": 93000 }, { "epoch": 120.17994858611826, "grad_norm": 2.8136119842529297, "learning_rate": 6.5000000000000004e-06, "loss": 0.5599, "step": 93500 }, { "epoch": 120.17994858611826, "eval_accuracy": 0.8210522728285633, "eval_loss": 0.8533715605735779, "eval_runtime": 32.0108, "eval_samples_per_second": 194.403, "eval_steps_per_second": 6.092, "step": 93500 }, { "epoch": 120.82262210796915, "grad_norm": 2.5622072219848633, "learning_rate": 6e-06, "loss": 0.5538, "step": 94000 }, { "epoch": 120.82262210796915, "eval_accuracy": 0.8210836727544613, "eval_loss": 0.8567976951599121, "eval_runtime": 32.0891, "eval_samples_per_second": 193.929, "eval_steps_per_second": 6.077, "step": 94000 }, { "epoch": 121.46529562982005, "grad_norm": 2.870851516723633, "learning_rate": 5.500000000000001e-06, "loss": 0.5544, "step": 94500 }, { "epoch": 121.46529562982005, "eval_accuracy": 0.8227272131920523, "eval_loss": 0.8419870138168335, "eval_runtime": 32.0445, "eval_samples_per_second": 194.199, "eval_steps_per_second": 6.085, "step": 94500 } ], "logging_steps": 500, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 129, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.979699566574305e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }