diff --git "a/checkpoint-99500/trainer_state.json" "b/checkpoint-99500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-99500/trainer_state.json" @@ -0,0 +1,13964 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.99, + "eval_steps": 500, + "global_step": 99500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001, + "grad_norm": 2.5878946781158447, + "learning_rate": 4.99755e-05, + "loss": 9.5648, + "step": 50 + }, + { + "epoch": 0.002, + "grad_norm": 2.326439619064331, + "learning_rate": 4.99505e-05, + "loss": 8.7873, + "step": 100 + }, + { + "epoch": 0.003, + "grad_norm": 2.3391146659851074, + "learning_rate": 4.99255e-05, + "loss": 8.1288, + "step": 150 + }, + { + "epoch": 0.004, + "grad_norm": 2.398390531539917, + "learning_rate": 4.99005e-05, + "loss": 7.5247, + "step": 200 + }, + { + "epoch": 0.005, + "grad_norm": 2.4866814613342285, + "learning_rate": 4.98755e-05, + "loss": 7.0484, + "step": 250 + }, + { + "epoch": 0.006, + "grad_norm": 2.6657838821411133, + "learning_rate": 4.98505e-05, + "loss": 6.7936, + "step": 300 + }, + { + "epoch": 0.007, + "grad_norm": 2.2808291912078857, + "learning_rate": 4.9825500000000006e-05, + "loss": 6.5912, + "step": 350 + }, + { + "epoch": 0.008, + "grad_norm": 2.881570816040039, + "learning_rate": 4.98005e-05, + "loss": 6.5094, + "step": 400 + }, + { + "epoch": 0.009, + "grad_norm": 2.216550588607788, + "learning_rate": 4.9775500000000005e-05, + "loss": 6.4908, + "step": 450 + }, + { + "epoch": 0.01, + "grad_norm": 2.7933621406555176, + "learning_rate": 4.97505e-05, + "loss": 6.3012, + "step": 500 + }, + { + "epoch": 0.011, + "grad_norm": 2.3578381538391113, + "learning_rate": 4.9725500000000004e-05, + "loss": 6.2162, + "step": 550 + }, + { + "epoch": 0.012, + "grad_norm": 2.922057867050171, + "learning_rate": 4.97005e-05, + "loss": 6.1724, + "step": 600 + }, + { + "epoch": 0.013, + "grad_norm": 3.2069950103759766, + "learning_rate": 4.96755e-05, + "loss": 6.0117, + "step": 650 + }, + { + "epoch": 0.014, + "grad_norm": 2.830580234527588, + "learning_rate": 4.96505e-05, + "loss": 6.2267, + "step": 700 + }, + { + "epoch": 0.015, + "grad_norm": 3.050321102142334, + "learning_rate": 4.96255e-05, + "loss": 6.0236, + "step": 750 + }, + { + "epoch": 0.016, + "grad_norm": 3.1756136417388916, + "learning_rate": 4.96005e-05, + "loss": 6.0656, + "step": 800 + }, + { + "epoch": 0.017, + "grad_norm": 2.805255889892578, + "learning_rate": 4.95755e-05, + "loss": 6.0215, + "step": 850 + }, + { + "epoch": 0.018, + "grad_norm": 3.590785026550293, + "learning_rate": 4.95505e-05, + "loss": 6.0075, + "step": 900 + }, + { + "epoch": 0.019, + "grad_norm": 3.177901268005371, + "learning_rate": 4.9525500000000005e-05, + "loss": 5.8187, + "step": 950 + }, + { + "epoch": 0.02, + "grad_norm": 2.7860662937164307, + "learning_rate": 4.95005e-05, + "loss": 5.9114, + "step": 1000 + }, + { + "epoch": 0.021, + "grad_norm": 2.8016605377197266, + "learning_rate": 4.9475500000000004e-05, + "loss": 5.8214, + "step": 1050 + }, + { + "epoch": 0.022, + "grad_norm": 3.046123743057251, + "learning_rate": 4.94505e-05, + "loss": 5.9951, + "step": 1100 + }, + { + "epoch": 0.023, + "grad_norm": 3.1473944187164307, + "learning_rate": 4.94255e-05, + "loss": 5.8583, + "step": 1150 + }, + { + "epoch": 0.024, + "grad_norm": 3.6134166717529297, + "learning_rate": 4.94005e-05, + "loss": 5.8007, + "step": 1200 + }, + { + "epoch": 0.025, + "grad_norm": 3.606462001800537, + "learning_rate": 4.93755e-05, + "loss": 5.668, + "step": 1250 + }, + { + "epoch": 0.026, + "grad_norm": 3.616032838821411, + "learning_rate": 4.935050000000001e-05, + "loss": 5.7571, + "step": 1300 + }, + { + "epoch": 0.027, + "grad_norm": 5.09005069732666, + "learning_rate": 4.93255e-05, + "loss": 5.6571, + "step": 1350 + }, + { + "epoch": 0.028, + "grad_norm": 3.366626501083374, + "learning_rate": 4.930050000000001e-05, + "loss": 5.7309, + "step": 1400 + }, + { + "epoch": 0.029, + "grad_norm": 3.0361785888671875, + "learning_rate": 4.92755e-05, + "loss": 5.7198, + "step": 1450 + }, + { + "epoch": 0.03, + "grad_norm": 3.8341948986053467, + "learning_rate": 4.9250500000000006e-05, + "loss": 5.6426, + "step": 1500 + }, + { + "epoch": 0.031, + "grad_norm": 2.939432382583618, + "learning_rate": 4.9225500000000005e-05, + "loss": 5.5296, + "step": 1550 + }, + { + "epoch": 0.032, + "grad_norm": 5.241086483001709, + "learning_rate": 4.9200500000000004e-05, + "loss": 5.5357, + "step": 1600 + }, + { + "epoch": 0.033, + "grad_norm": 4.518623352050781, + "learning_rate": 4.9175500000000004e-05, + "loss": 5.676, + "step": 1650 + }, + { + "epoch": 0.034, + "grad_norm": 2.8882062435150146, + "learning_rate": 4.91505e-05, + "loss": 5.6926, + "step": 1700 + }, + { + "epoch": 0.035, + "grad_norm": 3.0241808891296387, + "learning_rate": 4.91255e-05, + "loss": 5.5071, + "step": 1750 + }, + { + "epoch": 0.036, + "grad_norm": 3.798578977584839, + "learning_rate": 4.91005e-05, + "loss": 5.3687, + "step": 1800 + }, + { + "epoch": 0.037, + "grad_norm": 3.716325044631958, + "learning_rate": 4.90755e-05, + "loss": 5.5141, + "step": 1850 + }, + { + "epoch": 0.038, + "grad_norm": 3.375710964202881, + "learning_rate": 4.905050000000001e-05, + "loss": 5.5613, + "step": 1900 + }, + { + "epoch": 0.039, + "grad_norm": 3.86002516746521, + "learning_rate": 4.90255e-05, + "loss": 5.4685, + "step": 1950 + }, + { + "epoch": 0.04, + "grad_norm": 4.336085796356201, + "learning_rate": 4.9000500000000006e-05, + "loss": 5.4872, + "step": 2000 + }, + { + "epoch": 0.041, + "grad_norm": 3.3040659427642822, + "learning_rate": 4.89755e-05, + "loss": 5.55, + "step": 2050 + }, + { + "epoch": 0.042, + "grad_norm": 4.6140313148498535, + "learning_rate": 4.8950500000000005e-05, + "loss": 5.4541, + "step": 2100 + }, + { + "epoch": 0.043, + "grad_norm": 3.5962140560150146, + "learning_rate": 4.8926e-05, + "loss": 5.503, + "step": 2150 + }, + { + "epoch": 0.044, + "grad_norm": 3.8882644176483154, + "learning_rate": 4.8901000000000005e-05, + "loss": 5.4016, + "step": 2200 + }, + { + "epoch": 0.045, + "grad_norm": 3.2127366065979004, + "learning_rate": 4.8876e-05, + "loss": 5.3505, + "step": 2250 + }, + { + "epoch": 0.046, + "grad_norm": 3.812075614929199, + "learning_rate": 4.8851000000000004e-05, + "loss": 5.308, + "step": 2300 + }, + { + "epoch": 0.047, + "grad_norm": 4.519869804382324, + "learning_rate": 4.8826e-05, + "loss": 5.426, + "step": 2350 + }, + { + "epoch": 0.048, + "grad_norm": 3.7040762901306152, + "learning_rate": 4.8801e-05, + "loss": 5.3255, + "step": 2400 + }, + { + "epoch": 0.049, + "grad_norm": 3.347390651702881, + "learning_rate": 4.8776e-05, + "loss": 5.3165, + "step": 2450 + }, + { + "epoch": 0.05, + "grad_norm": 3.786517858505249, + "learning_rate": 4.8751e-05, + "loss": 5.2374, + "step": 2500 + }, + { + "epoch": 0.051, + "grad_norm": 4.848645210266113, + "learning_rate": 4.872600000000001e-05, + "loss": 5.2179, + "step": 2550 + }, + { + "epoch": 0.052, + "grad_norm": 3.5423243045806885, + "learning_rate": 4.8701e-05, + "loss": 5.2099, + "step": 2600 + }, + { + "epoch": 0.053, + "grad_norm": 3.162454128265381, + "learning_rate": 4.8676000000000006e-05, + "loss": 5.2194, + "step": 2650 + }, + { + "epoch": 0.054, + "grad_norm": 4.749024868011475, + "learning_rate": 4.8651e-05, + "loss": 5.3041, + "step": 2700 + }, + { + "epoch": 0.055, + "grad_norm": 4.379371643066406, + "learning_rate": 4.8626000000000005e-05, + "loss": 5.3414, + "step": 2750 + }, + { + "epoch": 0.056, + "grad_norm": 4.969504356384277, + "learning_rate": 4.8601000000000004e-05, + "loss": 5.3592, + "step": 2800 + }, + { + "epoch": 0.057, + "grad_norm": 3.9636573791503906, + "learning_rate": 4.8576000000000004e-05, + "loss": 5.0791, + "step": 2850 + }, + { + "epoch": 0.058, + "grad_norm": 4.6978631019592285, + "learning_rate": 4.8551e-05, + "loss": 5.2218, + "step": 2900 + }, + { + "epoch": 0.059, + "grad_norm": 3.546217203140259, + "learning_rate": 4.8526e-05, + "loss": 5.1096, + "step": 2950 + }, + { + "epoch": 0.06, + "grad_norm": 3.556810140609741, + "learning_rate": 4.8501e-05, + "loss": 5.2164, + "step": 3000 + }, + { + "epoch": 0.061, + "grad_norm": 4.784888744354248, + "learning_rate": 4.8476e-05, + "loss": 5.1909, + "step": 3050 + }, + { + "epoch": 0.062, + "grad_norm": 3.073183059692383, + "learning_rate": 4.8451e-05, + "loss": 5.0549, + "step": 3100 + }, + { + "epoch": 0.063, + "grad_norm": 4.279170513153076, + "learning_rate": 4.842600000000001e-05, + "loss": 5.2979, + "step": 3150 + }, + { + "epoch": 0.064, + "grad_norm": 3.607466220855713, + "learning_rate": 4.8401e-05, + "loss": 4.9936, + "step": 3200 + }, + { + "epoch": 0.065, + "grad_norm": 4.333967685699463, + "learning_rate": 4.8376000000000006e-05, + "loss": 5.1944, + "step": 3250 + }, + { + "epoch": 0.066, + "grad_norm": 4.101257801055908, + "learning_rate": 4.8351000000000005e-05, + "loss": 5.1698, + "step": 3300 + }, + { + "epoch": 0.067, + "grad_norm": 5.113340377807617, + "learning_rate": 4.8326000000000005e-05, + "loss": 4.9471, + "step": 3350 + }, + { + "epoch": 0.068, + "grad_norm": 4.2900614738464355, + "learning_rate": 4.8301000000000004e-05, + "loss": 5.0118, + "step": 3400 + }, + { + "epoch": 0.069, + "grad_norm": 4.718325138092041, + "learning_rate": 4.8276e-05, + "loss": 4.983, + "step": 3450 + }, + { + "epoch": 0.07, + "grad_norm": 4.2561187744140625, + "learning_rate": 4.8251e-05, + "loss": 5.0226, + "step": 3500 + }, + { + "epoch": 0.071, + "grad_norm": 4.311369895935059, + "learning_rate": 4.8226e-05, + "loss": 4.7971, + "step": 3550 + }, + { + "epoch": 0.072, + "grad_norm": 3.74359130859375, + "learning_rate": 4.8201e-05, + "loss": 4.8312, + "step": 3600 + }, + { + "epoch": 0.073, + "grad_norm": 4.252142429351807, + "learning_rate": 4.817600000000001e-05, + "loss": 4.9795, + "step": 3650 + }, + { + "epoch": 0.074, + "grad_norm": 6.2820024490356445, + "learning_rate": 4.8151e-05, + "loss": 5.2044, + "step": 3700 + }, + { + "epoch": 0.075, + "grad_norm": 4.392597198486328, + "learning_rate": 4.8126000000000006e-05, + "loss": 5.0104, + "step": 3750 + }, + { + "epoch": 0.076, + "grad_norm": 3.9632086753845215, + "learning_rate": 4.8101e-05, + "loss": 4.9215, + "step": 3800 + }, + { + "epoch": 0.077, + "grad_norm": 3.8012332916259766, + "learning_rate": 4.8076000000000005e-05, + "loss": 5.1204, + "step": 3850 + }, + { + "epoch": 0.078, + "grad_norm": 3.9559926986694336, + "learning_rate": 4.8051000000000005e-05, + "loss": 4.9711, + "step": 3900 + }, + { + "epoch": 0.079, + "grad_norm": 4.333497047424316, + "learning_rate": 4.8026000000000004e-05, + "loss": 4.9412, + "step": 3950 + }, + { + "epoch": 0.08, + "grad_norm": 4.0923027992248535, + "learning_rate": 4.8001000000000003e-05, + "loss": 5.0554, + "step": 4000 + }, + { + "epoch": 0.081, + "grad_norm": 5.284310340881348, + "learning_rate": 4.7976e-05, + "loss": 5.008, + "step": 4050 + }, + { + "epoch": 0.082, + "grad_norm": 4.0718793869018555, + "learning_rate": 4.7951e-05, + "loss": 4.8831, + "step": 4100 + }, + { + "epoch": 0.083, + "grad_norm": 4.595408916473389, + "learning_rate": 4.7926e-05, + "loss": 4.8901, + "step": 4150 + }, + { + "epoch": 0.084, + "grad_norm": 4.26055383682251, + "learning_rate": 4.79015e-05, + "loss": 4.9658, + "step": 4200 + }, + { + "epoch": 0.085, + "grad_norm": 5.186002731323242, + "learning_rate": 4.78765e-05, + "loss": 4.8566, + "step": 4250 + }, + { + "epoch": 0.086, + "grad_norm": 4.462579250335693, + "learning_rate": 4.785150000000001e-05, + "loss": 4.7892, + "step": 4300 + }, + { + "epoch": 0.087, + "grad_norm": 5.473343849182129, + "learning_rate": 4.78265e-05, + "loss": 5.0057, + "step": 4350 + }, + { + "epoch": 0.088, + "grad_norm": 6.329896450042725, + "learning_rate": 4.7801500000000006e-05, + "loss": 4.841, + "step": 4400 + }, + { + "epoch": 0.089, + "grad_norm": 4.852380752563477, + "learning_rate": 4.77765e-05, + "loss": 4.7658, + "step": 4450 + }, + { + "epoch": 0.09, + "grad_norm": 4.577455520629883, + "learning_rate": 4.7751500000000005e-05, + "loss": 4.9862, + "step": 4500 + }, + { + "epoch": 0.091, + "grad_norm": 4.643036365509033, + "learning_rate": 4.7726500000000005e-05, + "loss": 4.8218, + "step": 4550 + }, + { + "epoch": 0.092, + "grad_norm": 4.697981834411621, + "learning_rate": 4.7701500000000004e-05, + "loss": 4.8586, + "step": 4600 + }, + { + "epoch": 0.093, + "grad_norm": 4.388003826141357, + "learning_rate": 4.76765e-05, + "loss": 4.9039, + "step": 4650 + }, + { + "epoch": 0.094, + "grad_norm": 4.59213399887085, + "learning_rate": 4.76515e-05, + "loss": 4.8719, + "step": 4700 + }, + { + "epoch": 0.095, + "grad_norm": 3.707645893096924, + "learning_rate": 4.76265e-05, + "loss": 4.7647, + "step": 4750 + }, + { + "epoch": 0.096, + "grad_norm": 4.055088520050049, + "learning_rate": 4.76015e-05, + "loss": 4.9086, + "step": 4800 + }, + { + "epoch": 0.097, + "grad_norm": 3.311262845993042, + "learning_rate": 4.75765e-05, + "loss": 4.8342, + "step": 4850 + }, + { + "epoch": 0.098, + "grad_norm": 4.882483005523682, + "learning_rate": 4.755150000000001e-05, + "loss": 4.8363, + "step": 4900 + }, + { + "epoch": 0.099, + "grad_norm": 4.083950042724609, + "learning_rate": 4.75265e-05, + "loss": 4.8207, + "step": 4950 + }, + { + "epoch": 0.1, + "grad_norm": 4.274576187133789, + "learning_rate": 4.7501500000000006e-05, + "loss": 4.7845, + "step": 5000 + }, + { + "epoch": 0.101, + "grad_norm": 4.489608287811279, + "learning_rate": 4.74765e-05, + "loss": 4.705, + "step": 5050 + }, + { + "epoch": 0.102, + "grad_norm": 4.044312953948975, + "learning_rate": 4.7451500000000005e-05, + "loss": 4.6666, + "step": 5100 + }, + { + "epoch": 0.103, + "grad_norm": 4.765976905822754, + "learning_rate": 4.7426500000000004e-05, + "loss": 4.9063, + "step": 5150 + }, + { + "epoch": 0.104, + "grad_norm": 3.8264522552490234, + "learning_rate": 4.7401500000000003e-05, + "loss": 4.6931, + "step": 5200 + }, + { + "epoch": 0.105, + "grad_norm": 4.612431049346924, + "learning_rate": 4.73765e-05, + "loss": 4.7997, + "step": 5250 + }, + { + "epoch": 0.106, + "grad_norm": 3.871309280395508, + "learning_rate": 4.73515e-05, + "loss": 4.8181, + "step": 5300 + }, + { + "epoch": 0.107, + "grad_norm": 5.149219512939453, + "learning_rate": 4.73265e-05, + "loss": 4.7243, + "step": 5350 + }, + { + "epoch": 0.108, + "grad_norm": 5.467377185821533, + "learning_rate": 4.73015e-05, + "loss": 4.743, + "step": 5400 + }, + { + "epoch": 0.109, + "grad_norm": 4.169463157653809, + "learning_rate": 4.72765e-05, + "loss": 4.6821, + "step": 5450 + }, + { + "epoch": 0.11, + "grad_norm": 4.368409633636475, + "learning_rate": 4.7251500000000007e-05, + "loss": 4.6557, + "step": 5500 + }, + { + "epoch": 0.111, + "grad_norm": 4.478771209716797, + "learning_rate": 4.72265e-05, + "loss": 4.7211, + "step": 5550 + }, + { + "epoch": 0.112, + "grad_norm": 4.097254753112793, + "learning_rate": 4.7201500000000005e-05, + "loss": 4.5969, + "step": 5600 + }, + { + "epoch": 0.113, + "grad_norm": 5.199203014373779, + "learning_rate": 4.71765e-05, + "loss": 4.6524, + "step": 5650 + }, + { + "epoch": 0.114, + "grad_norm": 4.438806056976318, + "learning_rate": 4.7151500000000004e-05, + "loss": 4.8195, + "step": 5700 + }, + { + "epoch": 0.115, + "grad_norm": 3.9345571994781494, + "learning_rate": 4.7126500000000004e-05, + "loss": 4.727, + "step": 5750 + }, + { + "epoch": 0.116, + "grad_norm": 4.215692520141602, + "learning_rate": 4.71015e-05, + "loss": 4.6921, + "step": 5800 + }, + { + "epoch": 0.117, + "grad_norm": 4.601874828338623, + "learning_rate": 4.70765e-05, + "loss": 4.6495, + "step": 5850 + }, + { + "epoch": 0.118, + "grad_norm": 4.633566856384277, + "learning_rate": 4.70515e-05, + "loss": 4.698, + "step": 5900 + }, + { + "epoch": 0.119, + "grad_norm": 4.730185031890869, + "learning_rate": 4.70265e-05, + "loss": 4.4599, + "step": 5950 + }, + { + "epoch": 0.12, + "grad_norm": 4.261397838592529, + "learning_rate": 4.70015e-05, + "loss": 4.6781, + "step": 6000 + }, + { + "epoch": 0.121, + "grad_norm": 4.527892112731934, + "learning_rate": 4.69765e-05, + "loss": 4.6323, + "step": 6050 + }, + { + "epoch": 0.122, + "grad_norm": 4.5726237297058105, + "learning_rate": 4.6951500000000006e-05, + "loss": 4.6572, + "step": 6100 + }, + { + "epoch": 0.123, + "grad_norm": 6.083981037139893, + "learning_rate": 4.69265e-05, + "loss": 4.709, + "step": 6150 + }, + { + "epoch": 0.124, + "grad_norm": 6.871278762817383, + "learning_rate": 4.6901500000000005e-05, + "loss": 4.5572, + "step": 6200 + }, + { + "epoch": 0.125, + "grad_norm": 5.033217430114746, + "learning_rate": 4.6876500000000004e-05, + "loss": 4.8283, + "step": 6250 + }, + { + "epoch": 0.126, + "grad_norm": 5.237866401672363, + "learning_rate": 4.6851500000000004e-05, + "loss": 4.6801, + "step": 6300 + }, + { + "epoch": 0.127, + "grad_norm": 4.246436595916748, + "learning_rate": 4.68265e-05, + "loss": 4.3712, + "step": 6350 + }, + { + "epoch": 0.128, + "grad_norm": 4.287390232086182, + "learning_rate": 4.68015e-05, + "loss": 4.549, + "step": 6400 + }, + { + "epoch": 0.129, + "grad_norm": 5.145524978637695, + "learning_rate": 4.67765e-05, + "loss": 4.6266, + "step": 6450 + }, + { + "epoch": 0.13, + "grad_norm": 5.119638919830322, + "learning_rate": 4.67515e-05, + "loss": 4.5634, + "step": 6500 + }, + { + "epoch": 0.131, + "grad_norm": 5.78672981262207, + "learning_rate": 4.67265e-05, + "loss": 4.402, + "step": 6550 + }, + { + "epoch": 0.132, + "grad_norm": 4.446723461151123, + "learning_rate": 4.670150000000001e-05, + "loss": 4.425, + "step": 6600 + }, + { + "epoch": 0.133, + "grad_norm": 6.311605453491211, + "learning_rate": 4.667700000000001e-05, + "loss": 4.6741, + "step": 6650 + }, + { + "epoch": 0.134, + "grad_norm": 4.3736772537231445, + "learning_rate": 4.6652e-05, + "loss": 4.532, + "step": 6700 + }, + { + "epoch": 0.135, + "grad_norm": 4.863807678222656, + "learning_rate": 4.6627000000000006e-05, + "loss": 4.3248, + "step": 6750 + }, + { + "epoch": 0.136, + "grad_norm": 4.457269668579102, + "learning_rate": 4.6602e-05, + "loss": 4.498, + "step": 6800 + }, + { + "epoch": 0.137, + "grad_norm": 5.396666526794434, + "learning_rate": 4.6577000000000005e-05, + "loss": 4.4462, + "step": 6850 + }, + { + "epoch": 0.138, + "grad_norm": 4.7764153480529785, + "learning_rate": 4.6552000000000004e-05, + "loss": 4.6494, + "step": 6900 + }, + { + "epoch": 0.139, + "grad_norm": 3.6046454906463623, + "learning_rate": 4.6527000000000004e-05, + "loss": 4.5615, + "step": 6950 + }, + { + "epoch": 0.14, + "grad_norm": 5.655783653259277, + "learning_rate": 4.6502e-05, + "loss": 4.6513, + "step": 7000 + }, + { + "epoch": 0.141, + "grad_norm": 4.3254218101501465, + "learning_rate": 4.6477e-05, + "loss": 4.66, + "step": 7050 + }, + { + "epoch": 0.142, + "grad_norm": 4.6337785720825195, + "learning_rate": 4.6452e-05, + "loss": 4.6969, + "step": 7100 + }, + { + "epoch": 0.143, + "grad_norm": 4.757678985595703, + "learning_rate": 4.6427e-05, + "loss": 4.5222, + "step": 7150 + }, + { + "epoch": 0.144, + "grad_norm": 5.13063907623291, + "learning_rate": 4.6402e-05, + "loss": 4.6098, + "step": 7200 + }, + { + "epoch": 0.145, + "grad_norm": 4.38551664352417, + "learning_rate": 4.637700000000001e-05, + "loss": 4.3308, + "step": 7250 + }, + { + "epoch": 0.146, + "grad_norm": 5.627738952636719, + "learning_rate": 4.6352e-05, + "loss": 4.3211, + "step": 7300 + }, + { + "epoch": 0.147, + "grad_norm": 5.648825168609619, + "learning_rate": 4.6327000000000006e-05, + "loss": 4.5699, + "step": 7350 + }, + { + "epoch": 0.148, + "grad_norm": 4.3480072021484375, + "learning_rate": 4.6302e-05, + "loss": 4.4976, + "step": 7400 + }, + { + "epoch": 0.149, + "grad_norm": 5.155121803283691, + "learning_rate": 4.6277000000000004e-05, + "loss": 4.5547, + "step": 7450 + }, + { + "epoch": 0.15, + "grad_norm": 7.101211071014404, + "learning_rate": 4.6252000000000004e-05, + "loss": 4.5195, + "step": 7500 + }, + { + "epoch": 0.151, + "grad_norm": 4.691388130187988, + "learning_rate": 4.6227e-05, + "loss": 4.3268, + "step": 7550 + }, + { + "epoch": 0.152, + "grad_norm": 4.447563648223877, + "learning_rate": 4.6202e-05, + "loss": 4.4686, + "step": 7600 + }, + { + "epoch": 0.153, + "grad_norm": 5.302483081817627, + "learning_rate": 4.6177e-05, + "loss": 4.4724, + "step": 7650 + }, + { + "epoch": 0.154, + "grad_norm": 5.505932807922363, + "learning_rate": 4.6152e-05, + "loss": 4.5188, + "step": 7700 + }, + { + "epoch": 0.155, + "grad_norm": 4.236738681793213, + "learning_rate": 4.6127e-05, + "loss": 4.3862, + "step": 7750 + }, + { + "epoch": 0.156, + "grad_norm": 4.4891839027404785, + "learning_rate": 4.6102e-05, + "loss": 4.3585, + "step": 7800 + }, + { + "epoch": 0.157, + "grad_norm": 5.605619430541992, + "learning_rate": 4.6077000000000006e-05, + "loss": 4.4001, + "step": 7850 + }, + { + "epoch": 0.158, + "grad_norm": 5.551383018493652, + "learning_rate": 4.6052e-05, + "loss": 4.6582, + "step": 7900 + }, + { + "epoch": 0.159, + "grad_norm": 6.318504810333252, + "learning_rate": 4.6027000000000005e-05, + "loss": 4.4838, + "step": 7950 + }, + { + "epoch": 0.16, + "grad_norm": 4.358963489532471, + "learning_rate": 4.6002e-05, + "loss": 4.4965, + "step": 8000 + }, + { + "epoch": 0.161, + "grad_norm": 4.824229717254639, + "learning_rate": 4.5977000000000004e-05, + "loss": 4.4741, + "step": 8050 + }, + { + "epoch": 0.162, + "grad_norm": 4.499610424041748, + "learning_rate": 4.5952e-05, + "loss": 4.4009, + "step": 8100 + }, + { + "epoch": 0.163, + "grad_norm": 4.207203388214111, + "learning_rate": 4.5927e-05, + "loss": 4.5216, + "step": 8150 + }, + { + "epoch": 0.164, + "grad_norm": 5.642760276794434, + "learning_rate": 4.5902e-05, + "loss": 4.3556, + "step": 8200 + }, + { + "epoch": 0.165, + "grad_norm": 4.778339385986328, + "learning_rate": 4.5877e-05, + "loss": 4.4773, + "step": 8250 + }, + { + "epoch": 0.166, + "grad_norm": 5.704320907592773, + "learning_rate": 4.5852e-05, + "loss": 4.2456, + "step": 8300 + }, + { + "epoch": 0.167, + "grad_norm": 4.367884635925293, + "learning_rate": 4.5827e-05, + "loss": 4.4405, + "step": 8350 + }, + { + "epoch": 0.168, + "grad_norm": 4.443906784057617, + "learning_rate": 4.5802e-05, + "loss": 4.4384, + "step": 8400 + }, + { + "epoch": 0.169, + "grad_norm": 4.177926540374756, + "learning_rate": 4.5777000000000006e-05, + "loss": 4.4162, + "step": 8450 + }, + { + "epoch": 0.17, + "grad_norm": 5.305122375488281, + "learning_rate": 4.5752e-05, + "loss": 4.4163, + "step": 8500 + }, + { + "epoch": 0.171, + "grad_norm": 4.545613765716553, + "learning_rate": 4.5727000000000005e-05, + "loss": 4.4573, + "step": 8550 + }, + { + "epoch": 0.172, + "grad_norm": 5.365312099456787, + "learning_rate": 4.5702e-05, + "loss": 4.4307, + "step": 8600 + }, + { + "epoch": 0.173, + "grad_norm": 5.722323417663574, + "learning_rate": 4.5677e-05, + "loss": 4.5031, + "step": 8650 + }, + { + "epoch": 0.174, + "grad_norm": 4.399505615234375, + "learning_rate": 4.5652e-05, + "loss": 4.0916, + "step": 8700 + }, + { + "epoch": 0.175, + "grad_norm": 3.8715851306915283, + "learning_rate": 4.5627e-05, + "loss": 4.1975, + "step": 8750 + }, + { + "epoch": 0.176, + "grad_norm": 4.734847068786621, + "learning_rate": 4.5602e-05, + "loss": 4.3628, + "step": 8800 + }, + { + "epoch": 0.177, + "grad_norm": 5.130662441253662, + "learning_rate": 4.55775e-05, + "loss": 4.3021, + "step": 8850 + }, + { + "epoch": 0.178, + "grad_norm": 7.081720352172852, + "learning_rate": 4.55525e-05, + "loss": 4.5149, + "step": 8900 + }, + { + "epoch": 0.179, + "grad_norm": 4.4592108726501465, + "learning_rate": 4.55275e-05, + "loss": 4.3799, + "step": 8950 + }, + { + "epoch": 0.18, + "grad_norm": 4.49852991104126, + "learning_rate": 4.55025e-05, + "loss": 4.3414, + "step": 9000 + }, + { + "epoch": 0.181, + "grad_norm": 4.531373023986816, + "learning_rate": 4.54775e-05, + "loss": 4.6448, + "step": 9050 + }, + { + "epoch": 0.182, + "grad_norm": 5.069887638092041, + "learning_rate": 4.5452500000000006e-05, + "loss": 4.4109, + "step": 9100 + }, + { + "epoch": 0.183, + "grad_norm": 6.900569438934326, + "learning_rate": 4.54275e-05, + "loss": 4.1422, + "step": 9150 + }, + { + "epoch": 0.184, + "grad_norm": 5.383849620819092, + "learning_rate": 4.5402500000000004e-05, + "loss": 4.1877, + "step": 9200 + }, + { + "epoch": 0.185, + "grad_norm": 4.181257247924805, + "learning_rate": 4.5377500000000004e-05, + "loss": 4.2277, + "step": 9250 + }, + { + "epoch": 0.186, + "grad_norm": 5.585279941558838, + "learning_rate": 4.53525e-05, + "loss": 4.3192, + "step": 9300 + }, + { + "epoch": 0.187, + "grad_norm": 4.499333381652832, + "learning_rate": 4.53275e-05, + "loss": 4.4794, + "step": 9350 + }, + { + "epoch": 0.188, + "grad_norm": 4.638943672180176, + "learning_rate": 4.53025e-05, + "loss": 4.342, + "step": 9400 + }, + { + "epoch": 0.189, + "grad_norm": 4.7845330238342285, + "learning_rate": 4.52775e-05, + "loss": 4.2032, + "step": 9450 + }, + { + "epoch": 0.19, + "grad_norm": 6.759675979614258, + "learning_rate": 4.52525e-05, + "loss": 4.2937, + "step": 9500 + }, + { + "epoch": 0.191, + "grad_norm": 5.508966445922852, + "learning_rate": 4.52275e-05, + "loss": 4.2401, + "step": 9550 + }, + { + "epoch": 0.192, + "grad_norm": 4.584382057189941, + "learning_rate": 4.5202500000000006e-05, + "loss": 4.3531, + "step": 9600 + }, + { + "epoch": 0.193, + "grad_norm": 6.405841827392578, + "learning_rate": 4.51775e-05, + "loss": 4.3412, + "step": 9650 + }, + { + "epoch": 0.194, + "grad_norm": 4.769068241119385, + "learning_rate": 4.5152500000000005e-05, + "loss": 4.2218, + "step": 9700 + }, + { + "epoch": 0.195, + "grad_norm": 5.489443778991699, + "learning_rate": 4.51275e-05, + "loss": 4.1926, + "step": 9750 + }, + { + "epoch": 0.196, + "grad_norm": 4.364252090454102, + "learning_rate": 4.5102500000000004e-05, + "loss": 4.2831, + "step": 9800 + }, + { + "epoch": 0.197, + "grad_norm": 5.687471389770508, + "learning_rate": 4.50775e-05, + "loss": 4.1691, + "step": 9850 + }, + { + "epoch": 0.198, + "grad_norm": 4.656460762023926, + "learning_rate": 4.50525e-05, + "loss": 4.376, + "step": 9900 + }, + { + "epoch": 0.199, + "grad_norm": 6.34237813949585, + "learning_rate": 4.50275e-05, + "loss": 4.2811, + "step": 9950 + }, + { + "epoch": 0.2, + "grad_norm": 4.788524150848389, + "learning_rate": 4.50025e-05, + "loss": 4.3582, + "step": 10000 + }, + { + "epoch": 0.201, + "grad_norm": 4.443533420562744, + "learning_rate": 4.49775e-05, + "loss": 4.201, + "step": 10050 + }, + { + "epoch": 0.202, + "grad_norm": 5.163551330566406, + "learning_rate": 4.49525e-05, + "loss": 4.3056, + "step": 10100 + }, + { + "epoch": 0.203, + "grad_norm": 4.789438247680664, + "learning_rate": 4.49275e-05, + "loss": 4.1726, + "step": 10150 + }, + { + "epoch": 0.204, + "grad_norm": 4.787554740905762, + "learning_rate": 4.4902500000000006e-05, + "loss": 4.2856, + "step": 10200 + }, + { + "epoch": 0.205, + "grad_norm": 5.560729503631592, + "learning_rate": 4.48775e-05, + "loss": 4.1345, + "step": 10250 + }, + { + "epoch": 0.206, + "grad_norm": 4.729394435882568, + "learning_rate": 4.4852500000000005e-05, + "loss": 4.1033, + "step": 10300 + }, + { + "epoch": 0.207, + "grad_norm": 6.309693336486816, + "learning_rate": 4.48275e-05, + "loss": 4.2375, + "step": 10350 + }, + { + "epoch": 0.208, + "grad_norm": 5.436013698577881, + "learning_rate": 4.4802500000000003e-05, + "loss": 4.0883, + "step": 10400 + }, + { + "epoch": 0.209, + "grad_norm": 5.470222473144531, + "learning_rate": 4.47775e-05, + "loss": 4.1247, + "step": 10450 + }, + { + "epoch": 0.21, + "grad_norm": 5.284532070159912, + "learning_rate": 4.47525e-05, + "loss": 4.2951, + "step": 10500 + }, + { + "epoch": 0.211, + "grad_norm": 5.210220813751221, + "learning_rate": 4.47275e-05, + "loss": 4.1732, + "step": 10550 + }, + { + "epoch": 0.212, + "grad_norm": 4.622988224029541, + "learning_rate": 4.47025e-05, + "loss": 4.3512, + "step": 10600 + }, + { + "epoch": 0.213, + "grad_norm": 4.809802055358887, + "learning_rate": 4.46775e-05, + "loss": 4.206, + "step": 10650 + }, + { + "epoch": 0.214, + "grad_norm": 5.238707542419434, + "learning_rate": 4.46525e-05, + "loss": 4.146, + "step": 10700 + }, + { + "epoch": 0.215, + "grad_norm": 5.506088733673096, + "learning_rate": 4.46275e-05, + "loss": 4.1784, + "step": 10750 + }, + { + "epoch": 0.216, + "grad_norm": 5.379574775695801, + "learning_rate": 4.4602500000000005e-05, + "loss": 4.1822, + "step": 10800 + }, + { + "epoch": 0.217, + "grad_norm": 6.162681579589844, + "learning_rate": 4.4578000000000006e-05, + "loss": 4.221, + "step": 10850 + }, + { + "epoch": 0.218, + "grad_norm": 4.8035759925842285, + "learning_rate": 4.4553e-05, + "loss": 4.1525, + "step": 10900 + }, + { + "epoch": 0.219, + "grad_norm": 4.488569736480713, + "learning_rate": 4.4528000000000005e-05, + "loss": 4.092, + "step": 10950 + }, + { + "epoch": 0.22, + "grad_norm": 5.557890892028809, + "learning_rate": 4.4503e-05, + "loss": 4.2091, + "step": 11000 + }, + { + "epoch": 0.221, + "grad_norm": 5.737027168273926, + "learning_rate": 4.4478000000000003e-05, + "loss": 4.2103, + "step": 11050 + }, + { + "epoch": 0.222, + "grad_norm": 5.317434787750244, + "learning_rate": 4.4453e-05, + "loss": 4.1134, + "step": 11100 + }, + { + "epoch": 0.223, + "grad_norm": 5.508534908294678, + "learning_rate": 4.4428e-05, + "loss": 4.2158, + "step": 11150 + }, + { + "epoch": 0.224, + "grad_norm": 5.144506931304932, + "learning_rate": 4.4403e-05, + "loss": 4.0672, + "step": 11200 + }, + { + "epoch": 0.225, + "grad_norm": 5.047402858734131, + "learning_rate": 4.4378e-05, + "loss": 4.1603, + "step": 11250 + }, + { + "epoch": 0.226, + "grad_norm": 6.11688232421875, + "learning_rate": 4.4353e-05, + "loss": 4.1523, + "step": 11300 + }, + { + "epoch": 0.227, + "grad_norm": 4.963644981384277, + "learning_rate": 4.4328e-05, + "loss": 4.1287, + "step": 11350 + }, + { + "epoch": 0.228, + "grad_norm": 6.210315704345703, + "learning_rate": 4.4303e-05, + "loss": 4.0335, + "step": 11400 + }, + { + "epoch": 0.229, + "grad_norm": 4.604816913604736, + "learning_rate": 4.4278000000000005e-05, + "loss": 4.151, + "step": 11450 + }, + { + "epoch": 0.23, + "grad_norm": 4.679665565490723, + "learning_rate": 4.4253e-05, + "loss": 4.0733, + "step": 11500 + }, + { + "epoch": 0.231, + "grad_norm": 5.566308975219727, + "learning_rate": 4.4228000000000004e-05, + "loss": 4.2364, + "step": 11550 + }, + { + "epoch": 0.232, + "grad_norm": 6.232782363891602, + "learning_rate": 4.4203e-05, + "loss": 4.1428, + "step": 11600 + }, + { + "epoch": 0.233, + "grad_norm": 5.726539611816406, + "learning_rate": 4.4178e-05, + "loss": 4.243, + "step": 11650 + }, + { + "epoch": 0.234, + "grad_norm": 7.133997440338135, + "learning_rate": 4.4153e-05, + "loss": 4.2425, + "step": 11700 + }, + { + "epoch": 0.235, + "grad_norm": 5.223475933074951, + "learning_rate": 4.4128e-05, + "loss": 3.8897, + "step": 11750 + }, + { + "epoch": 0.236, + "grad_norm": 5.071702480316162, + "learning_rate": 4.4103e-05, + "loss": 4.1501, + "step": 11800 + }, + { + "epoch": 0.237, + "grad_norm": 6.633401870727539, + "learning_rate": 4.4078e-05, + "loss": 4.2129, + "step": 11850 + }, + { + "epoch": 0.238, + "grad_norm": 5.5083465576171875, + "learning_rate": 4.4053e-05, + "loss": 4.0511, + "step": 11900 + }, + { + "epoch": 0.239, + "grad_norm": 5.211723327636719, + "learning_rate": 4.4028e-05, + "loss": 4.2103, + "step": 11950 + }, + { + "epoch": 0.24, + "grad_norm": 6.270011901855469, + "learning_rate": 4.4003e-05, + "loss": 3.9425, + "step": 12000 + }, + { + "epoch": 0.241, + "grad_norm": 6.336957931518555, + "learning_rate": 4.3978000000000005e-05, + "loss": 4.1376, + "step": 12050 + }, + { + "epoch": 0.242, + "grad_norm": 6.871969223022461, + "learning_rate": 4.3953e-05, + "loss": 3.9205, + "step": 12100 + }, + { + "epoch": 0.243, + "grad_norm": 6.551147937774658, + "learning_rate": 4.3928000000000004e-05, + "loss": 4.0799, + "step": 12150 + }, + { + "epoch": 0.244, + "grad_norm": 5.746366500854492, + "learning_rate": 4.3903e-05, + "loss": 4.0542, + "step": 12200 + }, + { + "epoch": 0.245, + "grad_norm": 5.873757362365723, + "learning_rate": 4.3878e-05, + "loss": 4.1353, + "step": 12250 + }, + { + "epoch": 0.246, + "grad_norm": 5.338042736053467, + "learning_rate": 4.3853e-05, + "loss": 3.8595, + "step": 12300 + }, + { + "epoch": 0.247, + "grad_norm": 4.573357105255127, + "learning_rate": 4.3828e-05, + "loss": 4.2164, + "step": 12350 + }, + { + "epoch": 0.248, + "grad_norm": 6.519650459289551, + "learning_rate": 4.3803e-05, + "loss": 4.0502, + "step": 12400 + }, + { + "epoch": 0.249, + "grad_norm": 5.267230987548828, + "learning_rate": 4.3778e-05, + "loss": 4.1534, + "step": 12450 + }, + { + "epoch": 0.25, + "grad_norm": 6.070836544036865, + "learning_rate": 4.3753e-05, + "loss": 3.7346, + "step": 12500 + }, + { + "epoch": 0.251, + "grad_norm": 4.947108745574951, + "learning_rate": 4.3728000000000006e-05, + "loss": 3.976, + "step": 12550 + }, + { + "epoch": 0.252, + "grad_norm": 5.57907247543335, + "learning_rate": 4.3703e-05, + "loss": 3.925, + "step": 12600 + }, + { + "epoch": 0.253, + "grad_norm": 5.960052490234375, + "learning_rate": 4.3678000000000004e-05, + "loss": 4.1336, + "step": 12650 + }, + { + "epoch": 0.254, + "grad_norm": 5.7773756980896, + "learning_rate": 4.3653e-05, + "loss": 4.055, + "step": 12700 + }, + { + "epoch": 0.255, + "grad_norm": 5.2892584800720215, + "learning_rate": 4.3628e-05, + "loss": 4.0579, + "step": 12750 + }, + { + "epoch": 0.256, + "grad_norm": 5.7825798988342285, + "learning_rate": 4.3603e-05, + "loss": 4.0154, + "step": 12800 + }, + { + "epoch": 0.257, + "grad_norm": 4.2211432456970215, + "learning_rate": 4.35785e-05, + "loss": 4.1892, + "step": 12850 + }, + { + "epoch": 0.258, + "grad_norm": 5.150039196014404, + "learning_rate": 4.35535e-05, + "loss": 4.1491, + "step": 12900 + }, + { + "epoch": 0.259, + "grad_norm": 4.940394401550293, + "learning_rate": 4.35285e-05, + "loss": 4.1802, + "step": 12950 + }, + { + "epoch": 0.26, + "grad_norm": 5.550903797149658, + "learning_rate": 4.35035e-05, + "loss": 4.1659, + "step": 13000 + }, + { + "epoch": 0.261, + "grad_norm": 5.111509799957275, + "learning_rate": 4.34785e-05, + "loss": 3.9146, + "step": 13050 + }, + { + "epoch": 0.262, + "grad_norm": 5.713197231292725, + "learning_rate": 4.34535e-05, + "loss": 4.0872, + "step": 13100 + }, + { + "epoch": 0.263, + "grad_norm": 4.943962574005127, + "learning_rate": 4.34285e-05, + "loss": 3.8049, + "step": 13150 + }, + { + "epoch": 0.264, + "grad_norm": 5.962152481079102, + "learning_rate": 4.3403500000000005e-05, + "loss": 3.8484, + "step": 13200 + }, + { + "epoch": 0.265, + "grad_norm": 5.06650972366333, + "learning_rate": 4.33785e-05, + "loss": 3.9027, + "step": 13250 + }, + { + "epoch": 0.266, + "grad_norm": 5.804838180541992, + "learning_rate": 4.3353500000000004e-05, + "loss": 3.8695, + "step": 13300 + }, + { + "epoch": 0.267, + "grad_norm": 4.750165939331055, + "learning_rate": 4.33285e-05, + "loss": 3.9193, + "step": 13350 + }, + { + "epoch": 0.268, + "grad_norm": 5.648144721984863, + "learning_rate": 4.33035e-05, + "loss": 4.1735, + "step": 13400 + }, + { + "epoch": 0.269, + "grad_norm": 5.493870735168457, + "learning_rate": 4.32785e-05, + "loss": 3.9941, + "step": 13450 + }, + { + "epoch": 0.27, + "grad_norm": 5.866032123565674, + "learning_rate": 4.32535e-05, + "loss": 3.8839, + "step": 13500 + }, + { + "epoch": 0.271, + "grad_norm": 5.975167274475098, + "learning_rate": 4.32285e-05, + "loss": 4.1467, + "step": 13550 + }, + { + "epoch": 0.272, + "grad_norm": 4.195262432098389, + "learning_rate": 4.32035e-05, + "loss": 3.8118, + "step": 13600 + }, + { + "epoch": 0.273, + "grad_norm": 5.3393378257751465, + "learning_rate": 4.31785e-05, + "loss": 3.9583, + "step": 13650 + }, + { + "epoch": 0.274, + "grad_norm": 5.666850566864014, + "learning_rate": 4.31535e-05, + "loss": 3.8644, + "step": 13700 + }, + { + "epoch": 0.275, + "grad_norm": 5.188211917877197, + "learning_rate": 4.31285e-05, + "loss": 3.9014, + "step": 13750 + }, + { + "epoch": 0.276, + "grad_norm": 8.021430969238281, + "learning_rate": 4.3103500000000005e-05, + "loss": 3.9939, + "step": 13800 + }, + { + "epoch": 0.277, + "grad_norm": 6.547386169433594, + "learning_rate": 4.30785e-05, + "loss": 3.9064, + "step": 13850 + }, + { + "epoch": 0.278, + "grad_norm": 6.097850799560547, + "learning_rate": 4.3053500000000004e-05, + "loss": 3.9909, + "step": 13900 + }, + { + "epoch": 0.279, + "grad_norm": 5.55470085144043, + "learning_rate": 4.3028499999999996e-05, + "loss": 4.0055, + "step": 13950 + }, + { + "epoch": 0.28, + "grad_norm": 5.464843273162842, + "learning_rate": 4.30035e-05, + "loss": 3.9649, + "step": 14000 + }, + { + "epoch": 0.281, + "grad_norm": 5.489349365234375, + "learning_rate": 4.29785e-05, + "loss": 4.1088, + "step": 14050 + }, + { + "epoch": 0.282, + "grad_norm": 5.301501274108887, + "learning_rate": 4.29535e-05, + "loss": 3.9677, + "step": 14100 + }, + { + "epoch": 0.283, + "grad_norm": 6.240916728973389, + "learning_rate": 4.292850000000001e-05, + "loss": 3.8114, + "step": 14150 + }, + { + "epoch": 0.284, + "grad_norm": 5.443836212158203, + "learning_rate": 4.29035e-05, + "loss": 4.0816, + "step": 14200 + }, + { + "epoch": 0.285, + "grad_norm": 5.28709602355957, + "learning_rate": 4.2878500000000006e-05, + "loss": 3.829, + "step": 14250 + }, + { + "epoch": 0.286, + "grad_norm": 4.40752649307251, + "learning_rate": 4.28535e-05, + "loss": 3.8467, + "step": 14300 + }, + { + "epoch": 0.287, + "grad_norm": 6.202967643737793, + "learning_rate": 4.2828500000000005e-05, + "loss": 3.9803, + "step": 14350 + }, + { + "epoch": 0.288, + "grad_norm": 5.116330623626709, + "learning_rate": 4.2803500000000004e-05, + "loss": 3.8783, + "step": 14400 + }, + { + "epoch": 0.289, + "grad_norm": 5.7756781578063965, + "learning_rate": 4.2778500000000004e-05, + "loss": 3.6101, + "step": 14450 + }, + { + "epoch": 0.29, + "grad_norm": 5.229415416717529, + "learning_rate": 4.27535e-05, + "loss": 4.0489, + "step": 14500 + }, + { + "epoch": 0.291, + "grad_norm": 6.183515548706055, + "learning_rate": 4.27285e-05, + "loss": 3.8464, + "step": 14550 + }, + { + "epoch": 0.292, + "grad_norm": 5.619703769683838, + "learning_rate": 4.27035e-05, + "loss": 4.0055, + "step": 14600 + }, + { + "epoch": 0.293, + "grad_norm": 5.139617919921875, + "learning_rate": 4.26785e-05, + "loss": 4.0818, + "step": 14650 + }, + { + "epoch": 0.294, + "grad_norm": 5.552958011627197, + "learning_rate": 4.26535e-05, + "loss": 3.9061, + "step": 14700 + }, + { + "epoch": 0.295, + "grad_norm": 6.033257007598877, + "learning_rate": 4.262850000000001e-05, + "loss": 3.9266, + "step": 14750 + }, + { + "epoch": 0.296, + "grad_norm": 5.819438934326172, + "learning_rate": 4.26035e-05, + "loss": 3.9946, + "step": 14800 + }, + { + "epoch": 0.297, + "grad_norm": 5.418524742126465, + "learning_rate": 4.2578500000000006e-05, + "loss": 3.6531, + "step": 14850 + }, + { + "epoch": 0.298, + "grad_norm": 6.2832560539245605, + "learning_rate": 4.2554e-05, + "loss": 3.9278, + "step": 14900 + }, + { + "epoch": 0.299, + "grad_norm": 7.28920841217041, + "learning_rate": 4.2529e-05, + "loss": 3.8046, + "step": 14950 + }, + { + "epoch": 0.3, + "grad_norm": 7.926489353179932, + "learning_rate": 4.2504e-05, + "loss": 3.8063, + "step": 15000 + }, + { + "epoch": 0.301, + "grad_norm": 5.459051609039307, + "learning_rate": 4.2479000000000004e-05, + "loss": 3.8941, + "step": 15050 + }, + { + "epoch": 0.302, + "grad_norm": 5.072142124176025, + "learning_rate": 4.2454e-05, + "loss": 3.8465, + "step": 15100 + }, + { + "epoch": 0.303, + "grad_norm": 5.166111469268799, + "learning_rate": 4.2429e-05, + "loss": 4.0484, + "step": 15150 + }, + { + "epoch": 0.304, + "grad_norm": 5.6650471687316895, + "learning_rate": 4.2404e-05, + "loss": 3.7839, + "step": 15200 + }, + { + "epoch": 0.305, + "grad_norm": 6.4663801193237305, + "learning_rate": 4.2379e-05, + "loss": 3.7774, + "step": 15250 + }, + { + "epoch": 0.306, + "grad_norm": 5.610959529876709, + "learning_rate": 4.2354e-05, + "loss": 3.9454, + "step": 15300 + }, + { + "epoch": 0.307, + "grad_norm": 4.7916259765625, + "learning_rate": 4.2329e-05, + "loss": 3.8588, + "step": 15350 + }, + { + "epoch": 0.308, + "grad_norm": 4.65864372253418, + "learning_rate": 4.230400000000001e-05, + "loss": 4.0363, + "step": 15400 + }, + { + "epoch": 0.309, + "grad_norm": 5.171529293060303, + "learning_rate": 4.2279e-05, + "loss": 3.7952, + "step": 15450 + }, + { + "epoch": 0.31, + "grad_norm": 6.2722578048706055, + "learning_rate": 4.2254000000000006e-05, + "loss": 3.8943, + "step": 15500 + }, + { + "epoch": 0.311, + "grad_norm": 6.563569068908691, + "learning_rate": 4.2229000000000005e-05, + "loss": 4.1681, + "step": 15550 + }, + { + "epoch": 0.312, + "grad_norm": 4.982414245605469, + "learning_rate": 4.2204000000000005e-05, + "loss": 3.648, + "step": 15600 + }, + { + "epoch": 0.313, + "grad_norm": 5.668543338775635, + "learning_rate": 4.2179000000000004e-05, + "loss": 3.9033, + "step": 15650 + }, + { + "epoch": 0.314, + "grad_norm": 5.404988765716553, + "learning_rate": 4.2154e-05, + "loss": 3.6414, + "step": 15700 + }, + { + "epoch": 0.315, + "grad_norm": 5.084871292114258, + "learning_rate": 4.2129e-05, + "loss": 3.88, + "step": 15750 + }, + { + "epoch": 0.316, + "grad_norm": 5.059499263763428, + "learning_rate": 4.2104e-05, + "loss": 3.8755, + "step": 15800 + }, + { + "epoch": 0.317, + "grad_norm": 5.1128010749816895, + "learning_rate": 4.2079e-05, + "loss": 3.946, + "step": 15850 + }, + { + "epoch": 0.318, + "grad_norm": 5.611748218536377, + "learning_rate": 4.205400000000001e-05, + "loss": 3.8578, + "step": 15900 + }, + { + "epoch": 0.319, + "grad_norm": 6.221893787384033, + "learning_rate": 4.2029e-05, + "loss": 4.0508, + "step": 15950 + }, + { + "epoch": 0.32, + "grad_norm": 7.107609272003174, + "learning_rate": 4.2004000000000006e-05, + "loss": 3.8917, + "step": 16000 + }, + { + "epoch": 0.321, + "grad_norm": 5.290852069854736, + "learning_rate": 4.1979e-05, + "loss": 3.8707, + "step": 16050 + }, + { + "epoch": 0.322, + "grad_norm": 6.895267486572266, + "learning_rate": 4.1954000000000005e-05, + "loss": 4.0213, + "step": 16100 + }, + { + "epoch": 0.323, + "grad_norm": 4.469918727874756, + "learning_rate": 4.1929000000000005e-05, + "loss": 3.7341, + "step": 16150 + }, + { + "epoch": 0.324, + "grad_norm": 8.303849220275879, + "learning_rate": 4.1904000000000004e-05, + "loss": 3.6815, + "step": 16200 + }, + { + "epoch": 0.325, + "grad_norm": 5.290466785430908, + "learning_rate": 4.1879000000000003e-05, + "loss": 3.8443, + "step": 16250 + }, + { + "epoch": 0.326, + "grad_norm": 6.372802734375, + "learning_rate": 4.1854e-05, + "loss": 3.6625, + "step": 16300 + }, + { + "epoch": 0.327, + "grad_norm": 5.735015392303467, + "learning_rate": 4.1829e-05, + "loss": 3.9276, + "step": 16350 + }, + { + "epoch": 0.328, + "grad_norm": 5.2200093269348145, + "learning_rate": 4.1804e-05, + "loss": 3.9428, + "step": 16400 + }, + { + "epoch": 0.329, + "grad_norm": 6.257447719573975, + "learning_rate": 4.1779e-05, + "loss": 3.9017, + "step": 16450 + }, + { + "epoch": 0.33, + "grad_norm": 6.63900899887085, + "learning_rate": 4.175400000000001e-05, + "loss": 3.8419, + "step": 16500 + }, + { + "epoch": 0.331, + "grad_norm": 4.8323469161987305, + "learning_rate": 4.1729e-05, + "loss": 3.7655, + "step": 16550 + }, + { + "epoch": 0.332, + "grad_norm": 4.395730972290039, + "learning_rate": 4.1704000000000006e-05, + "loss": 3.8917, + "step": 16600 + }, + { + "epoch": 0.333, + "grad_norm": 5.6126484870910645, + "learning_rate": 4.1679e-05, + "loss": 3.9089, + "step": 16650 + }, + { + "epoch": 0.334, + "grad_norm": 7.912436485290527, + "learning_rate": 4.1654000000000005e-05, + "loss": 3.8822, + "step": 16700 + }, + { + "epoch": 0.335, + "grad_norm": 6.455771446228027, + "learning_rate": 4.1629000000000004e-05, + "loss": 3.8588, + "step": 16750 + }, + { + "epoch": 0.336, + "grad_norm": 5.0589447021484375, + "learning_rate": 4.1604000000000004e-05, + "loss": 3.8178, + "step": 16800 + }, + { + "epoch": 0.337, + "grad_norm": 5.345367431640625, + "learning_rate": 4.1579e-05, + "loss": 3.7995, + "step": 16850 + }, + { + "epoch": 0.338, + "grad_norm": 5.080057621002197, + "learning_rate": 4.1554e-05, + "loss": 3.8185, + "step": 16900 + }, + { + "epoch": 0.339, + "grad_norm": 5.360045433044434, + "learning_rate": 4.15295e-05, + "loss": 3.8565, + "step": 16950 + }, + { + "epoch": 0.34, + "grad_norm": 6.614706039428711, + "learning_rate": 4.15045e-05, + "loss": 3.9084, + "step": 17000 + }, + { + "epoch": 0.341, + "grad_norm": 6.911114692687988, + "learning_rate": 4.14795e-05, + "loss": 3.9076, + "step": 17050 + }, + { + "epoch": 0.342, + "grad_norm": 6.049564361572266, + "learning_rate": 4.14545e-05, + "loss": 3.8488, + "step": 17100 + }, + { + "epoch": 0.343, + "grad_norm": 5.199458599090576, + "learning_rate": 4.142950000000001e-05, + "loss": 3.7354, + "step": 17150 + }, + { + "epoch": 0.344, + "grad_norm": 5.313460826873779, + "learning_rate": 4.14045e-05, + "loss": 3.7896, + "step": 17200 + }, + { + "epoch": 0.345, + "grad_norm": 6.561459064483643, + "learning_rate": 4.1379500000000006e-05, + "loss": 3.9219, + "step": 17250 + }, + { + "epoch": 0.346, + "grad_norm": 6.4212565422058105, + "learning_rate": 4.13545e-05, + "loss": 3.7611, + "step": 17300 + }, + { + "epoch": 0.347, + "grad_norm": 5.5424275398254395, + "learning_rate": 4.1329500000000005e-05, + "loss": 3.7, + "step": 17350 + }, + { + "epoch": 0.348, + "grad_norm": 5.535832405090332, + "learning_rate": 4.1304500000000004e-05, + "loss": 3.8373, + "step": 17400 + }, + { + "epoch": 0.349, + "grad_norm": 6.890970230102539, + "learning_rate": 4.1279500000000003e-05, + "loss": 3.8647, + "step": 17450 + }, + { + "epoch": 0.35, + "grad_norm": 6.52531623840332, + "learning_rate": 4.12545e-05, + "loss": 3.9338, + "step": 17500 + }, + { + "epoch": 0.351, + "grad_norm": 5.829474925994873, + "learning_rate": 4.12295e-05, + "loss": 3.7462, + "step": 17550 + }, + { + "epoch": 0.352, + "grad_norm": 5.539413928985596, + "learning_rate": 4.12045e-05, + "loss": 3.9142, + "step": 17600 + }, + { + "epoch": 0.353, + "grad_norm": 5.760298252105713, + "learning_rate": 4.11795e-05, + "loss": 3.7838, + "step": 17650 + }, + { + "epoch": 0.354, + "grad_norm": 5.714521408081055, + "learning_rate": 4.11545e-05, + "loss": 3.5264, + "step": 17700 + }, + { + "epoch": 0.355, + "grad_norm": 7.143244743347168, + "learning_rate": 4.1129500000000007e-05, + "loss": 3.5885, + "step": 17750 + }, + { + "epoch": 0.356, + "grad_norm": 5.311776161193848, + "learning_rate": 4.11045e-05, + "loss": 3.8021, + "step": 17800 + }, + { + "epoch": 0.357, + "grad_norm": 5.300187587738037, + "learning_rate": 4.1079500000000005e-05, + "loss": 3.7987, + "step": 17850 + }, + { + "epoch": 0.358, + "grad_norm": 4.707153797149658, + "learning_rate": 4.10545e-05, + "loss": 3.7739, + "step": 17900 + }, + { + "epoch": 0.359, + "grad_norm": 6.6488261222839355, + "learning_rate": 4.1029500000000004e-05, + "loss": 3.6998, + "step": 17950 + }, + { + "epoch": 0.36, + "grad_norm": 6.836310386657715, + "learning_rate": 4.1004500000000004e-05, + "loss": 3.8338, + "step": 18000 + }, + { + "epoch": 0.361, + "grad_norm": 4.986599922180176, + "learning_rate": 4.09795e-05, + "loss": 3.8181, + "step": 18050 + }, + { + "epoch": 0.362, + "grad_norm": 6.216142654418945, + "learning_rate": 4.09545e-05, + "loss": 3.7621, + "step": 18100 + }, + { + "epoch": 0.363, + "grad_norm": 5.541645050048828, + "learning_rate": 4.09295e-05, + "loss": 3.6238, + "step": 18150 + }, + { + "epoch": 0.364, + "grad_norm": 5.436352252960205, + "learning_rate": 4.09045e-05, + "loss": 3.5689, + "step": 18200 + }, + { + "epoch": 0.365, + "grad_norm": 5.6476335525512695, + "learning_rate": 4.08795e-05, + "loss": 3.884, + "step": 18250 + }, + { + "epoch": 0.366, + "grad_norm": 5.718410968780518, + "learning_rate": 4.08545e-05, + "loss": 3.6267, + "step": 18300 + }, + { + "epoch": 0.367, + "grad_norm": 5.955104827880859, + "learning_rate": 4.0829500000000006e-05, + "loss": 3.7001, + "step": 18350 + }, + { + "epoch": 0.368, + "grad_norm": 7.096001148223877, + "learning_rate": 4.08045e-05, + "loss": 3.7513, + "step": 18400 + }, + { + "epoch": 0.369, + "grad_norm": 5.519197463989258, + "learning_rate": 4.0779500000000005e-05, + "loss": 3.6899, + "step": 18450 + }, + { + "epoch": 0.37, + "grad_norm": 5.956851482391357, + "learning_rate": 4.0754500000000004e-05, + "loss": 3.5223, + "step": 18500 + }, + { + "epoch": 0.371, + "grad_norm": 7.059772491455078, + "learning_rate": 4.0729500000000004e-05, + "loss": 3.6623, + "step": 18550 + }, + { + "epoch": 0.372, + "grad_norm": 5.00886344909668, + "learning_rate": 4.07045e-05, + "loss": 3.6135, + "step": 18600 + }, + { + "epoch": 0.373, + "grad_norm": 9.62269115447998, + "learning_rate": 4.06795e-05, + "loss": 3.5963, + "step": 18650 + }, + { + "epoch": 0.374, + "grad_norm": 5.832387924194336, + "learning_rate": 4.06545e-05, + "loss": 3.8358, + "step": 18700 + }, + { + "epoch": 0.375, + "grad_norm": 5.544480323791504, + "learning_rate": 4.06295e-05, + "loss": 3.7966, + "step": 18750 + }, + { + "epoch": 0.376, + "grad_norm": 7.148738384246826, + "learning_rate": 4.06045e-05, + "loss": 3.6962, + "step": 18800 + }, + { + "epoch": 0.377, + "grad_norm": 6.446701526641846, + "learning_rate": 4.057950000000001e-05, + "loss": 3.6023, + "step": 18850 + }, + { + "epoch": 0.378, + "grad_norm": 5.0687713623046875, + "learning_rate": 4.05545e-05, + "loss": 3.6317, + "step": 18900 + }, + { + "epoch": 0.379, + "grad_norm": 6.391605854034424, + "learning_rate": 4.0529500000000006e-05, + "loss": 3.7784, + "step": 18950 + }, + { + "epoch": 0.38, + "grad_norm": 5.7233076095581055, + "learning_rate": 4.05045e-05, + "loss": 3.8169, + "step": 19000 + }, + { + "epoch": 0.381, + "grad_norm": 5.0900678634643555, + "learning_rate": 4.048e-05, + "loss": 3.9733, + "step": 19050 + }, + { + "epoch": 0.382, + "grad_norm": 7.224969387054443, + "learning_rate": 4.0455000000000005e-05, + "loss": 3.9576, + "step": 19100 + }, + { + "epoch": 0.383, + "grad_norm": 5.281608581542969, + "learning_rate": 4.0430000000000004e-05, + "loss": 3.8107, + "step": 19150 + }, + { + "epoch": 0.384, + "grad_norm": 5.875776290893555, + "learning_rate": 4.0405000000000004e-05, + "loss": 3.8561, + "step": 19200 + }, + { + "epoch": 0.385, + "grad_norm": 5.377148628234863, + "learning_rate": 4.038e-05, + "loss": 3.8033, + "step": 19250 + }, + { + "epoch": 0.386, + "grad_norm": 6.931129455566406, + "learning_rate": 4.0355e-05, + "loss": 3.6839, + "step": 19300 + }, + { + "epoch": 0.387, + "grad_norm": 6.245096206665039, + "learning_rate": 4.033e-05, + "loss": 3.6428, + "step": 19350 + }, + { + "epoch": 0.388, + "grad_norm": 5.333400249481201, + "learning_rate": 4.0305e-05, + "loss": 3.5137, + "step": 19400 + }, + { + "epoch": 0.389, + "grad_norm": 5.533875942230225, + "learning_rate": 4.028e-05, + "loss": 3.8904, + "step": 19450 + }, + { + "epoch": 0.39, + "grad_norm": 6.073616981506348, + "learning_rate": 4.025500000000001e-05, + "loss": 3.7149, + "step": 19500 + }, + { + "epoch": 0.391, + "grad_norm": 5.462928771972656, + "learning_rate": 4.023e-05, + "loss": 3.7531, + "step": 19550 + }, + { + "epoch": 0.392, + "grad_norm": 5.310408115386963, + "learning_rate": 4.0205000000000006e-05, + "loss": 3.6716, + "step": 19600 + }, + { + "epoch": 0.393, + "grad_norm": 9.851359367370605, + "learning_rate": 4.018e-05, + "loss": 3.7123, + "step": 19650 + }, + { + "epoch": 0.394, + "grad_norm": 6.514659881591797, + "learning_rate": 4.0155000000000004e-05, + "loss": 3.8305, + "step": 19700 + }, + { + "epoch": 0.395, + "grad_norm": 6.6378374099731445, + "learning_rate": 4.0130000000000004e-05, + "loss": 3.6212, + "step": 19750 + }, + { + "epoch": 0.396, + "grad_norm": 7.522112846374512, + "learning_rate": 4.0105e-05, + "loss": 3.7809, + "step": 19800 + }, + { + "epoch": 0.397, + "grad_norm": 6.330406665802002, + "learning_rate": 4.008e-05, + "loss": 3.6223, + "step": 19850 + }, + { + "epoch": 0.398, + "grad_norm": 5.559330463409424, + "learning_rate": 4.0055e-05, + "loss": 3.6856, + "step": 19900 + }, + { + "epoch": 0.399, + "grad_norm": 5.872786045074463, + "learning_rate": 4.003e-05, + "loss": 3.6335, + "step": 19950 + }, + { + "epoch": 0.4, + "grad_norm": 5.806800842285156, + "learning_rate": 4.0005e-05, + "loss": 3.6443, + "step": 20000 + }, + { + "epoch": 0.401, + "grad_norm": 5.577113151550293, + "learning_rate": 3.998e-05, + "loss": 3.7707, + "step": 20050 + }, + { + "epoch": 0.402, + "grad_norm": 6.950751304626465, + "learning_rate": 3.9955000000000006e-05, + "loss": 3.6968, + "step": 20100 + }, + { + "epoch": 0.403, + "grad_norm": 6.79304838180542, + "learning_rate": 3.993e-05, + "loss": 3.676, + "step": 20150 + }, + { + "epoch": 0.404, + "grad_norm": 5.125970840454102, + "learning_rate": 3.9905000000000005e-05, + "loss": 3.6477, + "step": 20200 + }, + { + "epoch": 0.405, + "grad_norm": 5.478579998016357, + "learning_rate": 3.988e-05, + "loss": 3.7483, + "step": 20250 + }, + { + "epoch": 0.406, + "grad_norm": 10.339896202087402, + "learning_rate": 3.9855000000000004e-05, + "loss": 3.6422, + "step": 20300 + }, + { + "epoch": 0.407, + "grad_norm": 5.875314712524414, + "learning_rate": 3.983e-05, + "loss": 3.6645, + "step": 20350 + }, + { + "epoch": 0.408, + "grad_norm": 7.035477638244629, + "learning_rate": 3.9805e-05, + "loss": 3.9642, + "step": 20400 + }, + { + "epoch": 0.409, + "grad_norm": 5.8045125007629395, + "learning_rate": 3.978e-05, + "loss": 3.6628, + "step": 20450 + }, + { + "epoch": 0.41, + "grad_norm": 7.441490173339844, + "learning_rate": 3.9755e-05, + "loss": 3.6702, + "step": 20500 + }, + { + "epoch": 0.411, + "grad_norm": 9.806413650512695, + "learning_rate": 3.973e-05, + "loss": 3.8382, + "step": 20550 + }, + { + "epoch": 0.412, + "grad_norm": 5.094761848449707, + "learning_rate": 3.9705e-05, + "loss": 3.4913, + "step": 20600 + }, + { + "epoch": 0.413, + "grad_norm": 5.1229248046875, + "learning_rate": 3.968e-05, + "loss": 3.6758, + "step": 20650 + }, + { + "epoch": 0.414, + "grad_norm": 5.905995845794678, + "learning_rate": 3.9655000000000006e-05, + "loss": 3.5157, + "step": 20700 + }, + { + "epoch": 0.415, + "grad_norm": 5.98081636428833, + "learning_rate": 3.963e-05, + "loss": 3.7878, + "step": 20750 + }, + { + "epoch": 0.416, + "grad_norm": 5.739871501922607, + "learning_rate": 3.9605000000000005e-05, + "loss": 3.6435, + "step": 20800 + }, + { + "epoch": 0.417, + "grad_norm": 7.51310396194458, + "learning_rate": 3.958e-05, + "loss": 3.6459, + "step": 20850 + }, + { + "epoch": 0.418, + "grad_norm": 5.59712553024292, + "learning_rate": 3.9555e-05, + "loss": 3.5044, + "step": 20900 + }, + { + "epoch": 0.419, + "grad_norm": 6.285709381103516, + "learning_rate": 3.953e-05, + "loss": 3.5614, + "step": 20950 + }, + { + "epoch": 0.42, + "grad_norm": 5.389233589172363, + "learning_rate": 3.9505e-05, + "loss": 3.6717, + "step": 21000 + }, + { + "epoch": 0.421, + "grad_norm": 5.242571830749512, + "learning_rate": 3.948e-05, + "loss": 3.7844, + "step": 21050 + }, + { + "epoch": 0.422, + "grad_norm": 5.075847148895264, + "learning_rate": 3.94555e-05, + "loss": 3.7471, + "step": 21100 + }, + { + "epoch": 0.423, + "grad_norm": 7.122387409210205, + "learning_rate": 3.94305e-05, + "loss": 3.6374, + "step": 21150 + }, + { + "epoch": 0.424, + "grad_norm": 6.306674003601074, + "learning_rate": 3.94055e-05, + "loss": 3.6265, + "step": 21200 + }, + { + "epoch": 0.425, + "grad_norm": 5.332902908325195, + "learning_rate": 3.93805e-05, + "loss": 3.4958, + "step": 21250 + }, + { + "epoch": 0.426, + "grad_norm": 5.518804550170898, + "learning_rate": 3.93555e-05, + "loss": 3.7738, + "step": 21300 + }, + { + "epoch": 0.427, + "grad_norm": 6.586755275726318, + "learning_rate": 3.9330500000000006e-05, + "loss": 3.777, + "step": 21350 + }, + { + "epoch": 0.428, + "grad_norm": 6.083924293518066, + "learning_rate": 3.93055e-05, + "loss": 3.4348, + "step": 21400 + }, + { + "epoch": 0.429, + "grad_norm": 6.382724761962891, + "learning_rate": 3.9280500000000004e-05, + "loss": 3.4485, + "step": 21450 + }, + { + "epoch": 0.43, + "grad_norm": 4.780561923980713, + "learning_rate": 3.9255500000000004e-05, + "loss": 3.7721, + "step": 21500 + }, + { + "epoch": 0.431, + "grad_norm": 7.1107683181762695, + "learning_rate": 3.92305e-05, + "loss": 3.7841, + "step": 21550 + }, + { + "epoch": 0.432, + "grad_norm": 5.783836364746094, + "learning_rate": 3.92055e-05, + "loss": 3.8264, + "step": 21600 + }, + { + "epoch": 0.433, + "grad_norm": 6.417442321777344, + "learning_rate": 3.91805e-05, + "loss": 3.5419, + "step": 21650 + }, + { + "epoch": 0.434, + "grad_norm": 6.642063140869141, + "learning_rate": 3.91555e-05, + "loss": 3.6673, + "step": 21700 + }, + { + "epoch": 0.435, + "grad_norm": 6.779355525970459, + "learning_rate": 3.91305e-05, + "loss": 3.3835, + "step": 21750 + }, + { + "epoch": 0.436, + "grad_norm": 6.05160665512085, + "learning_rate": 3.91055e-05, + "loss": 3.3663, + "step": 21800 + }, + { + "epoch": 0.437, + "grad_norm": 6.938116073608398, + "learning_rate": 3.9080500000000006e-05, + "loss": 3.5852, + "step": 21850 + }, + { + "epoch": 0.438, + "grad_norm": 5.694869041442871, + "learning_rate": 3.90555e-05, + "loss": 3.4862, + "step": 21900 + }, + { + "epoch": 0.439, + "grad_norm": 6.541887283325195, + "learning_rate": 3.9030500000000005e-05, + "loss": 3.452, + "step": 21950 + }, + { + "epoch": 0.44, + "grad_norm": 5.991393566131592, + "learning_rate": 3.90055e-05, + "loss": 3.4754, + "step": 22000 + }, + { + "epoch": 0.441, + "grad_norm": 5.7733025550842285, + "learning_rate": 3.8980500000000004e-05, + "loss": 3.6772, + "step": 22050 + }, + { + "epoch": 0.442, + "grad_norm": 5.654860496520996, + "learning_rate": 3.89555e-05, + "loss": 3.5211, + "step": 22100 + }, + { + "epoch": 0.443, + "grad_norm": 7.81252908706665, + "learning_rate": 3.89305e-05, + "loss": 3.5404, + "step": 22150 + }, + { + "epoch": 0.444, + "grad_norm": 5.495180606842041, + "learning_rate": 3.89055e-05, + "loss": 3.6398, + "step": 22200 + }, + { + "epoch": 0.445, + "grad_norm": 7.39019775390625, + "learning_rate": 3.88805e-05, + "loss": 3.6327, + "step": 22250 + }, + { + "epoch": 0.446, + "grad_norm": 6.242217063903809, + "learning_rate": 3.88555e-05, + "loss": 3.7173, + "step": 22300 + }, + { + "epoch": 0.447, + "grad_norm": 5.690585136413574, + "learning_rate": 3.88305e-05, + "loss": 3.3506, + "step": 22350 + }, + { + "epoch": 0.448, + "grad_norm": 5.454275131225586, + "learning_rate": 3.88055e-05, + "loss": 3.6386, + "step": 22400 + }, + { + "epoch": 0.449, + "grad_norm": 10.499303817749023, + "learning_rate": 3.8780500000000006e-05, + "loss": 3.6395, + "step": 22450 + }, + { + "epoch": 0.45, + "grad_norm": 6.501960277557373, + "learning_rate": 3.87555e-05, + "loss": 3.6292, + "step": 22500 + }, + { + "epoch": 0.451, + "grad_norm": 6.991559028625488, + "learning_rate": 3.8730500000000005e-05, + "loss": 3.6887, + "step": 22550 + }, + { + "epoch": 0.452, + "grad_norm": 6.130929946899414, + "learning_rate": 3.87055e-05, + "loss": 3.7495, + "step": 22600 + }, + { + "epoch": 0.453, + "grad_norm": 6.396117210388184, + "learning_rate": 3.8680500000000003e-05, + "loss": 3.77, + "step": 22650 + }, + { + "epoch": 0.454, + "grad_norm": 5.2830939292907715, + "learning_rate": 3.86555e-05, + "loss": 3.2854, + "step": 22700 + }, + { + "epoch": 0.455, + "grad_norm": 5.548758506774902, + "learning_rate": 3.86305e-05, + "loss": 3.4142, + "step": 22750 + }, + { + "epoch": 0.456, + "grad_norm": 8.338722229003906, + "learning_rate": 3.86055e-05, + "loss": 3.5935, + "step": 22800 + }, + { + "epoch": 0.457, + "grad_norm": 5.559829235076904, + "learning_rate": 3.85805e-05, + "loss": 3.539, + "step": 22850 + }, + { + "epoch": 0.458, + "grad_norm": 6.5197601318359375, + "learning_rate": 3.85555e-05, + "loss": 3.5664, + "step": 22900 + }, + { + "epoch": 0.459, + "grad_norm": 6.871826648712158, + "learning_rate": 3.85305e-05, + "loss": 3.4904, + "step": 22950 + }, + { + "epoch": 0.46, + "grad_norm": 6.459062099456787, + "learning_rate": 3.85055e-05, + "loss": 3.5906, + "step": 23000 + }, + { + "epoch": 0.461, + "grad_norm": 7.952236652374268, + "learning_rate": 3.8480500000000005e-05, + "loss": 3.4198, + "step": 23050 + }, + { + "epoch": 0.462, + "grad_norm": 5.091039180755615, + "learning_rate": 3.8456000000000006e-05, + "loss": 3.6821, + "step": 23100 + }, + { + "epoch": 0.463, + "grad_norm": 5.670896530151367, + "learning_rate": 3.8431e-05, + "loss": 3.4976, + "step": 23150 + }, + { + "epoch": 0.464, + "grad_norm": 5.132435321807861, + "learning_rate": 3.8406000000000005e-05, + "loss": 3.7298, + "step": 23200 + }, + { + "epoch": 0.465, + "grad_norm": 7.798178195953369, + "learning_rate": 3.8381e-05, + "loss": 3.6912, + "step": 23250 + }, + { + "epoch": 0.466, + "grad_norm": 7.2740983963012695, + "learning_rate": 3.8356000000000003e-05, + "loss": 3.4549, + "step": 23300 + }, + { + "epoch": 0.467, + "grad_norm": 5.059585094451904, + "learning_rate": 3.8331e-05, + "loss": 3.4792, + "step": 23350 + }, + { + "epoch": 0.468, + "grad_norm": 7.0619893074035645, + "learning_rate": 3.8306e-05, + "loss": 3.7425, + "step": 23400 + }, + { + "epoch": 0.469, + "grad_norm": 6.161375522613525, + "learning_rate": 3.8281e-05, + "loss": 3.5272, + "step": 23450 + }, + { + "epoch": 0.47, + "grad_norm": 7.16375207901001, + "learning_rate": 3.8256e-05, + "loss": 3.669, + "step": 23500 + }, + { + "epoch": 0.471, + "grad_norm": 5.802226543426514, + "learning_rate": 3.8231e-05, + "loss": 3.6014, + "step": 23550 + }, + { + "epoch": 0.472, + "grad_norm": 4.789316177368164, + "learning_rate": 3.8206e-05, + "loss": 3.4649, + "step": 23600 + }, + { + "epoch": 0.473, + "grad_norm": 6.274521827697754, + "learning_rate": 3.8181e-05, + "loss": 3.4292, + "step": 23650 + }, + { + "epoch": 0.474, + "grad_norm": 5.716911792755127, + "learning_rate": 3.8156000000000005e-05, + "loss": 3.3089, + "step": 23700 + }, + { + "epoch": 0.475, + "grad_norm": 6.505302429199219, + "learning_rate": 3.8131e-05, + "loss": 3.6164, + "step": 23750 + }, + { + "epoch": 0.476, + "grad_norm": 6.222828388214111, + "learning_rate": 3.8106000000000004e-05, + "loss": 3.4026, + "step": 23800 + }, + { + "epoch": 0.477, + "grad_norm": 6.24766206741333, + "learning_rate": 3.8081e-05, + "loss": 3.443, + "step": 23850 + }, + { + "epoch": 0.478, + "grad_norm": 7.223185062408447, + "learning_rate": 3.8056e-05, + "loss": 3.6329, + "step": 23900 + }, + { + "epoch": 0.479, + "grad_norm": 6.246890544891357, + "learning_rate": 3.8031e-05, + "loss": 3.3726, + "step": 23950 + }, + { + "epoch": 0.48, + "grad_norm": 5.942480564117432, + "learning_rate": 3.8006e-05, + "loss": 3.5571, + "step": 24000 + }, + { + "epoch": 0.481, + "grad_norm": 5.436431407928467, + "learning_rate": 3.7981e-05, + "loss": 3.4648, + "step": 24050 + }, + { + "epoch": 0.482, + "grad_norm": 6.082589626312256, + "learning_rate": 3.7956e-05, + "loss": 3.5571, + "step": 24100 + }, + { + "epoch": 0.483, + "grad_norm": 7.166791915893555, + "learning_rate": 3.7931e-05, + "loss": 3.5147, + "step": 24150 + }, + { + "epoch": 0.484, + "grad_norm": 6.249309539794922, + "learning_rate": 3.7906e-05, + "loss": 3.6697, + "step": 24200 + }, + { + "epoch": 0.485, + "grad_norm": 7.935781478881836, + "learning_rate": 3.7881e-05, + "loss": 3.4738, + "step": 24250 + }, + { + "epoch": 0.486, + "grad_norm": 5.446045875549316, + "learning_rate": 3.7856000000000005e-05, + "loss": 3.3919, + "step": 24300 + }, + { + "epoch": 0.487, + "grad_norm": 6.253237724304199, + "learning_rate": 3.7831e-05, + "loss": 3.6019, + "step": 24350 + }, + { + "epoch": 0.488, + "grad_norm": 13.48686695098877, + "learning_rate": 3.7806000000000004e-05, + "loss": 3.5479, + "step": 24400 + }, + { + "epoch": 0.489, + "grad_norm": 7.377110481262207, + "learning_rate": 3.7781e-05, + "loss": 3.3436, + "step": 24450 + }, + { + "epoch": 0.49, + "grad_norm": 7.141451835632324, + "learning_rate": 3.7756e-05, + "loss": 3.4679, + "step": 24500 + }, + { + "epoch": 0.491, + "grad_norm": 5.041921615600586, + "learning_rate": 3.7731e-05, + "loss": 3.5736, + "step": 24550 + }, + { + "epoch": 0.492, + "grad_norm": 6.365973949432373, + "learning_rate": 3.7706e-05, + "loss": 3.5077, + "step": 24600 + }, + { + "epoch": 0.493, + "grad_norm": 5.532159805297852, + "learning_rate": 3.7681e-05, + "loss": 3.4038, + "step": 24650 + }, + { + "epoch": 0.494, + "grad_norm": 6.18168830871582, + "learning_rate": 3.7656e-05, + "loss": 3.36, + "step": 24700 + }, + { + "epoch": 0.495, + "grad_norm": 7.991237163543701, + "learning_rate": 3.7631e-05, + "loss": 3.4926, + "step": 24750 + }, + { + "epoch": 0.496, + "grad_norm": 7.126778602600098, + "learning_rate": 3.7606000000000006e-05, + "loss": 3.6006, + "step": 24800 + }, + { + "epoch": 0.497, + "grad_norm": 8.774191856384277, + "learning_rate": 3.7581e-05, + "loss": 3.551, + "step": 24850 + }, + { + "epoch": 0.498, + "grad_norm": 7.611205101013184, + "learning_rate": 3.7556000000000004e-05, + "loss": 3.3755, + "step": 24900 + }, + { + "epoch": 0.499, + "grad_norm": 8.540602684020996, + "learning_rate": 3.7531e-05, + "loss": 3.3131, + "step": 24950 + }, + { + "epoch": 0.5, + "grad_norm": 7.313963890075684, + "learning_rate": 3.7506e-05, + "loss": 3.4656, + "step": 25000 + }, + { + "epoch": 0.501, + "grad_norm": 5.498029708862305, + "learning_rate": 3.7481e-05, + "loss": 3.4094, + "step": 25050 + }, + { + "epoch": 0.502, + "grad_norm": 6.962924480438232, + "learning_rate": 3.7456e-05, + "loss": 3.6499, + "step": 25100 + }, + { + "epoch": 0.503, + "grad_norm": 7.007199764251709, + "learning_rate": 3.7431e-05, + "loss": 3.4367, + "step": 25150 + }, + { + "epoch": 0.504, + "grad_norm": 7.297113418579102, + "learning_rate": 3.74065e-05, + "loss": 3.4985, + "step": 25200 + }, + { + "epoch": 0.505, + "grad_norm": 6.30457878112793, + "learning_rate": 3.73815e-05, + "loss": 3.4324, + "step": 25250 + }, + { + "epoch": 0.506, + "grad_norm": 6.83005952835083, + "learning_rate": 3.73565e-05, + "loss": 3.3904, + "step": 25300 + }, + { + "epoch": 0.507, + "grad_norm": 7.69933557510376, + "learning_rate": 3.73315e-05, + "loss": 3.6869, + "step": 25350 + }, + { + "epoch": 0.508, + "grad_norm": 6.051611423492432, + "learning_rate": 3.73065e-05, + "loss": 3.5676, + "step": 25400 + }, + { + "epoch": 0.509, + "grad_norm": 6.487821578979492, + "learning_rate": 3.7281500000000005e-05, + "loss": 3.5218, + "step": 25450 + }, + { + "epoch": 0.51, + "grad_norm": 7.36344575881958, + "learning_rate": 3.72565e-05, + "loss": 3.4862, + "step": 25500 + }, + { + "epoch": 0.511, + "grad_norm": 5.60974645614624, + "learning_rate": 3.7231500000000004e-05, + "loss": 3.304, + "step": 25550 + }, + { + "epoch": 0.512, + "grad_norm": 5.354685306549072, + "learning_rate": 3.72065e-05, + "loss": 3.4184, + "step": 25600 + }, + { + "epoch": 0.513, + "grad_norm": 6.2862420082092285, + "learning_rate": 3.71815e-05, + "loss": 3.6906, + "step": 25650 + }, + { + "epoch": 0.514, + "grad_norm": 7.004101753234863, + "learning_rate": 3.71565e-05, + "loss": 3.4951, + "step": 25700 + }, + { + "epoch": 0.515, + "grad_norm": 11.132220268249512, + "learning_rate": 3.71315e-05, + "loss": 3.4476, + "step": 25750 + }, + { + "epoch": 0.516, + "grad_norm": 10.14500617980957, + "learning_rate": 3.71065e-05, + "loss": 3.5392, + "step": 25800 + }, + { + "epoch": 0.517, + "grad_norm": 5.003336429595947, + "learning_rate": 3.70815e-05, + "loss": 3.5125, + "step": 25850 + }, + { + "epoch": 0.518, + "grad_norm": 5.623690605163574, + "learning_rate": 3.70565e-05, + "loss": 3.4813, + "step": 25900 + }, + { + "epoch": 0.519, + "grad_norm": 6.5926432609558105, + "learning_rate": 3.70315e-05, + "loss": 3.5556, + "step": 25950 + }, + { + "epoch": 0.52, + "grad_norm": 6.16590690612793, + "learning_rate": 3.70065e-05, + "loss": 3.4139, + "step": 26000 + }, + { + "epoch": 0.521, + "grad_norm": 5.917094707489014, + "learning_rate": 3.6981500000000005e-05, + "loss": 3.5458, + "step": 26050 + }, + { + "epoch": 0.522, + "grad_norm": 6.361861705780029, + "learning_rate": 3.69565e-05, + "loss": 3.4147, + "step": 26100 + }, + { + "epoch": 0.523, + "grad_norm": 6.118766784667969, + "learning_rate": 3.6931500000000004e-05, + "loss": 3.668, + "step": 26150 + }, + { + "epoch": 0.524, + "grad_norm": 7.089566230773926, + "learning_rate": 3.6906499999999996e-05, + "loss": 3.3806, + "step": 26200 + }, + { + "epoch": 0.525, + "grad_norm": 5.34354829788208, + "learning_rate": 3.68815e-05, + "loss": 3.4482, + "step": 26250 + }, + { + "epoch": 0.526, + "grad_norm": 8.42415714263916, + "learning_rate": 3.68565e-05, + "loss": 3.5294, + "step": 26300 + }, + { + "epoch": 0.527, + "grad_norm": 6.866034984588623, + "learning_rate": 3.68315e-05, + "loss": 3.5308, + "step": 26350 + }, + { + "epoch": 0.528, + "grad_norm": 6.633973121643066, + "learning_rate": 3.68065e-05, + "loss": 3.598, + "step": 26400 + }, + { + "epoch": 0.529, + "grad_norm": 5.965973854064941, + "learning_rate": 3.67815e-05, + "loss": 3.6788, + "step": 26450 + }, + { + "epoch": 0.53, + "grad_norm": 8.220425605773926, + "learning_rate": 3.67565e-05, + "loss": 3.2695, + "step": 26500 + }, + { + "epoch": 0.531, + "grad_norm": 5.58171272277832, + "learning_rate": 3.67315e-05, + "loss": 3.4882, + "step": 26550 + }, + { + "epoch": 0.532, + "grad_norm": 6.113706111907959, + "learning_rate": 3.67065e-05, + "loss": 3.5617, + "step": 26600 + }, + { + "epoch": 0.533, + "grad_norm": 6.026816368103027, + "learning_rate": 3.6681500000000004e-05, + "loss": 3.5393, + "step": 26650 + }, + { + "epoch": 0.534, + "grad_norm": 7.218907356262207, + "learning_rate": 3.66565e-05, + "loss": 3.5193, + "step": 26700 + }, + { + "epoch": 0.535, + "grad_norm": 6.00888729095459, + "learning_rate": 3.66315e-05, + "loss": 3.4065, + "step": 26750 + }, + { + "epoch": 0.536, + "grad_norm": 7.0676679611206055, + "learning_rate": 3.6606499999999996e-05, + "loss": 3.2652, + "step": 26800 + }, + { + "epoch": 0.537, + "grad_norm": 5.561299800872803, + "learning_rate": 3.65815e-05, + "loss": 3.6312, + "step": 26850 + }, + { + "epoch": 0.538, + "grad_norm": 6.713068008422852, + "learning_rate": 3.65565e-05, + "loss": 3.6427, + "step": 26900 + }, + { + "epoch": 0.539, + "grad_norm": 5.804120063781738, + "learning_rate": 3.65315e-05, + "loss": 3.1819, + "step": 26950 + }, + { + "epoch": 0.54, + "grad_norm": 6.488461017608643, + "learning_rate": 3.65065e-05, + "loss": 3.4673, + "step": 27000 + }, + { + "epoch": 0.541, + "grad_norm": 6.780620098114014, + "learning_rate": 3.64815e-05, + "loss": 3.4887, + "step": 27050 + }, + { + "epoch": 0.542, + "grad_norm": 6.49476957321167, + "learning_rate": 3.64565e-05, + "loss": 3.6123, + "step": 27100 + }, + { + "epoch": 0.543, + "grad_norm": 6.915765285491943, + "learning_rate": 3.64315e-05, + "loss": 3.4393, + "step": 27150 + }, + { + "epoch": 0.544, + "grad_norm": 6.593012809753418, + "learning_rate": 3.6406500000000005e-05, + "loss": 3.5151, + "step": 27200 + }, + { + "epoch": 0.545, + "grad_norm": 6.4061760902404785, + "learning_rate": 3.6382e-05, + "loss": 3.4307, + "step": 27250 + }, + { + "epoch": 0.546, + "grad_norm": 6.907027721405029, + "learning_rate": 3.6357000000000004e-05, + "loss": 3.4679, + "step": 27300 + }, + { + "epoch": 0.547, + "grad_norm": 5.774991989135742, + "learning_rate": 3.6332e-05, + "loss": 3.3226, + "step": 27350 + }, + { + "epoch": 0.548, + "grad_norm": 5.766547679901123, + "learning_rate": 3.6307e-05, + "loss": 3.4283, + "step": 27400 + }, + { + "epoch": 0.549, + "grad_norm": 7.284669876098633, + "learning_rate": 3.6282e-05, + "loss": 3.4481, + "step": 27450 + }, + { + "epoch": 0.55, + "grad_norm": 6.108999729156494, + "learning_rate": 3.6257e-05, + "loss": 3.5764, + "step": 27500 + }, + { + "epoch": 0.551, + "grad_norm": 8.253243446350098, + "learning_rate": 3.62325e-05, + "loss": 3.4882, + "step": 27550 + }, + { + "epoch": 0.552, + "grad_norm": 8.250670433044434, + "learning_rate": 3.62075e-05, + "loss": 3.3237, + "step": 27600 + }, + { + "epoch": 0.553, + "grad_norm": 7.141805648803711, + "learning_rate": 3.61825e-05, + "loss": 3.5175, + "step": 27650 + }, + { + "epoch": 0.554, + "grad_norm": 7.745401859283447, + "learning_rate": 3.61575e-05, + "loss": 3.3908, + "step": 27700 + }, + { + "epoch": 0.555, + "grad_norm": 5.771395683288574, + "learning_rate": 3.61325e-05, + "loss": 3.2266, + "step": 27750 + }, + { + "epoch": 0.556, + "grad_norm": 7.92910623550415, + "learning_rate": 3.61075e-05, + "loss": 3.1754, + "step": 27800 + }, + { + "epoch": 0.557, + "grad_norm": 7.521282196044922, + "learning_rate": 3.6082500000000006e-05, + "loss": 3.5141, + "step": 27850 + }, + { + "epoch": 0.558, + "grad_norm": 6.065182209014893, + "learning_rate": 3.60575e-05, + "loss": 3.5914, + "step": 27900 + }, + { + "epoch": 0.559, + "grad_norm": 7.105879306793213, + "learning_rate": 3.6032500000000004e-05, + "loss": 3.2975, + "step": 27950 + }, + { + "epoch": 0.56, + "grad_norm": 7.2546515464782715, + "learning_rate": 3.60075e-05, + "loss": 3.5552, + "step": 28000 + }, + { + "epoch": 0.561, + "grad_norm": 7.344614505767822, + "learning_rate": 3.59825e-05, + "loss": 3.3722, + "step": 28050 + }, + { + "epoch": 0.562, + "grad_norm": 5.996744155883789, + "learning_rate": 3.59575e-05, + "loss": 3.4005, + "step": 28100 + }, + { + "epoch": 0.563, + "grad_norm": 5.243777751922607, + "learning_rate": 3.59325e-05, + "loss": 3.5161, + "step": 28150 + }, + { + "epoch": 0.564, + "grad_norm": 5.685503959655762, + "learning_rate": 3.59075e-05, + "loss": 3.4874, + "step": 28200 + }, + { + "epoch": 0.565, + "grad_norm": 6.744235038757324, + "learning_rate": 3.58825e-05, + "loss": 3.4842, + "step": 28250 + }, + { + "epoch": 0.566, + "grad_norm": 6.83828592300415, + "learning_rate": 3.58575e-05, + "loss": 3.3306, + "step": 28300 + }, + { + "epoch": 0.567, + "grad_norm": 6.369629859924316, + "learning_rate": 3.58325e-05, + "loss": 3.4862, + "step": 28350 + }, + { + "epoch": 0.568, + "grad_norm": 6.583117485046387, + "learning_rate": 3.58075e-05, + "loss": 3.2907, + "step": 28400 + }, + { + "epoch": 0.569, + "grad_norm": 8.253291130065918, + "learning_rate": 3.5782500000000005e-05, + "loss": 3.3894, + "step": 28450 + }, + { + "epoch": 0.57, + "grad_norm": 6.858260154724121, + "learning_rate": 3.57575e-05, + "loss": 3.4645, + "step": 28500 + }, + { + "epoch": 0.571, + "grad_norm": 6.5981221199035645, + "learning_rate": 3.5732500000000004e-05, + "loss": 3.5327, + "step": 28550 + }, + { + "epoch": 0.572, + "grad_norm": 6.859750270843506, + "learning_rate": 3.5707499999999997e-05, + "loss": 3.3362, + "step": 28600 + }, + { + "epoch": 0.573, + "grad_norm": 7.866117000579834, + "learning_rate": 3.56825e-05, + "loss": 3.6273, + "step": 28650 + }, + { + "epoch": 0.574, + "grad_norm": 5.867875576019287, + "learning_rate": 3.56575e-05, + "loss": 3.3157, + "step": 28700 + }, + { + "epoch": 0.575, + "grad_norm": 6.136321544647217, + "learning_rate": 3.56325e-05, + "loss": 3.345, + "step": 28750 + }, + { + "epoch": 0.576, + "grad_norm": 6.829803466796875, + "learning_rate": 3.56075e-05, + "loss": 3.3066, + "step": 28800 + }, + { + "epoch": 0.577, + "grad_norm": 6.177272319793701, + "learning_rate": 3.55825e-05, + "loss": 3.4357, + "step": 28850 + }, + { + "epoch": 0.578, + "grad_norm": 6.678221225738525, + "learning_rate": 3.55575e-05, + "loss": 3.4919, + "step": 28900 + }, + { + "epoch": 0.579, + "grad_norm": 5.453572750091553, + "learning_rate": 3.55325e-05, + "loss": 3.3439, + "step": 28950 + }, + { + "epoch": 0.58, + "grad_norm": 8.180026054382324, + "learning_rate": 3.55075e-05, + "loss": 3.4776, + "step": 29000 + }, + { + "epoch": 0.581, + "grad_norm": 6.252904415130615, + "learning_rate": 3.5482500000000005e-05, + "loss": 3.4176, + "step": 29050 + }, + { + "epoch": 0.582, + "grad_norm": 7.99691104888916, + "learning_rate": 3.54575e-05, + "loss": 3.3812, + "step": 29100 + }, + { + "epoch": 0.583, + "grad_norm": 6.05564022064209, + "learning_rate": 3.54325e-05, + "loss": 3.5308, + "step": 29150 + }, + { + "epoch": 0.584, + "grad_norm": 6.284146308898926, + "learning_rate": 3.5407499999999996e-05, + "loss": 3.3615, + "step": 29200 + }, + { + "epoch": 0.585, + "grad_norm": 10.308747291564941, + "learning_rate": 3.53825e-05, + "loss": 3.2485, + "step": 29250 + }, + { + "epoch": 0.586, + "grad_norm": 5.75729513168335, + "learning_rate": 3.53575e-05, + "loss": 3.3904, + "step": 29300 + }, + { + "epoch": 0.587, + "grad_norm": 6.623788356781006, + "learning_rate": 3.53325e-05, + "loss": 3.4116, + "step": 29350 + }, + { + "epoch": 0.588, + "grad_norm": 5.757374286651611, + "learning_rate": 3.530750000000001e-05, + "loss": 3.4118, + "step": 29400 + }, + { + "epoch": 0.589, + "grad_norm": 6.021833896636963, + "learning_rate": 3.52825e-05, + "loss": 3.3119, + "step": 29450 + }, + { + "epoch": 0.59, + "grad_norm": 6.991232395172119, + "learning_rate": 3.5257500000000006e-05, + "loss": 3.4385, + "step": 29500 + }, + { + "epoch": 0.591, + "grad_norm": 6.228766441345215, + "learning_rate": 3.52325e-05, + "loss": 3.3973, + "step": 29550 + }, + { + "epoch": 0.592, + "grad_norm": 6.7667131423950195, + "learning_rate": 3.5207500000000005e-05, + "loss": 3.492, + "step": 29600 + }, + { + "epoch": 0.593, + "grad_norm": 7.218946933746338, + "learning_rate": 3.5182500000000004e-05, + "loss": 3.4921, + "step": 29650 + }, + { + "epoch": 0.594, + "grad_norm": 6.98897123336792, + "learning_rate": 3.5157500000000003e-05, + "loss": 3.432, + "step": 29700 + }, + { + "epoch": 0.595, + "grad_norm": 7.497246265411377, + "learning_rate": 3.51325e-05, + "loss": 3.2246, + "step": 29750 + }, + { + "epoch": 0.596, + "grad_norm": 6.9698686599731445, + "learning_rate": 3.51075e-05, + "loss": 3.4781, + "step": 29800 + }, + { + "epoch": 0.597, + "grad_norm": 5.121500492095947, + "learning_rate": 3.50825e-05, + "loss": 3.4499, + "step": 29850 + }, + { + "epoch": 0.598, + "grad_norm": 6.432731628417969, + "learning_rate": 3.50575e-05, + "loss": 3.2076, + "step": 29900 + }, + { + "epoch": 0.599, + "grad_norm": 5.066705703735352, + "learning_rate": 3.50325e-05, + "loss": 3.4198, + "step": 29950 + }, + { + "epoch": 0.6, + "grad_norm": 6.430960655212402, + "learning_rate": 3.5007500000000007e-05, + "loss": 3.4104, + "step": 30000 + }, + { + "epoch": 0.601, + "grad_norm": 9.220848083496094, + "learning_rate": 3.49825e-05, + "loss": 3.4114, + "step": 30050 + }, + { + "epoch": 0.602, + "grad_norm": 6.596940517425537, + "learning_rate": 3.4957500000000005e-05, + "loss": 3.3112, + "step": 30100 + }, + { + "epoch": 0.603, + "grad_norm": 8.433357238769531, + "learning_rate": 3.49325e-05, + "loss": 3.499, + "step": 30150 + }, + { + "epoch": 0.604, + "grad_norm": 7.3862738609313965, + "learning_rate": 3.4907500000000004e-05, + "loss": 3.2682, + "step": 30200 + }, + { + "epoch": 0.605, + "grad_norm": 7.238501071929932, + "learning_rate": 3.4882500000000004e-05, + "loss": 3.4221, + "step": 30250 + }, + { + "epoch": 0.606, + "grad_norm": 6.758235931396484, + "learning_rate": 3.48575e-05, + "loss": 3.3936, + "step": 30300 + }, + { + "epoch": 0.607, + "grad_norm": 6.463011741638184, + "learning_rate": 3.48325e-05, + "loss": 3.5434, + "step": 30350 + }, + { + "epoch": 0.608, + "grad_norm": 8.514334678649902, + "learning_rate": 3.48075e-05, + "loss": 3.3497, + "step": 30400 + }, + { + "epoch": 0.609, + "grad_norm": 5.6331915855407715, + "learning_rate": 3.47825e-05, + "loss": 3.2981, + "step": 30450 + }, + { + "epoch": 0.61, + "grad_norm": 7.361815929412842, + "learning_rate": 3.475750000000001e-05, + "loss": 3.2069, + "step": 30500 + }, + { + "epoch": 0.611, + "grad_norm": 5.87959098815918, + "learning_rate": 3.47325e-05, + "loss": 3.3483, + "step": 30550 + }, + { + "epoch": 0.612, + "grad_norm": 7.229435443878174, + "learning_rate": 3.4707500000000006e-05, + "loss": 3.3584, + "step": 30600 + }, + { + "epoch": 0.613, + "grad_norm": 5.749815940856934, + "learning_rate": 3.46825e-05, + "loss": 3.5367, + "step": 30650 + }, + { + "epoch": 0.614, + "grad_norm": 7.94840145111084, + "learning_rate": 3.4657500000000005e-05, + "loss": 3.3931, + "step": 30700 + }, + { + "epoch": 0.615, + "grad_norm": 6.358616828918457, + "learning_rate": 3.4632500000000004e-05, + "loss": 3.2705, + "step": 30750 + }, + { + "epoch": 0.616, + "grad_norm": 6.277470111846924, + "learning_rate": 3.4607500000000004e-05, + "loss": 3.4044, + "step": 30800 + }, + { + "epoch": 0.617, + "grad_norm": 6.311099529266357, + "learning_rate": 3.45825e-05, + "loss": 3.3239, + "step": 30850 + }, + { + "epoch": 0.618, + "grad_norm": 7.405715465545654, + "learning_rate": 3.45575e-05, + "loss": 3.3971, + "step": 30900 + }, + { + "epoch": 0.619, + "grad_norm": 6.301971912384033, + "learning_rate": 3.45325e-05, + "loss": 3.4035, + "step": 30950 + }, + { + "epoch": 0.62, + "grad_norm": 6.6296844482421875, + "learning_rate": 3.45075e-05, + "loss": 3.4517, + "step": 31000 + }, + { + "epoch": 0.621, + "grad_norm": 6.209890842437744, + "learning_rate": 3.44825e-05, + "loss": 3.3755, + "step": 31050 + }, + { + "epoch": 0.622, + "grad_norm": 5.518044471740723, + "learning_rate": 3.445750000000001e-05, + "loss": 3.2734, + "step": 31100 + }, + { + "epoch": 0.623, + "grad_norm": 7.205735206604004, + "learning_rate": 3.44325e-05, + "loss": 3.3552, + "step": 31150 + }, + { + "epoch": 0.624, + "grad_norm": 6.634133815765381, + "learning_rate": 3.4407500000000006e-05, + "loss": 3.2855, + "step": 31200 + }, + { + "epoch": 0.625, + "grad_norm": 6.324268341064453, + "learning_rate": 3.43825e-05, + "loss": 3.4877, + "step": 31250 + }, + { + "epoch": 0.626, + "grad_norm": 5.836503505706787, + "learning_rate": 3.4357500000000004e-05, + "loss": 3.492, + "step": 31300 + }, + { + "epoch": 0.627, + "grad_norm": 6.615384101867676, + "learning_rate": 3.4332500000000004e-05, + "loss": 3.5195, + "step": 31350 + }, + { + "epoch": 0.628, + "grad_norm": 7.209743499755859, + "learning_rate": 3.43075e-05, + "loss": 3.4772, + "step": 31400 + }, + { + "epoch": 0.629, + "grad_norm": 8.48714828491211, + "learning_rate": 3.42825e-05, + "loss": 3.5931, + "step": 31450 + }, + { + "epoch": 0.63, + "grad_norm": 5.982743740081787, + "learning_rate": 3.42575e-05, + "loss": 3.5209, + "step": 31500 + }, + { + "epoch": 0.631, + "grad_norm": 6.397634029388428, + "learning_rate": 3.4233e-05, + "loss": 3.2492, + "step": 31550 + }, + { + "epoch": 0.632, + "grad_norm": 6.8665642738342285, + "learning_rate": 3.4208e-05, + "loss": 3.4509, + "step": 31600 + }, + { + "epoch": 0.633, + "grad_norm": 6.930782794952393, + "learning_rate": 3.4183e-05, + "loss": 3.2818, + "step": 31650 + }, + { + "epoch": 0.634, + "grad_norm": 5.9751296043396, + "learning_rate": 3.4158e-05, + "loss": 3.2229, + "step": 31700 + }, + { + "epoch": 0.635, + "grad_norm": 6.509876728057861, + "learning_rate": 3.413300000000001e-05, + "loss": 3.2482, + "step": 31750 + }, + { + "epoch": 0.636, + "grad_norm": 6.303530693054199, + "learning_rate": 3.4108e-05, + "loss": 3.3513, + "step": 31800 + }, + { + "epoch": 0.637, + "grad_norm": 5.964878559112549, + "learning_rate": 3.4083000000000006e-05, + "loss": 3.1939, + "step": 31850 + }, + { + "epoch": 0.638, + "grad_norm": 5.7284321784973145, + "learning_rate": 3.4058e-05, + "loss": 3.3163, + "step": 31900 + }, + { + "epoch": 0.639, + "grad_norm": 5.236353397369385, + "learning_rate": 3.4033000000000004e-05, + "loss": 3.3796, + "step": 31950 + }, + { + "epoch": 0.64, + "grad_norm": 6.227442741394043, + "learning_rate": 3.4008000000000004e-05, + "loss": 3.2888, + "step": 32000 + }, + { + "epoch": 0.641, + "grad_norm": 9.655413627624512, + "learning_rate": 3.3983e-05, + "loss": 3.5678, + "step": 32050 + }, + { + "epoch": 0.642, + "grad_norm": 5.565030574798584, + "learning_rate": 3.3958e-05, + "loss": 3.2497, + "step": 32100 + }, + { + "epoch": 0.643, + "grad_norm": 11.382092475891113, + "learning_rate": 3.3933e-05, + "loss": 3.2119, + "step": 32150 + }, + { + "epoch": 0.644, + "grad_norm": 6.986525058746338, + "learning_rate": 3.3908e-05, + "loss": 3.3342, + "step": 32200 + }, + { + "epoch": 0.645, + "grad_norm": 5.240586757659912, + "learning_rate": 3.3883e-05, + "loss": 3.2399, + "step": 32250 + }, + { + "epoch": 0.646, + "grad_norm": 6.879291534423828, + "learning_rate": 3.3858e-05, + "loss": 3.4391, + "step": 32300 + }, + { + "epoch": 0.647, + "grad_norm": 7.895921230316162, + "learning_rate": 3.3833000000000006e-05, + "loss": 3.3315, + "step": 32350 + }, + { + "epoch": 0.648, + "grad_norm": 6.591862678527832, + "learning_rate": 3.3808e-05, + "loss": 3.2601, + "step": 32400 + }, + { + "epoch": 0.649, + "grad_norm": 8.269440650939941, + "learning_rate": 3.3783000000000005e-05, + "loss": 3.1949, + "step": 32450 + }, + { + "epoch": 0.65, + "grad_norm": 7.437238693237305, + "learning_rate": 3.3758e-05, + "loss": 3.5878, + "step": 32500 + }, + { + "epoch": 0.651, + "grad_norm": 5.902334690093994, + "learning_rate": 3.3733000000000004e-05, + "loss": 3.2744, + "step": 32550 + }, + { + "epoch": 0.652, + "grad_norm": 8.443634986877441, + "learning_rate": 3.3708e-05, + "loss": 3.0574, + "step": 32600 + }, + { + "epoch": 0.653, + "grad_norm": 6.006880283355713, + "learning_rate": 3.3683e-05, + "loss": 3.2878, + "step": 32650 + }, + { + "epoch": 0.654, + "grad_norm": 6.461446762084961, + "learning_rate": 3.3658e-05, + "loss": 3.2513, + "step": 32700 + }, + { + "epoch": 0.655, + "grad_norm": 7.1918487548828125, + "learning_rate": 3.3633e-05, + "loss": 3.2637, + "step": 32750 + }, + { + "epoch": 0.656, + "grad_norm": 7.551942825317383, + "learning_rate": 3.3608e-05, + "loss": 3.3599, + "step": 32800 + }, + { + "epoch": 0.657, + "grad_norm": 6.817183971405029, + "learning_rate": 3.3583e-05, + "loss": 3.5034, + "step": 32850 + }, + { + "epoch": 0.658, + "grad_norm": 6.113791465759277, + "learning_rate": 3.3558e-05, + "loss": 3.2996, + "step": 32900 + }, + { + "epoch": 0.659, + "grad_norm": 6.195582389831543, + "learning_rate": 3.3533000000000006e-05, + "loss": 3.3898, + "step": 32950 + }, + { + "epoch": 0.66, + "grad_norm": 6.301150798797607, + "learning_rate": 3.3508e-05, + "loss": 3.2412, + "step": 33000 + }, + { + "epoch": 0.661, + "grad_norm": 5.854180812835693, + "learning_rate": 3.34835e-05, + "loss": 3.4688, + "step": 33050 + }, + { + "epoch": 0.662, + "grad_norm": 9.124200820922852, + "learning_rate": 3.3458500000000005e-05, + "loss": 3.3273, + "step": 33100 + }, + { + "epoch": 0.663, + "grad_norm": 7.126101970672607, + "learning_rate": 3.34335e-05, + "loss": 3.2921, + "step": 33150 + }, + { + "epoch": 0.664, + "grad_norm": 7.3969526290893555, + "learning_rate": 3.3408500000000004e-05, + "loss": 3.2588, + "step": 33200 + }, + { + "epoch": 0.665, + "grad_norm": 6.349921703338623, + "learning_rate": 3.33835e-05, + "loss": 3.1005, + "step": 33250 + }, + { + "epoch": 0.666, + "grad_norm": 7.069624900817871, + "learning_rate": 3.33585e-05, + "loss": 3.3315, + "step": 33300 + }, + { + "epoch": 0.667, + "grad_norm": 6.957205772399902, + "learning_rate": 3.33335e-05, + "loss": 3.0547, + "step": 33350 + }, + { + "epoch": 0.668, + "grad_norm": 6.064169406890869, + "learning_rate": 3.33085e-05, + "loss": 3.4743, + "step": 33400 + }, + { + "epoch": 0.669, + "grad_norm": 8.175496101379395, + "learning_rate": 3.32835e-05, + "loss": 3.1822, + "step": 33450 + }, + { + "epoch": 0.67, + "grad_norm": 8.576241493225098, + "learning_rate": 3.325850000000001e-05, + "loss": 3.1038, + "step": 33500 + }, + { + "epoch": 0.671, + "grad_norm": 6.801853179931641, + "learning_rate": 3.32335e-05, + "loss": 3.1863, + "step": 33550 + }, + { + "epoch": 0.672, + "grad_norm": 5.9864983558654785, + "learning_rate": 3.3208500000000006e-05, + "loss": 3.4432, + "step": 33600 + }, + { + "epoch": 0.673, + "grad_norm": 9.82041072845459, + "learning_rate": 3.31835e-05, + "loss": 3.3234, + "step": 33650 + }, + { + "epoch": 0.674, + "grad_norm": 6.94479513168335, + "learning_rate": 3.3158500000000004e-05, + "loss": 3.3856, + "step": 33700 + }, + { + "epoch": 0.675, + "grad_norm": 7.436299800872803, + "learning_rate": 3.3133500000000004e-05, + "loss": 3.4031, + "step": 33750 + }, + { + "epoch": 0.676, + "grad_norm": 7.951491832733154, + "learning_rate": 3.31085e-05, + "loss": 3.1939, + "step": 33800 + }, + { + "epoch": 0.677, + "grad_norm": 5.3420000076293945, + "learning_rate": 3.30835e-05, + "loss": 3.277, + "step": 33850 + }, + { + "epoch": 0.678, + "grad_norm": 6.747936725616455, + "learning_rate": 3.30585e-05, + "loss": 3.457, + "step": 33900 + }, + { + "epoch": 0.679, + "grad_norm": 7.466944217681885, + "learning_rate": 3.30335e-05, + "loss": 3.4629, + "step": 33950 + }, + { + "epoch": 0.68, + "grad_norm": 7.159815311431885, + "learning_rate": 3.30085e-05, + "loss": 3.3527, + "step": 34000 + }, + { + "epoch": 0.681, + "grad_norm": 6.799013137817383, + "learning_rate": 3.29835e-05, + "loss": 3.2996, + "step": 34050 + }, + { + "epoch": 0.682, + "grad_norm": 7.7746262550354, + "learning_rate": 3.2958500000000006e-05, + "loss": 3.2661, + "step": 34100 + }, + { + "epoch": 0.683, + "grad_norm": 6.598784923553467, + "learning_rate": 3.29335e-05, + "loss": 3.3371, + "step": 34150 + }, + { + "epoch": 0.684, + "grad_norm": 6.144473075866699, + "learning_rate": 3.2908500000000005e-05, + "loss": 3.2466, + "step": 34200 + }, + { + "epoch": 0.685, + "grad_norm": 7.488722324371338, + "learning_rate": 3.28835e-05, + "loss": 3.2008, + "step": 34250 + }, + { + "epoch": 0.686, + "grad_norm": 5.943243980407715, + "learning_rate": 3.2858500000000004e-05, + "loss": 3.1717, + "step": 34300 + }, + { + "epoch": 0.687, + "grad_norm": 7.279262542724609, + "learning_rate": 3.28335e-05, + "loss": 3.2673, + "step": 34350 + }, + { + "epoch": 0.688, + "grad_norm": 7.265892028808594, + "learning_rate": 3.28085e-05, + "loss": 3.3252, + "step": 34400 + }, + { + "epoch": 0.689, + "grad_norm": 7.290516376495361, + "learning_rate": 3.27835e-05, + "loss": 3.1889, + "step": 34450 + }, + { + "epoch": 0.69, + "grad_norm": 5.666197776794434, + "learning_rate": 3.27585e-05, + "loss": 3.2569, + "step": 34500 + }, + { + "epoch": 0.691, + "grad_norm": 6.722541809082031, + "learning_rate": 3.27335e-05, + "loss": 3.0594, + "step": 34550 + }, + { + "epoch": 0.692, + "grad_norm": 6.768825531005859, + "learning_rate": 3.27085e-05, + "loss": 3.2594, + "step": 34600 + }, + { + "epoch": 0.693, + "grad_norm": 6.368803024291992, + "learning_rate": 3.26835e-05, + "loss": 3.1851, + "step": 34650 + }, + { + "epoch": 0.694, + "grad_norm": 8.27947998046875, + "learning_rate": 3.2658500000000006e-05, + "loss": 3.2759, + "step": 34700 + }, + { + "epoch": 0.695, + "grad_norm": 6.843438625335693, + "learning_rate": 3.26335e-05, + "loss": 3.1578, + "step": 34750 + }, + { + "epoch": 0.696, + "grad_norm": 8.467235565185547, + "learning_rate": 3.2608500000000005e-05, + "loss": 3.2223, + "step": 34800 + }, + { + "epoch": 0.697, + "grad_norm": 5.828463554382324, + "learning_rate": 3.25835e-05, + "loss": 3.2224, + "step": 34850 + }, + { + "epoch": 0.698, + "grad_norm": 6.603434085845947, + "learning_rate": 3.2558500000000003e-05, + "loss": 3.3885, + "step": 34900 + }, + { + "epoch": 0.699, + "grad_norm": 6.739859580993652, + "learning_rate": 3.25335e-05, + "loss": 3.1141, + "step": 34950 + }, + { + "epoch": 0.7, + "grad_norm": 6.304507255554199, + "learning_rate": 3.25085e-05, + "loss": 3.2604, + "step": 35000 + }, + { + "epoch": 0.701, + "grad_norm": 7.21973180770874, + "learning_rate": 3.24835e-05, + "loss": 3.2029, + "step": 35050 + }, + { + "epoch": 0.702, + "grad_norm": 6.183377742767334, + "learning_rate": 3.24585e-05, + "loss": 3.3226, + "step": 35100 + }, + { + "epoch": 0.703, + "grad_norm": 7.0609540939331055, + "learning_rate": 3.24335e-05, + "loss": 3.186, + "step": 35150 + }, + { + "epoch": 0.704, + "grad_norm": 6.175162315368652, + "learning_rate": 3.24085e-05, + "loss": 3.2179, + "step": 35200 + }, + { + "epoch": 0.705, + "grad_norm": 5.934025764465332, + "learning_rate": 3.23835e-05, + "loss": 3.2335, + "step": 35250 + }, + { + "epoch": 0.706, + "grad_norm": 8.728473663330078, + "learning_rate": 3.2358500000000005e-05, + "loss": 3.2551, + "step": 35300 + }, + { + "epoch": 0.707, + "grad_norm": 7.110035419464111, + "learning_rate": 3.23335e-05, + "loss": 3.2937, + "step": 35350 + }, + { + "epoch": 0.708, + "grad_norm": 6.184364318847656, + "learning_rate": 3.2308500000000004e-05, + "loss": 3.0839, + "step": 35400 + }, + { + "epoch": 0.709, + "grad_norm": 6.328525066375732, + "learning_rate": 3.22835e-05, + "loss": 3.1893, + "step": 35450 + }, + { + "epoch": 0.71, + "grad_norm": 6.684478759765625, + "learning_rate": 3.22585e-05, + "loss": 3.1691, + "step": 35500 + }, + { + "epoch": 0.711, + "grad_norm": 5.313352584838867, + "learning_rate": 3.22335e-05, + "loss": 3.4243, + "step": 35550 + }, + { + "epoch": 0.712, + "grad_norm": 6.211653709411621, + "learning_rate": 3.22085e-05, + "loss": 3.5204, + "step": 35600 + }, + { + "epoch": 0.713, + "grad_norm": 5.643269062042236, + "learning_rate": 3.21835e-05, + "loss": 3.4294, + "step": 35650 + }, + { + "epoch": 0.714, + "grad_norm": 7.590113639831543, + "learning_rate": 3.21585e-05, + "loss": 3.4033, + "step": 35700 + }, + { + "epoch": 0.715, + "grad_norm": 7.15920352935791, + "learning_rate": 3.21335e-05, + "loss": 3.1821, + "step": 35750 + }, + { + "epoch": 0.716, + "grad_norm": 6.130990505218506, + "learning_rate": 3.21085e-05, + "loss": 3.2556, + "step": 35800 + }, + { + "epoch": 0.717, + "grad_norm": 6.418147563934326, + "learning_rate": 3.20835e-05, + "loss": 3.1433, + "step": 35850 + }, + { + "epoch": 0.718, + "grad_norm": 7.010365009307861, + "learning_rate": 3.2058500000000005e-05, + "loss": 3.3129, + "step": 35900 + }, + { + "epoch": 0.719, + "grad_norm": 6.206649303436279, + "learning_rate": 3.20335e-05, + "loss": 3.5057, + "step": 35950 + }, + { + "epoch": 0.72, + "grad_norm": 6.868955612182617, + "learning_rate": 3.2008500000000004e-05, + "loss": 3.2482, + "step": 36000 + }, + { + "epoch": 0.721, + "grad_norm": 6.5969343185424805, + "learning_rate": 3.1983499999999996e-05, + "loss": 3.2539, + "step": 36050 + }, + { + "epoch": 0.722, + "grad_norm": 6.978048801422119, + "learning_rate": 3.19585e-05, + "loss": 3.1573, + "step": 36100 + }, + { + "epoch": 0.723, + "grad_norm": 6.406214714050293, + "learning_rate": 3.19335e-05, + "loss": 3.3721, + "step": 36150 + }, + { + "epoch": 0.724, + "grad_norm": 9.857353210449219, + "learning_rate": 3.19085e-05, + "loss": 3.3179, + "step": 36200 + }, + { + "epoch": 0.725, + "grad_norm": 6.391420841217041, + "learning_rate": 3.18835e-05, + "loss": 3.3233, + "step": 36250 + }, + { + "epoch": 0.726, + "grad_norm": 8.015118598937988, + "learning_rate": 3.18585e-05, + "loss": 3.0956, + "step": 36300 + }, + { + "epoch": 0.727, + "grad_norm": 5.823697566986084, + "learning_rate": 3.18335e-05, + "loss": 3.1998, + "step": 36350 + }, + { + "epoch": 0.728, + "grad_norm": 7.559394359588623, + "learning_rate": 3.1808500000000006e-05, + "loss": 3.2908, + "step": 36400 + }, + { + "epoch": 0.729, + "grad_norm": 6.329631328582764, + "learning_rate": 3.17835e-05, + "loss": 3.2908, + "step": 36450 + }, + { + "epoch": 0.73, + "grad_norm": 5.789269924163818, + "learning_rate": 3.1758500000000004e-05, + "loss": 3.3667, + "step": 36500 + }, + { + "epoch": 0.731, + "grad_norm": 5.785370349884033, + "learning_rate": 3.17335e-05, + "loss": 3.1064, + "step": 36550 + }, + { + "epoch": 0.732, + "grad_norm": 7.6929755210876465, + "learning_rate": 3.17085e-05, + "loss": 2.9547, + "step": 36600 + }, + { + "epoch": 0.733, + "grad_norm": 9.009896278381348, + "learning_rate": 3.16835e-05, + "loss": 3.0989, + "step": 36650 + }, + { + "epoch": 0.734, + "grad_norm": 6.26234245300293, + "learning_rate": 3.16585e-05, + "loss": 3.2564, + "step": 36700 + }, + { + "epoch": 0.735, + "grad_norm": 5.810740947723389, + "learning_rate": 3.16335e-05, + "loss": 3.0043, + "step": 36750 + }, + { + "epoch": 0.736, + "grad_norm": 6.051753520965576, + "learning_rate": 3.16085e-05, + "loss": 3.1809, + "step": 36800 + }, + { + "epoch": 0.737, + "grad_norm": 7.068359851837158, + "learning_rate": 3.15835e-05, + "loss": 3.2625, + "step": 36850 + }, + { + "epoch": 0.738, + "grad_norm": 8.538505554199219, + "learning_rate": 3.15585e-05, + "loss": 3.1945, + "step": 36900 + }, + { + "epoch": 0.739, + "grad_norm": 8.019240379333496, + "learning_rate": 3.15335e-05, + "loss": 3.1444, + "step": 36950 + }, + { + "epoch": 0.74, + "grad_norm": 5.253057479858398, + "learning_rate": 3.1508500000000005e-05, + "loss": 2.8749, + "step": 37000 + }, + { + "epoch": 0.741, + "grad_norm": 6.1826491355896, + "learning_rate": 3.14835e-05, + "loss": 3.1814, + "step": 37050 + }, + { + "epoch": 0.742, + "grad_norm": 6.367573261260986, + "learning_rate": 3.1458500000000004e-05, + "loss": 3.3143, + "step": 37100 + }, + { + "epoch": 0.743, + "grad_norm": 7.386603832244873, + "learning_rate": 3.1433499999999996e-05, + "loss": 3.4178, + "step": 37150 + }, + { + "epoch": 0.744, + "grad_norm": 7.538274765014648, + "learning_rate": 3.1409e-05, + "loss": 3.2942, + "step": 37200 + }, + { + "epoch": 0.745, + "grad_norm": 8.587214469909668, + "learning_rate": 3.1384e-05, + "loss": 3.1972, + "step": 37250 + }, + { + "epoch": 0.746, + "grad_norm": 6.307685375213623, + "learning_rate": 3.1359e-05, + "loss": 3.2638, + "step": 37300 + }, + { + "epoch": 0.747, + "grad_norm": 7.901178359985352, + "learning_rate": 3.1334e-05, + "loss": 3.212, + "step": 37350 + }, + { + "epoch": 0.748, + "grad_norm": 7.572473526000977, + "learning_rate": 3.1309e-05, + "loss": 2.9569, + "step": 37400 + }, + { + "epoch": 0.749, + "grad_norm": 6.52233362197876, + "learning_rate": 3.1284e-05, + "loss": 3.3622, + "step": 37450 + }, + { + "epoch": 0.75, + "grad_norm": 7.932403564453125, + "learning_rate": 3.1259e-05, + "loss": 3.1381, + "step": 37500 + }, + { + "epoch": 0.751, + "grad_norm": 6.504958152770996, + "learning_rate": 3.1234e-05, + "loss": 3.0979, + "step": 37550 + }, + { + "epoch": 0.752, + "grad_norm": 8.710679054260254, + "learning_rate": 3.1209e-05, + "loss": 3.2664, + "step": 37600 + }, + { + "epoch": 0.753, + "grad_norm": 5.697458744049072, + "learning_rate": 3.1184000000000005e-05, + "loss": 3.3793, + "step": 37650 + }, + { + "epoch": 0.754, + "grad_norm": 7.049675941467285, + "learning_rate": 3.1159e-05, + "loss": 3.3189, + "step": 37700 + }, + { + "epoch": 0.755, + "grad_norm": 6.419721603393555, + "learning_rate": 3.1134000000000004e-05, + "loss": 3.1938, + "step": 37750 + }, + { + "epoch": 0.756, + "grad_norm": 10.290610313415527, + "learning_rate": 3.1108999999999996e-05, + "loss": 3.2407, + "step": 37800 + }, + { + "epoch": 0.757, + "grad_norm": 7.655055999755859, + "learning_rate": 3.1084e-05, + "loss": 3.1777, + "step": 37850 + }, + { + "epoch": 0.758, + "grad_norm": 7.728888034820557, + "learning_rate": 3.1059e-05, + "loss": 3.0303, + "step": 37900 + }, + { + "epoch": 0.759, + "grad_norm": 7.557064533233643, + "learning_rate": 3.1034e-05, + "loss": 3.1784, + "step": 37950 + }, + { + "epoch": 0.76, + "grad_norm": 7.15919303894043, + "learning_rate": 3.1009e-05, + "loss": 3.2102, + "step": 38000 + }, + { + "epoch": 0.761, + "grad_norm": 5.564249038696289, + "learning_rate": 3.0984e-05, + "loss": 3.3053, + "step": 38050 + }, + { + "epoch": 0.762, + "grad_norm": 8.526185035705566, + "learning_rate": 3.0959e-05, + "loss": 3.4698, + "step": 38100 + }, + { + "epoch": 0.763, + "grad_norm": 7.014064311981201, + "learning_rate": 3.0934e-05, + "loss": 3.1349, + "step": 38150 + }, + { + "epoch": 0.764, + "grad_norm": 6.149911880493164, + "learning_rate": 3.0909e-05, + "loss": 3.3659, + "step": 38200 + }, + { + "epoch": 0.765, + "grad_norm": 7.0216064453125, + "learning_rate": 3.0884000000000004e-05, + "loss": 3.3658, + "step": 38250 + }, + { + "epoch": 0.766, + "grad_norm": 6.487388610839844, + "learning_rate": 3.0859e-05, + "loss": 3.1718, + "step": 38300 + }, + { + "epoch": 0.767, + "grad_norm": 7.192234039306641, + "learning_rate": 3.0834e-05, + "loss": 3.1795, + "step": 38350 + }, + { + "epoch": 0.768, + "grad_norm": 6.610414981842041, + "learning_rate": 3.0808999999999996e-05, + "loss": 3.0035, + "step": 38400 + }, + { + "epoch": 0.769, + "grad_norm": 5.408251762390137, + "learning_rate": 3.0784e-05, + "loss": 3.4079, + "step": 38450 + }, + { + "epoch": 0.77, + "grad_norm": 6.19015645980835, + "learning_rate": 3.0759e-05, + "loss": 3.2097, + "step": 38500 + }, + { + "epoch": 0.771, + "grad_norm": 8.879510879516602, + "learning_rate": 3.0734e-05, + "loss": 3.1956, + "step": 38550 + }, + { + "epoch": 0.772, + "grad_norm": 6.8085198402404785, + "learning_rate": 3.0709e-05, + "loss": 3.191, + "step": 38600 + }, + { + "epoch": 0.773, + "grad_norm": 6.9796671867370605, + "learning_rate": 3.0684e-05, + "loss": 3.0009, + "step": 38650 + }, + { + "epoch": 0.774, + "grad_norm": 6.667499542236328, + "learning_rate": 3.0659e-05, + "loss": 3.1651, + "step": 38700 + }, + { + "epoch": 0.775, + "grad_norm": 7.873382091522217, + "learning_rate": 3.0634e-05, + "loss": 3.2115, + "step": 38750 + }, + { + "epoch": 0.776, + "grad_norm": 7.049197673797607, + "learning_rate": 3.0609e-05, + "loss": 3.1057, + "step": 38800 + }, + { + "epoch": 0.777, + "grad_norm": 5.474276065826416, + "learning_rate": 3.0584000000000004e-05, + "loss": 3.205, + "step": 38850 + }, + { + "epoch": 0.778, + "grad_norm": 7.194087028503418, + "learning_rate": 3.0558999999999997e-05, + "loss": 3.1913, + "step": 38900 + }, + { + "epoch": 0.779, + "grad_norm": 8.143173217773438, + "learning_rate": 3.0534e-05, + "loss": 3.3131, + "step": 38950 + }, + { + "epoch": 0.78, + "grad_norm": 5.855649948120117, + "learning_rate": 3.0509e-05, + "loss": 3.0642, + "step": 39000 + }, + { + "epoch": 0.781, + "grad_norm": 5.941763401031494, + "learning_rate": 3.0484e-05, + "loss": 3.0076, + "step": 39050 + }, + { + "epoch": 0.782, + "grad_norm": 5.700584888458252, + "learning_rate": 3.0459000000000004e-05, + "loss": 3.1064, + "step": 39100 + }, + { + "epoch": 0.783, + "grad_norm": 7.375488758087158, + "learning_rate": 3.0434e-05, + "loss": 3.4071, + "step": 39150 + }, + { + "epoch": 0.784, + "grad_norm": 7.246762752532959, + "learning_rate": 3.0409000000000003e-05, + "loss": 3.1041, + "step": 39200 + }, + { + "epoch": 0.785, + "grad_norm": 6.453239917755127, + "learning_rate": 3.03845e-05, + "loss": 3.3334, + "step": 39250 + }, + { + "epoch": 0.786, + "grad_norm": 6.580065727233887, + "learning_rate": 3.0359500000000003e-05, + "loss": 3.0096, + "step": 39300 + }, + { + "epoch": 0.787, + "grad_norm": 6.784661293029785, + "learning_rate": 3.03345e-05, + "loss": 3.345, + "step": 39350 + }, + { + "epoch": 0.788, + "grad_norm": 5.282735824584961, + "learning_rate": 3.0309500000000002e-05, + "loss": 3.2078, + "step": 39400 + }, + { + "epoch": 0.789, + "grad_norm": 6.952493190765381, + "learning_rate": 3.0284499999999998e-05, + "loss": 3.2125, + "step": 39450 + }, + { + "epoch": 0.79, + "grad_norm": 7.221185207366943, + "learning_rate": 3.02595e-05, + "loss": 3.1933, + "step": 39500 + }, + { + "epoch": 0.791, + "grad_norm": 6.6697564125061035, + "learning_rate": 3.02345e-05, + "loss": 3.2683, + "step": 39550 + }, + { + "epoch": 0.792, + "grad_norm": 7.7430243492126465, + "learning_rate": 3.0209500000000003e-05, + "loss": 3.2786, + "step": 39600 + }, + { + "epoch": 0.793, + "grad_norm": 7.560022830963135, + "learning_rate": 3.01845e-05, + "loss": 3.1485, + "step": 39650 + }, + { + "epoch": 0.794, + "grad_norm": 9.309632301330566, + "learning_rate": 3.01595e-05, + "loss": 3.3837, + "step": 39700 + }, + { + "epoch": 0.795, + "grad_norm": 6.466885089874268, + "learning_rate": 3.0134499999999998e-05, + "loss": 3.0223, + "step": 39750 + }, + { + "epoch": 0.796, + "grad_norm": 7.028127670288086, + "learning_rate": 3.01095e-05, + "loss": 3.2119, + "step": 39800 + }, + { + "epoch": 0.797, + "grad_norm": 6.102771282196045, + "learning_rate": 3.00845e-05, + "loss": 3.1982, + "step": 39850 + }, + { + "epoch": 0.798, + "grad_norm": 6.1855387687683105, + "learning_rate": 3.0059500000000002e-05, + "loss": 3.1221, + "step": 39900 + }, + { + "epoch": 0.799, + "grad_norm": 6.996290683746338, + "learning_rate": 3.00345e-05, + "loss": 3.1688, + "step": 39950 + }, + { + "epoch": 0.8, + "grad_norm": 5.62674617767334, + "learning_rate": 3.00095e-05, + "loss": 2.9733, + "step": 40000 + }, + { + "epoch": 0.801, + "grad_norm": 6.106110572814941, + "learning_rate": 2.9984499999999997e-05, + "loss": 3.1656, + "step": 40050 + }, + { + "epoch": 0.802, + "grad_norm": 6.850236415863037, + "learning_rate": 2.99595e-05, + "loss": 3.1764, + "step": 40100 + }, + { + "epoch": 0.803, + "grad_norm": 7.164166450500488, + "learning_rate": 2.9934500000000003e-05, + "loss": 3.1813, + "step": 40150 + }, + { + "epoch": 0.804, + "grad_norm": 8.257601737976074, + "learning_rate": 2.9909500000000002e-05, + "loss": 3.0087, + "step": 40200 + }, + { + "epoch": 0.805, + "grad_norm": 6.853578090667725, + "learning_rate": 2.9884500000000005e-05, + "loss": 3.1218, + "step": 40250 + }, + { + "epoch": 0.806, + "grad_norm": 6.926779270172119, + "learning_rate": 2.98595e-05, + "loss": 3.2044, + "step": 40300 + }, + { + "epoch": 0.807, + "grad_norm": 6.643428802490234, + "learning_rate": 2.9834500000000004e-05, + "loss": 3.2425, + "step": 40350 + }, + { + "epoch": 0.808, + "grad_norm": 7.304353713989258, + "learning_rate": 2.98095e-05, + "loss": 3.2175, + "step": 40400 + }, + { + "epoch": 0.809, + "grad_norm": 6.92768669128418, + "learning_rate": 2.9784500000000003e-05, + "loss": 3.2095, + "step": 40450 + }, + { + "epoch": 0.81, + "grad_norm": 7.586690425872803, + "learning_rate": 2.9759500000000002e-05, + "loss": 3.2259, + "step": 40500 + }, + { + "epoch": 0.811, + "grad_norm": 6.485148906707764, + "learning_rate": 2.9734500000000005e-05, + "loss": 3.1969, + "step": 40550 + }, + { + "epoch": 0.812, + "grad_norm": 7.481224060058594, + "learning_rate": 2.97095e-05, + "loss": 3.1681, + "step": 40600 + }, + { + "epoch": 0.813, + "grad_norm": 6.11465311050415, + "learning_rate": 2.9684500000000004e-05, + "loss": 3.2654, + "step": 40650 + }, + { + "epoch": 0.814, + "grad_norm": 7.569796562194824, + "learning_rate": 2.96595e-05, + "loss": 3.2122, + "step": 40700 + }, + { + "epoch": 0.815, + "grad_norm": 6.672354698181152, + "learning_rate": 2.9634500000000002e-05, + "loss": 3.0291, + "step": 40750 + }, + { + "epoch": 0.816, + "grad_norm": 14.324336051940918, + "learning_rate": 2.9609500000000002e-05, + "loss": 3.2593, + "step": 40800 + }, + { + "epoch": 0.817, + "grad_norm": 6.56348991394043, + "learning_rate": 2.9584500000000004e-05, + "loss": 3.2292, + "step": 40850 + }, + { + "epoch": 0.818, + "grad_norm": 7.05709981918335, + "learning_rate": 2.95595e-05, + "loss": 3.1446, + "step": 40900 + }, + { + "epoch": 0.819, + "grad_norm": 6.213717937469482, + "learning_rate": 2.9534500000000003e-05, + "loss": 3.1543, + "step": 40950 + }, + { + "epoch": 0.82, + "grad_norm": 9.178669929504395, + "learning_rate": 2.9509500000000003e-05, + "loss": 3.0867, + "step": 41000 + }, + { + "epoch": 0.821, + "grad_norm": 7.343588829040527, + "learning_rate": 2.9484500000000005e-05, + "loss": 3.0533, + "step": 41050 + }, + { + "epoch": 0.822, + "grad_norm": 8.41699504852295, + "learning_rate": 2.94595e-05, + "loss": 2.9823, + "step": 41100 + }, + { + "epoch": 0.823, + "grad_norm": 6.462010860443115, + "learning_rate": 2.9434500000000004e-05, + "loss": 3.134, + "step": 41150 + }, + { + "epoch": 0.824, + "grad_norm": 7.462036609649658, + "learning_rate": 2.94095e-05, + "loss": 3.2841, + "step": 41200 + }, + { + "epoch": 0.825, + "grad_norm": 5.845949649810791, + "learning_rate": 2.9385e-05, + "loss": 3.1668, + "step": 41250 + }, + { + "epoch": 0.826, + "grad_norm": 8.043434143066406, + "learning_rate": 2.9360000000000003e-05, + "loss": 3.092, + "step": 41300 + }, + { + "epoch": 0.827, + "grad_norm": 6.7618842124938965, + "learning_rate": 2.9335000000000003e-05, + "loss": 3.2226, + "step": 41350 + }, + { + "epoch": 0.828, + "grad_norm": 5.803843021392822, + "learning_rate": 2.9310000000000006e-05, + "loss": 3.0952, + "step": 41400 + }, + { + "epoch": 0.829, + "grad_norm": 8.092726707458496, + "learning_rate": 2.9285e-05, + "loss": 3.338, + "step": 41450 + }, + { + "epoch": 0.83, + "grad_norm": 6.214277267456055, + "learning_rate": 2.9260000000000004e-05, + "loss": 3.2117, + "step": 41500 + }, + { + "epoch": 0.831, + "grad_norm": 6.212014675140381, + "learning_rate": 2.9235e-05, + "loss": 3.1922, + "step": 41550 + }, + { + "epoch": 0.832, + "grad_norm": 6.231184482574463, + "learning_rate": 2.9210000000000003e-05, + "loss": 3.1626, + "step": 41600 + }, + { + "epoch": 0.833, + "grad_norm": 8.141480445861816, + "learning_rate": 2.9185000000000003e-05, + "loss": 3.2031, + "step": 41650 + }, + { + "epoch": 0.834, + "grad_norm": 5.561233043670654, + "learning_rate": 2.9160000000000005e-05, + "loss": 3.2058, + "step": 41700 + }, + { + "epoch": 0.835, + "grad_norm": 6.850268363952637, + "learning_rate": 2.9135e-05, + "loss": 3.1208, + "step": 41750 + }, + { + "epoch": 0.836, + "grad_norm": 6.550021171569824, + "learning_rate": 2.9110000000000004e-05, + "loss": 3.1654, + "step": 41800 + }, + { + "epoch": 0.837, + "grad_norm": 6.2863359451293945, + "learning_rate": 2.9085e-05, + "loss": 2.9412, + "step": 41850 + }, + { + "epoch": 0.838, + "grad_norm": 7.536694526672363, + "learning_rate": 2.9060000000000003e-05, + "loss": 3.3024, + "step": 41900 + }, + { + "epoch": 0.839, + "grad_norm": 6.780869007110596, + "learning_rate": 2.9035000000000002e-05, + "loss": 3.0517, + "step": 41950 + }, + { + "epoch": 0.84, + "grad_norm": 6.739259243011475, + "learning_rate": 2.9010000000000005e-05, + "loss": 3.2168, + "step": 42000 + }, + { + "epoch": 0.841, + "grad_norm": 6.765299320220947, + "learning_rate": 2.8985e-05, + "loss": 3.2832, + "step": 42050 + }, + { + "epoch": 0.842, + "grad_norm": 9.602505683898926, + "learning_rate": 2.8960000000000004e-05, + "loss": 3.1125, + "step": 42100 + }, + { + "epoch": 0.843, + "grad_norm": 8.207633972167969, + "learning_rate": 2.8935e-05, + "loss": 3.1905, + "step": 42150 + }, + { + "epoch": 0.844, + "grad_norm": 6.844604015350342, + "learning_rate": 2.8910000000000003e-05, + "loss": 3.1824, + "step": 42200 + }, + { + "epoch": 0.845, + "grad_norm": 7.132315158843994, + "learning_rate": 2.8885000000000002e-05, + "loss": 3.197, + "step": 42250 + }, + { + "epoch": 0.846, + "grad_norm": 8.794624328613281, + "learning_rate": 2.8860000000000005e-05, + "loss": 3.074, + "step": 42300 + }, + { + "epoch": 0.847, + "grad_norm": 6.943325042724609, + "learning_rate": 2.8835e-05, + "loss": 3.1916, + "step": 42350 + }, + { + "epoch": 0.848, + "grad_norm": 6.533550262451172, + "learning_rate": 2.8810000000000004e-05, + "loss": 3.1118, + "step": 42400 + }, + { + "epoch": 0.849, + "grad_norm": 10.443181991577148, + "learning_rate": 2.8785e-05, + "loss": 3.0385, + "step": 42450 + }, + { + "epoch": 0.85, + "grad_norm": 6.8528642654418945, + "learning_rate": 2.8760000000000002e-05, + "loss": 3.301, + "step": 42500 + }, + { + "epoch": 0.851, + "grad_norm": 6.201817512512207, + "learning_rate": 2.8735000000000002e-05, + "loss": 3.1458, + "step": 42550 + }, + { + "epoch": 0.852, + "grad_norm": 5.921964645385742, + "learning_rate": 2.8710000000000005e-05, + "loss": 3.0353, + "step": 42600 + }, + { + "epoch": 0.853, + "grad_norm": 5.661495685577393, + "learning_rate": 2.8685e-05, + "loss": 3.1163, + "step": 42650 + }, + { + "epoch": 0.854, + "grad_norm": 6.364812850952148, + "learning_rate": 2.8660000000000003e-05, + "loss": 3.1634, + "step": 42700 + }, + { + "epoch": 0.855, + "grad_norm": 7.702561378479004, + "learning_rate": 2.8635e-05, + "loss": 3.2006, + "step": 42750 + }, + { + "epoch": 0.856, + "grad_norm": 7.330812454223633, + "learning_rate": 2.8610000000000002e-05, + "loss": 3.0284, + "step": 42800 + }, + { + "epoch": 0.857, + "grad_norm": 7.044076919555664, + "learning_rate": 2.8585e-05, + "loss": 2.958, + "step": 42850 + }, + { + "epoch": 0.858, + "grad_norm": 9.742399215698242, + "learning_rate": 2.8560000000000004e-05, + "loss": 3.0941, + "step": 42900 + }, + { + "epoch": 0.859, + "grad_norm": 6.272192001342773, + "learning_rate": 2.8535e-05, + "loss": 3.2671, + "step": 42950 + }, + { + "epoch": 0.86, + "grad_norm": 5.866217136383057, + "learning_rate": 2.8510000000000003e-05, + "loss": 3.099, + "step": 43000 + }, + { + "epoch": 0.861, + "grad_norm": 6.47220516204834, + "learning_rate": 2.8485e-05, + "loss": 3.1561, + "step": 43050 + }, + { + "epoch": 0.862, + "grad_norm": 7.465406894683838, + "learning_rate": 2.8460000000000002e-05, + "loss": 2.9667, + "step": 43100 + }, + { + "epoch": 0.863, + "grad_norm": 6.800641059875488, + "learning_rate": 2.8435e-05, + "loss": 3.1791, + "step": 43150 + }, + { + "epoch": 0.864, + "grad_norm": 7.830578804016113, + "learning_rate": 2.8410000000000004e-05, + "loss": 2.9727, + "step": 43200 + }, + { + "epoch": 0.865, + "grad_norm": 6.879873275756836, + "learning_rate": 2.8385500000000005e-05, + "loss": 3.1912, + "step": 43250 + }, + { + "epoch": 0.866, + "grad_norm": 6.994164943695068, + "learning_rate": 2.83605e-05, + "loss": 3.1734, + "step": 43300 + }, + { + "epoch": 0.867, + "grad_norm": 8.127955436706543, + "learning_rate": 2.8335500000000003e-05, + "loss": 3.1977, + "step": 43350 + }, + { + "epoch": 0.868, + "grad_norm": 6.18363618850708, + "learning_rate": 2.83105e-05, + "loss": 2.9296, + "step": 43400 + }, + { + "epoch": 0.869, + "grad_norm": 6.684689044952393, + "learning_rate": 2.8285500000000002e-05, + "loss": 3.0261, + "step": 43450 + }, + { + "epoch": 0.87, + "grad_norm": 9.57409954071045, + "learning_rate": 2.82605e-05, + "loss": 3.2812, + "step": 43500 + }, + { + "epoch": 0.871, + "grad_norm": 6.665359020233154, + "learning_rate": 2.8235500000000004e-05, + "loss": 3.1191, + "step": 43550 + }, + { + "epoch": 0.872, + "grad_norm": 7.3514533042907715, + "learning_rate": 2.82105e-05, + "loss": 3.0988, + "step": 43600 + }, + { + "epoch": 0.873, + "grad_norm": 7.701603889465332, + "learning_rate": 2.8185500000000003e-05, + "loss": 3.2266, + "step": 43650 + }, + { + "epoch": 0.874, + "grad_norm": 6.6192851066589355, + "learning_rate": 2.81605e-05, + "loss": 3.2071, + "step": 43700 + }, + { + "epoch": 0.875, + "grad_norm": 7.822471618652344, + "learning_rate": 2.8135500000000002e-05, + "loss": 3.2628, + "step": 43750 + }, + { + "epoch": 0.876, + "grad_norm": 6.103046417236328, + "learning_rate": 2.81105e-05, + "loss": 3.1475, + "step": 43800 + }, + { + "epoch": 0.877, + "grad_norm": 9.693628311157227, + "learning_rate": 2.8085500000000004e-05, + "loss": 3.2774, + "step": 43850 + }, + { + "epoch": 0.878, + "grad_norm": 8.551675796508789, + "learning_rate": 2.8061000000000005e-05, + "loss": 3.2343, + "step": 43900 + }, + { + "epoch": 0.879, + "grad_norm": 7.0858025550842285, + "learning_rate": 2.8036e-05, + "loss": 3.2596, + "step": 43950 + }, + { + "epoch": 0.88, + "grad_norm": 6.832662105560303, + "learning_rate": 2.8011000000000003e-05, + "loss": 3.1012, + "step": 44000 + }, + { + "epoch": 0.881, + "grad_norm": 8.389840126037598, + "learning_rate": 2.7986000000000003e-05, + "loss": 3.1796, + "step": 44050 + }, + { + "epoch": 0.882, + "grad_norm": 10.351311683654785, + "learning_rate": 2.7961000000000006e-05, + "loss": 2.9221, + "step": 44100 + }, + { + "epoch": 0.883, + "grad_norm": 6.427529811859131, + "learning_rate": 2.7936e-05, + "loss": 2.9733, + "step": 44150 + }, + { + "epoch": 0.884, + "grad_norm": 6.574181079864502, + "learning_rate": 2.7911000000000004e-05, + "loss": 3.1097, + "step": 44200 + }, + { + "epoch": 0.885, + "grad_norm": 7.591128826141357, + "learning_rate": 2.7886e-05, + "loss": 3.1482, + "step": 44250 + }, + { + "epoch": 0.886, + "grad_norm": 7.332815647125244, + "learning_rate": 2.7861000000000003e-05, + "loss": 3.1352, + "step": 44300 + }, + { + "epoch": 0.887, + "grad_norm": 8.689337730407715, + "learning_rate": 2.7836000000000003e-05, + "loss": 3.1673, + "step": 44350 + }, + { + "epoch": 0.888, + "grad_norm": 6.476491451263428, + "learning_rate": 2.7811000000000005e-05, + "loss": 3.0232, + "step": 44400 + }, + { + "epoch": 0.889, + "grad_norm": 5.135951519012451, + "learning_rate": 2.7786e-05, + "loss": 3.1024, + "step": 44450 + }, + { + "epoch": 0.89, + "grad_norm": 7.735162734985352, + "learning_rate": 2.7761000000000004e-05, + "loss": 3.1889, + "step": 44500 + }, + { + "epoch": 0.891, + "grad_norm": 7.481966972351074, + "learning_rate": 2.7736e-05, + "loss": 3.0462, + "step": 44550 + }, + { + "epoch": 0.892, + "grad_norm": 7.0432844161987305, + "learning_rate": 2.7711000000000003e-05, + "loss": 3.1846, + "step": 44600 + }, + { + "epoch": 0.893, + "grad_norm": 9.478955268859863, + "learning_rate": 2.7686000000000002e-05, + "loss": 3.2704, + "step": 44650 + }, + { + "epoch": 0.894, + "grad_norm": 6.461492538452148, + "learning_rate": 2.7661000000000005e-05, + "loss": 3.1446, + "step": 44700 + }, + { + "epoch": 0.895, + "grad_norm": 7.2203168869018555, + "learning_rate": 2.7636e-05, + "loss": 3.2167, + "step": 44750 + }, + { + "epoch": 0.896, + "grad_norm": 9.509076118469238, + "learning_rate": 2.7611000000000004e-05, + "loss": 2.979, + "step": 44800 + }, + { + "epoch": 0.897, + "grad_norm": 6.559743404388428, + "learning_rate": 2.7586e-05, + "loss": 3.1303, + "step": 44850 + }, + { + "epoch": 0.898, + "grad_norm": 9.804462432861328, + "learning_rate": 2.7561000000000003e-05, + "loss": 3.1583, + "step": 44900 + }, + { + "epoch": 0.899, + "grad_norm": 7.840511322021484, + "learning_rate": 2.7536000000000002e-05, + "loss": 3.1117, + "step": 44950 + }, + { + "epoch": 0.9, + "grad_norm": 6.799412250518799, + "learning_rate": 2.7511000000000005e-05, + "loss": 2.9047, + "step": 45000 + }, + { + "epoch": 0.901, + "grad_norm": 7.972790241241455, + "learning_rate": 2.7486e-05, + "loss": 3.0451, + "step": 45050 + }, + { + "epoch": 0.902, + "grad_norm": 10.409866333007812, + "learning_rate": 2.7461000000000004e-05, + "loss": 3.0812, + "step": 45100 + }, + { + "epoch": 0.903, + "grad_norm": 6.936646938323975, + "learning_rate": 2.7436e-05, + "loss": 3.1419, + "step": 45150 + }, + { + "epoch": 0.904, + "grad_norm": 10.039634704589844, + "learning_rate": 2.7411000000000002e-05, + "loss": 3.0438, + "step": 45200 + }, + { + "epoch": 0.905, + "grad_norm": 6.061675548553467, + "learning_rate": 2.7386000000000002e-05, + "loss": 3.0916, + "step": 45250 + }, + { + "epoch": 0.906, + "grad_norm": 8.285578727722168, + "learning_rate": 2.7361000000000005e-05, + "loss": 3.1964, + "step": 45300 + }, + { + "epoch": 0.907, + "grad_norm": 7.144978046417236, + "learning_rate": 2.7336e-05, + "loss": 2.9567, + "step": 45350 + }, + { + "epoch": 0.908, + "grad_norm": 8.918584823608398, + "learning_rate": 2.7311000000000003e-05, + "loss": 2.9987, + "step": 45400 + }, + { + "epoch": 0.909, + "grad_norm": 6.627841949462891, + "learning_rate": 2.7286e-05, + "loss": 3.1431, + "step": 45450 + }, + { + "epoch": 0.91, + "grad_norm": 7.455625057220459, + "learning_rate": 2.7261000000000002e-05, + "loss": 3.102, + "step": 45500 + }, + { + "epoch": 0.911, + "grad_norm": 8.264837265014648, + "learning_rate": 2.7236e-05, + "loss": 3.1442, + "step": 45550 + }, + { + "epoch": 0.912, + "grad_norm": 9.39218521118164, + "learning_rate": 2.7211000000000004e-05, + "loss": 3.0657, + "step": 45600 + }, + { + "epoch": 0.913, + "grad_norm": 8.049072265625, + "learning_rate": 2.7186e-05, + "loss": 3.0677, + "step": 45650 + }, + { + "epoch": 0.914, + "grad_norm": 6.083390235900879, + "learning_rate": 2.7161000000000003e-05, + "loss": 3.018, + "step": 45700 + }, + { + "epoch": 0.915, + "grad_norm": 6.058326244354248, + "learning_rate": 2.7136e-05, + "loss": 2.9001, + "step": 45750 + }, + { + "epoch": 0.916, + "grad_norm": 6.957967281341553, + "learning_rate": 2.7111000000000002e-05, + "loss": 3.1806, + "step": 45800 + }, + { + "epoch": 0.917, + "grad_norm": 9.043895721435547, + "learning_rate": 2.7086e-05, + "loss": 3.0095, + "step": 45850 + }, + { + "epoch": 0.918, + "grad_norm": 8.315635681152344, + "learning_rate": 2.7061000000000004e-05, + "loss": 3.2231, + "step": 45900 + }, + { + "epoch": 0.919, + "grad_norm": 6.859596252441406, + "learning_rate": 2.7036e-05, + "loss": 3.1016, + "step": 45950 + }, + { + "epoch": 0.92, + "grad_norm": 9.305853843688965, + "learning_rate": 2.7011000000000003e-05, + "loss": 3.1801, + "step": 46000 + }, + { + "epoch": 0.921, + "grad_norm": 7.9762797355651855, + "learning_rate": 2.6986e-05, + "loss": 2.9616, + "step": 46050 + }, + { + "epoch": 0.922, + "grad_norm": 6.378198623657227, + "learning_rate": 2.6961e-05, + "loss": 3.1772, + "step": 46100 + }, + { + "epoch": 0.923, + "grad_norm": 7.276421070098877, + "learning_rate": 2.6936e-05, + "loss": 3.1461, + "step": 46150 + }, + { + "epoch": 0.924, + "grad_norm": 7.234610080718994, + "learning_rate": 2.6911000000000004e-05, + "loss": 2.9373, + "step": 46200 + }, + { + "epoch": 0.925, + "grad_norm": 6.928109169006348, + "learning_rate": 2.6886e-05, + "loss": 3.0292, + "step": 46250 + }, + { + "epoch": 0.926, + "grad_norm": 8.218812942504883, + "learning_rate": 2.6861000000000003e-05, + "loss": 3.0692, + "step": 46300 + }, + { + "epoch": 0.927, + "grad_norm": 6.951169490814209, + "learning_rate": 2.6836e-05, + "loss": 2.983, + "step": 46350 + }, + { + "epoch": 0.928, + "grad_norm": 8.088844299316406, + "learning_rate": 2.6811e-05, + "loss": 3.0935, + "step": 46400 + }, + { + "epoch": 0.929, + "grad_norm": 6.722114086151123, + "learning_rate": 2.6786e-05, + "loss": 3.0447, + "step": 46450 + }, + { + "epoch": 0.93, + "grad_norm": 8.844832420349121, + "learning_rate": 2.6761000000000003e-05, + "loss": 3.1378, + "step": 46500 + }, + { + "epoch": 0.931, + "grad_norm": 5.869866371154785, + "learning_rate": 2.6736e-05, + "loss": 3.0179, + "step": 46550 + }, + { + "epoch": 0.932, + "grad_norm": 6.477950096130371, + "learning_rate": 2.6711000000000002e-05, + "loss": 2.9879, + "step": 46600 + }, + { + "epoch": 0.933, + "grad_norm": 8.349437713623047, + "learning_rate": 2.6685999999999998e-05, + "loss": 3.0, + "step": 46650 + }, + { + "epoch": 0.934, + "grad_norm": 6.469250679016113, + "learning_rate": 2.6661e-05, + "loss": 3.1057, + "step": 46700 + }, + { + "epoch": 0.935, + "grad_norm": 6.10085391998291, + "learning_rate": 2.6636e-05, + "loss": 3.0944, + "step": 46750 + }, + { + "epoch": 0.936, + "grad_norm": 7.693483352661133, + "learning_rate": 2.6611000000000003e-05, + "loss": 3.1472, + "step": 46800 + }, + { + "epoch": 0.937, + "grad_norm": 6.2554030418396, + "learning_rate": 2.6586e-05, + "loss": 3.1175, + "step": 46850 + }, + { + "epoch": 0.938, + "grad_norm": 10.338006973266602, + "learning_rate": 2.6561000000000002e-05, + "loss": 2.9336, + "step": 46900 + }, + { + "epoch": 0.939, + "grad_norm": 6.1178765296936035, + "learning_rate": 2.6536e-05, + "loss": 3.1405, + "step": 46950 + }, + { + "epoch": 0.94, + "grad_norm": 8.21044635772705, + "learning_rate": 2.6511000000000004e-05, + "loss": 3.046, + "step": 47000 + }, + { + "epoch": 0.941, + "grad_norm": 7.216266632080078, + "learning_rate": 2.6486e-05, + "loss": 3.129, + "step": 47050 + }, + { + "epoch": 0.942, + "grad_norm": 7.356600761413574, + "learning_rate": 2.6461000000000003e-05, + "loss": 3.0074, + "step": 47100 + }, + { + "epoch": 0.943, + "grad_norm": 7.9352264404296875, + "learning_rate": 2.6436e-05, + "loss": 2.9326, + "step": 47150 + }, + { + "epoch": 0.944, + "grad_norm": 6.63361120223999, + "learning_rate": 2.6411000000000002e-05, + "loss": 3.1064, + "step": 47200 + }, + { + "epoch": 0.945, + "grad_norm": 7.0836286544799805, + "learning_rate": 2.6386e-05, + "loss": 2.981, + "step": 47250 + }, + { + "epoch": 0.946, + "grad_norm": 6.541557312011719, + "learning_rate": 2.6361000000000004e-05, + "loss": 3.0636, + "step": 47300 + }, + { + "epoch": 0.947, + "grad_norm": 9.376893043518066, + "learning_rate": 2.6336e-05, + "loss": 2.9022, + "step": 47350 + }, + { + "epoch": 0.948, + "grad_norm": 6.6397929191589355, + "learning_rate": 2.6311000000000003e-05, + "loss": 3.1219, + "step": 47400 + }, + { + "epoch": 0.949, + "grad_norm": 7.550232410430908, + "learning_rate": 2.6286e-05, + "loss": 3.3295, + "step": 47450 + }, + { + "epoch": 0.95, + "grad_norm": 6.387320041656494, + "learning_rate": 2.6261e-05, + "loss": 3.1274, + "step": 47500 + }, + { + "epoch": 0.951, + "grad_norm": 6.654208183288574, + "learning_rate": 2.6236e-05, + "loss": 2.8878, + "step": 47550 + }, + { + "epoch": 0.952, + "grad_norm": 7.0465497970581055, + "learning_rate": 2.6211000000000004e-05, + "loss": 3.3068, + "step": 47600 + }, + { + "epoch": 0.953, + "grad_norm": 8.581622123718262, + "learning_rate": 2.6186e-05, + "loss": 2.9106, + "step": 47650 + }, + { + "epoch": 0.954, + "grad_norm": 7.229891777038574, + "learning_rate": 2.6161000000000002e-05, + "loss": 2.9841, + "step": 47700 + }, + { + "epoch": 0.955, + "grad_norm": 7.919389247894287, + "learning_rate": 2.6136e-05, + "loss": 2.9297, + "step": 47750 + }, + { + "epoch": 0.956, + "grad_norm": 7.118837833404541, + "learning_rate": 2.6111e-05, + "loss": 2.8457, + "step": 47800 + }, + { + "epoch": 0.957, + "grad_norm": 6.714677333831787, + "learning_rate": 2.6086e-05, + "loss": 2.9976, + "step": 47850 + }, + { + "epoch": 0.958, + "grad_norm": 9.282464027404785, + "learning_rate": 2.6061000000000003e-05, + "loss": 3.017, + "step": 47900 + }, + { + "epoch": 0.959, + "grad_norm": 6.635936260223389, + "learning_rate": 2.6036500000000004e-05, + "loss": 3.248, + "step": 47950 + }, + { + "epoch": 0.96, + "grad_norm": 6.59296178817749, + "learning_rate": 2.60115e-05, + "loss": 3.0149, + "step": 48000 + }, + { + "epoch": 0.961, + "grad_norm": 6.398571014404297, + "learning_rate": 2.5986500000000003e-05, + "loss": 3.2064, + "step": 48050 + }, + { + "epoch": 0.962, + "grad_norm": 8.443436622619629, + "learning_rate": 2.59615e-05, + "loss": 3.086, + "step": 48100 + }, + { + "epoch": 0.963, + "grad_norm": 6.965825080871582, + "learning_rate": 2.59365e-05, + "loss": 3.0896, + "step": 48150 + }, + { + "epoch": 0.964, + "grad_norm": 7.949388027191162, + "learning_rate": 2.59115e-05, + "loss": 3.1704, + "step": 48200 + }, + { + "epoch": 0.965, + "grad_norm": 8.066429138183594, + "learning_rate": 2.5886500000000004e-05, + "loss": 3.0161, + "step": 48250 + }, + { + "epoch": 0.966, + "grad_norm": 8.622272491455078, + "learning_rate": 2.58615e-05, + "loss": 3.1115, + "step": 48300 + }, + { + "epoch": 0.967, + "grad_norm": 6.805866241455078, + "learning_rate": 2.5836500000000002e-05, + "loss": 3.2632, + "step": 48350 + }, + { + "epoch": 0.968, + "grad_norm": 6.861439228057861, + "learning_rate": 2.58115e-05, + "loss": 3.139, + "step": 48400 + }, + { + "epoch": 0.969, + "grad_norm": 8.325223922729492, + "learning_rate": 2.57865e-05, + "loss": 3.0783, + "step": 48450 + }, + { + "epoch": 0.97, + "grad_norm": 6.219725131988525, + "learning_rate": 2.57615e-05, + "loss": 3.1166, + "step": 48500 + }, + { + "epoch": 0.971, + "grad_norm": 7.012028217315674, + "learning_rate": 2.5736500000000003e-05, + "loss": 3.1029, + "step": 48550 + }, + { + "epoch": 0.972, + "grad_norm": 7.103126525878906, + "learning_rate": 2.57115e-05, + "loss": 3.0094, + "step": 48600 + }, + { + "epoch": 0.973, + "grad_norm": 8.089792251586914, + "learning_rate": 2.5686500000000002e-05, + "loss": 3.0782, + "step": 48650 + }, + { + "epoch": 0.974, + "grad_norm": 8.976265907287598, + "learning_rate": 2.5661499999999998e-05, + "loss": 3.0816, + "step": 48700 + }, + { + "epoch": 0.975, + "grad_norm": 5.711278915405273, + "learning_rate": 2.56365e-05, + "loss": 3.0422, + "step": 48750 + }, + { + "epoch": 0.976, + "grad_norm": 8.107563972473145, + "learning_rate": 2.56115e-05, + "loss": 2.931, + "step": 48800 + }, + { + "epoch": 0.977, + "grad_norm": 6.930409908294678, + "learning_rate": 2.5586500000000003e-05, + "loss": 3.1535, + "step": 48850 + }, + { + "epoch": 0.978, + "grad_norm": 8.81825065612793, + "learning_rate": 2.55615e-05, + "loss": 3.1463, + "step": 48900 + }, + { + "epoch": 0.979, + "grad_norm": 7.318758010864258, + "learning_rate": 2.5536500000000002e-05, + "loss": 3.0381, + "step": 48950 + }, + { + "epoch": 0.98, + "grad_norm": 7.180883884429932, + "learning_rate": 2.5511499999999998e-05, + "loss": 3.0863, + "step": 49000 + }, + { + "epoch": 0.981, + "grad_norm": 8.624285697937012, + "learning_rate": 2.54865e-05, + "loss": 3.1038, + "step": 49050 + }, + { + "epoch": 0.982, + "grad_norm": 7.267908573150635, + "learning_rate": 2.54615e-05, + "loss": 2.8967, + "step": 49100 + }, + { + "epoch": 0.983, + "grad_norm": 7.4274773597717285, + "learning_rate": 2.5436500000000003e-05, + "loss": 2.9911, + "step": 49150 + }, + { + "epoch": 0.984, + "grad_norm": 8.504932403564453, + "learning_rate": 2.54115e-05, + "loss": 3.2724, + "step": 49200 + }, + { + "epoch": 0.985, + "grad_norm": 7.094997882843018, + "learning_rate": 2.53865e-05, + "loss": 3.1187, + "step": 49250 + }, + { + "epoch": 0.986, + "grad_norm": 6.276993751525879, + "learning_rate": 2.5361499999999998e-05, + "loss": 3.0965, + "step": 49300 + }, + { + "epoch": 0.987, + "grad_norm": 8.081740379333496, + "learning_rate": 2.53365e-05, + "loss": 3.0091, + "step": 49350 + }, + { + "epoch": 0.988, + "grad_norm": 7.968372821807861, + "learning_rate": 2.53115e-05, + "loss": 3.2251, + "step": 49400 + }, + { + "epoch": 0.989, + "grad_norm": 8.792092323303223, + "learning_rate": 2.5286500000000003e-05, + "loss": 2.912, + "step": 49450 + }, + { + "epoch": 0.99, + "grad_norm": 6.958602428436279, + "learning_rate": 2.52615e-05, + "loss": 3.1384, + "step": 49500 + }, + { + "epoch": 0.991, + "grad_norm": 7.0767059326171875, + "learning_rate": 2.52365e-05, + "loss": 3.0584, + "step": 49550 + }, + { + "epoch": 0.992, + "grad_norm": 7.639810085296631, + "learning_rate": 2.5211499999999997e-05, + "loss": 3.2435, + "step": 49600 + }, + { + "epoch": 0.993, + "grad_norm": 9.285743713378906, + "learning_rate": 2.51865e-05, + "loss": 3.1184, + "step": 49650 + }, + { + "epoch": 0.994, + "grad_norm": 6.950512409210205, + "learning_rate": 2.51615e-05, + "loss": 3.0376, + "step": 49700 + }, + { + "epoch": 0.995, + "grad_norm": 8.000362396240234, + "learning_rate": 2.5136500000000002e-05, + "loss": 3.1514, + "step": 49750 + }, + { + "epoch": 0.996, + "grad_norm": 7.425972938537598, + "learning_rate": 2.51115e-05, + "loss": 3.0469, + "step": 49800 + }, + { + "epoch": 0.997, + "grad_norm": 5.315979480743408, + "learning_rate": 2.50865e-05, + "loss": 3.0874, + "step": 49850 + }, + { + "epoch": 0.998, + "grad_norm": 6.839958190917969, + "learning_rate": 2.50615e-05, + "loss": 3.1059, + "step": 49900 + }, + { + "epoch": 0.999, + "grad_norm": 7.220871925354004, + "learning_rate": 2.5037e-05, + "loss": 3.0, + "step": 49950 + }, + { + "epoch": 1.0, + "grad_norm": 8.067781448364258, + "learning_rate": 2.5012e-05, + "loss": 3.1688, + "step": 50000 + }, + { + "epoch": 1.001, + "grad_norm": 6.941652297973633, + "learning_rate": 2.4987000000000003e-05, + "loss": 3.0667, + "step": 50050 + }, + { + "epoch": 1.002, + "grad_norm": 6.937021732330322, + "learning_rate": 2.4962000000000003e-05, + "loss": 3.0059, + "step": 50100 + }, + { + "epoch": 1.003, + "grad_norm": 7.390977382659912, + "learning_rate": 2.4937000000000002e-05, + "loss": 2.9124, + "step": 50150 + }, + { + "epoch": 1.004, + "grad_norm": 8.082268714904785, + "learning_rate": 2.4912e-05, + "loss": 3.1063, + "step": 50200 + }, + { + "epoch": 1.005, + "grad_norm": 10.308112144470215, + "learning_rate": 2.4887e-05, + "loss": 3.0824, + "step": 50250 + }, + { + "epoch": 1.006, + "grad_norm": 5.8875298500061035, + "learning_rate": 2.4862000000000004e-05, + "loss": 2.9386, + "step": 50300 + }, + { + "epoch": 1.007, + "grad_norm": 8.643927574157715, + "learning_rate": 2.4837000000000003e-05, + "loss": 3.0654, + "step": 50350 + }, + { + "epoch": 1.008, + "grad_norm": 8.515275955200195, + "learning_rate": 2.4812000000000002e-05, + "loss": 2.8866, + "step": 50400 + }, + { + "epoch": 1.009, + "grad_norm": 7.331372261047363, + "learning_rate": 2.4787000000000002e-05, + "loss": 2.9803, + "step": 50450 + }, + { + "epoch": 1.01, + "grad_norm": 8.064435958862305, + "learning_rate": 2.4762e-05, + "loss": 3.043, + "step": 50500 + }, + { + "epoch": 1.011, + "grad_norm": 6.975559711456299, + "learning_rate": 2.4737e-05, + "loss": 3.0534, + "step": 50550 + }, + { + "epoch": 1.012, + "grad_norm": 6.67125129699707, + "learning_rate": 2.4712000000000003e-05, + "loss": 3.022, + "step": 50600 + }, + { + "epoch": 1.013, + "grad_norm": 8.514412879943848, + "learning_rate": 2.4687000000000003e-05, + "loss": 3.0496, + "step": 50650 + }, + { + "epoch": 1.014, + "grad_norm": 6.08818244934082, + "learning_rate": 2.4662000000000002e-05, + "loss": 2.8703, + "step": 50700 + }, + { + "epoch": 1.015, + "grad_norm": 6.377881050109863, + "learning_rate": 2.4637e-05, + "loss": 2.8455, + "step": 50750 + }, + { + "epoch": 1.016, + "grad_norm": 6.975340366363525, + "learning_rate": 2.4612e-05, + "loss": 3.0349, + "step": 50800 + }, + { + "epoch": 1.017, + "grad_norm": 7.518290996551514, + "learning_rate": 2.4587e-05, + "loss": 2.9862, + "step": 50850 + }, + { + "epoch": 1.018, + "grad_norm": 6.725731372833252, + "learning_rate": 2.4562000000000003e-05, + "loss": 3.0017, + "step": 50900 + }, + { + "epoch": 1.019, + "grad_norm": 10.636852264404297, + "learning_rate": 2.4537000000000002e-05, + "loss": 2.911, + "step": 50950 + }, + { + "epoch": 1.02, + "grad_norm": 12.086158752441406, + "learning_rate": 2.4512000000000002e-05, + "loss": 3.0008, + "step": 51000 + }, + { + "epoch": 1.021, + "grad_norm": 6.38085412979126, + "learning_rate": 2.4487e-05, + "loss": 3.1455, + "step": 51050 + }, + { + "epoch": 1.022, + "grad_norm": 5.900729179382324, + "learning_rate": 2.4462e-05, + "loss": 2.9867, + "step": 51100 + }, + { + "epoch": 1.023, + "grad_norm": 6.688613414764404, + "learning_rate": 2.4437e-05, + "loss": 2.8085, + "step": 51150 + }, + { + "epoch": 1.024, + "grad_norm": 7.4456281661987305, + "learning_rate": 2.4412000000000003e-05, + "loss": 3.0203, + "step": 51200 + }, + { + "epoch": 1.025, + "grad_norm": 7.295360088348389, + "learning_rate": 2.4387000000000002e-05, + "loss": 3.0503, + "step": 51250 + }, + { + "epoch": 1.026, + "grad_norm": 6.788741111755371, + "learning_rate": 2.4362e-05, + "loss": 3.059, + "step": 51300 + }, + { + "epoch": 1.027, + "grad_norm": 6.674519062042236, + "learning_rate": 2.4337e-05, + "loss": 3.1171, + "step": 51350 + }, + { + "epoch": 1.028, + "grad_norm": 7.097243309020996, + "learning_rate": 2.4312e-05, + "loss": 2.8427, + "step": 51400 + }, + { + "epoch": 1.029, + "grad_norm": 8.2136869430542, + "learning_rate": 2.4287000000000003e-05, + "loss": 2.9196, + "step": 51450 + }, + { + "epoch": 1.03, + "grad_norm": 7.26527214050293, + "learning_rate": 2.4262000000000003e-05, + "loss": 3.1156, + "step": 51500 + }, + { + "epoch": 1.031, + "grad_norm": 8.166938781738281, + "learning_rate": 2.4237000000000002e-05, + "loss": 3.1713, + "step": 51550 + }, + { + "epoch": 1.032, + "grad_norm": 6.972194671630859, + "learning_rate": 2.4212e-05, + "loss": 2.8535, + "step": 51600 + }, + { + "epoch": 1.033, + "grad_norm": 9.820430755615234, + "learning_rate": 2.4187e-05, + "loss": 2.9944, + "step": 51650 + }, + { + "epoch": 1.034, + "grad_norm": 7.9938530921936035, + "learning_rate": 2.4162e-05, + "loss": 3.1709, + "step": 51700 + }, + { + "epoch": 1.035, + "grad_norm": 6.771377086639404, + "learning_rate": 2.4137000000000003e-05, + "loss": 3.1786, + "step": 51750 + }, + { + "epoch": 1.036, + "grad_norm": 8.01101016998291, + "learning_rate": 2.4112000000000002e-05, + "loss": 3.0196, + "step": 51800 + }, + { + "epoch": 1.037, + "grad_norm": 6.825372219085693, + "learning_rate": 2.4087e-05, + "loss": 3.1268, + "step": 51850 + }, + { + "epoch": 1.038, + "grad_norm": 6.796056747436523, + "learning_rate": 2.4062e-05, + "loss": 2.9989, + "step": 51900 + }, + { + "epoch": 1.039, + "grad_norm": Infinity, + "learning_rate": 2.4037e-05, + "loss": 2.8302, + "step": 51950 + }, + { + "epoch": 1.04, + "grad_norm": 6.67830228805542, + "learning_rate": 2.40125e-05, + "loss": 3.0738, + "step": 52000 + }, + { + "epoch": 1.041, + "grad_norm": 7.3164143562316895, + "learning_rate": 2.39875e-05, + "loss": 3.1385, + "step": 52050 + }, + { + "epoch": 1.042, + "grad_norm": 8.837868690490723, + "learning_rate": 2.3962500000000003e-05, + "loss": 2.98, + "step": 52100 + }, + { + "epoch": 1.043, + "grad_norm": 7.212620258331299, + "learning_rate": 2.3937500000000002e-05, + "loss": 2.9773, + "step": 52150 + }, + { + "epoch": 1.044, + "grad_norm": 7.9372878074646, + "learning_rate": 2.3912500000000002e-05, + "loss": 3.0351, + "step": 52200 + }, + { + "epoch": 1.045, + "grad_norm": 8.558395385742188, + "learning_rate": 2.38875e-05, + "loss": 3.0283, + "step": 52250 + }, + { + "epoch": 1.046, + "grad_norm": 9.92737102508545, + "learning_rate": 2.38625e-05, + "loss": 2.715, + "step": 52300 + }, + { + "epoch": 1.047, + "grad_norm": 6.824094772338867, + "learning_rate": 2.38375e-05, + "loss": 2.8445, + "step": 52350 + }, + { + "epoch": 1.048, + "grad_norm": 7.356767654418945, + "learning_rate": 2.3812500000000003e-05, + "loss": 2.9878, + "step": 52400 + }, + { + "epoch": 1.049, + "grad_norm": 9.474124908447266, + "learning_rate": 2.3787500000000002e-05, + "loss": 2.8968, + "step": 52450 + }, + { + "epoch": 1.05, + "grad_norm": 6.359553813934326, + "learning_rate": 2.37625e-05, + "loss": 2.8757, + "step": 52500 + }, + { + "epoch": 1.051, + "grad_norm": 7.467580795288086, + "learning_rate": 2.37375e-05, + "loss": 3.1544, + "step": 52550 + }, + { + "epoch": 1.052, + "grad_norm": 9.67093276977539, + "learning_rate": 2.37125e-05, + "loss": 3.0963, + "step": 52600 + }, + { + "epoch": 1.053, + "grad_norm": 8.854635238647461, + "learning_rate": 2.36875e-05, + "loss": 2.9702, + "step": 52650 + }, + { + "epoch": 1.054, + "grad_norm": 6.981558322906494, + "learning_rate": 2.3662500000000003e-05, + "loss": 3.143, + "step": 52700 + }, + { + "epoch": 1.055, + "grad_norm": 6.7144293785095215, + "learning_rate": 2.3637500000000002e-05, + "loss": 2.8273, + "step": 52750 + }, + { + "epoch": 1.056, + "grad_norm": 6.277219295501709, + "learning_rate": 2.36125e-05, + "loss": 2.9522, + "step": 52800 + }, + { + "epoch": 1.057, + "grad_norm": 9.357563972473145, + "learning_rate": 2.35875e-05, + "loss": 2.8644, + "step": 52850 + }, + { + "epoch": 1.058, + "grad_norm": 6.855267524719238, + "learning_rate": 2.35625e-05, + "loss": 2.9058, + "step": 52900 + }, + { + "epoch": 1.059, + "grad_norm": 6.813782691955566, + "learning_rate": 2.35375e-05, + "loss": 3.0752, + "step": 52950 + }, + { + "epoch": 1.06, + "grad_norm": 7.318324565887451, + "learning_rate": 2.3512500000000002e-05, + "loss": 2.9552, + "step": 53000 + }, + { + "epoch": 1.061, + "grad_norm": 8.61180305480957, + "learning_rate": 2.3487500000000002e-05, + "loss": 2.9447, + "step": 53050 + }, + { + "epoch": 1.062, + "grad_norm": 7.704663276672363, + "learning_rate": 2.34625e-05, + "loss": 3.1072, + "step": 53100 + }, + { + "epoch": 1.063, + "grad_norm": 9.428034782409668, + "learning_rate": 2.34375e-05, + "loss": 2.9625, + "step": 53150 + }, + { + "epoch": 1.064, + "grad_norm": 8.153590202331543, + "learning_rate": 2.34125e-05, + "loss": 3.1441, + "step": 53200 + }, + { + "epoch": 1.065, + "grad_norm": 7.686233997344971, + "learning_rate": 2.3387500000000003e-05, + "loss": 2.977, + "step": 53250 + }, + { + "epoch": 1.066, + "grad_norm": 7.506038188934326, + "learning_rate": 2.3362500000000002e-05, + "loss": 2.9525, + "step": 53300 + }, + { + "epoch": 1.067, + "grad_norm": 7.652656078338623, + "learning_rate": 2.33375e-05, + "loss": 3.0001, + "step": 53350 + }, + { + "epoch": 1.068, + "grad_norm": 6.947332382202148, + "learning_rate": 2.33125e-05, + "loss": 2.9472, + "step": 53400 + }, + { + "epoch": 1.069, + "grad_norm": 7.201016902923584, + "learning_rate": 2.32875e-05, + "loss": 2.9012, + "step": 53450 + }, + { + "epoch": 1.07, + "grad_norm": 7.253627777099609, + "learning_rate": 2.32625e-05, + "loss": 3.228, + "step": 53500 + }, + { + "epoch": 1.071, + "grad_norm": 7.386748313903809, + "learning_rate": 2.3237500000000002e-05, + "loss": 2.9931, + "step": 53550 + }, + { + "epoch": 1.072, + "grad_norm": 7.58491849899292, + "learning_rate": 2.3212500000000002e-05, + "loss": 3.0007, + "step": 53600 + }, + { + "epoch": 1.073, + "grad_norm": 8.3165922164917, + "learning_rate": 2.31875e-05, + "loss": 2.9794, + "step": 53650 + }, + { + "epoch": 1.074, + "grad_norm": 7.365329742431641, + "learning_rate": 2.31625e-05, + "loss": 2.9635, + "step": 53700 + }, + { + "epoch": 1.075, + "grad_norm": 6.985613822937012, + "learning_rate": 2.31375e-05, + "loss": 2.9755, + "step": 53750 + }, + { + "epoch": 1.076, + "grad_norm": 9.172526359558105, + "learning_rate": 2.31125e-05, + "loss": 2.8953, + "step": 53800 + }, + { + "epoch": 1.077, + "grad_norm": 9.151043891906738, + "learning_rate": 2.3087500000000002e-05, + "loss": 3.0558, + "step": 53850 + }, + { + "epoch": 1.078, + "grad_norm": 6.541784763336182, + "learning_rate": 2.30625e-05, + "loss": 2.8525, + "step": 53900 + }, + { + "epoch": 1.079, + "grad_norm": 8.261955261230469, + "learning_rate": 2.30375e-05, + "loss": 3.0542, + "step": 53950 + }, + { + "epoch": 1.08, + "grad_norm": 6.869956970214844, + "learning_rate": 2.3013e-05, + "loss": 2.939, + "step": 54000 + }, + { + "epoch": 1.081, + "grad_norm": 7.964324474334717, + "learning_rate": 2.2988e-05, + "loss": 2.9387, + "step": 54050 + }, + { + "epoch": 1.082, + "grad_norm": 7.4598517417907715, + "learning_rate": 2.2963e-05, + "loss": 3.1512, + "step": 54100 + }, + { + "epoch": 1.083, + "grad_norm": 6.513663291931152, + "learning_rate": 2.2938e-05, + "loss": 2.9179, + "step": 54150 + }, + { + "epoch": 1.084, + "grad_norm": 9.144841194152832, + "learning_rate": 2.2913000000000002e-05, + "loss": 2.8925, + "step": 54200 + }, + { + "epoch": 1.085, + "grad_norm": 6.455650329589844, + "learning_rate": 2.2888000000000002e-05, + "loss": 2.8863, + "step": 54250 + }, + { + "epoch": 1.086, + "grad_norm": 7.622753620147705, + "learning_rate": 2.2863e-05, + "loss": 2.9365, + "step": 54300 + }, + { + "epoch": 1.087, + "grad_norm": 8.794417381286621, + "learning_rate": 2.2838e-05, + "loss": 2.9299, + "step": 54350 + }, + { + "epoch": 1.088, + "grad_norm": 7.10263204574585, + "learning_rate": 2.2813e-05, + "loss": 2.8352, + "step": 54400 + }, + { + "epoch": 1.089, + "grad_norm": 7.9989142417907715, + "learning_rate": 2.2788e-05, + "loss": 2.7885, + "step": 54450 + }, + { + "epoch": 1.09, + "grad_norm": 7.510954856872559, + "learning_rate": 2.2763000000000002e-05, + "loss": 3.0037, + "step": 54500 + }, + { + "epoch": 1.091, + "grad_norm": 7.432366371154785, + "learning_rate": 2.2738e-05, + "loss": 3.1301, + "step": 54550 + }, + { + "epoch": 1.092, + "grad_norm": 7.562934398651123, + "learning_rate": 2.2713e-05, + "loss": 3.112, + "step": 54600 + }, + { + "epoch": 1.093, + "grad_norm": 7.627951622009277, + "learning_rate": 2.2688e-05, + "loss": 2.9647, + "step": 54650 + }, + { + "epoch": 1.094, + "grad_norm": 8.570850372314453, + "learning_rate": 2.2663e-05, + "loss": 2.9396, + "step": 54700 + }, + { + "epoch": 1.095, + "grad_norm": 7.013098239898682, + "learning_rate": 2.2638000000000002e-05, + "loss": 2.8855, + "step": 54750 + }, + { + "epoch": 1.096, + "grad_norm": 6.925095558166504, + "learning_rate": 2.2613000000000002e-05, + "loss": 2.8753, + "step": 54800 + }, + { + "epoch": 1.097, + "grad_norm": 8.075315475463867, + "learning_rate": 2.2588e-05, + "loss": 3.0105, + "step": 54850 + }, + { + "epoch": 1.098, + "grad_norm": 7.342935085296631, + "learning_rate": 2.2563e-05, + "loss": 3.0008, + "step": 54900 + }, + { + "epoch": 1.099, + "grad_norm": 6.944976806640625, + "learning_rate": 2.2538e-05, + "loss": 3.0758, + "step": 54950 + }, + { + "epoch": 1.1, + "grad_norm": 7.054170608520508, + "learning_rate": 2.2513e-05, + "loss": 2.9099, + "step": 55000 + }, + { + "epoch": 1.101, + "grad_norm": 8.738309860229492, + "learning_rate": 2.2488000000000002e-05, + "loss": 3.031, + "step": 55050 + }, + { + "epoch": 1.102, + "grad_norm": 7.9822893142700195, + "learning_rate": 2.2463e-05, + "loss": 2.8642, + "step": 55100 + }, + { + "epoch": 1.103, + "grad_norm": 6.917319297790527, + "learning_rate": 2.2438e-05, + "loss": 3.0214, + "step": 55150 + }, + { + "epoch": 1.104, + "grad_norm": 8.734328269958496, + "learning_rate": 2.2413e-05, + "loss": 3.0149, + "step": 55200 + }, + { + "epoch": 1.105, + "grad_norm": 6.920499801635742, + "learning_rate": 2.2388e-05, + "loss": 2.7432, + "step": 55250 + }, + { + "epoch": 1.106, + "grad_norm": 6.7622809410095215, + "learning_rate": 2.2363e-05, + "loss": 2.7757, + "step": 55300 + }, + { + "epoch": 1.107, + "grad_norm": 8.261820793151855, + "learning_rate": 2.2338000000000002e-05, + "loss": 2.983, + "step": 55350 + }, + { + "epoch": 1.108, + "grad_norm": 7.027587890625, + "learning_rate": 2.2313e-05, + "loss": 2.8707, + "step": 55400 + }, + { + "epoch": 1.109, + "grad_norm": 6.786430835723877, + "learning_rate": 2.2288e-05, + "loss": 3.0307, + "step": 55450 + }, + { + "epoch": 1.11, + "grad_norm": 7.964380264282227, + "learning_rate": 2.2263e-05, + "loss": 2.8678, + "step": 55500 + }, + { + "epoch": 1.111, + "grad_norm": 6.880892753601074, + "learning_rate": 2.2238e-05, + "loss": 2.9177, + "step": 55550 + }, + { + "epoch": 1.112, + "grad_norm": 8.703950881958008, + "learning_rate": 2.2213e-05, + "loss": 2.7289, + "step": 55600 + }, + { + "epoch": 1.113, + "grad_norm": 7.1236090660095215, + "learning_rate": 2.2188e-05, + "loss": 2.8935, + "step": 55650 + }, + { + "epoch": 1.114, + "grad_norm": 7.747103691101074, + "learning_rate": 2.2163e-05, + "loss": 2.805, + "step": 55700 + }, + { + "epoch": 1.115, + "grad_norm": 7.6112260818481445, + "learning_rate": 2.2138e-05, + "loss": 2.9563, + "step": 55750 + }, + { + "epoch": 1.116, + "grad_norm": 6.807455539703369, + "learning_rate": 2.2113e-05, + "loss": 2.9746, + "step": 55800 + }, + { + "epoch": 1.117, + "grad_norm": 7.226533889770508, + "learning_rate": 2.2088e-05, + "loss": 3.0062, + "step": 55850 + }, + { + "epoch": 1.1179999999999999, + "grad_norm": 7.821582794189453, + "learning_rate": 2.2063e-05, + "loss": 2.8594, + "step": 55900 + }, + { + "epoch": 1.119, + "grad_norm": 7.580191135406494, + "learning_rate": 2.2038e-05, + "loss": 2.9352, + "step": 55950 + }, + { + "epoch": 1.12, + "grad_norm": 7.944276809692383, + "learning_rate": 2.2013500000000002e-05, + "loss": 2.8431, + "step": 56000 + }, + { + "epoch": 1.121, + "grad_norm": 8.267626762390137, + "learning_rate": 2.19885e-05, + "loss": 2.9671, + "step": 56050 + }, + { + "epoch": 1.1219999999999999, + "grad_norm": 6.602245330810547, + "learning_rate": 2.19635e-05, + "loss": 2.8601, + "step": 56100 + }, + { + "epoch": 1.123, + "grad_norm": 7.23159122467041, + "learning_rate": 2.19385e-05, + "loss": 3.2612, + "step": 56150 + }, + { + "epoch": 1.124, + "grad_norm": 8.431252479553223, + "learning_rate": 2.19135e-05, + "loss": 2.8907, + "step": 56200 + }, + { + "epoch": 1.125, + "grad_norm": 7.482148170471191, + "learning_rate": 2.1888500000000002e-05, + "loss": 2.8308, + "step": 56250 + }, + { + "epoch": 1.126, + "grad_norm": 7.137932300567627, + "learning_rate": 2.18635e-05, + "loss": 2.9463, + "step": 56300 + }, + { + "epoch": 1.127, + "grad_norm": 8.12203311920166, + "learning_rate": 2.18385e-05, + "loss": 3.0064, + "step": 56350 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 6.96235466003418, + "learning_rate": 2.18135e-05, + "loss": 2.7659, + "step": 56400 + }, + { + "epoch": 1.129, + "grad_norm": 6.59172248840332, + "learning_rate": 2.17885e-05, + "loss": 2.9026, + "step": 56450 + }, + { + "epoch": 1.13, + "grad_norm": 7.312923431396484, + "learning_rate": 2.17635e-05, + "loss": 3.0246, + "step": 56500 + }, + { + "epoch": 1.131, + "grad_norm": 7.342343330383301, + "learning_rate": 2.1738500000000002e-05, + "loss": 2.9613, + "step": 56550 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 8.1234130859375, + "learning_rate": 2.17135e-05, + "loss": 2.9619, + "step": 56600 + }, + { + "epoch": 1.133, + "grad_norm": 7.191281318664551, + "learning_rate": 2.16885e-05, + "loss": 2.9403, + "step": 56650 + }, + { + "epoch": 1.134, + "grad_norm": 8.372027397155762, + "learning_rate": 2.16635e-05, + "loss": 3.0187, + "step": 56700 + }, + { + "epoch": 1.135, + "grad_norm": 6.654972553253174, + "learning_rate": 2.16385e-05, + "loss": 3.056, + "step": 56750 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 5.851999759674072, + "learning_rate": 2.16135e-05, + "loss": 2.8419, + "step": 56800 + }, + { + "epoch": 1.137, + "grad_norm": 8.843352317810059, + "learning_rate": 2.1588500000000002e-05, + "loss": 2.9252, + "step": 56850 + }, + { + "epoch": 1.138, + "grad_norm": 8.799443244934082, + "learning_rate": 2.15635e-05, + "loss": 2.9716, + "step": 56900 + }, + { + "epoch": 1.139, + "grad_norm": 7.772215843200684, + "learning_rate": 2.15385e-05, + "loss": 2.9954, + "step": 56950 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 8.679427146911621, + "learning_rate": 2.15135e-05, + "loss": 2.7281, + "step": 57000 + }, + { + "epoch": 1.141, + "grad_norm": 7.537428379058838, + "learning_rate": 2.14885e-05, + "loss": 3.0369, + "step": 57050 + }, + { + "epoch": 1.142, + "grad_norm": 7.318295478820801, + "learning_rate": 2.14635e-05, + "loss": 3.0333, + "step": 57100 + }, + { + "epoch": 1.143, + "grad_norm": 7.262914657592773, + "learning_rate": 2.14385e-05, + "loss": 2.8413, + "step": 57150 + }, + { + "epoch": 1.144, + "grad_norm": 7.274723529815674, + "learning_rate": 2.14135e-05, + "loss": 2.8387, + "step": 57200 + }, + { + "epoch": 1.145, + "grad_norm": 8.164852142333984, + "learning_rate": 2.13885e-05, + "loss": 2.7594, + "step": 57250 + }, + { + "epoch": 1.146, + "grad_norm": 7.805911540985107, + "learning_rate": 2.13635e-05, + "loss": 3.0005, + "step": 57300 + }, + { + "epoch": 1.147, + "grad_norm": 6.96591329574585, + "learning_rate": 2.13385e-05, + "loss": 2.8565, + "step": 57350 + }, + { + "epoch": 1.148, + "grad_norm": 7.270416259765625, + "learning_rate": 2.13135e-05, + "loss": 3.0091, + "step": 57400 + }, + { + "epoch": 1.149, + "grad_norm": 7.759993076324463, + "learning_rate": 2.12885e-05, + "loss": 2.9593, + "step": 57450 + }, + { + "epoch": 1.15, + "grad_norm": 7.487631320953369, + "learning_rate": 2.12635e-05, + "loss": 2.9263, + "step": 57500 + }, + { + "epoch": 1.151, + "grad_norm": 6.525301456451416, + "learning_rate": 2.12385e-05, + "loss": 2.8832, + "step": 57550 + }, + { + "epoch": 1.152, + "grad_norm": 7.14619779586792, + "learning_rate": 2.12135e-05, + "loss": 2.8469, + "step": 57600 + }, + { + "epoch": 1.153, + "grad_norm": 8.395821571350098, + "learning_rate": 2.11885e-05, + "loss": 2.9667, + "step": 57650 + }, + { + "epoch": 1.154, + "grad_norm": 7.4564900398254395, + "learning_rate": 2.11635e-05, + "loss": 2.9125, + "step": 57700 + }, + { + "epoch": 1.155, + "grad_norm": 7.6553120613098145, + "learning_rate": 2.11385e-05, + "loss": 3.046, + "step": 57750 + }, + { + "epoch": 1.156, + "grad_norm": 7.374424934387207, + "learning_rate": 2.11135e-05, + "loss": 2.7586, + "step": 57800 + }, + { + "epoch": 1.157, + "grad_norm": 8.02834415435791, + "learning_rate": 2.10885e-05, + "loss": 2.8078, + "step": 57850 + }, + { + "epoch": 1.158, + "grad_norm": 8.8065767288208, + "learning_rate": 2.10635e-05, + "loss": 2.7314, + "step": 57900 + }, + { + "epoch": 1.159, + "grad_norm": 7.677451133728027, + "learning_rate": 2.10385e-05, + "loss": 2.8307, + "step": 57950 + }, + { + "epoch": 1.16, + "grad_norm": 7.3158464431762695, + "learning_rate": 2.10135e-05, + "loss": 2.8309, + "step": 58000 + }, + { + "epoch": 1.161, + "grad_norm": 8.057973861694336, + "learning_rate": 2.0989000000000002e-05, + "loss": 2.8944, + "step": 58050 + }, + { + "epoch": 1.162, + "grad_norm": 8.379156112670898, + "learning_rate": 2.0964e-05, + "loss": 2.9813, + "step": 58100 + }, + { + "epoch": 1.163, + "grad_norm": 6.20780611038208, + "learning_rate": 2.0939e-05, + "loss": 2.8746, + "step": 58150 + }, + { + "epoch": 1.164, + "grad_norm": 6.247302532196045, + "learning_rate": 2.0914e-05, + "loss": 3.1069, + "step": 58200 + }, + { + "epoch": 1.165, + "grad_norm": 10.308443069458008, + "learning_rate": 2.0889e-05, + "loss": 2.9991, + "step": 58250 + }, + { + "epoch": 1.166, + "grad_norm": 7.055253982543945, + "learning_rate": 2.0864e-05, + "loss": 3.0063, + "step": 58300 + }, + { + "epoch": 1.167, + "grad_norm": 8.708407402038574, + "learning_rate": 2.0839e-05, + "loss": 3.0384, + "step": 58350 + }, + { + "epoch": 1.168, + "grad_norm": 9.416001319885254, + "learning_rate": 2.0814e-05, + "loss": 3.0263, + "step": 58400 + }, + { + "epoch": 1.169, + "grad_norm": 7.336208820343018, + "learning_rate": 2.0789e-05, + "loss": 2.8834, + "step": 58450 + }, + { + "epoch": 1.17, + "grad_norm": 7.314346790313721, + "learning_rate": 2.0764e-05, + "loss": 2.9417, + "step": 58500 + }, + { + "epoch": 1.171, + "grad_norm": 7.871457576751709, + "learning_rate": 2.0739e-05, + "loss": 2.7274, + "step": 58550 + }, + { + "epoch": 1.172, + "grad_norm": 7.405073165893555, + "learning_rate": 2.0714e-05, + "loss": 2.9292, + "step": 58600 + }, + { + "epoch": 1.173, + "grad_norm": 7.039045333862305, + "learning_rate": 2.0689e-05, + "loss": 2.9214, + "step": 58650 + }, + { + "epoch": 1.174, + "grad_norm": 8.36693000793457, + "learning_rate": 2.0664e-05, + "loss": 2.7912, + "step": 58700 + }, + { + "epoch": 1.175, + "grad_norm": 6.045433521270752, + "learning_rate": 2.0639e-05, + "loss": 2.9735, + "step": 58750 + }, + { + "epoch": 1.176, + "grad_norm": 6.981970310211182, + "learning_rate": 2.0614e-05, + "loss": 3.0241, + "step": 58800 + }, + { + "epoch": 1.177, + "grad_norm": 8.423614501953125, + "learning_rate": 2.0589e-05, + "loss": 3.1809, + "step": 58850 + }, + { + "epoch": 1.178, + "grad_norm": 7.818522930145264, + "learning_rate": 2.0563999999999998e-05, + "loss": 2.8685, + "step": 58900 + }, + { + "epoch": 1.179, + "grad_norm": 7.669625759124756, + "learning_rate": 2.0539e-05, + "loss": 2.993, + "step": 58950 + }, + { + "epoch": 1.18, + "grad_norm": 7.713820457458496, + "learning_rate": 2.0514e-05, + "loss": 2.8745, + "step": 59000 + }, + { + "epoch": 1.181, + "grad_norm": 8.473793983459473, + "learning_rate": 2.0489e-05, + "loss": 2.9275, + "step": 59050 + }, + { + "epoch": 1.182, + "grad_norm": 7.552177429199219, + "learning_rate": 2.0464e-05, + "loss": 2.9823, + "step": 59100 + }, + { + "epoch": 1.183, + "grad_norm": 8.08139419555664, + "learning_rate": 2.0439e-05, + "loss": 2.8557, + "step": 59150 + }, + { + "epoch": 1.184, + "grad_norm": 7.447910785675049, + "learning_rate": 2.0414e-05, + "loss": 2.9044, + "step": 59200 + }, + { + "epoch": 1.185, + "grad_norm": 8.74582290649414, + "learning_rate": 2.0389e-05, + "loss": 2.8841, + "step": 59250 + }, + { + "epoch": 1.186, + "grad_norm": 6.812131404876709, + "learning_rate": 2.0364e-05, + "loss": 2.7652, + "step": 59300 + }, + { + "epoch": 1.187, + "grad_norm": 6.931506156921387, + "learning_rate": 2.0339e-05, + "loss": 2.9071, + "step": 59350 + }, + { + "epoch": 1.188, + "grad_norm": 7.589115619659424, + "learning_rate": 2.0314e-05, + "loss": 3.0316, + "step": 59400 + }, + { + "epoch": 1.189, + "grad_norm": 7.810254096984863, + "learning_rate": 2.0289e-05, + "loss": 2.89, + "step": 59450 + }, + { + "epoch": 1.19, + "grad_norm": 6.848294258117676, + "learning_rate": 2.0264e-05, + "loss": 2.938, + "step": 59500 + }, + { + "epoch": 1.191, + "grad_norm": 8.115826606750488, + "learning_rate": 2.0239e-05, + "loss": 2.958, + "step": 59550 + }, + { + "epoch": 1.192, + "grad_norm": 7.386075973510742, + "learning_rate": 2.0214e-05, + "loss": 2.902, + "step": 59600 + }, + { + "epoch": 1.193, + "grad_norm": 5.866041660308838, + "learning_rate": 2.0189e-05, + "loss": 2.883, + "step": 59650 + }, + { + "epoch": 1.194, + "grad_norm": 6.591806411743164, + "learning_rate": 2.0164000000000002e-05, + "loss": 2.7417, + "step": 59700 + }, + { + "epoch": 1.195, + "grad_norm": 7.479393482208252, + "learning_rate": 2.0139e-05, + "loss": 2.8905, + "step": 59750 + }, + { + "epoch": 1.196, + "grad_norm": 6.727341651916504, + "learning_rate": 2.0114e-05, + "loss": 2.9524, + "step": 59800 + }, + { + "epoch": 1.197, + "grad_norm": 7.175183296203613, + "learning_rate": 2.0089000000000004e-05, + "loss": 2.7486, + "step": 59850 + }, + { + "epoch": 1.198, + "grad_norm": 8.155020713806152, + "learning_rate": 2.0064000000000003e-05, + "loss": 3.042, + "step": 59900 + }, + { + "epoch": 1.199, + "grad_norm": 6.671029567718506, + "learning_rate": 2.0039000000000002e-05, + "loss": 3.034, + "step": 59950 + }, + { + "epoch": 1.2, + "grad_norm": 8.177928924560547, + "learning_rate": 2.0014000000000002e-05, + "loss": 3.0329, + "step": 60000 + }, + { + "epoch": 1.201, + "grad_norm": 8.286439895629883, + "learning_rate": 1.9989e-05, + "loss": 3.051, + "step": 60050 + }, + { + "epoch": 1.202, + "grad_norm": 8.125651359558105, + "learning_rate": 1.99645e-05, + "loss": 3.1017, + "step": 60100 + }, + { + "epoch": 1.203, + "grad_norm": 6.588275909423828, + "learning_rate": 1.99395e-05, + "loss": 2.9559, + "step": 60150 + }, + { + "epoch": 1.204, + "grad_norm": 8.682856559753418, + "learning_rate": 1.99145e-05, + "loss": 2.799, + "step": 60200 + }, + { + "epoch": 1.205, + "grad_norm": 7.333592414855957, + "learning_rate": 1.98895e-05, + "loss": 2.9984, + "step": 60250 + }, + { + "epoch": 1.206, + "grad_norm": 6.971369743347168, + "learning_rate": 1.98645e-05, + "loss": 2.8116, + "step": 60300 + }, + { + "epoch": 1.207, + "grad_norm": 9.593948364257812, + "learning_rate": 1.98395e-05, + "loss": 2.7672, + "step": 60350 + }, + { + "epoch": 1.208, + "grad_norm": 6.947750091552734, + "learning_rate": 1.9814499999999998e-05, + "loss": 2.8234, + "step": 60400 + }, + { + "epoch": 1.209, + "grad_norm": 9.746689796447754, + "learning_rate": 1.97895e-05, + "loss": 2.8323, + "step": 60450 + }, + { + "epoch": 1.21, + "grad_norm": 7.871998310089111, + "learning_rate": 1.97645e-05, + "loss": 3.0075, + "step": 60500 + }, + { + "epoch": 1.211, + "grad_norm": 8.099973678588867, + "learning_rate": 1.97395e-05, + "loss": 2.894, + "step": 60550 + }, + { + "epoch": 1.212, + "grad_norm": 9.960383415222168, + "learning_rate": 1.97145e-05, + "loss": 3.0326, + "step": 60600 + }, + { + "epoch": 1.213, + "grad_norm": 7.341196060180664, + "learning_rate": 1.96895e-05, + "loss": 2.7773, + "step": 60650 + }, + { + "epoch": 1.214, + "grad_norm": 7.2196125984191895, + "learning_rate": 1.96645e-05, + "loss": 2.9141, + "step": 60700 + }, + { + "epoch": 1.215, + "grad_norm": 7.776864051818848, + "learning_rate": 1.96395e-05, + "loss": 3.0412, + "step": 60750 + }, + { + "epoch": 1.216, + "grad_norm": 9.311785697937012, + "learning_rate": 1.9614500000000003e-05, + "loss": 2.9752, + "step": 60800 + }, + { + "epoch": 1.217, + "grad_norm": 9.076855659484863, + "learning_rate": 1.9589500000000003e-05, + "loss": 2.7665, + "step": 60850 + }, + { + "epoch": 1.218, + "grad_norm": 7.043571472167969, + "learning_rate": 1.9564500000000002e-05, + "loss": 2.8502, + "step": 60900 + }, + { + "epoch": 1.219, + "grad_norm": 8.467164993286133, + "learning_rate": 1.95395e-05, + "loss": 2.8783, + "step": 60950 + }, + { + "epoch": 1.22, + "grad_norm": 6.73739767074585, + "learning_rate": 1.95145e-05, + "loss": 2.7169, + "step": 61000 + }, + { + "epoch": 1.221, + "grad_norm": 8.30189037322998, + "learning_rate": 1.9489500000000004e-05, + "loss": 2.8347, + "step": 61050 + }, + { + "epoch": 1.222, + "grad_norm": 8.079895973205566, + "learning_rate": 1.9464500000000003e-05, + "loss": 3.0235, + "step": 61100 + }, + { + "epoch": 1.223, + "grad_norm": 8.46454906463623, + "learning_rate": 1.9439500000000003e-05, + "loss": 2.7273, + "step": 61150 + }, + { + "epoch": 1.224, + "grad_norm": 7.661693096160889, + "learning_rate": 1.9414500000000002e-05, + "loss": 3.0257, + "step": 61200 + }, + { + "epoch": 1.225, + "grad_norm": 8.443622589111328, + "learning_rate": 1.93895e-05, + "loss": 2.9563, + "step": 61250 + }, + { + "epoch": 1.226, + "grad_norm": 6.420770168304443, + "learning_rate": 1.93645e-05, + "loss": 3.0847, + "step": 61300 + }, + { + "epoch": 1.227, + "grad_norm": 7.339363098144531, + "learning_rate": 1.9339500000000003e-05, + "loss": 2.9745, + "step": 61350 + }, + { + "epoch": 1.228, + "grad_norm": 10.756458282470703, + "learning_rate": 1.9314500000000003e-05, + "loss": 2.8565, + "step": 61400 + }, + { + "epoch": 1.229, + "grad_norm": 7.621661186218262, + "learning_rate": 1.9289500000000002e-05, + "loss": 2.8492, + "step": 61450 + }, + { + "epoch": 1.23, + "grad_norm": 8.242738723754883, + "learning_rate": 1.92645e-05, + "loss": 2.9573, + "step": 61500 + }, + { + "epoch": 1.231, + "grad_norm": 8.807046890258789, + "learning_rate": 1.92395e-05, + "loss": 2.8937, + "step": 61550 + }, + { + "epoch": 1.232, + "grad_norm": 7.321935653686523, + "learning_rate": 1.92145e-05, + "loss": 2.7142, + "step": 61600 + }, + { + "epoch": 1.233, + "grad_norm": 7.530791759490967, + "learning_rate": 1.9189500000000003e-05, + "loss": 2.8536, + "step": 61650 + }, + { + "epoch": 1.234, + "grad_norm": 9.09104061126709, + "learning_rate": 1.9164500000000003e-05, + "loss": 2.9084, + "step": 61700 + }, + { + "epoch": 1.2349999999999999, + "grad_norm": 8.539567947387695, + "learning_rate": 1.9139500000000002e-05, + "loss": 3.0237, + "step": 61750 + }, + { + "epoch": 1.236, + "grad_norm": 7.933023929595947, + "learning_rate": 1.91145e-05, + "loss": 2.8838, + "step": 61800 + }, + { + "epoch": 1.237, + "grad_norm": 7.881004810333252, + "learning_rate": 1.90895e-05, + "loss": 2.6865, + "step": 61850 + }, + { + "epoch": 1.238, + "grad_norm": 7.823519706726074, + "learning_rate": 1.90645e-05, + "loss": 2.8898, + "step": 61900 + }, + { + "epoch": 1.2389999999999999, + "grad_norm": 7.689711570739746, + "learning_rate": 1.9039500000000003e-05, + "loss": 2.9576, + "step": 61950 + }, + { + "epoch": 1.24, + "grad_norm": 11.04930305480957, + "learning_rate": 1.9014500000000002e-05, + "loss": 2.7864, + "step": 62000 + }, + { + "epoch": 1.241, + "grad_norm": 7.104748249053955, + "learning_rate": 1.8989500000000002e-05, + "loss": 2.8336, + "step": 62050 + }, + { + "epoch": 1.242, + "grad_norm": 8.643699645996094, + "learning_rate": 1.8965000000000002e-05, + "loss": 2.9855, + "step": 62100 + }, + { + "epoch": 1.2429999999999999, + "grad_norm": 8.03945255279541, + "learning_rate": 1.894e-05, + "loss": 2.9015, + "step": 62150 + }, + { + "epoch": 1.244, + "grad_norm": 8.157134056091309, + "learning_rate": 1.8915e-05, + "loss": 3.0076, + "step": 62200 + }, + { + "epoch": 1.245, + "grad_norm": 7.763930797576904, + "learning_rate": 1.8890000000000004e-05, + "loss": 2.9506, + "step": 62250 + }, + { + "epoch": 1.246, + "grad_norm": 7.697092056274414, + "learning_rate": 1.8865000000000003e-05, + "loss": 2.8397, + "step": 62300 + }, + { + "epoch": 1.2469999999999999, + "grad_norm": 6.57480001449585, + "learning_rate": 1.8840000000000003e-05, + "loss": 2.7468, + "step": 62350 + }, + { + "epoch": 1.248, + "grad_norm": 8.595124244689941, + "learning_rate": 1.8815000000000002e-05, + "loss": 2.9574, + "step": 62400 + }, + { + "epoch": 1.249, + "grad_norm": 7.175886631011963, + "learning_rate": 1.879e-05, + "loss": 2.9085, + "step": 62450 + }, + { + "epoch": 1.25, + "grad_norm": 8.397321701049805, + "learning_rate": 1.8765e-05, + "loss": 2.9286, + "step": 62500 + }, + { + "epoch": 1.251, + "grad_norm": 6.980536937713623, + "learning_rate": 1.8740000000000004e-05, + "loss": 2.747, + "step": 62550 + }, + { + "epoch": 1.252, + "grad_norm": 7.271572113037109, + "learning_rate": 1.8715000000000003e-05, + "loss": 2.7768, + "step": 62600 + }, + { + "epoch": 1.2530000000000001, + "grad_norm": 7.7483696937561035, + "learning_rate": 1.8690000000000002e-05, + "loss": 3.0488, + "step": 62650 + }, + { + "epoch": 1.254, + "grad_norm": 11.277056694030762, + "learning_rate": 1.8665000000000002e-05, + "loss": 3.0649, + "step": 62700 + }, + { + "epoch": 1.255, + "grad_norm": 8.678947448730469, + "learning_rate": 1.864e-05, + "loss": 2.9212, + "step": 62750 + }, + { + "epoch": 1.256, + "grad_norm": 7.193722724914551, + "learning_rate": 1.8615e-05, + "loss": 2.7391, + "step": 62800 + }, + { + "epoch": 1.2570000000000001, + "grad_norm": 12.467113494873047, + "learning_rate": 1.8590000000000003e-05, + "loss": 2.9668, + "step": 62850 + }, + { + "epoch": 1.258, + "grad_norm": 7.693551063537598, + "learning_rate": 1.8565000000000003e-05, + "loss": 2.7936, + "step": 62900 + }, + { + "epoch": 1.259, + "grad_norm": 7.1652445793151855, + "learning_rate": 1.8540000000000002e-05, + "loss": 2.9067, + "step": 62950 + }, + { + "epoch": 1.26, + "grad_norm": 6.875372409820557, + "learning_rate": 1.8515e-05, + "loss": 2.9379, + "step": 63000 + }, + { + "epoch": 1.2610000000000001, + "grad_norm": 8.923434257507324, + "learning_rate": 1.849e-05, + "loss": 2.7059, + "step": 63050 + }, + { + "epoch": 1.262, + "grad_norm": 11.432680130004883, + "learning_rate": 1.8465e-05, + "loss": 2.9926, + "step": 63100 + }, + { + "epoch": 1.263, + "grad_norm": 10.260088920593262, + "learning_rate": 1.8440000000000003e-05, + "loss": 3.0907, + "step": 63150 + }, + { + "epoch": 1.264, + "grad_norm": 8.717737197875977, + "learning_rate": 1.8415000000000002e-05, + "loss": 2.9059, + "step": 63200 + }, + { + "epoch": 1.2650000000000001, + "grad_norm": 8.755745887756348, + "learning_rate": 1.8390000000000002e-05, + "loss": 2.8056, + "step": 63250 + }, + { + "epoch": 1.266, + "grad_norm": 6.1573286056518555, + "learning_rate": 1.8365e-05, + "loss": 3.0321, + "step": 63300 + }, + { + "epoch": 1.267, + "grad_norm": 8.053323745727539, + "learning_rate": 1.834e-05, + "loss": 2.8293, + "step": 63350 + }, + { + "epoch": 1.268, + "grad_norm": 7.611751556396484, + "learning_rate": 1.8315e-05, + "loss": 2.9095, + "step": 63400 + }, + { + "epoch": 1.2690000000000001, + "grad_norm": 9.002436637878418, + "learning_rate": 1.8290000000000003e-05, + "loss": 2.8894, + "step": 63450 + }, + { + "epoch": 1.27, + "grad_norm": 8.82766056060791, + "learning_rate": 1.8265000000000002e-05, + "loss": 3.1194, + "step": 63500 + }, + { + "epoch": 1.271, + "grad_norm": 7.163455486297607, + "learning_rate": 1.824e-05, + "loss": 2.8781, + "step": 63550 + }, + { + "epoch": 1.272, + "grad_norm": 7.2303032875061035, + "learning_rate": 1.8215e-05, + "loss": 2.7364, + "step": 63600 + }, + { + "epoch": 1.2730000000000001, + "grad_norm": 7.012924671173096, + "learning_rate": 1.819e-05, + "loss": 2.7426, + "step": 63650 + }, + { + "epoch": 1.274, + "grad_norm": 8.245091438293457, + "learning_rate": 1.8165000000000003e-05, + "loss": 2.7933, + "step": 63700 + }, + { + "epoch": 1.275, + "grad_norm": 8.162280082702637, + "learning_rate": 1.8140000000000003e-05, + "loss": 2.9715, + "step": 63750 + }, + { + "epoch": 1.276, + "grad_norm": 6.616583824157715, + "learning_rate": 1.8115000000000002e-05, + "loss": 2.7865, + "step": 63800 + }, + { + "epoch": 1.2770000000000001, + "grad_norm": 7.844861030578613, + "learning_rate": 1.809e-05, + "loss": 2.9921, + "step": 63850 + }, + { + "epoch": 1.278, + "grad_norm": 5.926692008972168, + "learning_rate": 1.8065e-05, + "loss": 2.7858, + "step": 63900 + }, + { + "epoch": 1.279, + "grad_norm": 8.044204711914062, + "learning_rate": 1.804e-05, + "loss": 2.9768, + "step": 63950 + }, + { + "epoch": 1.28, + "grad_norm": 7.818645477294922, + "learning_rate": 1.8015000000000003e-05, + "loss": 2.7761, + "step": 64000 + }, + { + "epoch": 1.2810000000000001, + "grad_norm": 6.930302143096924, + "learning_rate": 1.7990000000000002e-05, + "loss": 2.9969, + "step": 64050 + }, + { + "epoch": 1.282, + "grad_norm": 7.224350929260254, + "learning_rate": 1.7965e-05, + "loss": 2.8126, + "step": 64100 + }, + { + "epoch": 1.283, + "grad_norm": 8.74736499786377, + "learning_rate": 1.7940500000000002e-05, + "loss": 3.0012, + "step": 64150 + }, + { + "epoch": 1.284, + "grad_norm": 7.229979038238525, + "learning_rate": 1.79155e-05, + "loss": 2.7311, + "step": 64200 + }, + { + "epoch": 1.285, + "grad_norm": 7.284625053405762, + "learning_rate": 1.78905e-05, + "loss": 3.009, + "step": 64250 + }, + { + "epoch": 1.286, + "grad_norm": 7.718355178833008, + "learning_rate": 1.78655e-05, + "loss": 2.874, + "step": 64300 + }, + { + "epoch": 1.287, + "grad_norm": 7.456071376800537, + "learning_rate": 1.7840500000000003e-05, + "loss": 2.9934, + "step": 64350 + }, + { + "epoch": 1.288, + "grad_norm": 10.048409461975098, + "learning_rate": 1.7815500000000002e-05, + "loss": 2.7551, + "step": 64400 + }, + { + "epoch": 1.289, + "grad_norm": 6.842342853546143, + "learning_rate": 1.7790500000000002e-05, + "loss": 2.9174, + "step": 64450 + }, + { + "epoch": 1.29, + "grad_norm": 8.162269592285156, + "learning_rate": 1.77655e-05, + "loss": 2.9437, + "step": 64500 + }, + { + "epoch": 1.291, + "grad_norm": 6.678775787353516, + "learning_rate": 1.77405e-05, + "loss": 3.0262, + "step": 64550 + }, + { + "epoch": 1.292, + "grad_norm": 7.563632011413574, + "learning_rate": 1.77155e-05, + "loss": 2.8644, + "step": 64600 + }, + { + "epoch": 1.293, + "grad_norm": 6.082263946533203, + "learning_rate": 1.7690500000000003e-05, + "loss": 2.7633, + "step": 64650 + }, + { + "epoch": 1.294, + "grad_norm": 6.205733299255371, + "learning_rate": 1.7665500000000002e-05, + "loss": 2.7663, + "step": 64700 + }, + { + "epoch": 1.295, + "grad_norm": 8.632345199584961, + "learning_rate": 1.76405e-05, + "loss": 2.7141, + "step": 64750 + }, + { + "epoch": 1.296, + "grad_norm": 6.342571258544922, + "learning_rate": 1.76155e-05, + "loss": 2.8406, + "step": 64800 + }, + { + "epoch": 1.297, + "grad_norm": 6.610126495361328, + "learning_rate": 1.75905e-05, + "loss": 2.87, + "step": 64850 + }, + { + "epoch": 1.298, + "grad_norm": 7.278533458709717, + "learning_rate": 1.75655e-05, + "loss": 2.8149, + "step": 64900 + }, + { + "epoch": 1.299, + "grad_norm": 8.10925006866455, + "learning_rate": 1.7540500000000003e-05, + "loss": 2.8956, + "step": 64950 + }, + { + "epoch": 1.3, + "grad_norm": 8.207983016967773, + "learning_rate": 1.7515500000000002e-05, + "loss": 2.9087, + "step": 65000 + }, + { + "epoch": 1.301, + "grad_norm": 8.794236183166504, + "learning_rate": 1.74905e-05, + "loss": 3.0564, + "step": 65050 + }, + { + "epoch": 1.302, + "grad_norm": 15.703174591064453, + "learning_rate": 1.74655e-05, + "loss": 2.9178, + "step": 65100 + }, + { + "epoch": 1.303, + "grad_norm": 8.117988586425781, + "learning_rate": 1.74405e-05, + "loss": 2.8114, + "step": 65150 + }, + { + "epoch": 1.304, + "grad_norm": 7.884382247924805, + "learning_rate": 1.7415500000000003e-05, + "loss": 2.7444, + "step": 65200 + }, + { + "epoch": 1.305, + "grad_norm": 8.498047828674316, + "learning_rate": 1.7390500000000002e-05, + "loss": 2.8569, + "step": 65250 + }, + { + "epoch": 1.306, + "grad_norm": 7.473427772521973, + "learning_rate": 1.7365500000000002e-05, + "loss": 2.9345, + "step": 65300 + }, + { + "epoch": 1.307, + "grad_norm": 7.29819917678833, + "learning_rate": 1.73405e-05, + "loss": 3.117, + "step": 65350 + }, + { + "epoch": 1.308, + "grad_norm": 8.479053497314453, + "learning_rate": 1.73155e-05, + "loss": 2.8155, + "step": 65400 + }, + { + "epoch": 1.309, + "grad_norm": 8.74229907989502, + "learning_rate": 1.72905e-05, + "loss": 2.9308, + "step": 65450 + }, + { + "epoch": 1.31, + "grad_norm": 8.387805938720703, + "learning_rate": 1.7265500000000003e-05, + "loss": 2.9132, + "step": 65500 + }, + { + "epoch": 1.311, + "grad_norm": 8.244154930114746, + "learning_rate": 1.7240500000000002e-05, + "loss": 2.8277, + "step": 65550 + }, + { + "epoch": 1.312, + "grad_norm": 6.698085308074951, + "learning_rate": 1.72155e-05, + "loss": 3.0865, + "step": 65600 + }, + { + "epoch": 1.313, + "grad_norm": 10.616350173950195, + "learning_rate": 1.71905e-05, + "loss": 2.8149, + "step": 65650 + }, + { + "epoch": 1.314, + "grad_norm": 9.140168190002441, + "learning_rate": 1.71655e-05, + "loss": 2.8297, + "step": 65700 + }, + { + "epoch": 1.315, + "grad_norm": 9.049459457397461, + "learning_rate": 1.71405e-05, + "loss": 2.6896, + "step": 65750 + }, + { + "epoch": 1.316, + "grad_norm": 6.275703430175781, + "learning_rate": 1.7115500000000002e-05, + "loss": 2.8437, + "step": 65800 + }, + { + "epoch": 1.317, + "grad_norm": 9.357379913330078, + "learning_rate": 1.7090500000000002e-05, + "loss": 3.0127, + "step": 65850 + }, + { + "epoch": 1.318, + "grad_norm": 7.831468105316162, + "learning_rate": 1.70655e-05, + "loss": 2.792, + "step": 65900 + }, + { + "epoch": 1.319, + "grad_norm": 9.359698295593262, + "learning_rate": 1.70405e-05, + "loss": 3.0451, + "step": 65950 + }, + { + "epoch": 1.32, + "grad_norm": 6.813355922698975, + "learning_rate": 1.70155e-05, + "loss": 2.7334, + "step": 66000 + }, + { + "epoch": 1.321, + "grad_norm": 8.375040054321289, + "learning_rate": 1.69905e-05, + "loss": 2.7643, + "step": 66050 + }, + { + "epoch": 1.322, + "grad_norm": 6.97425651550293, + "learning_rate": 1.6965500000000002e-05, + "loss": 2.9207, + "step": 66100 + }, + { + "epoch": 1.323, + "grad_norm": 9.489729881286621, + "learning_rate": 1.6941000000000003e-05, + "loss": 2.8561, + "step": 66150 + }, + { + "epoch": 1.324, + "grad_norm": 8.421103477478027, + "learning_rate": 1.6916000000000002e-05, + "loss": 3.0594, + "step": 66200 + }, + { + "epoch": 1.325, + "grad_norm": 7.924808502197266, + "learning_rate": 1.6891e-05, + "loss": 2.9725, + "step": 66250 + }, + { + "epoch": 1.326, + "grad_norm": 8.143972396850586, + "learning_rate": 1.6866e-05, + "loss": 2.7149, + "step": 66300 + }, + { + "epoch": 1.327, + "grad_norm": 11.077095031738281, + "learning_rate": 1.6841e-05, + "loss": 2.858, + "step": 66350 + }, + { + "epoch": 1.328, + "grad_norm": 7.736219882965088, + "learning_rate": 1.6816e-05, + "loss": 2.9161, + "step": 66400 + }, + { + "epoch": 1.329, + "grad_norm": 9.575567245483398, + "learning_rate": 1.6791000000000002e-05, + "loss": 2.8953, + "step": 66450 + }, + { + "epoch": 1.33, + "grad_norm": 6.864261150360107, + "learning_rate": 1.6766000000000002e-05, + "loss": 2.7205, + "step": 66500 + }, + { + "epoch": 1.331, + "grad_norm": 7.764801502227783, + "learning_rate": 1.6741e-05, + "loss": 2.9188, + "step": 66550 + }, + { + "epoch": 1.332, + "grad_norm": 10.629688262939453, + "learning_rate": 1.6716e-05, + "loss": 2.9199, + "step": 66600 + }, + { + "epoch": 1.333, + "grad_norm": 8.512171745300293, + "learning_rate": 1.6691e-05, + "loss": 2.8188, + "step": 66650 + }, + { + "epoch": 1.334, + "grad_norm": 8.378336906433105, + "learning_rate": 1.6666000000000003e-05, + "loss": 2.7863, + "step": 66700 + }, + { + "epoch": 1.335, + "grad_norm": 8.283653259277344, + "learning_rate": 1.6641000000000002e-05, + "loss": 2.9134, + "step": 66750 + }, + { + "epoch": 1.336, + "grad_norm": 6.067685604095459, + "learning_rate": 1.6616e-05, + "loss": 2.661, + "step": 66800 + }, + { + "epoch": 1.337, + "grad_norm": 7.018548488616943, + "learning_rate": 1.6591e-05, + "loss": 2.7564, + "step": 66850 + }, + { + "epoch": 1.338, + "grad_norm": 8.71909236907959, + "learning_rate": 1.6566e-05, + "loss": 2.7492, + "step": 66900 + }, + { + "epoch": 1.339, + "grad_norm": 7.719536304473877, + "learning_rate": 1.6541e-05, + "loss": 2.9521, + "step": 66950 + }, + { + "epoch": 1.34, + "grad_norm": 9.370909690856934, + "learning_rate": 1.6516000000000002e-05, + "loss": 2.9058, + "step": 67000 + }, + { + "epoch": 1.341, + "grad_norm": 6.47285795211792, + "learning_rate": 1.6491000000000002e-05, + "loss": 2.8832, + "step": 67050 + }, + { + "epoch": 1.342, + "grad_norm": 6.0031890869140625, + "learning_rate": 1.6466e-05, + "loss": 2.6373, + "step": 67100 + }, + { + "epoch": 1.343, + "grad_norm": 7.42556095123291, + "learning_rate": 1.6441e-05, + "loss": 2.9758, + "step": 67150 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 6.864173889160156, + "learning_rate": 1.6416e-05, + "loss": 2.8131, + "step": 67200 + }, + { + "epoch": 1.345, + "grad_norm": 8.983753204345703, + "learning_rate": 1.6391e-05, + "loss": 2.7171, + "step": 67250 + }, + { + "epoch": 1.346, + "grad_norm": 7.330747604370117, + "learning_rate": 1.6366000000000002e-05, + "loss": 2.7697, + "step": 67300 + }, + { + "epoch": 1.347, + "grad_norm": 8.947467803955078, + "learning_rate": 1.6341e-05, + "loss": 2.836, + "step": 67350 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 8.130457878112793, + "learning_rate": 1.6316e-05, + "loss": 2.8659, + "step": 67400 + }, + { + "epoch": 1.349, + "grad_norm": 7.954550743103027, + "learning_rate": 1.6291e-05, + "loss": 2.7594, + "step": 67450 + }, + { + "epoch": 1.35, + "grad_norm": 7.327019691467285, + "learning_rate": 1.6266e-05, + "loss": 2.956, + "step": 67500 + }, + { + "epoch": 1.351, + "grad_norm": 8.24224853515625, + "learning_rate": 1.6241e-05, + "loss": 2.7842, + "step": 67550 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 7.031774997711182, + "learning_rate": 1.6216000000000002e-05, + "loss": 2.8288, + "step": 67600 + }, + { + "epoch": 1.353, + "grad_norm": 17.19927978515625, + "learning_rate": 1.6191e-05, + "loss": 2.9684, + "step": 67650 + }, + { + "epoch": 1.354, + "grad_norm": 9.650681495666504, + "learning_rate": 1.6166e-05, + "loss": 2.853, + "step": 67700 + }, + { + "epoch": 1.355, + "grad_norm": 7.688391208648682, + "learning_rate": 1.6141e-05, + "loss": 2.9444, + "step": 67750 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 6.4372406005859375, + "learning_rate": 1.6116e-05, + "loss": 2.7925, + "step": 67800 + }, + { + "epoch": 1.357, + "grad_norm": 8.255843162536621, + "learning_rate": 1.6091e-05, + "loss": 2.9575, + "step": 67850 + }, + { + "epoch": 1.358, + "grad_norm": 8.876387596130371, + "learning_rate": 1.6066e-05, + "loss": 2.8265, + "step": 67900 + }, + { + "epoch": 1.359, + "grad_norm": 6.5156941413879395, + "learning_rate": 1.6041e-05, + "loss": 2.984, + "step": 67950 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 10.278947830200195, + "learning_rate": 1.6016e-05, + "loss": 2.8802, + "step": 68000 + }, + { + "epoch": 1.361, + "grad_norm": 8.730320930480957, + "learning_rate": 1.5991e-05, + "loss": 2.9155, + "step": 68050 + }, + { + "epoch": 1.362, + "grad_norm": 7.59114408493042, + "learning_rate": 1.5966e-05, + "loss": 2.9146, + "step": 68100 + }, + { + "epoch": 1.363, + "grad_norm": 7.220095157623291, + "learning_rate": 1.5941000000000002e-05, + "loss": 2.6534, + "step": 68150 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 8.136276245117188, + "learning_rate": 1.5916500000000003e-05, + "loss": 2.9809, + "step": 68200 + }, + { + "epoch": 1.365, + "grad_norm": 7.093013763427734, + "learning_rate": 1.5891500000000002e-05, + "loss": 2.7613, + "step": 68250 + }, + { + "epoch": 1.366, + "grad_norm": 8.815773963928223, + "learning_rate": 1.58665e-05, + "loss": 3.066, + "step": 68300 + }, + { + "epoch": 1.367, + "grad_norm": 7.6265668869018555, + "learning_rate": 1.58415e-05, + "loss": 2.8987, + "step": 68350 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 8.445241928100586, + "learning_rate": 1.58165e-05, + "loss": 2.7399, + "step": 68400 + }, + { + "epoch": 1.369, + "grad_norm": 9.16798210144043, + "learning_rate": 1.57915e-05, + "loss": 2.9594, + "step": 68450 + }, + { + "epoch": 1.37, + "grad_norm": 7.255063533782959, + "learning_rate": 1.5766500000000002e-05, + "loss": 2.7959, + "step": 68500 + }, + { + "epoch": 1.371, + "grad_norm": 7.769148349761963, + "learning_rate": 1.57415e-05, + "loss": 2.802, + "step": 68550 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 8.59789752960205, + "learning_rate": 1.57165e-05, + "loss": 2.7217, + "step": 68600 + }, + { + "epoch": 1.373, + "grad_norm": 10.071572303771973, + "learning_rate": 1.56915e-05, + "loss": 2.9676, + "step": 68650 + }, + { + "epoch": 1.374, + "grad_norm": 10.081389427185059, + "learning_rate": 1.56665e-05, + "loss": 2.7645, + "step": 68700 + }, + { + "epoch": 1.375, + "grad_norm": 8.29869270324707, + "learning_rate": 1.56415e-05, + "loss": 2.9042, + "step": 68750 + }, + { + "epoch": 1.376, + "grad_norm": 7.104742527008057, + "learning_rate": 1.5616500000000002e-05, + "loss": 3.0615, + "step": 68800 + }, + { + "epoch": 1.377, + "grad_norm": 6.897149085998535, + "learning_rate": 1.55915e-05, + "loss": 2.8236, + "step": 68850 + }, + { + "epoch": 1.3780000000000001, + "grad_norm": 6.750307083129883, + "learning_rate": 1.55665e-05, + "loss": 2.8946, + "step": 68900 + }, + { + "epoch": 1.379, + "grad_norm": 9.587626457214355, + "learning_rate": 1.55415e-05, + "loss": 2.8499, + "step": 68950 + }, + { + "epoch": 1.38, + "grad_norm": 9.258056640625, + "learning_rate": 1.55165e-05, + "loss": 2.7918, + "step": 69000 + }, + { + "epoch": 1.381, + "grad_norm": 7.532637596130371, + "learning_rate": 1.54915e-05, + "loss": 2.6911, + "step": 69050 + }, + { + "epoch": 1.3820000000000001, + "grad_norm": 8.102679252624512, + "learning_rate": 1.5466500000000002e-05, + "loss": 3.051, + "step": 69100 + }, + { + "epoch": 1.383, + "grad_norm": 8.062738418579102, + "learning_rate": 1.54415e-05, + "loss": 2.9364, + "step": 69150 + }, + { + "epoch": 1.384, + "grad_norm": 8.074463844299316, + "learning_rate": 1.54165e-05, + "loss": 2.9216, + "step": 69200 + }, + { + "epoch": 1.385, + "grad_norm": 7.463252544403076, + "learning_rate": 1.53915e-05, + "loss": 2.7501, + "step": 69250 + }, + { + "epoch": 1.3860000000000001, + "grad_norm": 7.215155124664307, + "learning_rate": 1.53665e-05, + "loss": 2.6774, + "step": 69300 + }, + { + "epoch": 1.387, + "grad_norm": 8.170955657958984, + "learning_rate": 1.53415e-05, + "loss": 2.7927, + "step": 69350 + }, + { + "epoch": 1.388, + "grad_norm": 8.302571296691895, + "learning_rate": 1.53165e-05, + "loss": 2.7675, + "step": 69400 + }, + { + "epoch": 1.389, + "grad_norm": 6.916486740112305, + "learning_rate": 1.52915e-05, + "loss": 2.9605, + "step": 69450 + }, + { + "epoch": 1.3900000000000001, + "grad_norm": 6.986796855926514, + "learning_rate": 1.52665e-05, + "loss": 2.9328, + "step": 69500 + }, + { + "epoch": 1.391, + "grad_norm": 7.800340175628662, + "learning_rate": 1.52415e-05, + "loss": 2.8924, + "step": 69550 + }, + { + "epoch": 1.392, + "grad_norm": 9.064175605773926, + "learning_rate": 1.52165e-05, + "loss": 2.8534, + "step": 69600 + }, + { + "epoch": 1.393, + "grad_norm": 8.305397033691406, + "learning_rate": 1.51915e-05, + "loss": 2.8954, + "step": 69650 + }, + { + "epoch": 1.3940000000000001, + "grad_norm": 7.492577075958252, + "learning_rate": 1.51665e-05, + "loss": 2.8653, + "step": 69700 + }, + { + "epoch": 1.395, + "grad_norm": 7.92070198059082, + "learning_rate": 1.5142e-05, + "loss": 2.7273, + "step": 69750 + }, + { + "epoch": 1.396, + "grad_norm": 7.920971393585205, + "learning_rate": 1.5117000000000001e-05, + "loss": 2.9048, + "step": 69800 + }, + { + "epoch": 1.397, + "grad_norm": 9.150144577026367, + "learning_rate": 1.5092e-05, + "loss": 2.6779, + "step": 69850 + }, + { + "epoch": 1.3980000000000001, + "grad_norm": 8.129121780395508, + "learning_rate": 1.5067e-05, + "loss": 2.9142, + "step": 69900 + }, + { + "epoch": 1.399, + "grad_norm": 8.334802627563477, + "learning_rate": 1.5042000000000001e-05, + "loss": 2.6763, + "step": 69950 + }, + { + "epoch": 1.4, + "grad_norm": 7.4681806564331055, + "learning_rate": 1.5017e-05, + "loss": 2.93, + "step": 70000 + }, + { + "epoch": 1.401, + "grad_norm": 7.846401214599609, + "learning_rate": 1.4992e-05, + "loss": 2.7229, + "step": 70050 + }, + { + "epoch": 1.4020000000000001, + "grad_norm": 7.523430347442627, + "learning_rate": 1.4967000000000001e-05, + "loss": 2.8843, + "step": 70100 + }, + { + "epoch": 1.403, + "grad_norm": 9.278549194335938, + "learning_rate": 1.4942e-05, + "loss": 2.7884, + "step": 70150 + }, + { + "epoch": 1.404, + "grad_norm": 7.723369121551514, + "learning_rate": 1.4917e-05, + "loss": 2.853, + "step": 70200 + }, + { + "epoch": 1.405, + "grad_norm": 8.88216781616211, + "learning_rate": 1.4892e-05, + "loss": 2.9649, + "step": 70250 + }, + { + "epoch": 1.4060000000000001, + "grad_norm": 6.627724647521973, + "learning_rate": 1.4867e-05, + "loss": 2.7964, + "step": 70300 + }, + { + "epoch": 1.407, + "grad_norm": 8.70402717590332, + "learning_rate": 1.4842e-05, + "loss": 2.7892, + "step": 70350 + }, + { + "epoch": 1.408, + "grad_norm": 7.616790294647217, + "learning_rate": 1.4817e-05, + "loss": 2.9413, + "step": 70400 + }, + { + "epoch": 1.409, + "grad_norm": 8.156023025512695, + "learning_rate": 1.4792e-05, + "loss": 2.9861, + "step": 70450 + }, + { + "epoch": 1.41, + "grad_norm": 8.509369850158691, + "learning_rate": 1.4767000000000001e-05, + "loss": 2.8202, + "step": 70500 + }, + { + "epoch": 1.411, + "grad_norm": 7.710995674133301, + "learning_rate": 1.4742e-05, + "loss": 2.9804, + "step": 70550 + }, + { + "epoch": 1.412, + "grad_norm": 6.933726787567139, + "learning_rate": 1.4717e-05, + "loss": 2.802, + "step": 70600 + }, + { + "epoch": 1.413, + "grad_norm": 8.056614875793457, + "learning_rate": 1.4692000000000001e-05, + "loss": 2.8331, + "step": 70650 + }, + { + "epoch": 1.414, + "grad_norm": 7.54365348815918, + "learning_rate": 1.4667e-05, + "loss": 2.8463, + "step": 70700 + }, + { + "epoch": 1.415, + "grad_norm": 7.553271770477295, + "learning_rate": 1.4642e-05, + "loss": 2.7486, + "step": 70750 + }, + { + "epoch": 1.416, + "grad_norm": 8.390814781188965, + "learning_rate": 1.4617e-05, + "loss": 2.8333, + "step": 70800 + }, + { + "epoch": 1.417, + "grad_norm": 7.930309772491455, + "learning_rate": 1.4592e-05, + "loss": 2.91, + "step": 70850 + }, + { + "epoch": 1.418, + "grad_norm": 8.551098823547363, + "learning_rate": 1.4567e-05, + "loss": 2.8685, + "step": 70900 + }, + { + "epoch": 1.419, + "grad_norm": 8.798742294311523, + "learning_rate": 1.4542e-05, + "loss": 2.8343, + "step": 70950 + }, + { + "epoch": 1.42, + "grad_norm": 8.70396900177002, + "learning_rate": 1.4517e-05, + "loss": 2.9567, + "step": 71000 + }, + { + "epoch": 1.421, + "grad_norm": 9.252113342285156, + "learning_rate": 1.4492e-05, + "loss": 2.9396, + "step": 71050 + }, + { + "epoch": 1.422, + "grad_norm": 8.939455032348633, + "learning_rate": 1.4467e-05, + "loss": 2.5287, + "step": 71100 + }, + { + "epoch": 1.423, + "grad_norm": 9.436803817749023, + "learning_rate": 1.4442e-05, + "loss": 2.8976, + "step": 71150 + }, + { + "epoch": 1.424, + "grad_norm": 8.383010864257812, + "learning_rate": 1.4417e-05, + "loss": 2.7318, + "step": 71200 + }, + { + "epoch": 1.425, + "grad_norm": 8.16480541229248, + "learning_rate": 1.4392e-05, + "loss": 2.7685, + "step": 71250 + }, + { + "epoch": 1.426, + "grad_norm": 9.6614351272583, + "learning_rate": 1.4367e-05, + "loss": 2.7841, + "step": 71300 + }, + { + "epoch": 1.427, + "grad_norm": 7.361180305480957, + "learning_rate": 1.4342e-05, + "loss": 2.8441, + "step": 71350 + }, + { + "epoch": 1.428, + "grad_norm": 10.832164764404297, + "learning_rate": 1.4317e-05, + "loss": 2.875, + "step": 71400 + }, + { + "epoch": 1.429, + "grad_norm": 6.530023097991943, + "learning_rate": 1.4292e-05, + "loss": 2.8486, + "step": 71450 + }, + { + "epoch": 1.43, + "grad_norm": 8.04982852935791, + "learning_rate": 1.4266999999999999e-05, + "loss": 3.0097, + "step": 71500 + }, + { + "epoch": 1.431, + "grad_norm": 7.762668132781982, + "learning_rate": 1.4242e-05, + "loss": 2.8063, + "step": 71550 + }, + { + "epoch": 1.432, + "grad_norm": 6.672072887420654, + "learning_rate": 1.4217e-05, + "loss": 2.752, + "step": 71600 + }, + { + "epoch": 1.433, + "grad_norm": 7.793580532073975, + "learning_rate": 1.4191999999999999e-05, + "loss": 2.7347, + "step": 71650 + }, + { + "epoch": 1.434, + "grad_norm": 7.609199523925781, + "learning_rate": 1.4167e-05, + "loss": 2.904, + "step": 71700 + }, + { + "epoch": 1.435, + "grad_norm": 8.817373275756836, + "learning_rate": 1.4142e-05, + "loss": 2.7914, + "step": 71750 + }, + { + "epoch": 1.436, + "grad_norm": 9.75827693939209, + "learning_rate": 1.4116999999999999e-05, + "loss": 2.8292, + "step": 71800 + }, + { + "epoch": 1.437, + "grad_norm": 6.9642863273620605, + "learning_rate": 1.4092e-05, + "loss": 2.6669, + "step": 71850 + }, + { + "epoch": 1.438, + "grad_norm": 9.276019096374512, + "learning_rate": 1.4067e-05, + "loss": 2.984, + "step": 71900 + }, + { + "epoch": 1.439, + "grad_norm": 7.211697578430176, + "learning_rate": 1.4042e-05, + "loss": 2.9085, + "step": 71950 + }, + { + "epoch": 1.44, + "grad_norm": 7.778343200683594, + "learning_rate": 1.4017e-05, + "loss": 2.897, + "step": 72000 + }, + { + "epoch": 1.441, + "grad_norm": 8.795747756958008, + "learning_rate": 1.3992e-05, + "loss": 2.7523, + "step": 72050 + }, + { + "epoch": 1.442, + "grad_norm": 7.971023082733154, + "learning_rate": 1.3967e-05, + "loss": 2.895, + "step": 72100 + }, + { + "epoch": 1.443, + "grad_norm": 8.681403160095215, + "learning_rate": 1.3942e-05, + "loss": 3.0107, + "step": 72150 + }, + { + "epoch": 1.444, + "grad_norm": 7.388852596282959, + "learning_rate": 1.3916999999999999e-05, + "loss": 2.7337, + "step": 72200 + }, + { + "epoch": 1.445, + "grad_norm": 6.82358980178833, + "learning_rate": 1.3892e-05, + "loss": 2.7527, + "step": 72250 + }, + { + "epoch": 1.446, + "grad_norm": 7.868986129760742, + "learning_rate": 1.3867e-05, + "loss": 2.8683, + "step": 72300 + }, + { + "epoch": 1.447, + "grad_norm": 7.227389812469482, + "learning_rate": 1.3841999999999999e-05, + "loss": 2.8543, + "step": 72350 + }, + { + "epoch": 1.448, + "grad_norm": 6.622435569763184, + "learning_rate": 1.3817e-05, + "loss": 2.8388, + "step": 72400 + }, + { + "epoch": 1.449, + "grad_norm": 6.9106903076171875, + "learning_rate": 1.3792e-05, + "loss": 2.7931, + "step": 72450 + }, + { + "epoch": 1.45, + "grad_norm": 7.011186599731445, + "learning_rate": 1.3766999999999999e-05, + "loss": 2.9716, + "step": 72500 + }, + { + "epoch": 1.451, + "grad_norm": 8.601675987243652, + "learning_rate": 1.3742e-05, + "loss": 2.8998, + "step": 72550 + }, + { + "epoch": 1.452, + "grad_norm": 6.609960079193115, + "learning_rate": 1.3717e-05, + "loss": 2.5795, + "step": 72600 + }, + { + "epoch": 1.453, + "grad_norm": 7.050849914550781, + "learning_rate": 1.3691999999999999e-05, + "loss": 2.8272, + "step": 72650 + }, + { + "epoch": 1.454, + "grad_norm": 9.343693733215332, + "learning_rate": 1.3667000000000001e-05, + "loss": 2.6923, + "step": 72700 + }, + { + "epoch": 1.455, + "grad_norm": 7.410576343536377, + "learning_rate": 1.3642000000000003e-05, + "loss": 2.7491, + "step": 72750 + }, + { + "epoch": 1.456, + "grad_norm": 7.9082159996032715, + "learning_rate": 1.3617000000000002e-05, + "loss": 3.0432, + "step": 72800 + }, + { + "epoch": 1.457, + "grad_norm": 10.245935440063477, + "learning_rate": 1.3592000000000001e-05, + "loss": 2.7326, + "step": 72850 + }, + { + "epoch": 1.458, + "grad_norm": 7.3419318199157715, + "learning_rate": 1.3567000000000002e-05, + "loss": 2.7541, + "step": 72900 + }, + { + "epoch": 1.459, + "grad_norm": 7.791906833648682, + "learning_rate": 1.3542000000000002e-05, + "loss": 2.8323, + "step": 72950 + }, + { + "epoch": 1.46, + "grad_norm": 8.035292625427246, + "learning_rate": 1.3517000000000001e-05, + "loss": 2.5501, + "step": 73000 + }, + { + "epoch": 1.461, + "grad_norm": 8.372005462646484, + "learning_rate": 1.3492000000000002e-05, + "loss": 2.7617, + "step": 73050 + }, + { + "epoch": 1.462, + "grad_norm": 8.679768562316895, + "learning_rate": 1.3467000000000002e-05, + "loss": 2.7125, + "step": 73100 + }, + { + "epoch": 1.463, + "grad_norm": 15.464470863342285, + "learning_rate": 1.3442000000000001e-05, + "loss": 2.9627, + "step": 73150 + }, + { + "epoch": 1.464, + "grad_norm": 7.9514007568359375, + "learning_rate": 1.3417000000000002e-05, + "loss": 3.045, + "step": 73200 + }, + { + "epoch": 1.465, + "grad_norm": 6.683189868927002, + "learning_rate": 1.3392000000000002e-05, + "loss": 2.7564, + "step": 73250 + }, + { + "epoch": 1.466, + "grad_norm": 11.89483642578125, + "learning_rate": 1.3367000000000001e-05, + "loss": 2.9175, + "step": 73300 + }, + { + "epoch": 1.467, + "grad_norm": 8.024170875549316, + "learning_rate": 1.3342000000000002e-05, + "loss": 2.6941, + "step": 73350 + }, + { + "epoch": 1.468, + "grad_norm": 8.924114227294922, + "learning_rate": 1.3317000000000001e-05, + "loss": 2.8156, + "step": 73400 + }, + { + "epoch": 1.4689999999999999, + "grad_norm": 8.561711311340332, + "learning_rate": 1.3292000000000003e-05, + "loss": 2.8866, + "step": 73450 + }, + { + "epoch": 1.47, + "grad_norm": 7.818349838256836, + "learning_rate": 1.3267000000000002e-05, + "loss": 2.7045, + "step": 73500 + }, + { + "epoch": 1.471, + "grad_norm": 9.295713424682617, + "learning_rate": 1.3242000000000001e-05, + "loss": 2.8804, + "step": 73550 + }, + { + "epoch": 1.472, + "grad_norm": 6.626889705657959, + "learning_rate": 1.3217000000000002e-05, + "loss": 2.8654, + "step": 73600 + }, + { + "epoch": 1.4729999999999999, + "grad_norm": 10.385565757751465, + "learning_rate": 1.3192000000000002e-05, + "loss": 2.8314, + "step": 73650 + }, + { + "epoch": 1.474, + "grad_norm": 8.731403350830078, + "learning_rate": 1.3167000000000001e-05, + "loss": 2.8624, + "step": 73700 + }, + { + "epoch": 1.475, + "grad_norm": 7.520509719848633, + "learning_rate": 1.31425e-05, + "loss": 2.8683, + "step": 73750 + }, + { + "epoch": 1.476, + "grad_norm": 8.531784057617188, + "learning_rate": 1.3118e-05, + "loss": 2.812, + "step": 73800 + }, + { + "epoch": 1.4769999999999999, + "grad_norm": 7.280162334442139, + "learning_rate": 1.3093e-05, + "loss": 2.756, + "step": 73850 + }, + { + "epoch": 1.478, + "grad_norm": 8.175580024719238, + "learning_rate": 1.3068e-05, + "loss": 2.7574, + "step": 73900 + }, + { + "epoch": 1.479, + "grad_norm": 7.7262091636657715, + "learning_rate": 1.3043e-05, + "loss": 2.6983, + "step": 73950 + }, + { + "epoch": 1.48, + "grad_norm": 7.174941062927246, + "learning_rate": 1.3018e-05, + "loss": 2.8821, + "step": 74000 + }, + { + "epoch": 1.4809999999999999, + "grad_norm": 9.21951961517334, + "learning_rate": 1.2992999999999999e-05, + "loss": 2.8512, + "step": 74050 + }, + { + "epoch": 1.482, + "grad_norm": 13.13577651977539, + "learning_rate": 1.2968e-05, + "loss": 2.7765, + "step": 74100 + }, + { + "epoch": 1.483, + "grad_norm": 7.468155384063721, + "learning_rate": 1.2943e-05, + "loss": 2.8334, + "step": 74150 + }, + { + "epoch": 1.484, + "grad_norm": 7.371670722961426, + "learning_rate": 1.2917999999999999e-05, + "loss": 2.5598, + "step": 74200 + }, + { + "epoch": 1.4849999999999999, + "grad_norm": 7.9383745193481445, + "learning_rate": 1.2893e-05, + "loss": 2.8735, + "step": 74250 + }, + { + "epoch": 1.486, + "grad_norm": 7.459591865539551, + "learning_rate": 1.2868e-05, + "loss": 2.8774, + "step": 74300 + }, + { + "epoch": 1.487, + "grad_norm": 9.01418685913086, + "learning_rate": 1.2842999999999999e-05, + "loss": 2.9084, + "step": 74350 + }, + { + "epoch": 1.488, + "grad_norm": 8.918654441833496, + "learning_rate": 1.2818e-05, + "loss": 2.7615, + "step": 74400 + }, + { + "epoch": 1.4889999999999999, + "grad_norm": 8.913796424865723, + "learning_rate": 1.2793e-05, + "loss": 2.5816, + "step": 74450 + }, + { + "epoch": 1.49, + "grad_norm": 7.444356918334961, + "learning_rate": 1.2767999999999999e-05, + "loss": 2.7854, + "step": 74500 + }, + { + "epoch": 1.491, + "grad_norm": 11.2793550491333, + "learning_rate": 1.2743e-05, + "loss": 2.7344, + "step": 74550 + }, + { + "epoch": 1.492, + "grad_norm": 6.877553939819336, + "learning_rate": 1.2718e-05, + "loss": 2.81, + "step": 74600 + }, + { + "epoch": 1.4929999999999999, + "grad_norm": 8.273252487182617, + "learning_rate": 1.2692999999999999e-05, + "loss": 2.7148, + "step": 74650 + }, + { + "epoch": 1.494, + "grad_norm": 6.85491418838501, + "learning_rate": 1.2668e-05, + "loss": 2.848, + "step": 74700 + }, + { + "epoch": 1.495, + "grad_norm": 7.391516208648682, + "learning_rate": 1.2642999999999999e-05, + "loss": 2.9437, + "step": 74750 + }, + { + "epoch": 1.496, + "grad_norm": 8.175915718078613, + "learning_rate": 1.2617999999999998e-05, + "loss": 2.5342, + "step": 74800 + }, + { + "epoch": 1.4969999999999999, + "grad_norm": 7.925018310546875, + "learning_rate": 1.2593e-05, + "loss": 2.8257, + "step": 74850 + }, + { + "epoch": 1.498, + "grad_norm": 7.637334823608398, + "learning_rate": 1.2567999999999999e-05, + "loss": 2.7577, + "step": 74900 + }, + { + "epoch": 1.499, + "grad_norm": 6.907679557800293, + "learning_rate": 1.2543000000000002e-05, + "loss": 2.6929, + "step": 74950 + }, + { + "epoch": 1.5, + "grad_norm": 12.479944229125977, + "learning_rate": 1.2518000000000001e-05, + "loss": 2.6092, + "step": 75000 + }, + { + "epoch": 1.501, + "grad_norm": 8.500955581665039, + "learning_rate": 1.2493e-05, + "loss": 2.9479, + "step": 75050 + }, + { + "epoch": 1.502, + "grad_norm": 8.31230640411377, + "learning_rate": 1.2468e-05, + "loss": 2.7979, + "step": 75100 + }, + { + "epoch": 1.5030000000000001, + "grad_norm": 7.513004779815674, + "learning_rate": 1.2443000000000001e-05, + "loss": 2.8403, + "step": 75150 + }, + { + "epoch": 1.504, + "grad_norm": 9.051568984985352, + "learning_rate": 1.2418e-05, + "loss": 2.5605, + "step": 75200 + }, + { + "epoch": 1.505, + "grad_norm": 9.6289644241333, + "learning_rate": 1.2393e-05, + "loss": 2.91, + "step": 75250 + }, + { + "epoch": 1.506, + "grad_norm": 7.6557440757751465, + "learning_rate": 1.2368e-05, + "loss": 2.7622, + "step": 75300 + }, + { + "epoch": 1.5070000000000001, + "grad_norm": 9.397058486938477, + "learning_rate": 1.2343e-05, + "loss": 2.5693, + "step": 75350 + }, + { + "epoch": 1.508, + "grad_norm": 7.9182634353637695, + "learning_rate": 1.2318e-05, + "loss": 2.7578, + "step": 75400 + }, + { + "epoch": 1.509, + "grad_norm": 8.215141296386719, + "learning_rate": 1.2293e-05, + "loss": 2.8933, + "step": 75450 + }, + { + "epoch": 1.51, + "grad_norm": 6.805432319641113, + "learning_rate": 1.2268e-05, + "loss": 2.8146, + "step": 75500 + }, + { + "epoch": 1.5110000000000001, + "grad_norm": 8.983750343322754, + "learning_rate": 1.2243e-05, + "loss": 2.8299, + "step": 75550 + }, + { + "epoch": 1.512, + "grad_norm": 8.060677528381348, + "learning_rate": 1.2218e-05, + "loss": 2.8027, + "step": 75600 + }, + { + "epoch": 1.513, + "grad_norm": 7.529645919799805, + "learning_rate": 1.2193e-05, + "loss": 2.752, + "step": 75650 + }, + { + "epoch": 1.514, + "grad_norm": 8.330324172973633, + "learning_rate": 1.2168000000000001e-05, + "loss": 2.8057, + "step": 75700 + }, + { + "epoch": 1.5150000000000001, + "grad_norm": 8.606231689453125, + "learning_rate": 1.2143e-05, + "loss": 2.881, + "step": 75750 + }, + { + "epoch": 1.516, + "grad_norm": 7.919892311096191, + "learning_rate": 1.2118e-05, + "loss": 2.7935, + "step": 75800 + }, + { + "epoch": 1.517, + "grad_norm": 7.2072954177856445, + "learning_rate": 1.2093000000000001e-05, + "loss": 2.6488, + "step": 75850 + }, + { + "epoch": 1.518, + "grad_norm": 7.602199077606201, + "learning_rate": 1.2068e-05, + "loss": 2.7267, + "step": 75900 + }, + { + "epoch": 1.5190000000000001, + "grad_norm": 7.478748321533203, + "learning_rate": 1.2043e-05, + "loss": 2.8507, + "step": 75950 + }, + { + "epoch": 1.52, + "grad_norm": 8.189718246459961, + "learning_rate": 1.2018e-05, + "loss": 2.797, + "step": 76000 + }, + { + "epoch": 1.521, + "grad_norm": 9.281253814697266, + "learning_rate": 1.1993e-05, + "loss": 2.7886, + "step": 76050 + }, + { + "epoch": 1.522, + "grad_norm": 7.495121002197266, + "learning_rate": 1.1968e-05, + "loss": 2.8119, + "step": 76100 + }, + { + "epoch": 1.5230000000000001, + "grad_norm": 7.031370162963867, + "learning_rate": 1.1943e-05, + "loss": 2.9879, + "step": 76150 + }, + { + "epoch": 1.524, + "grad_norm": 9.18350601196289, + "learning_rate": 1.1918e-05, + "loss": 2.8631, + "step": 76200 + }, + { + "epoch": 1.525, + "grad_norm": 7.201645851135254, + "learning_rate": 1.1893e-05, + "loss": 2.5649, + "step": 76250 + }, + { + "epoch": 1.526, + "grad_norm": 9.630722045898438, + "learning_rate": 1.1868e-05, + "loss": 2.9442, + "step": 76300 + }, + { + "epoch": 1.5270000000000001, + "grad_norm": 8.107288360595703, + "learning_rate": 1.1843e-05, + "loss": 2.7623, + "step": 76350 + }, + { + "epoch": 1.528, + "grad_norm": 7.947506427764893, + "learning_rate": 1.1818e-05, + "loss": 2.5865, + "step": 76400 + }, + { + "epoch": 1.529, + "grad_norm": 10.158778190612793, + "learning_rate": 1.1793e-05, + "loss": 2.7721, + "step": 76450 + }, + { + "epoch": 1.53, + "grad_norm": 11.925228118896484, + "learning_rate": 1.1768000000000002e-05, + "loss": 2.8884, + "step": 76500 + }, + { + "epoch": 1.5310000000000001, + "grad_norm": 7.677661418914795, + "learning_rate": 1.1743000000000001e-05, + "loss": 2.5829, + "step": 76550 + }, + { + "epoch": 1.532, + "grad_norm": 9.155203819274902, + "learning_rate": 1.1718000000000002e-05, + "loss": 2.8175, + "step": 76600 + }, + { + "epoch": 1.533, + "grad_norm": 8.825461387634277, + "learning_rate": 1.1693000000000001e-05, + "loss": 2.8015, + "step": 76650 + }, + { + "epoch": 1.534, + "grad_norm": 7.908650875091553, + "learning_rate": 1.1668e-05, + "loss": 2.8491, + "step": 76700 + }, + { + "epoch": 1.5350000000000001, + "grad_norm": 8.90393352508545, + "learning_rate": 1.1643000000000002e-05, + "loss": 2.6554, + "step": 76750 + }, + { + "epoch": 1.536, + "grad_norm": 12.58026123046875, + "learning_rate": 1.1618000000000001e-05, + "loss": 2.9377, + "step": 76800 + }, + { + "epoch": 1.537, + "grad_norm": 7.935201644897461, + "learning_rate": 1.1593e-05, + "loss": 2.9783, + "step": 76850 + }, + { + "epoch": 1.538, + "grad_norm": 14.243797302246094, + "learning_rate": 1.1568000000000002e-05, + "loss": 2.7387, + "step": 76900 + }, + { + "epoch": 1.5390000000000001, + "grad_norm": 7.501498699188232, + "learning_rate": 1.1543000000000001e-05, + "loss": 2.8034, + "step": 76950 + }, + { + "epoch": 1.54, + "grad_norm": 9.914602279663086, + "learning_rate": 1.1518e-05, + "loss": 2.9279, + "step": 77000 + }, + { + "epoch": 1.541, + "grad_norm": 8.507695198059082, + "learning_rate": 1.1493000000000002e-05, + "loss": 2.9641, + "step": 77050 + }, + { + "epoch": 1.542, + "grad_norm": 7.860219955444336, + "learning_rate": 1.1468000000000001e-05, + "loss": 2.8456, + "step": 77100 + }, + { + "epoch": 1.5430000000000001, + "grad_norm": 8.061058044433594, + "learning_rate": 1.1443e-05, + "loss": 2.6333, + "step": 77150 + }, + { + "epoch": 1.544, + "grad_norm": 9.513175964355469, + "learning_rate": 1.1418000000000001e-05, + "loss": 2.6512, + "step": 77200 + }, + { + "epoch": 1.545, + "grad_norm": 7.933878421783447, + "learning_rate": 1.1393000000000001e-05, + "loss": 2.8595, + "step": 77250 + }, + { + "epoch": 1.546, + "grad_norm": 9.191105842590332, + "learning_rate": 1.1368e-05, + "loss": 2.9499, + "step": 77300 + }, + { + "epoch": 1.5470000000000002, + "grad_norm": 8.587617874145508, + "learning_rate": 1.13435e-05, + "loss": 2.7946, + "step": 77350 + }, + { + "epoch": 1.548, + "grad_norm": 8.40201187133789, + "learning_rate": 1.13185e-05, + "loss": 2.935, + "step": 77400 + }, + { + "epoch": 1.549, + "grad_norm": 9.933843612670898, + "learning_rate": 1.12935e-05, + "loss": 2.53, + "step": 77450 + }, + { + "epoch": 1.55, + "grad_norm": 8.645332336425781, + "learning_rate": 1.12685e-05, + "loss": 2.7493, + "step": 77500 + }, + { + "epoch": 1.5510000000000002, + "grad_norm": 8.894145011901855, + "learning_rate": 1.12435e-05, + "loss": 2.6933, + "step": 77550 + }, + { + "epoch": 1.552, + "grad_norm": 7.8815460205078125, + "learning_rate": 1.1218500000000001e-05, + "loss": 2.9458, + "step": 77600 + }, + { + "epoch": 1.553, + "grad_norm": 7.892411231994629, + "learning_rate": 1.11935e-05, + "loss": 2.6212, + "step": 77650 + }, + { + "epoch": 1.554, + "grad_norm": 9.983649253845215, + "learning_rate": 1.1168500000000002e-05, + "loss": 2.7327, + "step": 77700 + }, + { + "epoch": 1.5550000000000002, + "grad_norm": 8.785431861877441, + "learning_rate": 1.1143500000000001e-05, + "loss": 2.8999, + "step": 77750 + }, + { + "epoch": 1.556, + "grad_norm": 7.402805805206299, + "learning_rate": 1.11185e-05, + "loss": 3.0402, + "step": 77800 + }, + { + "epoch": 1.557, + "grad_norm": 8.64169692993164, + "learning_rate": 1.1093500000000001e-05, + "loss": 2.8152, + "step": 77850 + }, + { + "epoch": 1.558, + "grad_norm": 6.422133445739746, + "learning_rate": 1.10685e-05, + "loss": 2.7819, + "step": 77900 + }, + { + "epoch": 1.5590000000000002, + "grad_norm": 8.386688232421875, + "learning_rate": 1.10435e-05, + "loss": 2.8271, + "step": 77950 + }, + { + "epoch": 1.56, + "grad_norm": 9.966951370239258, + "learning_rate": 1.1018500000000001e-05, + "loss": 2.8392, + "step": 78000 + }, + { + "epoch": 1.561, + "grad_norm": 8.008108139038086, + "learning_rate": 1.09935e-05, + "loss": 2.6764, + "step": 78050 + }, + { + "epoch": 1.562, + "grad_norm": 6.586634159088135, + "learning_rate": 1.0968500000000002e-05, + "loss": 2.6548, + "step": 78100 + }, + { + "epoch": 1.563, + "grad_norm": 6.59722375869751, + "learning_rate": 1.0943500000000001e-05, + "loss": 2.9959, + "step": 78150 + }, + { + "epoch": 1.564, + "grad_norm": 8.587728500366211, + "learning_rate": 1.09185e-05, + "loss": 2.8217, + "step": 78200 + }, + { + "epoch": 1.565, + "grad_norm": 8.256272315979004, + "learning_rate": 1.0893500000000002e-05, + "loss": 2.8172, + "step": 78250 + }, + { + "epoch": 1.5659999999999998, + "grad_norm": 9.824472427368164, + "learning_rate": 1.0868500000000001e-05, + "loss": 2.7618, + "step": 78300 + }, + { + "epoch": 1.567, + "grad_norm": 6.0259785652160645, + "learning_rate": 1.08435e-05, + "loss": 2.776, + "step": 78350 + }, + { + "epoch": 1.568, + "grad_norm": 8.524801254272461, + "learning_rate": 1.0818500000000002e-05, + "loss": 2.8886, + "step": 78400 + }, + { + "epoch": 1.569, + "grad_norm": 7.844742298126221, + "learning_rate": 1.0793500000000001e-05, + "loss": 2.6163, + "step": 78450 + }, + { + "epoch": 1.5699999999999998, + "grad_norm": 8.0889253616333, + "learning_rate": 1.07685e-05, + "loss": 2.6713, + "step": 78500 + }, + { + "epoch": 1.571, + "grad_norm": 8.610453605651855, + "learning_rate": 1.0743500000000001e-05, + "loss": 2.6546, + "step": 78550 + }, + { + "epoch": 1.572, + "grad_norm": 10.174454689025879, + "learning_rate": 1.07185e-05, + "loss": 2.8693, + "step": 78600 + }, + { + "epoch": 1.573, + "grad_norm": 9.715279579162598, + "learning_rate": 1.06935e-05, + "loss": 2.8965, + "step": 78650 + }, + { + "epoch": 1.5739999999999998, + "grad_norm": 8.34848403930664, + "learning_rate": 1.0668500000000001e-05, + "loss": 2.7681, + "step": 78700 + }, + { + "epoch": 1.575, + "grad_norm": 10.231193542480469, + "learning_rate": 1.06435e-05, + "loss": 3.1195, + "step": 78750 + }, + { + "epoch": 1.576, + "grad_norm": 7.847792625427246, + "learning_rate": 1.06185e-05, + "loss": 2.9861, + "step": 78800 + }, + { + "epoch": 1.577, + "grad_norm": 12.434806823730469, + "learning_rate": 1.0593500000000001e-05, + "loss": 2.7446, + "step": 78850 + }, + { + "epoch": 1.5779999999999998, + "grad_norm": 7.213706970214844, + "learning_rate": 1.05685e-05, + "loss": 2.8863, + "step": 78900 + }, + { + "epoch": 1.579, + "grad_norm": 13.66837215423584, + "learning_rate": 1.05435e-05, + "loss": 2.8019, + "step": 78950 + }, + { + "epoch": 1.58, + "grad_norm": 12.315549850463867, + "learning_rate": 1.0518500000000001e-05, + "loss": 2.6926, + "step": 79000 + }, + { + "epoch": 1.581, + "grad_norm": 10.095964431762695, + "learning_rate": 1.04935e-05, + "loss": 2.6185, + "step": 79050 + }, + { + "epoch": 1.5819999999999999, + "grad_norm": 7.9094648361206055, + "learning_rate": 1.04685e-05, + "loss": 2.9309, + "step": 79100 + }, + { + "epoch": 1.583, + "grad_norm": 9.682660102844238, + "learning_rate": 1.0443500000000001e-05, + "loss": 2.7369, + "step": 79150 + }, + { + "epoch": 1.584, + "grad_norm": 8.518179893493652, + "learning_rate": 1.04185e-05, + "loss": 2.6872, + "step": 79200 + }, + { + "epoch": 1.585, + "grad_norm": 8.616283416748047, + "learning_rate": 1.03935e-05, + "loss": 2.9505, + "step": 79250 + }, + { + "epoch": 1.5859999999999999, + "grad_norm": 8.388751983642578, + "learning_rate": 1.03685e-05, + "loss": 3.0227, + "step": 79300 + }, + { + "epoch": 1.587, + "grad_norm": 7.643990993499756, + "learning_rate": 1.03435e-05, + "loss": 2.684, + "step": 79350 + }, + { + "epoch": 1.588, + "grad_norm": 7.016181945800781, + "learning_rate": 1.03185e-05, + "loss": 3.0096, + "step": 79400 + }, + { + "epoch": 1.589, + "grad_norm": 7.859565258026123, + "learning_rate": 1.02935e-05, + "loss": 2.8234, + "step": 79450 + }, + { + "epoch": 1.5899999999999999, + "grad_norm": 8.977822303771973, + "learning_rate": 1.02685e-05, + "loss": 2.8249, + "step": 79500 + }, + { + "epoch": 1.591, + "grad_norm": 9.124774932861328, + "learning_rate": 1.0243500000000001e-05, + "loss": 2.786, + "step": 79550 + }, + { + "epoch": 1.592, + "grad_norm": 7.759701728820801, + "learning_rate": 1.02185e-05, + "loss": 2.6182, + "step": 79600 + }, + { + "epoch": 1.593, + "grad_norm": 9.396001815795898, + "learning_rate": 1.01935e-05, + "loss": 2.7735, + "step": 79650 + }, + { + "epoch": 1.5939999999999999, + "grad_norm": 7.322061538696289, + "learning_rate": 1.0168500000000001e-05, + "loss": 2.749, + "step": 79700 + }, + { + "epoch": 1.595, + "grad_norm": 10.232261657714844, + "learning_rate": 1.01435e-05, + "loss": 2.5499, + "step": 79750 + }, + { + "epoch": 1.596, + "grad_norm": 7.829950332641602, + "learning_rate": 1.01185e-05, + "loss": 2.6453, + "step": 79800 + }, + { + "epoch": 1.597, + "grad_norm": 7.685505390167236, + "learning_rate": 1.0093500000000001e-05, + "loss": 2.7082, + "step": 79850 + }, + { + "epoch": 1.5979999999999999, + "grad_norm": 7.936647415161133, + "learning_rate": 1.00685e-05, + "loss": 2.734, + "step": 79900 + }, + { + "epoch": 1.599, + "grad_norm": 8.177151679992676, + "learning_rate": 1.00435e-05, + "loss": 2.8945, + "step": 79950 + }, + { + "epoch": 1.6, + "grad_norm": 7.7781782150268555, + "learning_rate": 1.00185e-05, + "loss": 2.8666, + "step": 80000 + }, + { + "epoch": 1.601, + "grad_norm": 8.397016525268555, + "learning_rate": 9.9935e-06, + "loss": 2.7994, + "step": 80050 + }, + { + "epoch": 1.6019999999999999, + "grad_norm": 8.55648422241211, + "learning_rate": 9.9685e-06, + "loss": 2.8561, + "step": 80100 + }, + { + "epoch": 1.603, + "grad_norm": 10.180916786193848, + "learning_rate": 9.9435e-06, + "loss": 2.7149, + "step": 80150 + }, + { + "epoch": 1.604, + "grad_norm": 7.665710926055908, + "learning_rate": 9.9185e-06, + "loss": 2.7395, + "step": 80200 + }, + { + "epoch": 1.605, + "grad_norm": 14.644109725952148, + "learning_rate": 9.8935e-06, + "loss": 2.7051, + "step": 80250 + }, + { + "epoch": 1.6059999999999999, + "grad_norm": 9.358149528503418, + "learning_rate": 9.8685e-06, + "loss": 2.9969, + "step": 80300 + }, + { + "epoch": 1.607, + "grad_norm": 9.007706642150879, + "learning_rate": 9.8435e-06, + "loss": 2.7834, + "step": 80350 + }, + { + "epoch": 1.608, + "grad_norm": 7.31819486618042, + "learning_rate": 9.8185e-06, + "loss": 2.8133, + "step": 80400 + }, + { + "epoch": 1.609, + "grad_norm": 10.879515647888184, + "learning_rate": 9.7935e-06, + "loss": 2.6609, + "step": 80450 + }, + { + "epoch": 1.6099999999999999, + "grad_norm": 7.164587020874023, + "learning_rate": 9.7685e-06, + "loss": 2.7477, + "step": 80500 + }, + { + "epoch": 1.611, + "grad_norm": 7.594228744506836, + "learning_rate": 9.743499999999999e-06, + "loss": 2.6883, + "step": 80550 + }, + { + "epoch": 1.612, + "grad_norm": 8.859107971191406, + "learning_rate": 9.7185e-06, + "loss": 2.7275, + "step": 80600 + }, + { + "epoch": 1.613, + "grad_norm": 8.09995174407959, + "learning_rate": 9.6935e-06, + "loss": 2.6976, + "step": 80650 + }, + { + "epoch": 1.6139999999999999, + "grad_norm": 8.347760200500488, + "learning_rate": 9.668499999999999e-06, + "loss": 2.7801, + "step": 80700 + }, + { + "epoch": 1.615, + "grad_norm": 9.736233711242676, + "learning_rate": 9.643500000000002e-06, + "loss": 2.812, + "step": 80750 + }, + { + "epoch": 1.616, + "grad_norm": 8.58044719696045, + "learning_rate": 9.618500000000001e-06, + "loss": 2.8104, + "step": 80800 + }, + { + "epoch": 1.617, + "grad_norm": 8.394573211669922, + "learning_rate": 9.5935e-06, + "loss": 2.9928, + "step": 80850 + }, + { + "epoch": 1.6179999999999999, + "grad_norm": 8.22284984588623, + "learning_rate": 9.568500000000002e-06, + "loss": 2.7226, + "step": 80900 + }, + { + "epoch": 1.619, + "grad_norm": 8.150736808776855, + "learning_rate": 9.543500000000001e-06, + "loss": 2.6672, + "step": 80950 + }, + { + "epoch": 1.62, + "grad_norm": 7.5970635414123535, + "learning_rate": 9.5185e-06, + "loss": 2.8622, + "step": 81000 + }, + { + "epoch": 1.621, + "grad_norm": 10.968223571777344, + "learning_rate": 9.493500000000002e-06, + "loss": 2.6366, + "step": 81050 + }, + { + "epoch": 1.6219999999999999, + "grad_norm": 6.821518898010254, + "learning_rate": 9.468500000000001e-06, + "loss": 2.8406, + "step": 81100 + }, + { + "epoch": 1.623, + "grad_norm": 7.469738483428955, + "learning_rate": 9.4435e-06, + "loss": 2.8246, + "step": 81150 + }, + { + "epoch": 1.624, + "grad_norm": 8.006429672241211, + "learning_rate": 9.418500000000001e-06, + "loss": 2.8267, + "step": 81200 + }, + { + "epoch": 1.625, + "grad_norm": 8.809013366699219, + "learning_rate": 9.3935e-06, + "loss": 2.7887, + "step": 81250 + }, + { + "epoch": 1.626, + "grad_norm": 12.106205940246582, + "learning_rate": 9.3685e-06, + "loss": 2.6218, + "step": 81300 + }, + { + "epoch": 1.627, + "grad_norm": 10.05888843536377, + "learning_rate": 9.344e-06, + "loss": 2.8547, + "step": 81350 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 8.236129760742188, + "learning_rate": 9.319e-06, + "loss": 2.872, + "step": 81400 + }, + { + "epoch": 1.629, + "grad_norm": 8.157784461975098, + "learning_rate": 9.294e-06, + "loss": 2.701, + "step": 81450 + }, + { + "epoch": 1.63, + "grad_norm": 6.6969218254089355, + "learning_rate": 9.269e-06, + "loss": 2.9475, + "step": 81500 + }, + { + "epoch": 1.631, + "grad_norm": 8.2749662399292, + "learning_rate": 9.244e-06, + "loss": 2.7482, + "step": 81550 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 9.816543579101562, + "learning_rate": 9.219e-06, + "loss": 2.8068, + "step": 81600 + }, + { + "epoch": 1.633, + "grad_norm": 8.033219337463379, + "learning_rate": 9.194e-06, + "loss": 2.9093, + "step": 81650 + }, + { + "epoch": 1.634, + "grad_norm": 8.458338737487793, + "learning_rate": 9.169e-06, + "loss": 2.8396, + "step": 81700 + }, + { + "epoch": 1.635, + "grad_norm": 8.59599781036377, + "learning_rate": 9.144e-06, + "loss": 2.9586, + "step": 81750 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 7.180260181427002, + "learning_rate": 9.119e-06, + "loss": 2.599, + "step": 81800 + }, + { + "epoch": 1.637, + "grad_norm": 6.488671779632568, + "learning_rate": 9.094e-06, + "loss": 2.8477, + "step": 81850 + }, + { + "epoch": 1.638, + "grad_norm": 8.595635414123535, + "learning_rate": 9.069e-06, + "loss": 2.8529, + "step": 81900 + }, + { + "epoch": 1.639, + "grad_norm": 7.423949718475342, + "learning_rate": 9.044000000000002e-06, + "loss": 2.698, + "step": 81950 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 8.720772743225098, + "learning_rate": 9.019000000000001e-06, + "loss": 2.8964, + "step": 82000 + }, + { + "epoch": 1.641, + "grad_norm": 11.61228084564209, + "learning_rate": 8.994e-06, + "loss": 2.9746, + "step": 82050 + }, + { + "epoch": 1.642, + "grad_norm": 6.805294990539551, + "learning_rate": 8.969000000000002e-06, + "loss": 2.7932, + "step": 82100 + }, + { + "epoch": 1.643, + "grad_norm": 9.611919403076172, + "learning_rate": 8.944000000000001e-06, + "loss": 2.8123, + "step": 82150 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 9.956771850585938, + "learning_rate": 8.919e-06, + "loss": 2.8532, + "step": 82200 + }, + { + "epoch": 1.645, + "grad_norm": 7.504790306091309, + "learning_rate": 8.894000000000002e-06, + "loss": 2.6689, + "step": 82250 + }, + { + "epoch": 1.646, + "grad_norm": 9.280957221984863, + "learning_rate": 8.869000000000001e-06, + "loss": 2.7067, + "step": 82300 + }, + { + "epoch": 1.647, + "grad_norm": 7.816487789154053, + "learning_rate": 8.844e-06, + "loss": 2.7388, + "step": 82350 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 6.874303340911865, + "learning_rate": 8.819000000000001e-06, + "loss": 2.742, + "step": 82400 + }, + { + "epoch": 1.649, + "grad_norm": 9.157050132751465, + "learning_rate": 8.794e-06, + "loss": 2.7184, + "step": 82450 + }, + { + "epoch": 1.65, + "grad_norm": 7.791154861450195, + "learning_rate": 8.769e-06, + "loss": 2.5517, + "step": 82500 + }, + { + "epoch": 1.651, + "grad_norm": 8.14765453338623, + "learning_rate": 8.744000000000001e-06, + "loss": 2.9349, + "step": 82550 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 7.623929023742676, + "learning_rate": 8.719e-06, + "loss": 2.6772, + "step": 82600 + }, + { + "epoch": 1.653, + "grad_norm": 8.879040718078613, + "learning_rate": 8.694e-06, + "loss": 2.8483, + "step": 82650 + }, + { + "epoch": 1.654, + "grad_norm": 12.571939468383789, + "learning_rate": 8.669000000000001e-06, + "loss": 2.6692, + "step": 82700 + }, + { + "epoch": 1.655, + "grad_norm": 8.578469276428223, + "learning_rate": 8.644e-06, + "loss": 2.6946, + "step": 82750 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 8.490697860717773, + "learning_rate": 8.619e-06, + "loss": 2.8827, + "step": 82800 + }, + { + "epoch": 1.657, + "grad_norm": 8.043312072753906, + "learning_rate": 8.594000000000001e-06, + "loss": 2.648, + "step": 82850 + }, + { + "epoch": 1.658, + "grad_norm": 8.180224418640137, + "learning_rate": 8.569e-06, + "loss": 2.6368, + "step": 82900 + }, + { + "epoch": 1.659, + "grad_norm": 7.162864685058594, + "learning_rate": 8.544e-06, + "loss": 2.7297, + "step": 82950 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 8.248373031616211, + "learning_rate": 8.519000000000001e-06, + "loss": 2.869, + "step": 83000 + }, + { + "epoch": 1.661, + "grad_norm": 6.883781909942627, + "learning_rate": 8.494e-06, + "loss": 2.7607, + "step": 83050 + }, + { + "epoch": 1.662, + "grad_norm": 9.59625244140625, + "learning_rate": 8.469e-06, + "loss": 2.6657, + "step": 83100 + }, + { + "epoch": 1.663, + "grad_norm": 8.114121437072754, + "learning_rate": 8.444e-06, + "loss": 2.7106, + "step": 83150 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 9.549163818359375, + "learning_rate": 8.419e-06, + "loss": 2.8075, + "step": 83200 + }, + { + "epoch": 1.665, + "grad_norm": 8.121686935424805, + "learning_rate": 8.394000000000001e-06, + "loss": 2.8488, + "step": 83250 + }, + { + "epoch": 1.666, + "grad_norm": 10.259820938110352, + "learning_rate": 8.369e-06, + "loss": 2.7512, + "step": 83300 + }, + { + "epoch": 1.667, + "grad_norm": 10.132661819458008, + "learning_rate": 8.344500000000001e-06, + "loss": 2.7909, + "step": 83350 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 7.976057052612305, + "learning_rate": 8.3195e-06, + "loss": 2.6088, + "step": 83400 + }, + { + "epoch": 1.669, + "grad_norm": 9.32275104522705, + "learning_rate": 8.294500000000002e-06, + "loss": 2.669, + "step": 83450 + }, + { + "epoch": 1.67, + "grad_norm": 8.177163124084473, + "learning_rate": 8.269500000000001e-06, + "loss": 2.7145, + "step": 83500 + }, + { + "epoch": 1.671, + "grad_norm": 6.75959587097168, + "learning_rate": 8.2445e-06, + "loss": 2.8134, + "step": 83550 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 8.769372940063477, + "learning_rate": 8.219500000000002e-06, + "loss": 2.6065, + "step": 83600 + }, + { + "epoch": 1.673, + "grad_norm": 7.8767900466918945, + "learning_rate": 8.194500000000001e-06, + "loss": 2.7516, + "step": 83650 + }, + { + "epoch": 1.674, + "grad_norm": 8.258367538452148, + "learning_rate": 8.1695e-06, + "loss": 2.8359, + "step": 83700 + }, + { + "epoch": 1.675, + "grad_norm": 6.696470260620117, + "learning_rate": 8.144500000000001e-06, + "loss": 2.7341, + "step": 83750 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 10.333334922790527, + "learning_rate": 8.1195e-06, + "loss": 2.8067, + "step": 83800 + }, + { + "epoch": 1.677, + "grad_norm": 6.307063102722168, + "learning_rate": 8.0945e-06, + "loss": 2.9026, + "step": 83850 + }, + { + "epoch": 1.678, + "grad_norm": 8.203290939331055, + "learning_rate": 8.069500000000001e-06, + "loss": 2.7649, + "step": 83900 + }, + { + "epoch": 1.679, + "grad_norm": 7.663438320159912, + "learning_rate": 8.0445e-06, + "loss": 2.7698, + "step": 83950 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 17.893810272216797, + "learning_rate": 8.0195e-06, + "loss": 2.7126, + "step": 84000 + }, + { + "epoch": 1.681, + "grad_norm": 7.697596549987793, + "learning_rate": 7.994500000000001e-06, + "loss": 2.7735, + "step": 84050 + }, + { + "epoch": 1.682, + "grad_norm": 8.048392295837402, + "learning_rate": 7.9695e-06, + "loss": 2.8021, + "step": 84100 + }, + { + "epoch": 1.683, + "grad_norm": 7.380740642547607, + "learning_rate": 7.9445e-06, + "loss": 2.856, + "step": 84150 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 7.892630577087402, + "learning_rate": 7.919500000000001e-06, + "loss": 2.7246, + "step": 84200 + }, + { + "epoch": 1.685, + "grad_norm": 8.605746269226074, + "learning_rate": 7.8945e-06, + "loss": 2.8434, + "step": 84250 + }, + { + "epoch": 1.686, + "grad_norm": 9.128484725952148, + "learning_rate": 7.8695e-06, + "loss": 2.7519, + "step": 84300 + }, + { + "epoch": 1.687, + "grad_norm": 10.054672241210938, + "learning_rate": 7.844500000000001e-06, + "loss": 2.6398, + "step": 84350 + }, + { + "epoch": 1.688, + "grad_norm": 9.681634902954102, + "learning_rate": 7.8195e-06, + "loss": 2.8255, + "step": 84400 + }, + { + "epoch": 1.689, + "grad_norm": 10.02967357635498, + "learning_rate": 7.7945e-06, + "loss": 2.5538, + "step": 84450 + }, + { + "epoch": 1.69, + "grad_norm": 7.61908483505249, + "learning_rate": 7.7695e-06, + "loss": 2.7887, + "step": 84500 + }, + { + "epoch": 1.6909999999999998, + "grad_norm": 8.514749526977539, + "learning_rate": 7.7445e-06, + "loss": 2.8793, + "step": 84550 + }, + { + "epoch": 1.692, + "grad_norm": 7.921048164367676, + "learning_rate": 7.7195e-06, + "loss": 2.8279, + "step": 84600 + }, + { + "epoch": 1.693, + "grad_norm": 5.613151550292969, + "learning_rate": 7.6945e-06, + "loss": 2.6869, + "step": 84650 + }, + { + "epoch": 1.694, + "grad_norm": 7.689486503601074, + "learning_rate": 7.6695e-06, + "loss": 2.9914, + "step": 84700 + }, + { + "epoch": 1.6949999999999998, + "grad_norm": 8.673030853271484, + "learning_rate": 7.644500000000001e-06, + "loss": 2.7578, + "step": 84750 + }, + { + "epoch": 1.696, + "grad_norm": 7.168893814086914, + "learning_rate": 7.6195000000000005e-06, + "loss": 2.7542, + "step": 84800 + }, + { + "epoch": 1.697, + "grad_norm": 6.62797212600708, + "learning_rate": 7.5945e-06, + "loss": 2.5553, + "step": 84850 + }, + { + "epoch": 1.698, + "grad_norm": 8.141118049621582, + "learning_rate": 7.5695e-06, + "loss": 2.6875, + "step": 84900 + }, + { + "epoch": 1.6989999999999998, + "grad_norm": 9.107691764831543, + "learning_rate": 7.5445e-06, + "loss": 2.6884, + "step": 84950 + }, + { + "epoch": 1.7, + "grad_norm": 7.87322998046875, + "learning_rate": 7.5195e-06, + "loss": 2.8789, + "step": 85000 + }, + { + "epoch": 1.701, + "grad_norm": 7.496004104614258, + "learning_rate": 7.4945e-06, + "loss": 2.6971, + "step": 85050 + }, + { + "epoch": 1.702, + "grad_norm": 9.35421371459961, + "learning_rate": 7.4695e-06, + "loss": 2.5611, + "step": 85100 + }, + { + "epoch": 1.7029999999999998, + "grad_norm": 9.931110382080078, + "learning_rate": 7.4445000000000005e-06, + "loss": 2.9122, + "step": 85150 + }, + { + "epoch": 1.704, + "grad_norm": 8.53326416015625, + "learning_rate": 7.4195e-06, + "loss": 2.8646, + "step": 85200 + }, + { + "epoch": 1.705, + "grad_norm": 7.935835838317871, + "learning_rate": 7.3945e-06, + "loss": 2.7549, + "step": 85250 + }, + { + "epoch": 1.706, + "grad_norm": 8.31574821472168, + "learning_rate": 7.3695e-06, + "loss": 2.7098, + "step": 85300 + }, + { + "epoch": 1.7069999999999999, + "grad_norm": 8.184920310974121, + "learning_rate": 7.3445e-06, + "loss": 2.6436, + "step": 85350 + }, + { + "epoch": 1.708, + "grad_norm": 6.512944221496582, + "learning_rate": 7.32e-06, + "loss": 2.6321, + "step": 85400 + }, + { + "epoch": 1.709, + "grad_norm": 7.280327796936035, + "learning_rate": 7.2950000000000005e-06, + "loss": 2.9698, + "step": 85450 + }, + { + "epoch": 1.71, + "grad_norm": 11.694329261779785, + "learning_rate": 7.270000000000001e-06, + "loss": 2.8015, + "step": 85500 + }, + { + "epoch": 1.7109999999999999, + "grad_norm": 10.256030082702637, + "learning_rate": 7.245e-06, + "loss": 2.6892, + "step": 85550 + }, + { + "epoch": 1.712, + "grad_norm": 8.213347434997559, + "learning_rate": 7.22e-06, + "loss": 2.7122, + "step": 85600 + }, + { + "epoch": 1.713, + "grad_norm": 10.824342727661133, + "learning_rate": 7.1950000000000006e-06, + "loss": 2.7303, + "step": 85650 + }, + { + "epoch": 1.714, + "grad_norm": 10.240262985229492, + "learning_rate": 7.17e-06, + "loss": 2.8331, + "step": 85700 + }, + { + "epoch": 1.7149999999999999, + "grad_norm": 10.220215797424316, + "learning_rate": 7.145e-06, + "loss": 2.7955, + "step": 85750 + }, + { + "epoch": 1.716, + "grad_norm": 11.018972396850586, + "learning_rate": 7.1200000000000004e-06, + "loss": 2.5736, + "step": 85800 + }, + { + "epoch": 1.717, + "grad_norm": 9.33096694946289, + "learning_rate": 7.095000000000001e-06, + "loss": 2.5607, + "step": 85850 + }, + { + "epoch": 1.718, + "grad_norm": 8.596012115478516, + "learning_rate": 7.07e-06, + "loss": 2.8816, + "step": 85900 + }, + { + "epoch": 1.7189999999999999, + "grad_norm": 6.443115234375, + "learning_rate": 7.045e-06, + "loss": 2.5941, + "step": 85950 + }, + { + "epoch": 1.72, + "grad_norm": 8.386388778686523, + "learning_rate": 7.0200000000000006e-06, + "loss": 2.67, + "step": 86000 + }, + { + "epoch": 1.721, + "grad_norm": 8.550931930541992, + "learning_rate": 6.995e-06, + "loss": 2.8201, + "step": 86050 + }, + { + "epoch": 1.722, + "grad_norm": 7.8057475090026855, + "learning_rate": 6.97e-06, + "loss": 2.672, + "step": 86100 + }, + { + "epoch": 1.7229999999999999, + "grad_norm": 7.748495578765869, + "learning_rate": 6.945e-06, + "loss": 2.7135, + "step": 86150 + }, + { + "epoch": 1.724, + "grad_norm": 9.472148895263672, + "learning_rate": 6.92e-06, + "loss": 2.8646, + "step": 86200 + }, + { + "epoch": 1.725, + "grad_norm": 8.10639762878418, + "learning_rate": 6.895e-06, + "loss": 2.6441, + "step": 86250 + }, + { + "epoch": 1.726, + "grad_norm": 16.613393783569336, + "learning_rate": 6.87e-06, + "loss": 2.7735, + "step": 86300 + }, + { + "epoch": 1.7269999999999999, + "grad_norm": 8.373213768005371, + "learning_rate": 6.845e-06, + "loss": 2.7388, + "step": 86350 + }, + { + "epoch": 1.728, + "grad_norm": 10.412379264831543, + "learning_rate": 6.82e-06, + "loss": 2.8095, + "step": 86400 + }, + { + "epoch": 1.729, + "grad_norm": 8.624077796936035, + "learning_rate": 6.795e-06, + "loss": 2.8076, + "step": 86450 + }, + { + "epoch": 1.73, + "grad_norm": 7.361799240112305, + "learning_rate": 6.7699999999999996e-06, + "loss": 2.7069, + "step": 86500 + }, + { + "epoch": 1.7309999999999999, + "grad_norm": 9.244890213012695, + "learning_rate": 6.745e-06, + "loss": 2.8612, + "step": 86550 + }, + { + "epoch": 1.732, + "grad_norm": 9.704329490661621, + "learning_rate": 6.72e-06, + "loss": 2.7387, + "step": 86600 + }, + { + "epoch": 1.733, + "grad_norm": 6.897952079772949, + "learning_rate": 6.695e-06, + "loss": 2.7645, + "step": 86650 + }, + { + "epoch": 1.734, + "grad_norm": 8.001317024230957, + "learning_rate": 6.67e-06, + "loss": 2.7385, + "step": 86700 + }, + { + "epoch": 1.7349999999999999, + "grad_norm": 7.381739616394043, + "learning_rate": 6.645e-06, + "loss": 2.862, + "step": 86750 + }, + { + "epoch": 1.736, + "grad_norm": 10.89158821105957, + "learning_rate": 6.62e-06, + "loss": 2.732, + "step": 86800 + }, + { + "epoch": 1.737, + "grad_norm": 7.905108451843262, + "learning_rate": 6.5949999999999995e-06, + "loss": 2.7843, + "step": 86850 + }, + { + "epoch": 1.738, + "grad_norm": 6.913460731506348, + "learning_rate": 6.57e-06, + "loss": 2.7391, + "step": 86900 + }, + { + "epoch": 1.7389999999999999, + "grad_norm": 7.739956378936768, + "learning_rate": 6.545e-06, + "loss": 2.6727, + "step": 86950 + }, + { + "epoch": 1.74, + "grad_norm": 9.15610122680664, + "learning_rate": 6.519999999999999e-06, + "loss": 2.686, + "step": 87000 + }, + { + "epoch": 1.741, + "grad_norm": 7.165774822235107, + "learning_rate": 6.495e-06, + "loss": 2.7024, + "step": 87050 + }, + { + "epoch": 1.742, + "grad_norm": 7.682003021240234, + "learning_rate": 6.47e-06, + "loss": 2.6132, + "step": 87100 + }, + { + "epoch": 1.7429999999999999, + "grad_norm": 8.491655349731445, + "learning_rate": 6.444999999999999e-06, + "loss": 2.7714, + "step": 87150 + }, + { + "epoch": 1.744, + "grad_norm": 8.800849914550781, + "learning_rate": 6.4199999999999995e-06, + "loss": 2.7004, + "step": 87200 + }, + { + "epoch": 1.745, + "grad_norm": 8.72658920288086, + "learning_rate": 6.395000000000001e-06, + "loss": 2.8618, + "step": 87250 + }, + { + "epoch": 1.746, + "grad_norm": 10.49486255645752, + "learning_rate": 6.370000000000001e-06, + "loss": 2.9177, + "step": 87300 + }, + { + "epoch": 1.7469999999999999, + "grad_norm": 7.540380001068115, + "learning_rate": 6.345000000000001e-06, + "loss": 2.7355, + "step": 87350 + }, + { + "epoch": 1.748, + "grad_norm": Infinity, + "learning_rate": 6.320000000000001e-06, + "loss": 3.01, + "step": 87400 + }, + { + "epoch": 1.749, + "grad_norm": 8.305073738098145, + "learning_rate": 6.2955e-06, + "loss": 2.8643, + "step": 87450 + }, + { + "epoch": 1.75, + "grad_norm": 7.897730827331543, + "learning_rate": 6.2705e-06, + "loss": 2.7766, + "step": 87500 + }, + { + "epoch": 1.751, + "grad_norm": 7.82082462310791, + "learning_rate": 6.2455e-06, + "loss": 2.8744, + "step": 87550 + }, + { + "epoch": 1.752, + "grad_norm": 8.460726737976074, + "learning_rate": 6.220500000000001e-06, + "loss": 2.8135, + "step": 87600 + }, + { + "epoch": 1.7530000000000001, + "grad_norm": 9.274124145507812, + "learning_rate": 6.1955e-06, + "loss": 2.6903, + "step": 87650 + }, + { + "epoch": 1.754, + "grad_norm": 8.046218872070312, + "learning_rate": 6.1705000000000005e-06, + "loss": 2.5912, + "step": 87700 + }, + { + "epoch": 1.755, + "grad_norm": 8.15789794921875, + "learning_rate": 6.145500000000001e-06, + "loss": 2.5456, + "step": 87750 + }, + { + "epoch": 1.756, + "grad_norm": 11.076677322387695, + "learning_rate": 6.120500000000001e-06, + "loss": 2.7817, + "step": 87800 + }, + { + "epoch": 1.7570000000000001, + "grad_norm": 7.190091609954834, + "learning_rate": 6.0955e-06, + "loss": 2.775, + "step": 87850 + }, + { + "epoch": 1.758, + "grad_norm": 8.741618156433105, + "learning_rate": 6.070500000000001e-06, + "loss": 2.8138, + "step": 87900 + }, + { + "epoch": 1.759, + "grad_norm": 7.122415065765381, + "learning_rate": 6.045500000000001e-06, + "loss": 2.7268, + "step": 87950 + }, + { + "epoch": 1.76, + "grad_norm": 6.823777198791504, + "learning_rate": 6.0205e-06, + "loss": 2.6694, + "step": 88000 + }, + { + "epoch": 1.7610000000000001, + "grad_norm": 7.419735908508301, + "learning_rate": 5.9955000000000004e-06, + "loss": 2.8717, + "step": 88050 + }, + { + "epoch": 1.762, + "grad_norm": 6.787914276123047, + "learning_rate": 5.970500000000001e-06, + "loss": 2.7379, + "step": 88100 + }, + { + "epoch": 1.763, + "grad_norm": 7.9896979331970215, + "learning_rate": 5.9455e-06, + "loss": 2.8077, + "step": 88150 + }, + { + "epoch": 1.764, + "grad_norm": 8.55472469329834, + "learning_rate": 5.9205e-06, + "loss": 2.7669, + "step": 88200 + }, + { + "epoch": 1.7650000000000001, + "grad_norm": 7.805665969848633, + "learning_rate": 5.8955000000000006e-06, + "loss": 2.7029, + "step": 88250 + }, + { + "epoch": 1.766, + "grad_norm": 7.704306125640869, + "learning_rate": 5.8705e-06, + "loss": 2.7733, + "step": 88300 + }, + { + "epoch": 1.767, + "grad_norm": 6.891770362854004, + "learning_rate": 5.8455e-06, + "loss": 2.7838, + "step": 88350 + }, + { + "epoch": 1.768, + "grad_norm": 7.864100456237793, + "learning_rate": 5.8205000000000004e-06, + "loss": 2.8568, + "step": 88400 + }, + { + "epoch": 1.7690000000000001, + "grad_norm": 10.85545539855957, + "learning_rate": 5.7955e-06, + "loss": 2.7936, + "step": 88450 + }, + { + "epoch": 1.77, + "grad_norm": 7.065977573394775, + "learning_rate": 5.7705e-06, + "loss": 3.0453, + "step": 88500 + }, + { + "epoch": 1.771, + "grad_norm": 8.579532623291016, + "learning_rate": 5.7455e-06, + "loss": 2.9797, + "step": 88550 + }, + { + "epoch": 1.772, + "grad_norm": 7.2842183113098145, + "learning_rate": 5.7205000000000005e-06, + "loss": 2.6284, + "step": 88600 + }, + { + "epoch": 1.7730000000000001, + "grad_norm": 9.140216827392578, + "learning_rate": 5.6955e-06, + "loss": 2.7266, + "step": 88650 + }, + { + "epoch": 1.774, + "grad_norm": 7.360970497131348, + "learning_rate": 5.6705e-06, + "loss": 2.798, + "step": 88700 + }, + { + "epoch": 1.775, + "grad_norm": 8.7395658493042, + "learning_rate": 5.6455e-06, + "loss": 2.8203, + "step": 88750 + }, + { + "epoch": 1.776, + "grad_norm": 8.213441848754883, + "learning_rate": 5.6205e-06, + "loss": 2.8288, + "step": 88800 + }, + { + "epoch": 1.7770000000000001, + "grad_norm": 7.257961750030518, + "learning_rate": 5.5955e-06, + "loss": 2.8347, + "step": 88850 + }, + { + "epoch": 1.778, + "grad_norm": 9.789780616760254, + "learning_rate": 5.5705e-06, + "loss": 2.8285, + "step": 88900 + }, + { + "epoch": 1.779, + "grad_norm": 7.806823253631592, + "learning_rate": 5.5455e-06, + "loss": 2.6556, + "step": 88950 + }, + { + "epoch": 1.78, + "grad_norm": 7.615421772003174, + "learning_rate": 5.5205e-06, + "loss": 2.6485, + "step": 89000 + }, + { + "epoch": 1.7810000000000001, + "grad_norm": 8.010266304016113, + "learning_rate": 5.4955e-06, + "loss": 2.6579, + "step": 89050 + }, + { + "epoch": 1.782, + "grad_norm": 8.119599342346191, + "learning_rate": 5.4704999999999995e-06, + "loss": 2.6313, + "step": 89100 + }, + { + "epoch": 1.783, + "grad_norm": 11.232370376586914, + "learning_rate": 5.445500000000001e-06, + "loss": 2.6671, + "step": 89150 + }, + { + "epoch": 1.784, + "grad_norm": 6.684316635131836, + "learning_rate": 5.420500000000001e-06, + "loss": 2.707, + "step": 89200 + }, + { + "epoch": 1.7850000000000001, + "grad_norm": 8.435420036315918, + "learning_rate": 5.3955e-06, + "loss": 2.4425, + "step": 89250 + }, + { + "epoch": 1.786, + "grad_norm": 7.559298992156982, + "learning_rate": 5.3705000000000005e-06, + "loss": 2.7967, + "step": 89300 + }, + { + "epoch": 1.787, + "grad_norm": 7.989433765411377, + "learning_rate": 5.345500000000001e-06, + "loss": 2.7226, + "step": 89350 + }, + { + "epoch": 1.788, + "grad_norm": 8.866364479064941, + "learning_rate": 5.3205e-06, + "loss": 2.6818, + "step": 89400 + }, + { + "epoch": 1.7890000000000001, + "grad_norm": 8.641900062561035, + "learning_rate": 5.296000000000001e-06, + "loss": 2.8602, + "step": 89450 + }, + { + "epoch": 1.79, + "grad_norm": 7.063851356506348, + "learning_rate": 5.271e-06, + "loss": 2.6172, + "step": 89500 + }, + { + "epoch": 1.791, + "grad_norm": 8.536163330078125, + "learning_rate": 5.246e-06, + "loss": 2.6273, + "step": 89550 + }, + { + "epoch": 1.792, + "grad_norm": 8.912287712097168, + "learning_rate": 5.2210000000000005e-06, + "loss": 2.7513, + "step": 89600 + }, + { + "epoch": 1.7930000000000001, + "grad_norm": 8.023194313049316, + "learning_rate": 5.196e-06, + "loss": 2.5922, + "step": 89650 + }, + { + "epoch": 1.794, + "grad_norm": 7.167906761169434, + "learning_rate": 5.171e-06, + "loss": 2.7784, + "step": 89700 + }, + { + "epoch": 1.795, + "grad_norm": 7.181880950927734, + "learning_rate": 5.1465e-06, + "loss": 2.9942, + "step": 89750 + }, + { + "epoch": 1.796, + "grad_norm": 9.94197940826416, + "learning_rate": 5.1215e-06, + "loss": 2.6009, + "step": 89800 + }, + { + "epoch": 1.7970000000000002, + "grad_norm": 8.615551948547363, + "learning_rate": 5.0965e-06, + "loss": 2.6638, + "step": 89850 + }, + { + "epoch": 1.798, + "grad_norm": 8.286895751953125, + "learning_rate": 5.0715000000000005e-06, + "loss": 2.6442, + "step": 89900 + }, + { + "epoch": 1.799, + "grad_norm": 7.661980628967285, + "learning_rate": 5.046500000000001e-06, + "loss": 2.718, + "step": 89950 + }, + { + "epoch": 1.8, + "grad_norm": 8.584294319152832, + "learning_rate": 5.0215e-06, + "loss": 2.7903, + "step": 90000 + }, + { + "epoch": 1.8010000000000002, + "grad_norm": 6.850968360900879, + "learning_rate": 4.9965e-06, + "loss": 2.7178, + "step": 90050 + }, + { + "epoch": 1.802, + "grad_norm": 8.273259162902832, + "learning_rate": 4.971500000000001e-06, + "loss": 2.8497, + "step": 90100 + }, + { + "epoch": 1.803, + "grad_norm": 7.811845779418945, + "learning_rate": 4.946500000000001e-06, + "loss": 2.652, + "step": 90150 + }, + { + "epoch": 1.804, + "grad_norm": 6.695546627044678, + "learning_rate": 4.9215e-06, + "loss": 2.6411, + "step": 90200 + }, + { + "epoch": 1.8050000000000002, + "grad_norm": 11.059279441833496, + "learning_rate": 4.8965000000000005e-06, + "loss": 2.5529, + "step": 90250 + }, + { + "epoch": 1.806, + "grad_norm": 9.72866153717041, + "learning_rate": 4.871500000000001e-06, + "loss": 2.9216, + "step": 90300 + }, + { + "epoch": 1.807, + "grad_norm": 10.693723678588867, + "learning_rate": 4.8465e-06, + "loss": 2.8347, + "step": 90350 + }, + { + "epoch": 1.808, + "grad_norm": 10.578742027282715, + "learning_rate": 4.8215e-06, + "loss": 2.8648, + "step": 90400 + }, + { + "epoch": 1.8090000000000002, + "grad_norm": 9.165677070617676, + "learning_rate": 4.796500000000001e-06, + "loss": 2.5032, + "step": 90450 + }, + { + "epoch": 1.81, + "grad_norm": 7.2900471687316895, + "learning_rate": 4.7715e-06, + "loss": 2.7891, + "step": 90500 + }, + { + "epoch": 1.811, + "grad_norm": 7.986770153045654, + "learning_rate": 4.7465e-06, + "loss": 2.8503, + "step": 90550 + }, + { + "epoch": 1.812, + "grad_norm": 8.599214553833008, + "learning_rate": 4.7215000000000004e-06, + "loss": 2.7908, + "step": 90600 + }, + { + "epoch": 1.813, + "grad_norm": 8.372736930847168, + "learning_rate": 4.6965e-06, + "loss": 2.7009, + "step": 90650 + }, + { + "epoch": 1.814, + "grad_norm": 8.184940338134766, + "learning_rate": 4.6715e-06, + "loss": 2.9262, + "step": 90700 + }, + { + "epoch": 1.815, + "grad_norm": 8.346232414245605, + "learning_rate": 4.6465e-06, + "loss": 2.7132, + "step": 90750 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 9.309367179870605, + "learning_rate": 4.6215e-06, + "loss": 2.7366, + "step": 90800 + }, + { + "epoch": 1.817, + "grad_norm": 8.382227897644043, + "learning_rate": 4.5965e-06, + "loss": 2.6953, + "step": 90850 + }, + { + "epoch": 1.818, + "grad_norm": 9.15678882598877, + "learning_rate": 4.5715e-06, + "loss": 2.5969, + "step": 90900 + }, + { + "epoch": 1.819, + "grad_norm": 9.202027320861816, + "learning_rate": 4.5465000000000004e-06, + "loss": 2.7032, + "step": 90950 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 9.636245727539062, + "learning_rate": 4.5215e-06, + "loss": 2.6589, + "step": 91000 + }, + { + "epoch": 1.821, + "grad_norm": 8.213318824768066, + "learning_rate": 4.4965e-06, + "loss": 2.667, + "step": 91050 + }, + { + "epoch": 1.822, + "grad_norm": 7.809316635131836, + "learning_rate": 4.4715e-06, + "loss": 2.8199, + "step": 91100 + }, + { + "epoch": 1.823, + "grad_norm": 9.173991203308105, + "learning_rate": 4.4465e-06, + "loss": 2.7688, + "step": 91150 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 8.543388366699219, + "learning_rate": 4.4215e-06, + "loss": 2.6941, + "step": 91200 + }, + { + "epoch": 1.825, + "grad_norm": 8.782526969909668, + "learning_rate": 4.3965e-06, + "loss": 2.5714, + "step": 91250 + }, + { + "epoch": 1.826, + "grad_norm": 8.245438575744629, + "learning_rate": 4.3714999999999996e-06, + "loss": 2.9752, + "step": 91300 + }, + { + "epoch": 1.827, + "grad_norm": 8.939053535461426, + "learning_rate": 4.3465e-06, + "loss": 2.7425, + "step": 91350 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 9.105463981628418, + "learning_rate": 4.321500000000001e-06, + "loss": 2.951, + "step": 91400 + }, + { + "epoch": 1.829, + "grad_norm": 7.773897171020508, + "learning_rate": 4.2965e-06, + "loss": 2.8498, + "step": 91450 + }, + { + "epoch": 1.83, + "grad_norm": 8.115293502807617, + "learning_rate": 4.2715000000000005e-06, + "loss": 2.984, + "step": 91500 + }, + { + "epoch": 1.831, + "grad_norm": 10.776023864746094, + "learning_rate": 4.246500000000001e-06, + "loss": 2.6162, + "step": 91550 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 10.591742515563965, + "learning_rate": 4.2215e-06, + "loss": 2.8948, + "step": 91600 + }, + { + "epoch": 1.833, + "grad_norm": 7.4504241943359375, + "learning_rate": 4.1965e-06, + "loss": 2.7548, + "step": 91650 + }, + { + "epoch": 1.834, + "grad_norm": 7.637448310852051, + "learning_rate": 4.171500000000001e-06, + "loss": 2.874, + "step": 91700 + }, + { + "epoch": 1.835, + "grad_norm": 9.06761360168457, + "learning_rate": 4.1465e-06, + "loss": 2.5051, + "step": 91750 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 6.866405487060547, + "learning_rate": 4.1215e-06, + "loss": 2.5928, + "step": 91800 + }, + { + "epoch": 1.837, + "grad_norm": 12.77744197845459, + "learning_rate": 4.0965000000000005e-06, + "loss": 2.7717, + "step": 91850 + }, + { + "epoch": 1.838, + "grad_norm": 9.051816940307617, + "learning_rate": 4.0715e-06, + "loss": 2.6854, + "step": 91900 + }, + { + "epoch": 1.839, + "grad_norm": 8.761739730834961, + "learning_rate": 4.0465e-06, + "loss": 2.6948, + "step": 91950 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 8.750024795532227, + "learning_rate": 4.0215e-06, + "loss": 2.7667, + "step": 92000 + }, + { + "epoch": 1.841, + "grad_norm": 8.970072746276855, + "learning_rate": 3.996500000000001e-06, + "loss": 2.6667, + "step": 92050 + }, + { + "epoch": 1.842, + "grad_norm": 8.1638822555542, + "learning_rate": 3.9715e-06, + "loss": 2.8341, + "step": 92100 + }, + { + "epoch": 1.843, + "grad_norm": 7.813655376434326, + "learning_rate": 3.9465e-06, + "loss": 2.6207, + "step": 92150 + }, + { + "epoch": 1.8439999999999999, + "grad_norm": 10.0947265625, + "learning_rate": 3.9215000000000005e-06, + "loss": 2.8325, + "step": 92200 + }, + { + "epoch": 1.845, + "grad_norm": 7.8281569480896, + "learning_rate": 3.8965e-06, + "loss": 2.7996, + "step": 92250 + }, + { + "epoch": 1.846, + "grad_norm": 8.70777702331543, + "learning_rate": 3.8715e-06, + "loss": 2.7297, + "step": 92300 + }, + { + "epoch": 1.847, + "grad_norm": 7.831836223602295, + "learning_rate": 3.8465e-06, + "loss": 2.7486, + "step": 92350 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 7.55377197265625, + "learning_rate": 3.8215e-06, + "loss": 2.802, + "step": 92400 + }, + { + "epoch": 1.849, + "grad_norm": 7.586316108703613, + "learning_rate": 3.7965e-06, + "loss": 2.7082, + "step": 92450 + }, + { + "epoch": 1.85, + "grad_norm": 7.920407772064209, + "learning_rate": 3.7715000000000002e-06, + "loss": 2.687, + "step": 92500 + }, + { + "epoch": 1.851, + "grad_norm": 7.852161884307861, + "learning_rate": 3.7465e-06, + "loss": 2.7005, + "step": 92550 + }, + { + "epoch": 1.8519999999999999, + "grad_norm": 8.755575180053711, + "learning_rate": 3.7215e-06, + "loss": 2.6285, + "step": 92600 + }, + { + "epoch": 1.853, + "grad_norm": 10.1644868850708, + "learning_rate": 3.6965e-06, + "loss": 2.7896, + "step": 92650 + }, + { + "epoch": 1.854, + "grad_norm": 8.134500503540039, + "learning_rate": 3.6715e-06, + "loss": 2.8607, + "step": 92700 + }, + { + "epoch": 1.855, + "grad_norm": 8.263608932495117, + "learning_rate": 3.6464999999999997e-06, + "loss": 2.5583, + "step": 92750 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 8.864056587219238, + "learning_rate": 3.6215e-06, + "loss": 2.7813, + "step": 92800 + }, + { + "epoch": 1.857, + "grad_norm": 9.94490909576416, + "learning_rate": 3.5964999999999998e-06, + "loss": 2.6101, + "step": 92850 + }, + { + "epoch": 1.858, + "grad_norm": 8.170188903808594, + "learning_rate": 3.5715e-06, + "loss": 2.9997, + "step": 92900 + }, + { + "epoch": 1.859, + "grad_norm": 10.034306526184082, + "learning_rate": 3.5465000000000007e-06, + "loss": 2.7796, + "step": 92950 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 8.248494148254395, + "learning_rate": 3.5215000000000005e-06, + "loss": 2.8471, + "step": 93000 + }, + { + "epoch": 1.861, + "grad_norm": 8.490750312805176, + "learning_rate": 3.4965000000000003e-06, + "loss": 2.549, + "step": 93050 + }, + { + "epoch": 1.862, + "grad_norm": 7.735034465789795, + "learning_rate": 3.4715000000000006e-06, + "loss": 2.8037, + "step": 93100 + }, + { + "epoch": 1.863, + "grad_norm": 9.583478927612305, + "learning_rate": 3.4465000000000004e-06, + "loss": 2.7986, + "step": 93150 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 7.504220008850098, + "learning_rate": 3.4215e-06, + "loss": 2.8001, + "step": 93200 + }, + { + "epoch": 1.865, + "grad_norm": 8.959890365600586, + "learning_rate": 3.3965000000000004e-06, + "loss": 2.5646, + "step": 93250 + }, + { + "epoch": 1.866, + "grad_norm": 8.324706077575684, + "learning_rate": 3.3715000000000002e-06, + "loss": 2.7756, + "step": 93300 + }, + { + "epoch": 1.867, + "grad_norm": 9.705889701843262, + "learning_rate": 3.3465000000000005e-06, + "loss": 2.6537, + "step": 93350 + }, + { + "epoch": 1.8679999999999999, + "grad_norm": 12.64687728881836, + "learning_rate": 3.3215000000000003e-06, + "loss": 2.76, + "step": 93400 + }, + { + "epoch": 1.869, + "grad_norm": 8.79395580291748, + "learning_rate": 3.2965e-06, + "loss": 2.7001, + "step": 93450 + }, + { + "epoch": 1.87, + "grad_norm": 9.220784187316895, + "learning_rate": 3.2715000000000004e-06, + "loss": 2.8881, + "step": 93500 + }, + { + "epoch": 1.871, + "grad_norm": 9.860733985900879, + "learning_rate": 3.2465e-06, + "loss": 2.7711, + "step": 93550 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 7.789809703826904, + "learning_rate": 3.2215000000000004e-06, + "loss": 2.6483, + "step": 93600 + }, + { + "epoch": 1.873, + "grad_norm": 7.488524436950684, + "learning_rate": 3.1965000000000002e-06, + "loss": 2.6206, + "step": 93650 + }, + { + "epoch": 1.874, + "grad_norm": 10.260944366455078, + "learning_rate": 3.1715e-06, + "loss": 2.6502, + "step": 93700 + }, + { + "epoch": 1.875, + "grad_norm": 8.944072723388672, + "learning_rate": 3.147e-06, + "loss": 3.0729, + "step": 93750 + }, + { + "epoch": 1.876, + "grad_norm": 8.896251678466797, + "learning_rate": 3.1220000000000003e-06, + "loss": 2.5911, + "step": 93800 + }, + { + "epoch": 1.877, + "grad_norm": 8.604350090026855, + "learning_rate": 3.097e-06, + "loss": 2.6665, + "step": 93850 + }, + { + "epoch": 1.8780000000000001, + "grad_norm": 8.519975662231445, + "learning_rate": 3.0720000000000004e-06, + "loss": 2.81, + "step": 93900 + }, + { + "epoch": 1.879, + "grad_norm": 7.503047943115234, + "learning_rate": 3.0470000000000002e-06, + "loss": 2.7349, + "step": 93950 + }, + { + "epoch": 1.88, + "grad_norm": 8.925019264221191, + "learning_rate": 3.022e-06, + "loss": 2.6914, + "step": 94000 + }, + { + "epoch": 1.881, + "grad_norm": 7.482077121734619, + "learning_rate": 2.9970000000000003e-06, + "loss": 2.7253, + "step": 94050 + }, + { + "epoch": 1.8820000000000001, + "grad_norm": 8.396530151367188, + "learning_rate": 2.972e-06, + "loss": 3.0853, + "step": 94100 + }, + { + "epoch": 1.883, + "grad_norm": 10.036365509033203, + "learning_rate": 2.947e-06, + "loss": 2.811, + "step": 94150 + }, + { + "epoch": 1.884, + "grad_norm": 9.501229286193848, + "learning_rate": 2.922e-06, + "loss": 2.7846, + "step": 94200 + }, + { + "epoch": 1.885, + "grad_norm": 7.217312335968018, + "learning_rate": 2.897e-06, + "loss": 2.7255, + "step": 94250 + }, + { + "epoch": 1.8860000000000001, + "grad_norm": 9.453542709350586, + "learning_rate": 2.872e-06, + "loss": 2.5492, + "step": 94300 + }, + { + "epoch": 1.887, + "grad_norm": 17.13677406311035, + "learning_rate": 2.847e-06, + "loss": 2.8925, + "step": 94350 + }, + { + "epoch": 1.888, + "grad_norm": 7.6179962158203125, + "learning_rate": 2.822e-06, + "loss": 2.5682, + "step": 94400 + }, + { + "epoch": 1.889, + "grad_norm": 7.958014488220215, + "learning_rate": 2.797e-06, + "loss": 2.6691, + "step": 94450 + }, + { + "epoch": 1.8900000000000001, + "grad_norm": 9.357193946838379, + "learning_rate": 2.7720000000000003e-06, + "loss": 2.4874, + "step": 94500 + }, + { + "epoch": 1.891, + "grad_norm": 11.814285278320312, + "learning_rate": 2.747e-06, + "loss": 2.8565, + "step": 94550 + }, + { + "epoch": 1.892, + "grad_norm": 8.839566230773926, + "learning_rate": 2.7220000000000004e-06, + "loss": 2.8531, + "step": 94600 + }, + { + "epoch": 1.893, + "grad_norm": 8.386056900024414, + "learning_rate": 2.697e-06, + "loss": 2.5458, + "step": 94650 + }, + { + "epoch": 1.8940000000000001, + "grad_norm": 7.575038433074951, + "learning_rate": 2.672e-06, + "loss": 2.6183, + "step": 94700 + }, + { + "epoch": 1.895, + "grad_norm": 7.668638229370117, + "learning_rate": 2.6470000000000002e-06, + "loss": 2.6924, + "step": 94750 + }, + { + "epoch": 1.896, + "grad_norm": 8.719830513000488, + "learning_rate": 2.622e-06, + "loss": 2.6432, + "step": 94800 + }, + { + "epoch": 1.897, + "grad_norm": 10.851173400878906, + "learning_rate": 2.5970000000000003e-06, + "loss": 2.9338, + "step": 94850 + }, + { + "epoch": 1.8980000000000001, + "grad_norm": 7.968566417694092, + "learning_rate": 2.572e-06, + "loss": 2.8757, + "step": 94900 + }, + { + "epoch": 1.899, + "grad_norm": 7.560086727142334, + "learning_rate": 2.547e-06, + "loss": 2.7454, + "step": 94950 + }, + { + "epoch": 1.9, + "grad_norm": 8.198180198669434, + "learning_rate": 2.522e-06, + "loss": 2.766, + "step": 95000 + }, + { + "epoch": 1.901, + "grad_norm": 7.824384689331055, + "learning_rate": 2.497e-06, + "loss": 2.7285, + "step": 95050 + }, + { + "epoch": 1.9020000000000001, + "grad_norm": 7.9695281982421875, + "learning_rate": 2.4720000000000002e-06, + "loss": 2.6896, + "step": 95100 + }, + { + "epoch": 1.903, + "grad_norm": 7.852643013000488, + "learning_rate": 2.447e-06, + "loss": 2.6197, + "step": 95150 + }, + { + "epoch": 1.904, + "grad_norm": 8.303346633911133, + "learning_rate": 2.422e-06, + "loss": 2.6494, + "step": 95200 + }, + { + "epoch": 1.905, + "grad_norm": 11.907207489013672, + "learning_rate": 2.397e-06, + "loss": 2.7931, + "step": 95250 + }, + { + "epoch": 1.9060000000000001, + "grad_norm": 8.8694486618042, + "learning_rate": 2.3720000000000003e-06, + "loss": 2.644, + "step": 95300 + }, + { + "epoch": 1.907, + "grad_norm": 7.289035797119141, + "learning_rate": 2.347e-06, + "loss": 2.7105, + "step": 95350 + }, + { + "epoch": 1.908, + "grad_norm": 8.548935890197754, + "learning_rate": 2.3220000000000004e-06, + "loss": 2.8733, + "step": 95400 + }, + { + "epoch": 1.909, + "grad_norm": 8.640983581542969, + "learning_rate": 2.297e-06, + "loss": 2.7284, + "step": 95450 + }, + { + "epoch": 1.9100000000000001, + "grad_norm": 8.68470573425293, + "learning_rate": 2.2725e-06, + "loss": 2.7682, + "step": 95500 + }, + { + "epoch": 1.911, + "grad_norm": 8.883843421936035, + "learning_rate": 2.2475e-06, + "loss": 2.7007, + "step": 95550 + }, + { + "epoch": 1.912, + "grad_norm": 8.315601348876953, + "learning_rate": 2.2225000000000003e-06, + "loss": 2.8442, + "step": 95600 + }, + { + "epoch": 1.913, + "grad_norm": 7.067404270172119, + "learning_rate": 2.1975e-06, + "loss": 2.6383, + "step": 95650 + }, + { + "epoch": 1.9140000000000001, + "grad_norm": 9.462407112121582, + "learning_rate": 2.1725000000000004e-06, + "loss": 2.7837, + "step": 95700 + }, + { + "epoch": 1.915, + "grad_norm": 7.8980021476745605, + "learning_rate": 2.1475e-06, + "loss": 2.8218, + "step": 95750 + }, + { + "epoch": 1.916, + "grad_norm": 8.796659469604492, + "learning_rate": 2.1225e-06, + "loss": 2.5969, + "step": 95800 + }, + { + "epoch": 1.917, + "grad_norm": 7.042479038238525, + "learning_rate": 2.0975000000000002e-06, + "loss": 2.7725, + "step": 95850 + }, + { + "epoch": 1.9180000000000001, + "grad_norm": 7.105340480804443, + "learning_rate": 2.0725e-06, + "loss": 2.7451, + "step": 95900 + }, + { + "epoch": 1.919, + "grad_norm": 8.806255340576172, + "learning_rate": 2.0475000000000003e-06, + "loss": 2.7912, + "step": 95950 + }, + { + "epoch": 1.92, + "grad_norm": 9.678021430969238, + "learning_rate": 2.0225e-06, + "loss": 2.9036, + "step": 96000 + }, + { + "epoch": 1.921, + "grad_norm": 12.134692192077637, + "learning_rate": 1.9975e-06, + "loss": 2.7814, + "step": 96050 + }, + { + "epoch": 1.9220000000000002, + "grad_norm": 9.222649574279785, + "learning_rate": 1.9725e-06, + "loss": 2.619, + "step": 96100 + }, + { + "epoch": 1.923, + "grad_norm": 8.665557861328125, + "learning_rate": 1.9475e-06, + "loss": 2.7856, + "step": 96150 + }, + { + "epoch": 1.924, + "grad_norm": 10.79527759552002, + "learning_rate": 1.9225e-06, + "loss": 2.7519, + "step": 96200 + }, + { + "epoch": 1.925, + "grad_norm": 10.203853607177734, + "learning_rate": 1.8975e-06, + "loss": 2.605, + "step": 96250 + }, + { + "epoch": 1.9260000000000002, + "grad_norm": 8.2529296875, + "learning_rate": 1.8724999999999999e-06, + "loss": 2.7996, + "step": 96300 + }, + { + "epoch": 1.927, + "grad_norm": 7.979179382324219, + "learning_rate": 1.8474999999999999e-06, + "loss": 2.8076, + "step": 96350 + }, + { + "epoch": 1.928, + "grad_norm": 9.023404121398926, + "learning_rate": 1.8225000000000003e-06, + "loss": 2.5915, + "step": 96400 + }, + { + "epoch": 1.929, + "grad_norm": 9.803990364074707, + "learning_rate": 1.7975000000000002e-06, + "loss": 2.8453, + "step": 96450 + }, + { + "epoch": 1.9300000000000002, + "grad_norm": 9.243449211120605, + "learning_rate": 1.7725000000000002e-06, + "loss": 2.7708, + "step": 96500 + }, + { + "epoch": 1.931, + "grad_norm": 8.036003112792969, + "learning_rate": 1.7475000000000002e-06, + "loss": 2.814, + "step": 96550 + }, + { + "epoch": 1.932, + "grad_norm": 7.145360469818115, + "learning_rate": 1.7225000000000002e-06, + "loss": 2.8895, + "step": 96600 + }, + { + "epoch": 1.933, + "grad_norm": 7.846442699432373, + "learning_rate": 1.6975e-06, + "loss": 2.8026, + "step": 96650 + }, + { + "epoch": 1.9340000000000002, + "grad_norm": 10.024850845336914, + "learning_rate": 1.6725e-06, + "loss": 2.8142, + "step": 96700 + }, + { + "epoch": 1.935, + "grad_norm": 8.556451797485352, + "learning_rate": 1.6475000000000001e-06, + "loss": 2.9857, + "step": 96750 + }, + { + "epoch": 1.936, + "grad_norm": 8.042387008666992, + "learning_rate": 1.6225000000000001e-06, + "loss": 2.6511, + "step": 96800 + }, + { + "epoch": 1.937, + "grad_norm": 9.563372611999512, + "learning_rate": 1.5975e-06, + "loss": 2.7634, + "step": 96850 + }, + { + "epoch": 1.938, + "grad_norm": 8.98689079284668, + "learning_rate": 1.5725e-06, + "loss": 2.6179, + "step": 96900 + }, + { + "epoch": 1.939, + "grad_norm": 9.716072082519531, + "learning_rate": 1.5475e-06, + "loss": 2.7824, + "step": 96950 + }, + { + "epoch": 1.94, + "grad_norm": 8.737848281860352, + "learning_rate": 1.5225000000000002e-06, + "loss": 2.6499, + "step": 97000 + }, + { + "epoch": 1.9409999999999998, + "grad_norm": 11.896839141845703, + "learning_rate": 1.4975e-06, + "loss": 2.5394, + "step": 97050 + }, + { + "epoch": 1.942, + "grad_norm": 6.5530853271484375, + "learning_rate": 1.4725e-06, + "loss": 2.6731, + "step": 97100 + }, + { + "epoch": 1.943, + "grad_norm": 6.974987030029297, + "learning_rate": 1.4475000000000001e-06, + "loss": 2.8178, + "step": 97150 + }, + { + "epoch": 1.944, + "grad_norm": 6.8209614753723145, + "learning_rate": 1.4225000000000001e-06, + "loss": 2.6135, + "step": 97200 + }, + { + "epoch": 1.9449999999999998, + "grad_norm": 9.504709243774414, + "learning_rate": 1.3975e-06, + "loss": 2.7069, + "step": 97250 + }, + { + "epoch": 1.946, + "grad_norm": 10.736202239990234, + "learning_rate": 1.3725e-06, + "loss": 2.7967, + "step": 97300 + }, + { + "epoch": 1.947, + "grad_norm": 8.686497688293457, + "learning_rate": 1.3475000000000002e-06, + "loss": 2.6214, + "step": 97350 + }, + { + "epoch": 1.948, + "grad_norm": 10.431570053100586, + "learning_rate": 1.3225e-06, + "loss": 2.6681, + "step": 97400 + }, + { + "epoch": 1.9489999999999998, + "grad_norm": 11.108994483947754, + "learning_rate": 1.2975e-06, + "loss": 2.8571, + "step": 97450 + }, + { + "epoch": 1.95, + "grad_norm": 9.426375389099121, + "learning_rate": 1.2725e-06, + "loss": 2.6336, + "step": 97500 + }, + { + "epoch": 1.951, + "grad_norm": 9.929731369018555, + "learning_rate": 1.2475000000000001e-06, + "loss": 2.8664, + "step": 97550 + }, + { + "epoch": 1.952, + "grad_norm": 5.834004878997803, + "learning_rate": 1.2225e-06, + "loss": 2.4785, + "step": 97600 + }, + { + "epoch": 1.9529999999999998, + "grad_norm": 10.382954597473145, + "learning_rate": 1.1975e-06, + "loss": 2.884, + "step": 97650 + }, + { + "epoch": 1.954, + "grad_norm": 9.039685249328613, + "learning_rate": 1.1725e-06, + "loss": 2.7769, + "step": 97700 + }, + { + "epoch": 1.955, + "grad_norm": 10.186605453491211, + "learning_rate": 1.1475000000000002e-06, + "loss": 2.8527, + "step": 97750 + }, + { + "epoch": 1.956, + "grad_norm": 7.526618480682373, + "learning_rate": 1.1225e-06, + "loss": 2.4586, + "step": 97800 + }, + { + "epoch": 1.9569999999999999, + "grad_norm": 7.888932704925537, + "learning_rate": 1.0975e-06, + "loss": 2.7046, + "step": 97850 + }, + { + "epoch": 1.958, + "grad_norm": 8.395057678222656, + "learning_rate": 1.0725000000000001e-06, + "loss": 2.8441, + "step": 97900 + }, + { + "epoch": 1.959, + "grad_norm": 7.477360725402832, + "learning_rate": 1.0475000000000001e-06, + "loss": 2.6665, + "step": 97950 + }, + { + "epoch": 1.96, + "grad_norm": 8.053496360778809, + "learning_rate": 1.0225e-06, + "loss": 2.7325, + "step": 98000 + }, + { + "epoch": 1.9609999999999999, + "grad_norm": 8.144623756408691, + "learning_rate": 9.975e-07, + "loss": 2.9861, + "step": 98050 + }, + { + "epoch": 1.962, + "grad_norm": 8.730600357055664, + "learning_rate": 9.725e-07, + "loss": 2.7406, + "step": 98100 + }, + { + "epoch": 1.963, + "grad_norm": 8.985459327697754, + "learning_rate": 9.475000000000001e-07, + "loss": 2.9892, + "step": 98150 + }, + { + "epoch": 1.964, + "grad_norm": 11.969583511352539, + "learning_rate": 9.225000000000001e-07, + "loss": 2.7578, + "step": 98200 + }, + { + "epoch": 1.9649999999999999, + "grad_norm": 7.816971778869629, + "learning_rate": 8.975000000000001e-07, + "loss": 2.6194, + "step": 98250 + }, + { + "epoch": 1.966, + "grad_norm": 8.625751495361328, + "learning_rate": 8.725e-07, + "loss": 2.8595, + "step": 98300 + }, + { + "epoch": 1.967, + "grad_norm": 8.965986251831055, + "learning_rate": 8.475e-07, + "loss": 2.5684, + "step": 98350 + }, + { + "epoch": 1.968, + "grad_norm": 8.818077087402344, + "learning_rate": 8.225e-07, + "loss": 2.789, + "step": 98400 + }, + { + "epoch": 1.9689999999999999, + "grad_norm": 8.827529907226562, + "learning_rate": 7.975e-07, + "loss": 2.7706, + "step": 98450 + }, + { + "epoch": 1.97, + "grad_norm": 8.630989074707031, + "learning_rate": 7.725e-07, + "loss": 2.6708, + "step": 98500 + }, + { + "epoch": 1.971, + "grad_norm": 7.110955715179443, + "learning_rate": 7.475e-07, + "loss": 2.628, + "step": 98550 + }, + { + "epoch": 1.972, + "grad_norm": 7.431270122528076, + "learning_rate": 7.225e-07, + "loss": 2.7951, + "step": 98600 + }, + { + "epoch": 1.9729999999999999, + "grad_norm": 7.719696521759033, + "learning_rate": 6.975000000000001e-07, + "loss": 2.9401, + "step": 98650 + }, + { + "epoch": 1.974, + "grad_norm": 8.271461486816406, + "learning_rate": 6.725e-07, + "loss": 2.7169, + "step": 98700 + }, + { + "epoch": 1.975, + "grad_norm": 9.32712173461914, + "learning_rate": 6.475e-07, + "loss": 2.7377, + "step": 98750 + }, + { + "epoch": 1.976, + "grad_norm": 7.909132957458496, + "learning_rate": 6.225e-07, + "loss": 2.7912, + "step": 98800 + }, + { + "epoch": 1.9769999999999999, + "grad_norm": 12.058829307556152, + "learning_rate": 5.975000000000001e-07, + "loss": 2.809, + "step": 98850 + }, + { + "epoch": 1.978, + "grad_norm": 9.400949478149414, + "learning_rate": 5.725e-07, + "loss": 2.6873, + "step": 98900 + }, + { + "epoch": 1.979, + "grad_norm": 8.208101272583008, + "learning_rate": 5.48e-07, + "loss": 2.6407, + "step": 98950 + }, + { + "epoch": 1.98, + "grad_norm": 8.864774703979492, + "learning_rate": 5.23e-07, + "loss": 2.8138, + "step": 99000 + }, + { + "epoch": 1.9809999999999999, + "grad_norm": 7.672954559326172, + "learning_rate": 4.98e-07, + "loss": 3.0117, + "step": 99050 + }, + { + "epoch": 1.982, + "grad_norm": 8.58277702331543, + "learning_rate": 4.73e-07, + "loss": 2.7453, + "step": 99100 + }, + { + "epoch": 1.983, + "grad_norm": 8.067780494689941, + "learning_rate": 4.48e-07, + "loss": 2.7547, + "step": 99150 + }, + { + "epoch": 1.984, + "grad_norm": 8.917376518249512, + "learning_rate": 4.2300000000000007e-07, + "loss": 2.7548, + "step": 99200 + }, + { + "epoch": 1.9849999999999999, + "grad_norm": 7.516794681549072, + "learning_rate": 3.9800000000000004e-07, + "loss": 2.7478, + "step": 99250 + }, + { + "epoch": 1.986, + "grad_norm": 9.242195129394531, + "learning_rate": 3.73e-07, + "loss": 2.7344, + "step": 99300 + }, + { + "epoch": 1.987, + "grad_norm": 8.246512413024902, + "learning_rate": 3.48e-07, + "loss": 2.6711, + "step": 99350 + }, + { + "epoch": 1.988, + "grad_norm": 8.762199401855469, + "learning_rate": 3.23e-07, + "loss": 2.6444, + "step": 99400 + }, + { + "epoch": 1.9889999999999999, + "grad_norm": 7.332550525665283, + "learning_rate": 2.98e-07, + "loss": 2.5687, + "step": 99450 + }, + { + "epoch": 1.99, + "grad_norm": 12.712468147277832, + "learning_rate": 2.7299999999999997e-07, + "loss": 2.8477, + "step": 99500 + } + ], + "logging_steps": 50, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 482880651264000.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}