| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.655, | |
| "eval_steps": 500, | |
| "global_step": 13100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0025, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 0.00025, | |
| "loss": 10.4938, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 0.0005, | |
| "loss": 9.1324, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0075, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 0.00075, | |
| "loss": 7.883, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.001, | |
| "loss": 7.3925, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0125, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 0.0009999842657116666, | |
| "loss": 7.062, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 0.0009999370638369377, | |
| "loss": 6.8396, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0175, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 0.0009998583973465647, | |
| "loss": 6.6879, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.0009997482711915926, | |
| "loss": 6.6086, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0225, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0009996066923030483, | |
| "loss": 6.5239, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 0.000999433669591504, | |
| "loss": 6.552, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0275, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.0009992292139465165, | |
| "loss": 6.4607, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 0.0009989933382359422, | |
| "loss": 6.3974, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0325, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0009987260573051267, | |
| "loss": 6.3249, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 0.0009984273879759713, | |
| "loss": 6.3374, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0375, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 0.0009980973490458728, | |
| "loss": 6.3157, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.0009977359612865424, | |
| "loss": 6.2525, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.0425, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.0009973432474426967, | |
| "loss": 6.2116, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 0.000996919232230627, | |
| "loss": 6.1993, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.0475, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 0.0009964639423366442, | |
| "loss": 6.0863, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 0.0009959774064153978, | |
| "loss": 6.049, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0525, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 0.0009954596550880734, | |
| "loss": 6.0222, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 0.0009949107209404665, | |
| "loss": 6.0078, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.0575, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 0.000994330638520929, | |
| "loss": 6.0114, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 0.0009937194443381972, | |
| "loss": 5.9764, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.0009930771768590933, | |
| "loss": 5.9146, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.000992403876506104, | |
| "loss": 5.9021, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.0675, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 0.0009916995856548369, | |
| "loss": 5.9053, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 0.0009909643486313534, | |
| "loss": 5.8727, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.0725, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 0.0009901982117093786, | |
| "loss": 5.8429, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0009894012231073895, | |
| "loss": 5.8162, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0775, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 0.0009885734329855799, | |
| "loss": 5.8881, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.0009877148934427035, | |
| "loss": 5.7903, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.0825, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.0009868256585127955, | |
| "loss": 5.7843, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.000985905784161771, | |
| "loss": 5.7443, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.0875, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0009849553282839025, | |
| "loss": 5.749, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 0.0009839743506981783, | |
| "loss": 5.8367, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.0925, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.0009829629131445341, | |
| "loss": 5.7485, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 0.000981921079279971, | |
| "loss": 5.7068, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.0975, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.0009808489146745465, | |
| "loss": 5.7167, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 0.0009797464868072487, | |
| "loss": 5.7298, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1025, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.0009786138650617494, | |
| "loss": 5.7332, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.0009774511207220368, | |
| "loss": 5.6218, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.1075, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.0009762583269679303, | |
| "loss": 5.662, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.0009750355588704727, | |
| "loss": 5.6434, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.1125, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 0.0009737828933872075, | |
| "loss": 5.585, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 0.0009725004093573342, | |
| "loss": 5.537, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.1175, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 0.000971188187496747, | |
| "loss": 5.605, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.0009698463103929542, | |
| "loss": 5.5706, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.1225, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.000968474862499881, | |
| "loss": 5.5877, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.0009670739301325534, | |
| "loss": 5.61, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.1275, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.000965643601461667, | |
| "loss": 5.6129, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 0.0009641839665080363, | |
| "loss": 5.5661, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.1325, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 0.0009626951171369304, | |
| "loss": 5.6271, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.135, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0009611771470522907, | |
| "loss": 5.5214, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.1375, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.0009596301517908328, | |
| "loss": 5.625, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 0.0009580542287160348, | |
| "loss": 5.5264, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.1425, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 0.0009564494770120089, | |
| "loss": 5.5348, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.145, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 0.0009548159976772592, | |
| "loss": 5.4634, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.1475, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 0.0009531538935183251, | |
| "loss": 5.4421, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 0.0009514632691433108, | |
| "loss": 5.47, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.1525, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 0.0009497442309553016, | |
| "loss": 5.4528, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.155, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 0.0009479968871456679, | |
| "loss": 5.3831, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.1575, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 0.000946221347687255, | |
| "loss": 5.3801, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.0009444177243274617, | |
| "loss": 5.394, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.1625, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 0.0009425861305812082, | |
| "loss": 5.4232, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.165, | |
| "grad_norm": 0.5, | |
| "learning_rate": 0.000940726681723791, | |
| "loss": 5.4551, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.1675, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 0.0009388394947836278, | |
| "loss": 5.4354, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 0.0009369246885348925, | |
| "loss": 5.4606, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.1725, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.0009349823834900395, | |
| "loss": 5.3785, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 0.0009330127018922195, | |
| "loss": 5.3472, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.1775, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 0.0009310157677075847, | |
| "loss": 5.4156, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.0009289917066174886, | |
| "loss": 5.2857, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.1825, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.000926940646010574, | |
| "loss": 5.3815, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.185, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 0.0009248627149747573, | |
| "loss": 5.3596, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 0.0009227580442891022, | |
| "loss": 5.3197, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 0.0009206267664155906, | |
| "loss": 5.2859, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.1925, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0009184690154907849, | |
| "loss": 5.3134, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.195, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.0009162849273173857, | |
| "loss": 5.218, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.1975, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 0.0009140746393556853, | |
| "loss": 5.3191, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.0009118382907149164, | |
| "loss": 5.2908, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.2025, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 0.0009095760221444959, | |
| "loss": 5.1909, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.205, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 0.0009072879760251679, | |
| "loss": 5.2562, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.2075, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 0.0009049742963600418, | |
| "loss": 5.2542, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 0.0009026351287655293, | |
| "loss": 5.1161, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.2125, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 0.0009002706204621802, | |
| "loss": 5.1941, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.215, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 0.0008978809202654162, | |
| "loss": 5.2478, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.2175, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 0.0008954661785761646, | |
| "loss": 5.2331, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.0008930265473713938, | |
| "loss": 5.1296, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.2225, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 0.0008905621801945467, | |
| "loss": 5.194, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 0.0008880732321458784, | |
| "loss": 5.1855, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.2275, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.0008855598598726938, | |
| "loss": 5.1613, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 0.000883022221559489, | |
| "loss": 5.1545, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.2325, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 0.0008804604769179958, | |
| "loss": 5.1522, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.235, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 0.0008778747871771292, | |
| "loss": 5.1246, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.2375, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.0008752653150728412, | |
| "loss": 5.1371, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.0008726322248378774, | |
| "loss": 5.0847, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.2425, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 0.0008699756821914419, | |
| "loss": 5.0916, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.245, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 0.0008672958543287666, | |
| "loss": 5.0772, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.2475, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 0.0008645929099105886, | |
| "loss": 5.1363, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 0.000861867019052535, | |
| "loss": 5.075, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2525, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0008591183533144171, | |
| "loss": 5.1377, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.255, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.0008563470856894315, | |
| "loss": 5.0829, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.2575, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 0.0008535533905932737, | |
| "loss": 5.0808, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 0.0008507374438531607, | |
| "loss": 5.0014, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.2625, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 0.0008478994226967638, | |
| "loss": 5.0243, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.265, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.000845039505741056, | |
| "loss": 5.128, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.2675, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 0.0008421578729810691, | |
| "loss": 5.0646, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.0008392547057785661, | |
| "loss": 5.0641, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.2725, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 0.0008363301868506264, | |
| "loss": 5.0466, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0008333845002581458, | |
| "loss": 5.0462, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.2775, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.0008304178313942535, | |
| "loss": 5.1451, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 0.0008274303669726426, | |
| "loss": 5.1001, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.2825, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 0.0008244222950158193, | |
| "loss": 4.9816, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.285, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 0.0008213938048432696, | |
| "loss": 5.0636, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.2875, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.0008183450870595441, | |
| "loss": 4.977, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.0008152763335422613, | |
| "loss": 4.9954, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.2925, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 0.0008121877374300317, | |
| "loss": 4.9429, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.295, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0008090794931103026, | |
| "loss": 4.9819, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.2975, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 0.0008059517962071233, | |
| "loss": 5.0432, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.0008028048435688333, | |
| "loss": 4.8642, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.3025, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0007996388332556734, | |
| "loss": 5.0357, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.305, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 0.0007964539645273203, | |
| "loss": 4.9587, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.3075, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.0007932504378303451, | |
| "loss": 4.9245, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.0007900284547855992, | |
| "loss": 4.9813, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0007867882181755231, | |
| "loss": 5.0188, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.315, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0007835299319313853, | |
| "loss": 4.9338, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.3175, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 0.000780253801120447, | |
| "loss": 4.8996, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0007769600319330552, | |
| "loss": 4.9161, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.3225, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.0007736488316696662, | |
| "loss": 4.9648, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.0007703204087277988, | |
| "loss": 4.908, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.3275, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.0007669749725889182, | |
| "loss": 4.9536, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 0.0007636127338052513, | |
| "loss": 4.8755, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.3325, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 0.0007602339039865362, | |
| "loss": 4.8819, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.335, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.0007568386957867032, | |
| "loss": 4.8511, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.3375, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.0007534273228904916, | |
| "loss": 4.9324, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.00075, | |
| "loss": 4.8898, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.3425, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.0007465569428211752, | |
| "loss": 4.9447, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.345, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.0007430983680502344, | |
| "loss": 4.8954, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.3475, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.0007396244933600284, | |
| "loss": 4.8977, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0007361355373863414, | |
| "loss": 4.7837, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.3525, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 0.0007326317197141304, | |
| "loss": 4.9241, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.355, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.0007291132608637052, | |
| "loss": 4.9113, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.3575, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0007255803822768504, | |
| "loss": 4.9187, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.0007220333063028871, | |
| "loss": 4.8632, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.3625, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.0007184722561846798, | |
| "loss": 5.0078, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.365, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.0007148974560445859, | |
| "loss": 4.7923, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.3675, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.0007113091308703497, | |
| "loss": 4.8948, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 0.0007077075065009433, | |
| "loss": 4.8259, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.3725, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.0007040928096123516, | |
| "loss": 4.8009, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 0.0007004652677033068, | |
| "loss": 4.847, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.3775, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 0.0006968251090809707, | |
| "loss": 4.835, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 0.0006931725628465643, | |
| "loss": 4.8838, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.3825, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 0.0006895078588809502, | |
| "loss": 4.8705, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.385, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 0.0006858312278301637, | |
| "loss": 4.763, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.3875, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.0006821429010908972, | |
| "loss": 4.8961, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.0006784431107959359, | |
| "loss": 4.8141, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.3925, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.0006747320897995492, | |
| "loss": 4.8718, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.395, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.0006710100716628344, | |
| "loss": 4.8394, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.3975, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0006672772906390176, | |
| "loss": 4.8264, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 0.0006635339816587109, | |
| "loss": 4.769, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.4025, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 0.000659780380315125, | |
| "loss": 4.8428, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.405, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.0006560167228492435, | |
| "loss": 4.8591, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.4075, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 0.0006522432461349536, | |
| "loss": 4.9224, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0006484601876641375, | |
| "loss": 4.7773, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.4125, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.0006446677855317265, | |
| "loss": 4.8107, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.415, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.0006408662784207149, | |
| "loss": 4.8252, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.4175, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 0.0006370559055871389, | |
| "loss": 4.8562, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 0.0006332369068450174, | |
| "loss": 4.7781, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.4225, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 0.0006294095225512603, | |
| "loss": 4.8325, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.0006255739935905395, | |
| "loss": 4.8363, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.4275, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 0.0006217305613601295, | |
| "loss": 4.8252, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.0006178794677547138, | |
| "loss": 4.8079, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.4325, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.0006140209551511608, | |
| "loss": 4.8508, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.435, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.0006101552663932703, | |
| "loss": 4.8389, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.4375, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0006062826447764884, | |
| "loss": 4.7695, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 0.0006024033340325954, | |
| "loss": 4.7953, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.4425, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.0005985175783143666, | |
| "loss": 4.8144, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.445, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 0.0005946256221802051, | |
| "loss": 4.8123, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.4475, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.0005907277105787513, | |
| "loss": 4.7778, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.0005868240888334653, | |
| "loss": 4.7237, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.4525, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.0005829150026271871, | |
| "loss": 4.7938, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.455, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.000579000697986675, | |
| "loss": 4.7879, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.4575, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 0.0005750814212671201, | |
| "loss": 4.7317, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 0.0005711574191366427, | |
| "loss": 4.7819, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.4625, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 0.0005672289385607659, | |
| "loss": 4.8393, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.465, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0005632962267868747, | |
| "loss": 4.7797, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.4675, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0005593595313286526, | |
| "loss": 4.764, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 0.0005554190999505056, | |
| "loss": 4.7639, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.4725, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.0005514751806519673, | |
| "loss": 4.8052, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.475, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 0.0005475280216520913, | |
| "loss": 4.8682, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.4775, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0005435778713738292, | |
| "loss": 4.748, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.0005396249784283942, | |
| "loss": 4.7625, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.4825, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0005356695915996161, | |
| "loss": 4.824, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.485, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 0.0005317119598282822, | |
| "loss": 4.7946, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.4875, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0005277523321964701, | |
| "loss": 4.8476, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.0005237909579118712, | |
| "loss": 4.735, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.4925, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 0.0005198280862921062, | |
| "loss": 4.8119, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.495, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 0.0005158639667490339, | |
| "loss": 4.794, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.4975, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 0.0005118988487730537, | |
| "loss": 4.7226, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.000507932981917404, | |
| "loss": 4.9084, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5025, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 0.0005039666157824549, | |
| "loss": 4.6737, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.505, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 0.0005, | |
| "loss": 4.7808, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.5075, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 0.0004960333842175453, | |
| "loss": 4.8202, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 0.000492067018082596, | |
| "loss": 4.7873, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.5125, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.00048810115122694634, | |
| "loss": 4.7753, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.515, | |
| "grad_norm": 0.6796875, | |
| "learning_rate": 0.0004841360332509663, | |
| "loss": 4.767, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.5175, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.00048017191370789385, | |
| "loss": 4.7159, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.0004762090420881289, | |
| "loss": 4.8013, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.5225, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.00047224766780353, | |
| "loss": 4.7983, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.525, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.00046828804017171776, | |
| "loss": 4.7126, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.5275, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.00046433040840038387, | |
| "loss": 4.7993, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 0.00046037502157160573, | |
| "loss": 4.6986, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.5325, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 0.00045642212862617086, | |
| "loss": 4.7849, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.535, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0004524719783479088, | |
| "loss": 4.7535, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.5375, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.0004485248193480328, | |
| "loss": 4.7556, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 0.00044458090004949454, | |
| "loss": 4.696, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.5425, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 0.00044064046867134756, | |
| "loss": 4.7343, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.545, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.00043670377321312535, | |
| "loss": 4.8243, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.5475, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 0.0004327710614392341, | |
| "loss": 4.7604, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.0004288425808633575, | |
| "loss": 4.8299, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.5525, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.0004249185787328798, | |
| "loss": 4.7282, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.555, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.000420999302013325, | |
| "loss": 4.7481, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.5575, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 0.00041708499737281305, | |
| "loss": 4.8119, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 0.00041317591116653486, | |
| "loss": 4.817, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.5625, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.0004092722894212487, | |
| "loss": 4.7253, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.565, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 0.0004053743778197951, | |
| "loss": 4.7405, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.5675, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.00040148242168563356, | |
| "loss": 4.7959, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.0003975966659674047, | |
| "loss": 4.6902, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.5725, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.00039371735522351166, | |
| "loss": 4.7793, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.575, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.00038984473360672965, | |
| "loss": 4.7839, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.5775, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0003859790448488394, | |
| "loss": 4.6369, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.0003821205322452863, | |
| "loss": 4.7955, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.5825, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.00037826943863987055, | |
| "loss": 4.7773, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.585, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 0.00037442600640946044, | |
| "loss": 4.7528, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.5875, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 0.0003705904774487396, | |
| "loss": 4.6866, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 0.0003667630931549826, | |
| "loss": 4.6988, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.5925, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.0003629440944128613, | |
| "loss": 4.6835, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.595, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0003591337215792851, | |
| "loss": 4.7863, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.5975, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.0003553322144682737, | |
| "loss": 4.7801, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 0.00035153981233586274, | |
| "loss": 4.711, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6025, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.00034775675386504657, | |
| "loss": 4.7176, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.605, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0003439832771507565, | |
| "loss": 4.8043, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.6075, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0003402196196848751, | |
| "loss": 4.7839, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0003364660183412892, | |
| "loss": 4.7492, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.6125, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.0003327227093609824, | |
| "loss": 4.7638, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.615, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.0003289899283371657, | |
| "loss": 4.7149, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.6175, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 0.0003252679102004509, | |
| "loss": 4.736, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.00032155688920406414, | |
| "loss": 4.8095, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.6225, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.0003178570989091028, | |
| "loss": 4.7903, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 0.0003141687721698363, | |
| "loss": 4.805, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.6275, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 0.00031049214111904987, | |
| "loss": 4.7314, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 0.00030682743715343565, | |
| "loss": 4.7352, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.6325, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.00030317489091902933, | |
| "loss": 4.6685, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 0.635, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.00029953473229669324, | |
| "loss": 4.7614, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.6375, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 0.00029590719038764856, | |
| "loss": 4.7522, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0002922924934990568, | |
| "loss": 4.7078, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.6425, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.0002886908691296504, | |
| "loss": 4.7599, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 0.645, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 0.0002851025439554142, | |
| "loss": 4.7683, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.6475, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0002815277438153203, | |
| "loss": 4.714, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.0002779666936971129, | |
| "loss": 4.7515, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.6525, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.0002744196177231498, | |
| "loss": 4.7995, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 0.655, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.0002708867391362948, | |
| "loss": 4.8059, | |
| "step": 13100 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 20000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.90944289161216e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |