{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.59, "eval_steps": 500, "global_step": 11800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025, "grad_norm": 1.0703125, "learning_rate": 0.00025, "loss": 10.4938, "step": 50 }, { "epoch": 0.005, "grad_norm": 0.90625, "learning_rate": 0.0005, "loss": 9.1324, "step": 100 }, { "epoch": 0.0075, "grad_norm": 0.62890625, "learning_rate": 0.00075, "loss": 7.883, "step": 150 }, { "epoch": 0.01, "grad_norm": 0.42578125, "learning_rate": 0.001, "loss": 7.3925, "step": 200 }, { "epoch": 0.0125, "grad_norm": 0.6171875, "learning_rate": 0.0009999842657116666, "loss": 7.062, "step": 250 }, { "epoch": 0.015, "grad_norm": 0.498046875, "learning_rate": 0.0009999370638369377, "loss": 6.8396, "step": 300 }, { "epoch": 0.0175, "grad_norm": 0.4296875, "learning_rate": 0.0009998583973465647, "loss": 6.6879, "step": 350 }, { "epoch": 0.02, "grad_norm": 0.435546875, "learning_rate": 0.0009997482711915926, "loss": 6.6086, "step": 400 }, { "epoch": 0.0225, "grad_norm": 0.63671875, "learning_rate": 0.0009996066923030483, "loss": 6.5239, "step": 450 }, { "epoch": 0.025, "grad_norm": 0.37109375, "learning_rate": 0.000999433669591504, "loss": 6.552, "step": 500 }, { "epoch": 0.0275, "grad_norm": 0.5234375, "learning_rate": 0.0009992292139465165, "loss": 6.4607, "step": 550 }, { "epoch": 0.03, "grad_norm": 0.61328125, "learning_rate": 0.0009989933382359422, "loss": 6.3974, "step": 600 }, { "epoch": 0.0325, "grad_norm": 0.67578125, "learning_rate": 0.0009987260573051267, "loss": 6.3249, "step": 650 }, { "epoch": 0.035, "grad_norm": 0.51953125, "learning_rate": 0.0009984273879759713, "loss": 6.3374, "step": 700 }, { "epoch": 0.0375, "grad_norm": 0.482421875, "learning_rate": 0.0009980973490458728, "loss": 6.3157, "step": 750 }, { "epoch": 0.04, "grad_norm": 0.55859375, "learning_rate": 0.0009977359612865424, "loss": 6.2525, "step": 800 }, { "epoch": 0.0425, "grad_norm": 0.53125, "learning_rate": 0.0009973432474426967, "loss": 6.2116, "step": 850 }, { "epoch": 0.045, "grad_norm": 0.439453125, "learning_rate": 0.000996919232230627, "loss": 6.1993, "step": 900 }, { "epoch": 0.0475, "grad_norm": 0.384765625, "learning_rate": 0.0009964639423366442, "loss": 6.0863, "step": 950 }, { "epoch": 0.05, "grad_norm": 0.349609375, "learning_rate": 0.0009959774064153978, "loss": 6.049, "step": 1000 }, { "epoch": 0.0525, "grad_norm": 0.44140625, "learning_rate": 0.0009954596550880734, "loss": 6.0222, "step": 1050 }, { "epoch": 0.055, "grad_norm": 0.453125, "learning_rate": 0.0009949107209404665, "loss": 6.0078, "step": 1100 }, { "epoch": 0.0575, "grad_norm": 0.494140625, "learning_rate": 0.000994330638520929, "loss": 6.0114, "step": 1150 }, { "epoch": 0.06, "grad_norm": 0.46484375, "learning_rate": 0.0009937194443381972, "loss": 5.9764, "step": 1200 }, { "epoch": 0.0625, "grad_norm": 0.53515625, "learning_rate": 0.0009930771768590933, "loss": 5.9146, "step": 1250 }, { "epoch": 0.065, "grad_norm": 0.65625, "learning_rate": 0.000992403876506104, "loss": 5.9021, "step": 1300 }, { "epoch": 0.0675, "grad_norm": 0.396484375, "learning_rate": 0.0009916995856548369, "loss": 5.9053, "step": 1350 }, { "epoch": 0.07, "grad_norm": 0.49609375, "learning_rate": 0.0009909643486313534, "loss": 5.8727, "step": 1400 }, { "epoch": 0.0725, "grad_norm": 0.484375, "learning_rate": 0.0009901982117093786, "loss": 5.8429, "step": 1450 }, { "epoch": 0.075, "grad_norm": 0.6640625, "learning_rate": 0.0009894012231073895, "loss": 5.8162, "step": 1500 }, { "epoch": 0.0775, "grad_norm": 0.48828125, "learning_rate": 0.0009885734329855799, "loss": 5.8881, "step": 1550 }, { "epoch": 0.08, "grad_norm": 0.56640625, "learning_rate": 0.0009877148934427035, "loss": 5.7903, "step": 1600 }, { "epoch": 0.0825, "grad_norm": 0.5390625, "learning_rate": 0.0009868256585127955, "loss": 5.7843, "step": 1650 }, { "epoch": 0.085, "grad_norm": 0.52734375, "learning_rate": 0.000985905784161771, "loss": 5.7443, "step": 1700 }, { "epoch": 0.0875, "grad_norm": 0.69921875, "learning_rate": 0.0009849553282839025, "loss": 5.749, "step": 1750 }, { "epoch": 0.09, "grad_norm": 0.37109375, "learning_rate": 0.0009839743506981783, "loss": 5.8367, "step": 1800 }, { "epoch": 0.0925, "grad_norm": 0.5703125, "learning_rate": 0.0009829629131445341, "loss": 5.7485, "step": 1850 }, { "epoch": 0.095, "grad_norm": 0.5625, "learning_rate": 0.000981921079279971, "loss": 5.7068, "step": 1900 }, { "epoch": 0.0975, "grad_norm": 0.474609375, "learning_rate": 0.0009808489146745465, "loss": 5.7167, "step": 1950 }, { "epoch": 0.1, "grad_norm": 0.462890625, "learning_rate": 0.0009797464868072487, "loss": 5.7298, "step": 2000 }, { "epoch": 0.1025, "grad_norm": 0.53515625, "learning_rate": 0.0009786138650617494, "loss": 5.7332, "step": 2050 }, { "epoch": 0.105, "grad_norm": 0.50390625, "learning_rate": 0.0009774511207220368, "loss": 5.6218, "step": 2100 }, { "epoch": 0.1075, "grad_norm": 0.55859375, "learning_rate": 0.0009762583269679303, "loss": 5.662, "step": 2150 }, { "epoch": 0.11, "grad_norm": 0.435546875, "learning_rate": 0.0009750355588704727, "loss": 5.6434, "step": 2200 }, { "epoch": 0.1125, "grad_norm": 0.49609375, "learning_rate": 0.0009737828933872075, "loss": 5.585, "step": 2250 }, { "epoch": 0.115, "grad_norm": 0.46484375, "learning_rate": 0.0009725004093573342, "loss": 5.537, "step": 2300 }, { "epoch": 0.1175, "grad_norm": 0.51953125, "learning_rate": 0.000971188187496747, "loss": 5.605, "step": 2350 }, { "epoch": 0.12, "grad_norm": 0.431640625, "learning_rate": 0.0009698463103929542, "loss": 5.5706, "step": 2400 }, { "epoch": 0.1225, "grad_norm": 0.58984375, "learning_rate": 0.000968474862499881, "loss": 5.5877, "step": 2450 }, { "epoch": 0.125, "grad_norm": 0.60546875, "learning_rate": 0.0009670739301325534, "loss": 5.61, "step": 2500 }, { "epoch": 0.1275, "grad_norm": 0.70703125, "learning_rate": 0.000965643601461667, "loss": 5.6129, "step": 2550 }, { "epoch": 0.13, "grad_norm": 0.4375, "learning_rate": 0.0009641839665080363, "loss": 5.5661, "step": 2600 }, { "epoch": 0.1325, "grad_norm": 0.396484375, "learning_rate": 0.0009626951171369304, "loss": 5.6271, "step": 2650 }, { "epoch": 0.135, "grad_norm": 0.51171875, "learning_rate": 0.0009611771470522907, "loss": 5.5214, "step": 2700 }, { "epoch": 0.1375, "grad_norm": 0.5234375, "learning_rate": 0.0009596301517908328, "loss": 5.625, "step": 2750 }, { "epoch": 0.14, "grad_norm": 0.453125, "learning_rate": 0.0009580542287160348, "loss": 5.5264, "step": 2800 }, { "epoch": 0.1425, "grad_norm": 0.453125, "learning_rate": 0.0009564494770120089, "loss": 5.5348, "step": 2850 }, { "epoch": 0.145, "grad_norm": 0.462890625, "learning_rate": 0.0009548159976772592, "loss": 5.4634, "step": 2900 }, { "epoch": 0.1475, "grad_norm": 0.48828125, "learning_rate": 0.0009531538935183251, "loss": 5.4421, "step": 2950 }, { "epoch": 0.15, "grad_norm": 0.439453125, "learning_rate": 0.0009514632691433108, "loss": 5.47, "step": 3000 }, { "epoch": 0.1525, "grad_norm": 0.84375, "learning_rate": 0.0009497442309553016, "loss": 5.4528, "step": 3050 }, { "epoch": 0.155, "grad_norm": 0.61328125, "learning_rate": 0.0009479968871456679, "loss": 5.3831, "step": 3100 }, { "epoch": 0.1575, "grad_norm": 0.439453125, "learning_rate": 0.000946221347687255, "loss": 5.3801, "step": 3150 }, { "epoch": 0.16, "grad_norm": 0.474609375, "learning_rate": 0.0009444177243274617, "loss": 5.394, "step": 3200 }, { "epoch": 0.1625, "grad_norm": 0.90234375, "learning_rate": 0.0009425861305812082, "loss": 5.4232, "step": 3250 }, { "epoch": 0.165, "grad_norm": 0.5, "learning_rate": 0.000940726681723791, "loss": 5.4551, "step": 3300 }, { "epoch": 0.1675, "grad_norm": 0.5625, "learning_rate": 0.0009388394947836278, "loss": 5.4354, "step": 3350 }, { "epoch": 0.17, "grad_norm": 0.455078125, "learning_rate": 0.0009369246885348925, "loss": 5.4606, "step": 3400 }, { "epoch": 0.1725, "grad_norm": 0.474609375, "learning_rate": 0.0009349823834900395, "loss": 5.3785, "step": 3450 }, { "epoch": 0.175, "grad_norm": 0.6328125, "learning_rate": 0.0009330127018922195, "loss": 5.3472, "step": 3500 }, { "epoch": 0.1775, "grad_norm": 0.40625, "learning_rate": 0.0009310157677075847, "loss": 5.4156, "step": 3550 }, { "epoch": 0.18, "grad_norm": 0.423828125, "learning_rate": 0.0009289917066174886, "loss": 5.2857, "step": 3600 }, { "epoch": 0.1825, "grad_norm": 0.470703125, "learning_rate": 0.000926940646010574, "loss": 5.3815, "step": 3650 }, { "epoch": 0.185, "grad_norm": 0.46484375, "learning_rate": 0.0009248627149747573, "loss": 5.3596, "step": 3700 }, { "epoch": 0.1875, "grad_norm": 0.455078125, "learning_rate": 0.0009227580442891022, "loss": 5.3197, "step": 3750 }, { "epoch": 0.19, "grad_norm": 0.49609375, "learning_rate": 0.0009206267664155906, "loss": 5.2859, "step": 3800 }, { "epoch": 0.1925, "grad_norm": 0.66015625, "learning_rate": 0.0009184690154907849, "loss": 5.3134, "step": 3850 }, { "epoch": 0.195, "grad_norm": 0.625, "learning_rate": 0.0009162849273173857, "loss": 5.218, "step": 3900 }, { "epoch": 0.1975, "grad_norm": 0.9296875, "learning_rate": 0.0009140746393556853, "loss": 5.3191, "step": 3950 }, { "epoch": 0.2, "grad_norm": 0.416015625, "learning_rate": 0.0009118382907149164, "loss": 5.2908, "step": 4000 }, { "epoch": 0.2025, "grad_norm": 0.498046875, "learning_rate": 0.0009095760221444959, "loss": 5.1909, "step": 4050 }, { "epoch": 0.205, "grad_norm": 1.0234375, "learning_rate": 0.0009072879760251679, "loss": 5.2562, "step": 4100 }, { "epoch": 0.2075, "grad_norm": 0.5859375, "learning_rate": 0.0009049742963600418, "loss": 5.2542, "step": 4150 }, { "epoch": 0.21, "grad_norm": 0.43359375, "learning_rate": 0.0009026351287655293, "loss": 5.1161, "step": 4200 }, { "epoch": 0.2125, "grad_norm": 0.51953125, "learning_rate": 0.0009002706204621802, "loss": 5.1941, "step": 4250 }, { "epoch": 0.215, "grad_norm": 0.6015625, "learning_rate": 0.0008978809202654162, "loss": 5.2478, "step": 4300 }, { "epoch": 0.2175, "grad_norm": 0.46875, "learning_rate": 0.0008954661785761646, "loss": 5.2331, "step": 4350 }, { "epoch": 0.22, "grad_norm": 0.5078125, "learning_rate": 0.0008930265473713938, "loss": 5.1296, "step": 4400 }, { "epoch": 0.2225, "grad_norm": 0.4765625, "learning_rate": 0.0008905621801945467, "loss": 5.194, "step": 4450 }, { "epoch": 0.225, "grad_norm": 0.48828125, "learning_rate": 0.0008880732321458784, "loss": 5.1855, "step": 4500 }, { "epoch": 0.2275, "grad_norm": 0.42578125, "learning_rate": 0.0008855598598726938, "loss": 5.1613, "step": 4550 }, { "epoch": 0.23, "grad_norm": 0.578125, "learning_rate": 0.000883022221559489, "loss": 5.1545, "step": 4600 }, { "epoch": 0.2325, "grad_norm": 0.41015625, "learning_rate": 0.0008804604769179958, "loss": 5.1522, "step": 4650 }, { "epoch": 0.235, "grad_norm": 0.44140625, "learning_rate": 0.0008778747871771292, "loss": 5.1246, "step": 4700 }, { "epoch": 0.2375, "grad_norm": 0.625, "learning_rate": 0.0008752653150728412, "loss": 5.1371, "step": 4750 }, { "epoch": 0.24, "grad_norm": 0.50390625, "learning_rate": 0.0008726322248378774, "loss": 5.0847, "step": 4800 }, { "epoch": 0.2425, "grad_norm": 0.44140625, "learning_rate": 0.0008699756821914419, "loss": 5.0916, "step": 4850 }, { "epoch": 0.245, "grad_norm": 0.59765625, "learning_rate": 0.0008672958543287666, "loss": 5.0772, "step": 4900 }, { "epoch": 0.2475, "grad_norm": 0.51953125, "learning_rate": 0.0008645929099105886, "loss": 5.1363, "step": 4950 }, { "epoch": 0.25, "grad_norm": 0.451171875, "learning_rate": 0.000861867019052535, "loss": 5.075, "step": 5000 }, { "epoch": 0.2525, "grad_norm": 0.51171875, "learning_rate": 0.0008591183533144171, "loss": 5.1377, "step": 5050 }, { "epoch": 0.255, "grad_norm": 0.470703125, "learning_rate": 0.0008563470856894315, "loss": 5.0829, "step": 5100 }, { "epoch": 0.2575, "grad_norm": 0.59375, "learning_rate": 0.0008535533905932737, "loss": 5.0808, "step": 5150 }, { "epoch": 0.26, "grad_norm": 0.49609375, "learning_rate": 0.0008507374438531607, "loss": 5.0014, "step": 5200 }, { "epoch": 0.2625, "grad_norm": 0.5859375, "learning_rate": 0.0008478994226967638, "loss": 5.0243, "step": 5250 }, { "epoch": 0.265, "grad_norm": 0.4140625, "learning_rate": 0.000845039505741056, "loss": 5.128, "step": 5300 }, { "epoch": 0.2675, "grad_norm": 0.453125, "learning_rate": 0.0008421578729810691, "loss": 5.0646, "step": 5350 }, { "epoch": 0.27, "grad_norm": 0.50390625, "learning_rate": 0.0008392547057785661, "loss": 5.0641, "step": 5400 }, { "epoch": 0.2725, "grad_norm": 0.48046875, "learning_rate": 0.0008363301868506264, "loss": 5.0466, "step": 5450 }, { "epoch": 0.275, "grad_norm": 0.51171875, "learning_rate": 0.0008333845002581458, "loss": 5.0462, "step": 5500 }, { "epoch": 0.2775, "grad_norm": 0.5390625, "learning_rate": 0.0008304178313942535, "loss": 5.1451, "step": 5550 }, { "epoch": 0.28, "grad_norm": 0.5859375, "learning_rate": 0.0008274303669726426, "loss": 5.1001, "step": 5600 }, { "epoch": 0.2825, "grad_norm": 0.4921875, "learning_rate": 0.0008244222950158193, "loss": 4.9816, "step": 5650 }, { "epoch": 0.285, "grad_norm": 0.466796875, "learning_rate": 0.0008213938048432696, "loss": 5.0636, "step": 5700 }, { "epoch": 0.2875, "grad_norm": 0.60546875, "learning_rate": 0.0008183450870595441, "loss": 4.977, "step": 5750 }, { "epoch": 0.29, "grad_norm": 0.52734375, "learning_rate": 0.0008152763335422613, "loss": 4.9954, "step": 5800 }, { "epoch": 0.2925, "grad_norm": 0.5546875, "learning_rate": 0.0008121877374300317, "loss": 4.9429, "step": 5850 }, { "epoch": 0.295, "grad_norm": 0.51171875, "learning_rate": 0.0008090794931103026, "loss": 4.9819, "step": 5900 }, { "epoch": 0.2975, "grad_norm": 0.57421875, "learning_rate": 0.0008059517962071233, "loss": 5.0432, "step": 5950 }, { "epoch": 0.3, "grad_norm": 0.50390625, "learning_rate": 0.0008028048435688333, "loss": 4.8642, "step": 6000 }, { "epoch": 0.3025, "grad_norm": 0.7421875, "learning_rate": 0.0007996388332556734, "loss": 5.0357, "step": 6050 }, { "epoch": 0.305, "grad_norm": 0.46484375, "learning_rate": 0.0007964539645273203, "loss": 4.9587, "step": 6100 }, { "epoch": 0.3075, "grad_norm": 0.470703125, "learning_rate": 0.0007932504378303451, "loss": 4.9245, "step": 6150 }, { "epoch": 0.31, "grad_norm": 0.625, "learning_rate": 0.0007900284547855992, "loss": 4.9813, "step": 6200 }, { "epoch": 0.3125, "grad_norm": 0.51171875, "learning_rate": 0.0007867882181755231, "loss": 5.0188, "step": 6250 }, { "epoch": 0.315, "grad_norm": 0.6953125, "learning_rate": 0.0007835299319313853, "loss": 4.9338, "step": 6300 }, { "epoch": 0.3175, "grad_norm": 0.59375, "learning_rate": 0.000780253801120447, "loss": 4.8996, "step": 6350 }, { "epoch": 0.32, "grad_norm": 0.671875, "learning_rate": 0.0007769600319330552, "loss": 4.9161, "step": 6400 }, { "epoch": 0.3225, "grad_norm": 0.5078125, "learning_rate": 0.0007736488316696662, "loss": 4.9648, "step": 6450 }, { "epoch": 0.325, "grad_norm": 0.625, "learning_rate": 0.0007703204087277988, "loss": 4.908, "step": 6500 }, { "epoch": 0.3275, "grad_norm": 0.50390625, "learning_rate": 0.0007669749725889182, "loss": 4.9536, "step": 6550 }, { "epoch": 0.33, "grad_norm": 0.87890625, "learning_rate": 0.0007636127338052513, "loss": 4.8755, "step": 6600 }, { "epoch": 0.3325, "grad_norm": 0.5546875, "learning_rate": 0.0007602339039865362, "loss": 4.8819, "step": 6650 }, { "epoch": 0.335, "grad_norm": 0.53125, "learning_rate": 0.0007568386957867032, "loss": 4.8511, "step": 6700 }, { "epoch": 0.3375, "grad_norm": 0.515625, "learning_rate": 0.0007534273228904916, "loss": 4.9324, "step": 6750 }, { "epoch": 0.34, "grad_norm": 0.58203125, "learning_rate": 0.00075, "loss": 4.8898, "step": 6800 }, { "epoch": 0.3425, "grad_norm": 0.63671875, "learning_rate": 0.0007465569428211752, "loss": 4.9447, "step": 6850 }, { "epoch": 0.345, "grad_norm": 0.58203125, "learning_rate": 0.0007430983680502344, "loss": 4.8954, "step": 6900 }, { "epoch": 0.3475, "grad_norm": 0.435546875, "learning_rate": 0.0007396244933600284, "loss": 4.8977, "step": 6950 }, { "epoch": 0.35, "grad_norm": 0.7734375, "learning_rate": 0.0007361355373863414, "loss": 4.7837, "step": 7000 }, { "epoch": 0.3525, "grad_norm": 0.458984375, "learning_rate": 0.0007326317197141304, "loss": 4.9241, "step": 7050 }, { "epoch": 0.355, "grad_norm": 0.58203125, "learning_rate": 0.0007291132608637052, "loss": 4.9113, "step": 7100 }, { "epoch": 0.3575, "grad_norm": 0.65234375, "learning_rate": 0.0007255803822768504, "loss": 4.9187, "step": 7150 }, { "epoch": 0.36, "grad_norm": 0.60546875, "learning_rate": 0.0007220333063028871, "loss": 4.8632, "step": 7200 }, { "epoch": 0.3625, "grad_norm": 0.58203125, "learning_rate": 0.0007184722561846798, "loss": 5.0078, "step": 7250 }, { "epoch": 0.365, "grad_norm": 0.50390625, "learning_rate": 0.0007148974560445859, "loss": 4.7923, "step": 7300 }, { "epoch": 0.3675, "grad_norm": 0.52734375, "learning_rate": 0.0007113091308703497, "loss": 4.8948, "step": 7350 }, { "epoch": 0.37, "grad_norm": 0.466796875, "learning_rate": 0.0007077075065009433, "loss": 4.8259, "step": 7400 }, { "epoch": 0.3725, "grad_norm": 0.55859375, "learning_rate": 0.0007040928096123516, "loss": 4.8009, "step": 7450 }, { "epoch": 0.375, "grad_norm": 0.4375, "learning_rate": 0.0007004652677033068, "loss": 4.847, "step": 7500 }, { "epoch": 0.3775, "grad_norm": 0.486328125, "learning_rate": 0.0006968251090809707, "loss": 4.835, "step": 7550 }, { "epoch": 0.38, "grad_norm": 0.455078125, "learning_rate": 0.0006931725628465643, "loss": 4.8838, "step": 7600 }, { "epoch": 0.3825, "grad_norm": 0.68359375, "learning_rate": 0.0006895078588809502, "loss": 4.8705, "step": 7650 }, { "epoch": 0.385, "grad_norm": 0.46875, "learning_rate": 0.0006858312278301637, "loss": 4.763, "step": 7700 }, { "epoch": 0.3875, "grad_norm": 0.53125, "learning_rate": 0.0006821429010908972, "loss": 4.8961, "step": 7750 }, { "epoch": 0.39, "grad_norm": 0.53125, "learning_rate": 0.0006784431107959359, "loss": 4.8141, "step": 7800 }, { "epoch": 0.3925, "grad_norm": 0.5078125, "learning_rate": 0.0006747320897995492, "loss": 4.8718, "step": 7850 }, { "epoch": 0.395, "grad_norm": 0.5390625, "learning_rate": 0.0006710100716628344, "loss": 4.8394, "step": 7900 }, { "epoch": 0.3975, "grad_norm": 0.65234375, "learning_rate": 0.0006672772906390176, "loss": 4.8264, "step": 7950 }, { "epoch": 0.4, "grad_norm": 0.80078125, "learning_rate": 0.0006635339816587109, "loss": 4.769, "step": 8000 }, { "epoch": 0.4025, "grad_norm": 0.61328125, "learning_rate": 0.000659780380315125, "loss": 4.8428, "step": 8050 }, { "epoch": 0.405, "grad_norm": 0.515625, "learning_rate": 0.0006560167228492435, "loss": 4.8591, "step": 8100 }, { "epoch": 0.4075, "grad_norm": 0.80078125, "learning_rate": 0.0006522432461349536, "loss": 4.9224, "step": 8150 }, { "epoch": 0.41, "grad_norm": 0.640625, "learning_rate": 0.0006484601876641375, "loss": 4.7773, "step": 8200 }, { "epoch": 0.4125, "grad_norm": 0.5703125, "learning_rate": 0.0006446677855317265, "loss": 4.8107, "step": 8250 }, { "epoch": 0.415, "grad_norm": 0.53515625, "learning_rate": 0.0006408662784207149, "loss": 4.8252, "step": 8300 }, { "epoch": 0.4175, "grad_norm": 0.546875, "learning_rate": 0.0006370559055871389, "loss": 4.8562, "step": 8350 }, { "epoch": 0.42, "grad_norm": 0.5625, "learning_rate": 0.0006332369068450174, "loss": 4.7781, "step": 8400 }, { "epoch": 0.4225, "grad_norm": 0.462890625, "learning_rate": 0.0006294095225512603, "loss": 4.8325, "step": 8450 }, { "epoch": 0.425, "grad_norm": 0.45703125, "learning_rate": 0.0006255739935905395, "loss": 4.8363, "step": 8500 }, { "epoch": 0.4275, "grad_norm": 0.6015625, "learning_rate": 0.0006217305613601295, "loss": 4.8252, "step": 8550 }, { "epoch": 0.43, "grad_norm": 0.609375, "learning_rate": 0.0006178794677547138, "loss": 4.8079, "step": 8600 }, { "epoch": 0.4325, "grad_norm": 0.470703125, "learning_rate": 0.0006140209551511608, "loss": 4.8508, "step": 8650 }, { "epoch": 0.435, "grad_norm": 0.53515625, "learning_rate": 0.0006101552663932703, "loss": 4.8389, "step": 8700 }, { "epoch": 0.4375, "grad_norm": 0.66796875, "learning_rate": 0.0006062826447764884, "loss": 4.7695, "step": 8750 }, { "epoch": 0.44, "grad_norm": 0.455078125, "learning_rate": 0.0006024033340325954, "loss": 4.7953, "step": 8800 }, { "epoch": 0.4425, "grad_norm": 0.56640625, "learning_rate": 0.0005985175783143666, "loss": 4.8144, "step": 8850 }, { "epoch": 0.445, "grad_norm": 0.48046875, "learning_rate": 0.0005946256221802051, "loss": 4.8123, "step": 8900 }, { "epoch": 0.4475, "grad_norm": 0.52734375, "learning_rate": 0.0005907277105787513, "loss": 4.7778, "step": 8950 }, { "epoch": 0.45, "grad_norm": 0.53515625, "learning_rate": 0.0005868240888334653, "loss": 4.7237, "step": 9000 }, { "epoch": 0.4525, "grad_norm": 0.58984375, "learning_rate": 0.0005829150026271871, "loss": 4.7938, "step": 9050 }, { "epoch": 0.455, "grad_norm": 0.56640625, "learning_rate": 0.000579000697986675, "loss": 4.7879, "step": 9100 }, { "epoch": 0.4575, "grad_norm": 0.578125, "learning_rate": 0.0005750814212671201, "loss": 4.7317, "step": 9150 }, { "epoch": 0.46, "grad_norm": 0.578125, "learning_rate": 0.0005711574191366427, "loss": 4.7819, "step": 9200 }, { "epoch": 0.4625, "grad_norm": 0.61328125, "learning_rate": 0.0005672289385607659, "loss": 4.8393, "step": 9250 }, { "epoch": 0.465, "grad_norm": 0.69921875, "learning_rate": 0.0005632962267868747, "loss": 4.7797, "step": 9300 }, { "epoch": 0.4675, "grad_norm": 0.71484375, "learning_rate": 0.0005593595313286526, "loss": 4.764, "step": 9350 }, { "epoch": 0.47, "grad_norm": 0.4765625, "learning_rate": 0.0005554190999505056, "loss": 4.7639, "step": 9400 }, { "epoch": 0.4725, "grad_norm": 0.53125, "learning_rate": 0.0005514751806519673, "loss": 4.8052, "step": 9450 }, { "epoch": 0.475, "grad_norm": 0.6328125, "learning_rate": 0.0005475280216520913, "loss": 4.8682, "step": 9500 }, { "epoch": 0.4775, "grad_norm": 0.640625, "learning_rate": 0.0005435778713738292, "loss": 4.748, "step": 9550 }, { "epoch": 0.48, "grad_norm": 0.66015625, "learning_rate": 0.0005396249784283942, "loss": 4.7625, "step": 9600 }, { "epoch": 0.4825, "grad_norm": 0.51171875, "learning_rate": 0.0005356695915996161, "loss": 4.824, "step": 9650 }, { "epoch": 0.485, "grad_norm": 0.54296875, "learning_rate": 0.0005317119598282822, "loss": 4.7946, "step": 9700 }, { "epoch": 0.4875, "grad_norm": 0.671875, "learning_rate": 0.0005277523321964701, "loss": 4.8476, "step": 9750 }, { "epoch": 0.49, "grad_norm": 0.55078125, "learning_rate": 0.0005237909579118712, "loss": 4.735, "step": 9800 }, { "epoch": 0.4925, "grad_norm": 0.47265625, "learning_rate": 0.0005198280862921062, "loss": 4.8119, "step": 9850 }, { "epoch": 0.495, "grad_norm": 0.49609375, "learning_rate": 0.0005158639667490339, "loss": 4.794, "step": 9900 }, { "epoch": 0.4975, "grad_norm": 1.0546875, "learning_rate": 0.0005118988487730537, "loss": 4.7226, "step": 9950 }, { "epoch": 0.5, "grad_norm": 0.56640625, "learning_rate": 0.000507932981917404, "loss": 4.9084, "step": 10000 }, { "epoch": 0.5025, "grad_norm": 0.54296875, "learning_rate": 0.0005039666157824549, "loss": 4.6737, "step": 10050 }, { "epoch": 0.505, "grad_norm": 0.59765625, "learning_rate": 0.0005, "loss": 4.7808, "step": 10100 }, { "epoch": 0.5075, "grad_norm": 0.86328125, "learning_rate": 0.0004960333842175453, "loss": 4.8202, "step": 10150 }, { "epoch": 0.51, "grad_norm": 0.546875, "learning_rate": 0.000492067018082596, "loss": 4.7873, "step": 10200 }, { "epoch": 0.5125, "grad_norm": 0.58984375, "learning_rate": 0.00048810115122694634, "loss": 4.7753, "step": 10250 }, { "epoch": 0.515, "grad_norm": 0.6796875, "learning_rate": 0.0004841360332509663, "loss": 4.767, "step": 10300 }, { "epoch": 0.5175, "grad_norm": 0.60546875, "learning_rate": 0.00048017191370789385, "loss": 4.7159, "step": 10350 }, { "epoch": 0.52, "grad_norm": 0.53125, "learning_rate": 0.0004762090420881289, "loss": 4.8013, "step": 10400 }, { "epoch": 0.5225, "grad_norm": 0.55859375, "learning_rate": 0.00047224766780353, "loss": 4.7983, "step": 10450 }, { "epoch": 0.525, "grad_norm": 0.5703125, "learning_rate": 0.00046828804017171776, "loss": 4.7126, "step": 10500 }, { "epoch": 0.5275, "grad_norm": 0.474609375, "learning_rate": 0.00046433040840038387, "loss": 4.7993, "step": 10550 }, { "epoch": 0.53, "grad_norm": 0.78515625, "learning_rate": 0.00046037502157160573, "loss": 4.6986, "step": 10600 }, { "epoch": 0.5325, "grad_norm": 0.5546875, "learning_rate": 0.00045642212862617086, "loss": 4.7849, "step": 10650 }, { "epoch": 0.535, "grad_norm": 0.7109375, "learning_rate": 0.0004524719783479088, "loss": 4.7535, "step": 10700 }, { "epoch": 0.5375, "grad_norm": 0.64453125, "learning_rate": 0.0004485248193480328, "loss": 4.7556, "step": 10750 }, { "epoch": 0.54, "grad_norm": 0.6328125, "learning_rate": 0.00044458090004949454, "loss": 4.696, "step": 10800 }, { "epoch": 0.5425, "grad_norm": 0.59765625, "learning_rate": 0.00044064046867134756, "loss": 4.7343, "step": 10850 }, { "epoch": 0.545, "grad_norm": 0.52734375, "learning_rate": 0.00043670377321312535, "loss": 4.8243, "step": 10900 }, { "epoch": 0.5475, "grad_norm": 0.47265625, "learning_rate": 0.0004327710614392341, "loss": 4.7604, "step": 10950 }, { "epoch": 0.55, "grad_norm": 0.58984375, "learning_rate": 0.0004288425808633575, "loss": 4.8299, "step": 11000 }, { "epoch": 0.5525, "grad_norm": 0.55078125, "learning_rate": 0.0004249185787328798, "loss": 4.7282, "step": 11050 }, { "epoch": 0.555, "grad_norm": 0.5234375, "learning_rate": 0.000420999302013325, "loss": 4.7481, "step": 11100 }, { "epoch": 0.5575, "grad_norm": 0.6328125, "learning_rate": 0.00041708499737281305, "loss": 4.8119, "step": 11150 }, { "epoch": 0.56, "grad_norm": 0.48046875, "learning_rate": 0.00041317591116653486, "loss": 4.817, "step": 11200 }, { "epoch": 0.5625, "grad_norm": 0.53515625, "learning_rate": 0.0004092722894212487, "loss": 4.7253, "step": 11250 }, { "epoch": 0.565, "grad_norm": 0.5859375, "learning_rate": 0.0004053743778197951, "loss": 4.7405, "step": 11300 }, { "epoch": 0.5675, "grad_norm": 0.51171875, "learning_rate": 0.00040148242168563356, "loss": 4.7959, "step": 11350 }, { "epoch": 0.57, "grad_norm": 0.5703125, "learning_rate": 0.0003975966659674047, "loss": 4.6902, "step": 11400 }, { "epoch": 0.5725, "grad_norm": 0.52734375, "learning_rate": 0.00039371735522351166, "loss": 4.7793, "step": 11450 }, { "epoch": 0.575, "grad_norm": 0.671875, "learning_rate": 0.00038984473360672965, "loss": 4.7839, "step": 11500 }, { "epoch": 0.5775, "grad_norm": 0.6640625, "learning_rate": 0.0003859790448488394, "loss": 4.6369, "step": 11550 }, { "epoch": 0.58, "grad_norm": 0.5234375, "learning_rate": 0.0003821205322452863, "loss": 4.7955, "step": 11600 }, { "epoch": 0.5825, "grad_norm": 0.65234375, "learning_rate": 0.00037826943863987055, "loss": 4.7773, "step": 11650 }, { "epoch": 0.585, "grad_norm": 0.59375, "learning_rate": 0.00037442600640946044, "loss": 4.7528, "step": 11700 }, { "epoch": 0.5875, "grad_norm": 0.61328125, "learning_rate": 0.0003705904774487396, "loss": 4.6866, "step": 11750 }, { "epoch": 0.59, "grad_norm": 0.4609375, "learning_rate": 0.0003667630931549826, "loss": 4.6988, "step": 11800 } ], "logging_steps": 50, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.7193201106944e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }