diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4430 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.80384, + "eval_steps": 500, + "global_step": 628, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00128, + "grad_norm": 2.8870697066158506, + "learning_rate": 0.0, + "loss": 0.8422, + "step": 1 + }, + { + "epoch": 0.00256, + "grad_norm": 2.88484389829891, + "learning_rate": 6.329113924050633e-07, + "loss": 0.8541, + "step": 2 + }, + { + "epoch": 0.00384, + "grad_norm": 2.858151965789657, + "learning_rate": 1.2658227848101265e-06, + "loss": 0.8376, + "step": 3 + }, + { + "epoch": 0.00512, + "grad_norm": 2.759628117182127, + "learning_rate": 1.8987341772151901e-06, + "loss": 0.8334, + "step": 4 + }, + { + "epoch": 0.0064, + "grad_norm": 2.796990062811218, + "learning_rate": 2.531645569620253e-06, + "loss": 0.8256, + "step": 5 + }, + { + "epoch": 0.00768, + "grad_norm": 2.5779298795445023, + "learning_rate": 3.1645569620253167e-06, + "loss": 0.8301, + "step": 6 + }, + { + "epoch": 0.00896, + "grad_norm": 2.182261607936066, + "learning_rate": 3.7974683544303802e-06, + "loss": 0.8156, + "step": 7 + }, + { + "epoch": 0.01024, + "grad_norm": 1.9615896152651355, + "learning_rate": 4.430379746835443e-06, + "loss": 0.7982, + "step": 8 + }, + { + "epoch": 0.01152, + "grad_norm": 1.452541644948315, + "learning_rate": 5.063291139240506e-06, + "loss": 0.7819, + "step": 9 + }, + { + "epoch": 0.0128, + "grad_norm": 1.4723286808630864, + "learning_rate": 5.69620253164557e-06, + "loss": 0.7906, + "step": 10 + }, + { + "epoch": 0.01408, + "grad_norm": 1.3529636617858944, + "learning_rate": 6.329113924050633e-06, + "loss": 0.7724, + "step": 11 + }, + { + "epoch": 0.01536, + "grad_norm": 1.960737179905222, + "learning_rate": 6.9620253164556965e-06, + "loss": 0.7495, + "step": 12 + }, + { + "epoch": 0.01664, + "grad_norm": 2.2349101055406337, + "learning_rate": 7.5949367088607605e-06, + "loss": 0.7581, + "step": 13 + }, + { + "epoch": 0.01792, + "grad_norm": 2.0897577150322477, + "learning_rate": 8.227848101265822e-06, + "loss": 0.7404, + "step": 14 + }, + { + "epoch": 0.0192, + "grad_norm": 1.8227218322635887, + "learning_rate": 8.860759493670886e-06, + "loss": 0.7382, + "step": 15 + }, + { + "epoch": 0.02048, + "grad_norm": 1.2099951464458898, + "learning_rate": 9.49367088607595e-06, + "loss": 0.7231, + "step": 16 + }, + { + "epoch": 0.02176, + "grad_norm": 1.2177037129914572, + "learning_rate": 1.0126582278481012e-05, + "loss": 0.7259, + "step": 17 + }, + { + "epoch": 0.02304, + "grad_norm": 1.1031346132830708, + "learning_rate": 1.0759493670886076e-05, + "loss": 0.7059, + "step": 18 + }, + { + "epoch": 0.02432, + "grad_norm": 0.9194779600801882, + "learning_rate": 1.139240506329114e-05, + "loss": 0.7137, + "step": 19 + }, + { + "epoch": 0.0256, + "grad_norm": 0.8679468005972053, + "learning_rate": 1.2025316455696203e-05, + "loss": 0.7036, + "step": 20 + }, + { + "epoch": 0.02688, + "grad_norm": 0.7227287276969042, + "learning_rate": 1.2658227848101267e-05, + "loss": 0.696, + "step": 21 + }, + { + "epoch": 0.02816, + "grad_norm": 0.7425882516811844, + "learning_rate": 1.3291139240506329e-05, + "loss": 0.6888, + "step": 22 + }, + { + "epoch": 0.02944, + "grad_norm": 0.7093793012252196, + "learning_rate": 1.3924050632911393e-05, + "loss": 0.6791, + "step": 23 + }, + { + "epoch": 0.03072, + "grad_norm": 0.6018215463147907, + "learning_rate": 1.4556962025316457e-05, + "loss": 0.6783, + "step": 24 + }, + { + "epoch": 0.032, + "grad_norm": 0.5846346732378257, + "learning_rate": 1.5189873417721521e-05, + "loss": 0.6811, + "step": 25 + }, + { + "epoch": 0.03328, + "grad_norm": 0.5855419788452784, + "learning_rate": 1.5822784810126583e-05, + "loss": 0.683, + "step": 26 + }, + { + "epoch": 0.03456, + "grad_norm": 0.5096689891724868, + "learning_rate": 1.6455696202531644e-05, + "loss": 0.6589, + "step": 27 + }, + { + "epoch": 0.03584, + "grad_norm": 0.4871170504081146, + "learning_rate": 1.7088607594936708e-05, + "loss": 0.6582, + "step": 28 + }, + { + "epoch": 0.03712, + "grad_norm": 0.4949600697144217, + "learning_rate": 1.7721518987341772e-05, + "loss": 0.669, + "step": 29 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5082926031630941, + "learning_rate": 1.8354430379746836e-05, + "loss": 0.666, + "step": 30 + }, + { + "epoch": 0.03968, + "grad_norm": 0.49381475380567175, + "learning_rate": 1.89873417721519e-05, + "loss": 0.6556, + "step": 31 + }, + { + "epoch": 0.04096, + "grad_norm": 0.4265624784331274, + "learning_rate": 1.962025316455696e-05, + "loss": 0.646, + "step": 32 + }, + { + "epoch": 0.04224, + "grad_norm": 0.39190416547723717, + "learning_rate": 2.0253164556962025e-05, + "loss": 0.6473, + "step": 33 + }, + { + "epoch": 0.04352, + "grad_norm": 0.4631353399929371, + "learning_rate": 2.088607594936709e-05, + "loss": 0.6441, + "step": 34 + }, + { + "epoch": 0.0448, + "grad_norm": 0.3928335126997034, + "learning_rate": 2.1518987341772153e-05, + "loss": 0.6352, + "step": 35 + }, + { + "epoch": 0.04608, + "grad_norm": 0.36295027582313966, + "learning_rate": 2.2151898734177217e-05, + "loss": 0.6333, + "step": 36 + }, + { + "epoch": 0.04736, + "grad_norm": 0.35026852064181846, + "learning_rate": 2.278481012658228e-05, + "loss": 0.6399, + "step": 37 + }, + { + "epoch": 0.04864, + "grad_norm": 0.39778614916835536, + "learning_rate": 2.341772151898734e-05, + "loss": 0.6298, + "step": 38 + }, + { + "epoch": 0.04992, + "grad_norm": 0.33278348666417684, + "learning_rate": 2.4050632911392405e-05, + "loss": 0.6301, + "step": 39 + }, + { + "epoch": 0.0512, + "grad_norm": 0.31444068712551376, + "learning_rate": 2.468354430379747e-05, + "loss": 0.6263, + "step": 40 + }, + { + "epoch": 0.05248, + "grad_norm": 0.36059728676958264, + "learning_rate": 2.5316455696202533e-05, + "loss": 0.6458, + "step": 41 + }, + { + "epoch": 0.05376, + "grad_norm": 0.3916144552301749, + "learning_rate": 2.5949367088607597e-05, + "loss": 0.6331, + "step": 42 + }, + { + "epoch": 0.05504, + "grad_norm": 0.32338566356420756, + "learning_rate": 2.6582278481012658e-05, + "loss": 0.6332, + "step": 43 + }, + { + "epoch": 0.05632, + "grad_norm": 0.33704233729853356, + "learning_rate": 2.7215189873417722e-05, + "loss": 0.6348, + "step": 44 + }, + { + "epoch": 0.0576, + "grad_norm": 0.36015399213900634, + "learning_rate": 2.7848101265822786e-05, + "loss": 0.6392, + "step": 45 + }, + { + "epoch": 0.05888, + "grad_norm": 0.31471331803021757, + "learning_rate": 2.848101265822785e-05, + "loss": 0.6272, + "step": 46 + }, + { + "epoch": 0.06016, + "grad_norm": 0.3225170654156012, + "learning_rate": 2.9113924050632914e-05, + "loss": 0.635, + "step": 47 + }, + { + "epoch": 0.06144, + "grad_norm": 0.3064473735810606, + "learning_rate": 2.9746835443037974e-05, + "loss": 0.6284, + "step": 48 + }, + { + "epoch": 0.06272, + "grad_norm": 0.3038289969291092, + "learning_rate": 3.0379746835443042e-05, + "loss": 0.6149, + "step": 49 + }, + { + "epoch": 0.064, + "grad_norm": 0.3226803690164346, + "learning_rate": 3.10126582278481e-05, + "loss": 0.626, + "step": 50 + }, + { + "epoch": 0.06528, + "grad_norm": 0.3096398144524693, + "learning_rate": 3.1645569620253167e-05, + "loss": 0.621, + "step": 51 + }, + { + "epoch": 0.06656, + "grad_norm": 0.2754757429130796, + "learning_rate": 3.227848101265823e-05, + "loss": 0.6185, + "step": 52 + }, + { + "epoch": 0.06784, + "grad_norm": 0.3262507218160328, + "learning_rate": 3.291139240506329e-05, + "loss": 0.6171, + "step": 53 + }, + { + "epoch": 0.06912, + "grad_norm": 0.34971068352090656, + "learning_rate": 3.354430379746836e-05, + "loss": 0.616, + "step": 54 + }, + { + "epoch": 0.0704, + "grad_norm": 0.2841621281043231, + "learning_rate": 3.4177215189873416e-05, + "loss": 0.5995, + "step": 55 + }, + { + "epoch": 0.07168, + "grad_norm": 0.4003223636484448, + "learning_rate": 3.4810126582278487e-05, + "loss": 0.6169, + "step": 56 + }, + { + "epoch": 0.07296, + "grad_norm": 0.31868860231705426, + "learning_rate": 3.5443037974683544e-05, + "loss": 0.6077, + "step": 57 + }, + { + "epoch": 0.07424, + "grad_norm": 0.3960425782005289, + "learning_rate": 3.607594936708861e-05, + "loss": 0.6164, + "step": 58 + }, + { + "epoch": 0.07552, + "grad_norm": 0.363865574596696, + "learning_rate": 3.670886075949367e-05, + "loss": 0.6118, + "step": 59 + }, + { + "epoch": 0.0768, + "grad_norm": 0.33961478774466697, + "learning_rate": 3.7341772151898736e-05, + "loss": 0.6137, + "step": 60 + }, + { + "epoch": 0.07808, + "grad_norm": 0.4212164741206082, + "learning_rate": 3.79746835443038e-05, + "loss": 0.6275, + "step": 61 + }, + { + "epoch": 0.07936, + "grad_norm": 0.29878729710395663, + "learning_rate": 3.8607594936708864e-05, + "loss": 0.6084, + "step": 62 + }, + { + "epoch": 0.08064, + "grad_norm": 0.36745026817379894, + "learning_rate": 3.924050632911392e-05, + "loss": 0.607, + "step": 63 + }, + { + "epoch": 0.08192, + "grad_norm": 0.38983571508393644, + "learning_rate": 3.987341772151899e-05, + "loss": 0.6176, + "step": 64 + }, + { + "epoch": 0.0832, + "grad_norm": 0.37337392917475115, + "learning_rate": 4.050632911392405e-05, + "loss": 0.6184, + "step": 65 + }, + { + "epoch": 0.08448, + "grad_norm": 0.3668068115925863, + "learning_rate": 4.113924050632912e-05, + "loss": 0.6194, + "step": 66 + }, + { + "epoch": 0.08576, + "grad_norm": 0.36138503055306903, + "learning_rate": 4.177215189873418e-05, + "loss": 0.6077, + "step": 67 + }, + { + "epoch": 0.08704, + "grad_norm": 0.43361127462043814, + "learning_rate": 4.240506329113924e-05, + "loss": 0.6147, + "step": 68 + }, + { + "epoch": 0.08832, + "grad_norm": 0.33520423726109644, + "learning_rate": 4.3037974683544305e-05, + "loss": 0.6118, + "step": 69 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4381154362148859, + "learning_rate": 4.367088607594937e-05, + "loss": 0.6031, + "step": 70 + }, + { + "epoch": 0.09088, + "grad_norm": 0.3717345864324632, + "learning_rate": 4.430379746835443e-05, + "loss": 0.6031, + "step": 71 + }, + { + "epoch": 0.09216, + "grad_norm": 0.4861728465398392, + "learning_rate": 4.49367088607595e-05, + "loss": 0.6006, + "step": 72 + }, + { + "epoch": 0.09344, + "grad_norm": 0.3264992939190504, + "learning_rate": 4.556962025316456e-05, + "loss": 0.6151, + "step": 73 + }, + { + "epoch": 0.09472, + "grad_norm": 0.4319794925001871, + "learning_rate": 4.6202531645569625e-05, + "loss": 0.6058, + "step": 74 + }, + { + "epoch": 0.096, + "grad_norm": 0.4616345840492333, + "learning_rate": 4.683544303797468e-05, + "loss": 0.5967, + "step": 75 + }, + { + "epoch": 0.09728, + "grad_norm": 0.4405721152587957, + "learning_rate": 4.7468354430379746e-05, + "loss": 0.6002, + "step": 76 + }, + { + "epoch": 0.09856, + "grad_norm": 0.5122605377853799, + "learning_rate": 4.810126582278481e-05, + "loss": 0.6076, + "step": 77 + }, + { + "epoch": 0.09984, + "grad_norm": 0.45313870340097556, + "learning_rate": 4.8734177215189874e-05, + "loss": 0.6074, + "step": 78 + }, + { + "epoch": 0.10112, + "grad_norm": 0.4340044755876676, + "learning_rate": 4.936708860759494e-05, + "loss": 0.606, + "step": 79 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4987172862476422, + "learning_rate": 5e-05, + "loss": 0.6158, + "step": 80 + }, + { + "epoch": 0.10368, + "grad_norm": 0.6226880208665108, + "learning_rate": 4.999974965737065e-05, + "loss": 0.621, + "step": 81 + }, + { + "epoch": 0.10496, + "grad_norm": 0.5448293131914782, + "learning_rate": 4.999899863449631e-05, + "loss": 0.6014, + "step": 82 + }, + { + "epoch": 0.10624, + "grad_norm": 0.3427022601926917, + "learning_rate": 4.999774694641803e-05, + "loss": 0.6198, + "step": 83 + }, + { + "epoch": 0.10752, + "grad_norm": 0.5005152113593655, + "learning_rate": 4.999599461820387e-05, + "loss": 0.6054, + "step": 84 + }, + { + "epoch": 0.1088, + "grad_norm": 0.5702968806820528, + "learning_rate": 4.999374168494844e-05, + "loss": 0.6069, + "step": 85 + }, + { + "epoch": 0.11008, + "grad_norm": 0.4671310661706222, + "learning_rate": 4.999098819177214e-05, + "loss": 0.6017, + "step": 86 + }, + { + "epoch": 0.11136, + "grad_norm": 0.46081768174689064, + "learning_rate": 4.9987734193820324e-05, + "loss": 0.5988, + "step": 87 + }, + { + "epoch": 0.11264, + "grad_norm": 0.5448729856183013, + "learning_rate": 4.9983979756262136e-05, + "loss": 0.6181, + "step": 88 + }, + { + "epoch": 0.11392, + "grad_norm": 0.5095775592779056, + "learning_rate": 4.9979724954289244e-05, + "loss": 0.608, + "step": 89 + }, + { + "epoch": 0.1152, + "grad_norm": 0.41119162739543413, + "learning_rate": 4.997496987311431e-05, + "loss": 0.5979, + "step": 90 + }, + { + "epoch": 0.11648, + "grad_norm": 0.45501958535738946, + "learning_rate": 4.996971460796929e-05, + "loss": 0.6019, + "step": 91 + }, + { + "epoch": 0.11776, + "grad_norm": 0.4287172104360816, + "learning_rate": 4.9963959264103544e-05, + "loss": 0.5955, + "step": 92 + }, + { + "epoch": 0.11904, + "grad_norm": 0.409872269342458, + "learning_rate": 4.995770395678171e-05, + "loss": 0.5927, + "step": 93 + }, + { + "epoch": 0.12032, + "grad_norm": 0.4304173966206036, + "learning_rate": 4.995094881128138e-05, + "loss": 0.5967, + "step": 94 + }, + { + "epoch": 0.1216, + "grad_norm": 0.4229799776298517, + "learning_rate": 4.994369396289063e-05, + "loss": 0.6084, + "step": 95 + }, + { + "epoch": 0.12288, + "grad_norm": 0.4509596954971553, + "learning_rate": 4.9935939556905295e-05, + "loss": 0.6134, + "step": 96 + }, + { + "epoch": 0.12416, + "grad_norm": 0.467661146414229, + "learning_rate": 4.992768574862603e-05, + "loss": 0.5986, + "step": 97 + }, + { + "epoch": 0.12544, + "grad_norm": 0.42432875998240194, + "learning_rate": 4.9918932703355256e-05, + "loss": 0.6028, + "step": 98 + }, + { + "epoch": 0.12672, + "grad_norm": 0.43479377184835605, + "learning_rate": 4.990968059639379e-05, + "loss": 0.5942, + "step": 99 + }, + { + "epoch": 0.128, + "grad_norm": 0.3680676685801686, + "learning_rate": 4.989992961303738e-05, + "loss": 0.5994, + "step": 100 + }, + { + "epoch": 0.12928, + "grad_norm": 0.3956815409903461, + "learning_rate": 4.9889679948572974e-05, + "loss": 0.5871, + "step": 101 + }, + { + "epoch": 0.13056, + "grad_norm": 0.34354949934586104, + "learning_rate": 4.98789318082748e-05, + "loss": 0.5873, + "step": 102 + }, + { + "epoch": 0.13184, + "grad_norm": 0.3608260963951222, + "learning_rate": 4.986768540740028e-05, + "loss": 0.5883, + "step": 103 + }, + { + "epoch": 0.13312, + "grad_norm": 0.3937004101078116, + "learning_rate": 4.98559409711857e-05, + "loss": 0.6029, + "step": 104 + }, + { + "epoch": 0.1344, + "grad_norm": 0.3401718481532899, + "learning_rate": 4.9843698734841705e-05, + "loss": 0.5983, + "step": 105 + }, + { + "epoch": 0.13568, + "grad_norm": 0.4371868869288284, + "learning_rate": 4.983095894354858e-05, + "loss": 0.5866, + "step": 106 + }, + { + "epoch": 0.13696, + "grad_norm": 0.3722813571279646, + "learning_rate": 4.981772185245135e-05, + "loss": 0.5954, + "step": 107 + }, + { + "epoch": 0.13824, + "grad_norm": 0.36493596395606354, + "learning_rate": 4.980398772665468e-05, + "loss": 0.5806, + "step": 108 + }, + { + "epoch": 0.13952, + "grad_norm": 0.43678937522389644, + "learning_rate": 4.9789756841217546e-05, + "loss": 0.595, + "step": 109 + }, + { + "epoch": 0.1408, + "grad_norm": 0.34968596729530604, + "learning_rate": 4.977502948114772e-05, + "loss": 0.5999, + "step": 110 + }, + { + "epoch": 0.14208, + "grad_norm": 0.4035249077012057, + "learning_rate": 4.9759805941396075e-05, + "loss": 0.582, + "step": 111 + }, + { + "epoch": 0.14336, + "grad_norm": 0.3396387531525401, + "learning_rate": 4.974408652685072e-05, + "loss": 0.5912, + "step": 112 + }, + { + "epoch": 0.14464, + "grad_norm": 0.3888124435581031, + "learning_rate": 4.9727871552330794e-05, + "loss": 0.5994, + "step": 113 + }, + { + "epoch": 0.14592, + "grad_norm": 0.3487289265208422, + "learning_rate": 4.971116134258025e-05, + "loss": 0.598, + "step": 114 + }, + { + "epoch": 0.1472, + "grad_norm": 0.34084258932596606, + "learning_rate": 4.969395623226133e-05, + "loss": 0.5965, + "step": 115 + }, + { + "epoch": 0.14848, + "grad_norm": 0.33211872605390524, + "learning_rate": 4.967625656594782e-05, + "loss": 0.5984, + "step": 116 + }, + { + "epoch": 0.14976, + "grad_norm": 0.31055192632357626, + "learning_rate": 4.9658062698118213e-05, + "loss": 0.593, + "step": 117 + }, + { + "epoch": 0.15104, + "grad_norm": 0.35790400007793166, + "learning_rate": 4.963937499314857e-05, + "loss": 0.6035, + "step": 118 + }, + { + "epoch": 0.15232, + "grad_norm": 0.31118450185510343, + "learning_rate": 4.962019382530521e-05, + "loss": 0.5811, + "step": 119 + }, + { + "epoch": 0.1536, + "grad_norm": 0.3326176465041298, + "learning_rate": 4.960051957873725e-05, + "loss": 0.581, + "step": 120 + }, + { + "epoch": 0.15488, + "grad_norm": 0.30210249377153575, + "learning_rate": 4.958035264746893e-05, + "loss": 0.5837, + "step": 121 + }, + { + "epoch": 0.15616, + "grad_norm": 0.3480385124671555, + "learning_rate": 4.955969343539162e-05, + "loss": 0.5768, + "step": 122 + }, + { + "epoch": 0.15744, + "grad_norm": 0.3003392569743352, + "learning_rate": 4.9538542356255866e-05, + "loss": 0.5938, + "step": 123 + }, + { + "epoch": 0.15872, + "grad_norm": 0.32082565179488104, + "learning_rate": 4.9516899833663e-05, + "loss": 0.5948, + "step": 124 + }, + { + "epoch": 0.16, + "grad_norm": 0.3564349708048278, + "learning_rate": 4.949476630105669e-05, + "loss": 0.595, + "step": 125 + }, + { + "epoch": 0.16128, + "grad_norm": 0.32049541972124757, + "learning_rate": 4.94721422017143e-05, + "loss": 0.5838, + "step": 126 + }, + { + "epoch": 0.16256, + "grad_norm": 0.3317680882353993, + "learning_rate": 4.944902798873794e-05, + "loss": 0.5952, + "step": 127 + }, + { + "epoch": 0.16384, + "grad_norm": 0.3381465061198974, + "learning_rate": 4.942542412504543e-05, + "loss": 0.6004, + "step": 128 + }, + { + "epoch": 0.16512, + "grad_norm": 0.38351657127693595, + "learning_rate": 4.940133108336105e-05, + "loss": 0.6014, + "step": 129 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3276142738951724, + "learning_rate": 4.9376749346206006e-05, + "loss": 0.5853, + "step": 130 + }, + { + "epoch": 0.16768, + "grad_norm": 0.37146400882939534, + "learning_rate": 4.935167940588887e-05, + "loss": 0.5995, + "step": 131 + }, + { + "epoch": 0.16896, + "grad_norm": 0.32804274509201087, + "learning_rate": 4.9326121764495596e-05, + "loss": 0.5955, + "step": 132 + }, + { + "epoch": 0.17024, + "grad_norm": 0.3344845806030499, + "learning_rate": 4.9300076933879574e-05, + "loss": 0.5818, + "step": 133 + }, + { + "epoch": 0.17152, + "grad_norm": 0.3479572078392269, + "learning_rate": 4.92735454356513e-05, + "loss": 0.5941, + "step": 134 + }, + { + "epoch": 0.1728, + "grad_norm": 0.34868252062960353, + "learning_rate": 4.924652780116799e-05, + "loss": 0.5898, + "step": 135 + }, + { + "epoch": 0.17408, + "grad_norm": 0.35674279058993497, + "learning_rate": 4.921902457152289e-05, + "loss": 0.5899, + "step": 136 + }, + { + "epoch": 0.17536, + "grad_norm": 0.3672614416380493, + "learning_rate": 4.9191036297534454e-05, + "loss": 0.585, + "step": 137 + }, + { + "epoch": 0.17664, + "grad_norm": 0.4039478601084677, + "learning_rate": 4.916256353973535e-05, + "loss": 0.5994, + "step": 138 + }, + { + "epoch": 0.17792, + "grad_norm": 0.3428958061155067, + "learning_rate": 4.913360686836117e-05, + "loss": 0.575, + "step": 139 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4024960602256603, + "learning_rate": 4.910416686333906e-05, + "loss": 0.5913, + "step": 140 + }, + { + "epoch": 0.18048, + "grad_norm": 0.31040065034832104, + "learning_rate": 4.907424411427608e-05, + "loss": 0.5761, + "step": 141 + }, + { + "epoch": 0.18176, + "grad_norm": 0.359237099401051, + "learning_rate": 4.90438392204474e-05, + "loss": 0.5885, + "step": 142 + }, + { + "epoch": 0.18304, + "grad_norm": 0.3357545415879296, + "learning_rate": 4.901295279078431e-05, + "loss": 0.5907, + "step": 143 + }, + { + "epoch": 0.18432, + "grad_norm": 0.2846403022642179, + "learning_rate": 4.898158544386201e-05, + "loss": 0.5886, + "step": 144 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3636245125193307, + "learning_rate": 4.894973780788722e-05, + "loss": 0.5816, + "step": 145 + }, + { + "epoch": 0.18688, + "grad_norm": 0.25440894793562924, + "learning_rate": 4.8917410520685635e-05, + "loss": 0.576, + "step": 146 + }, + { + "epoch": 0.18816, + "grad_norm": 0.3380189678855273, + "learning_rate": 4.888460422968908e-05, + "loss": 0.5931, + "step": 147 + }, + { + "epoch": 0.18944, + "grad_norm": 0.3096794617975588, + "learning_rate": 4.885131959192262e-05, + "loss": 0.5829, + "step": 148 + }, + { + "epoch": 0.19072, + "grad_norm": 0.280174710159943, + "learning_rate": 4.881755727399134e-05, + "loss": 0.5794, + "step": 149 + }, + { + "epoch": 0.192, + "grad_norm": 0.31769340776297994, + "learning_rate": 4.878331795206705e-05, + "loss": 0.5729, + "step": 150 + }, + { + "epoch": 0.19328, + "grad_norm": 0.31671973855902796, + "learning_rate": 4.8748602311874694e-05, + "loss": 0.5905, + "step": 151 + }, + { + "epoch": 0.19456, + "grad_norm": 0.32614211009906474, + "learning_rate": 4.8713411048678635e-05, + "loss": 0.5855, + "step": 152 + }, + { + "epoch": 0.19584, + "grad_norm": 0.29921149443441614, + "learning_rate": 4.8677744867268764e-05, + "loss": 0.5779, + "step": 153 + }, + { + "epoch": 0.19712, + "grad_norm": 0.3558339409344647, + "learning_rate": 4.8641604481946314e-05, + "loss": 0.5892, + "step": 154 + }, + { + "epoch": 0.1984, + "grad_norm": 0.285079025062, + "learning_rate": 4.8604990616509616e-05, + "loss": 0.5912, + "step": 155 + }, + { + "epoch": 0.19968, + "grad_norm": 0.32189736402098207, + "learning_rate": 4.856790400423958e-05, + "loss": 0.5881, + "step": 156 + }, + { + "epoch": 0.20096, + "grad_norm": 0.3293153125716864, + "learning_rate": 4.8530345387885004e-05, + "loss": 0.5679, + "step": 157 + }, + { + "epoch": 0.20224, + "grad_norm": 0.3020107126026594, + "learning_rate": 4.849231551964771e-05, + "loss": 0.5892, + "step": 158 + }, + { + "epoch": 0.20352, + "grad_norm": 0.37859431920215386, + "learning_rate": 4.845381516116748e-05, + "loss": 0.5752, + "step": 159 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3310159661704287, + "learning_rate": 4.841484508350679e-05, + "loss": 0.5746, + "step": 160 + }, + { + "epoch": 0.20608, + "grad_norm": 0.29255683283621947, + "learning_rate": 4.837540606713538e-05, + "loss": 0.5822, + "step": 161 + }, + { + "epoch": 0.20736, + "grad_norm": 0.35293526712063267, + "learning_rate": 4.83354989019146e-05, + "loss": 0.5872, + "step": 162 + }, + { + "epoch": 0.20864, + "grad_norm": 0.31406471157688987, + "learning_rate": 4.829512438708163e-05, + "loss": 0.5837, + "step": 163 + }, + { + "epoch": 0.20992, + "grad_norm": 0.3746042872826804, + "learning_rate": 4.8254283331233464e-05, + "loss": 0.5795, + "step": 164 + }, + { + "epoch": 0.2112, + "grad_norm": 0.36631879902112746, + "learning_rate": 4.821297655231066e-05, + "loss": 0.5917, + "step": 165 + }, + { + "epoch": 0.21248, + "grad_norm": 0.31527709590813463, + "learning_rate": 4.817120487758104e-05, + "loss": 0.583, + "step": 166 + }, + { + "epoch": 0.21376, + "grad_norm": 0.3311399262333245, + "learning_rate": 4.812896914362309e-05, + "loss": 0.5806, + "step": 167 + }, + { + "epoch": 0.21504, + "grad_norm": 0.2797000752887155, + "learning_rate": 4.808627019630917e-05, + "loss": 0.5818, + "step": 168 + }, + { + "epoch": 0.21632, + "grad_norm": 0.34099518073149715, + "learning_rate": 4.804310889078861e-05, + "loss": 0.5827, + "step": 169 + }, + { + "epoch": 0.2176, + "grad_norm": 0.2858567724381781, + "learning_rate": 4.799948609147061e-05, + "loss": 0.574, + "step": 170 + }, + { + "epoch": 0.21888, + "grad_norm": 0.3714847802025429, + "learning_rate": 4.7955402672006854e-05, + "loss": 0.568, + "step": 171 + }, + { + "epoch": 0.22016, + "grad_norm": 0.3629775316062746, + "learning_rate": 4.791085951527408e-05, + "loss": 0.5908, + "step": 172 + }, + { + "epoch": 0.22144, + "grad_norm": 0.3308553993376432, + "learning_rate": 4.786585751335637e-05, + "loss": 0.5822, + "step": 173 + }, + { + "epoch": 0.22272, + "grad_norm": 0.3583796747287126, + "learning_rate": 4.782039756752727e-05, + "loss": 0.5849, + "step": 174 + }, + { + "epoch": 0.224, + "grad_norm": 0.3200425393735649, + "learning_rate": 4.777448058823179e-05, + "loss": 0.5975, + "step": 175 + }, + { + "epoch": 0.22528, + "grad_norm": 0.292626050602806, + "learning_rate": 4.77281074950681e-05, + "loss": 0.5858, + "step": 176 + }, + { + "epoch": 0.22656, + "grad_norm": 0.35199899958158143, + "learning_rate": 4.768127921676916e-05, + "loss": 0.5956, + "step": 177 + }, + { + "epoch": 0.22784, + "grad_norm": 0.3049078150389153, + "learning_rate": 4.763399669118414e-05, + "loss": 0.5818, + "step": 178 + }, + { + "epoch": 0.22912, + "grad_norm": 0.3171759989327936, + "learning_rate": 4.758626086525956e-05, + "loss": 0.5788, + "step": 179 + }, + { + "epoch": 0.2304, + "grad_norm": 0.37616755806228974, + "learning_rate": 4.753807269502041e-05, + "loss": 0.5764, + "step": 180 + }, + { + "epoch": 0.23168, + "grad_norm": 0.3111683206800814, + "learning_rate": 4.748943314555093e-05, + "loss": 0.5904, + "step": 181 + }, + { + "epoch": 0.23296, + "grad_norm": 0.33324357602957505, + "learning_rate": 4.744034319097535e-05, + "loss": 0.5837, + "step": 182 + }, + { + "epoch": 0.23424, + "grad_norm": 0.3033759415187109, + "learning_rate": 4.739080381443834e-05, + "loss": 0.5872, + "step": 183 + }, + { + "epoch": 0.23552, + "grad_norm": 0.31221903432674414, + "learning_rate": 4.734081600808531e-05, + "loss": 0.5788, + "step": 184 + }, + { + "epoch": 0.2368, + "grad_norm": 0.3521014284242888, + "learning_rate": 4.7290380773042575e-05, + "loss": 0.5876, + "step": 185 + }, + { + "epoch": 0.23808, + "grad_norm": 0.310657917751192, + "learning_rate": 4.723949911939728e-05, + "loss": 0.5799, + "step": 186 + }, + { + "epoch": 0.23936, + "grad_norm": 0.3571637318567018, + "learning_rate": 4.718817206617718e-05, + "loss": 0.5768, + "step": 187 + }, + { + "epoch": 0.24064, + "grad_norm": 0.3019122458668047, + "learning_rate": 4.713640064133025e-05, + "loss": 0.569, + "step": 188 + }, + { + "epoch": 0.24192, + "grad_norm": 0.3867595871931151, + "learning_rate": 4.7084185881704037e-05, + "loss": 0.5807, + "step": 189 + }, + { + "epoch": 0.2432, + "grad_norm": 0.31887858233709804, + "learning_rate": 4.7031528833024976e-05, + "loss": 0.5853, + "step": 190 + }, + { + "epoch": 0.24448, + "grad_norm": 0.33241965371410676, + "learning_rate": 4.697843054987737e-05, + "loss": 0.5843, + "step": 191 + }, + { + "epoch": 0.24576, + "grad_norm": 0.3186271240585135, + "learning_rate": 4.692489209568234e-05, + "loss": 0.58, + "step": 192 + }, + { + "epoch": 0.24704, + "grad_norm": 0.27257060138401595, + "learning_rate": 4.687091454267646e-05, + "loss": 0.5782, + "step": 193 + }, + { + "epoch": 0.24832, + "grad_norm": 0.3068678257607851, + "learning_rate": 4.681649897189036e-05, + "loss": 0.5715, + "step": 194 + }, + { + "epoch": 0.2496, + "grad_norm": 0.2895553434845913, + "learning_rate": 4.6761646473126985e-05, + "loss": 0.5864, + "step": 195 + }, + { + "epoch": 0.25088, + "grad_norm": 0.3320177973067205, + "learning_rate": 4.670635814493984e-05, + "loss": 0.5895, + "step": 196 + }, + { + "epoch": 0.25216, + "grad_norm": 0.28847585573804785, + "learning_rate": 4.665063509461097e-05, + "loss": 0.5906, + "step": 197 + }, + { + "epoch": 0.25344, + "grad_norm": 0.36087996937741346, + "learning_rate": 4.6594478438128757e-05, + "loss": 0.5827, + "step": 198 + }, + { + "epoch": 0.25472, + "grad_norm": 0.2999930376743691, + "learning_rate": 4.653788930016562e-05, + "loss": 0.5661, + "step": 199 + }, + { + "epoch": 0.256, + "grad_norm": 0.3488298254178776, + "learning_rate": 4.6480868814055424e-05, + "loss": 0.5793, + "step": 200 + }, + { + "epoch": 0.25728, + "grad_norm": 0.3489687054350795, + "learning_rate": 4.6423418121770855e-05, + "loss": 0.5804, + "step": 201 + }, + { + "epoch": 0.25856, + "grad_norm": 0.32040065196445794, + "learning_rate": 4.636553837390051e-05, + "loss": 0.5885, + "step": 202 + }, + { + "epoch": 0.25984, + "grad_norm": 0.3211301436932052, + "learning_rate": 4.630723072962584e-05, + "loss": 0.5712, + "step": 203 + }, + { + "epoch": 0.26112, + "grad_norm": 0.359318175552222, + "learning_rate": 4.6248496356697966e-05, + "loss": 0.5708, + "step": 204 + }, + { + "epoch": 0.2624, + "grad_norm": 0.28862834349726524, + "learning_rate": 4.618933643141428e-05, + "loss": 0.5761, + "step": 205 + }, + { + "epoch": 0.26368, + "grad_norm": 0.37248048616519797, + "learning_rate": 4.6129752138594874e-05, + "loss": 0.5853, + "step": 206 + }, + { + "epoch": 0.26496, + "grad_norm": 0.26505735554799875, + "learning_rate": 4.6069744671558835e-05, + "loss": 0.5711, + "step": 207 + }, + { + "epoch": 0.26624, + "grad_norm": 0.4313554228926337, + "learning_rate": 4.6009315232100324e-05, + "loss": 0.5805, + "step": 208 + }, + { + "epoch": 0.26752, + "grad_norm": 0.25367772845462405, + "learning_rate": 4.5948465030464536e-05, + "loss": 0.5804, + "step": 209 + }, + { + "epoch": 0.2688, + "grad_norm": 0.4241960582103298, + "learning_rate": 4.588719528532342e-05, + "loss": 0.5781, + "step": 210 + }, + { + "epoch": 0.27008, + "grad_norm": 0.26959374978777695, + "learning_rate": 4.58255072237513e-05, + "loss": 0.5872, + "step": 211 + }, + { + "epoch": 0.27136, + "grad_norm": 0.32686893212462864, + "learning_rate": 4.5763402081200294e-05, + "loss": 0.5865, + "step": 212 + }, + { + "epoch": 0.27264, + "grad_norm": 0.23434165675561655, + "learning_rate": 4.570088110147559e-05, + "loss": 0.5605, + "step": 213 + }, + { + "epoch": 0.27392, + "grad_norm": 0.29221024109189686, + "learning_rate": 4.56379455367105e-05, + "loss": 0.5798, + "step": 214 + }, + { + "epoch": 0.2752, + "grad_norm": 0.24948843663318818, + "learning_rate": 4.557459664734141e-05, + "loss": 0.5647, + "step": 215 + }, + { + "epoch": 0.27648, + "grad_norm": 0.2697506211061829, + "learning_rate": 4.551083570208252e-05, + "loss": 0.574, + "step": 216 + }, + { + "epoch": 0.27776, + "grad_norm": 0.26982678855787623, + "learning_rate": 4.544666397790043e-05, + "loss": 0.5769, + "step": 217 + }, + { + "epoch": 0.27904, + "grad_norm": 0.26312816652400706, + "learning_rate": 4.538208275998861e-05, + "loss": 0.5794, + "step": 218 + }, + { + "epoch": 0.28032, + "grad_norm": 0.23961265146671684, + "learning_rate": 4.531709334174161e-05, + "loss": 0.5713, + "step": 219 + }, + { + "epoch": 0.2816, + "grad_norm": 0.2521656675623654, + "learning_rate": 4.5251697024729165e-05, + "loss": 0.5799, + "step": 220 + }, + { + "epoch": 0.28288, + "grad_norm": 0.2554557690482637, + "learning_rate": 4.518589511867017e-05, + "loss": 0.5612, + "step": 221 + }, + { + "epoch": 0.28416, + "grad_norm": 0.2575994504688328, + "learning_rate": 4.511968894140639e-05, + "loss": 0.5646, + "step": 222 + }, + { + "epoch": 0.28544, + "grad_norm": 0.24148588167926954, + "learning_rate": 4.50530798188761e-05, + "loss": 0.5752, + "step": 223 + }, + { + "epoch": 0.28672, + "grad_norm": 0.24795970441926052, + "learning_rate": 4.498606908508754e-05, + "loss": 0.5651, + "step": 224 + }, + { + "epoch": 0.288, + "grad_norm": 0.2594616590711217, + "learning_rate": 4.491865808209215e-05, + "loss": 0.5677, + "step": 225 + }, + { + "epoch": 0.28928, + "grad_norm": 0.25674541102342535, + "learning_rate": 4.485084815995778e-05, + "loss": 0.5612, + "step": 226 + }, + { + "epoch": 0.29056, + "grad_norm": 0.24985240413836018, + "learning_rate": 4.478264067674155e-05, + "loss": 0.5699, + "step": 227 + }, + { + "epoch": 0.29184, + "grad_norm": 0.2565786888027258, + "learning_rate": 4.471403699846272e-05, + "loss": 0.5649, + "step": 228 + }, + { + "epoch": 0.29312, + "grad_norm": 0.24372178698061564, + "learning_rate": 4.4645038499075296e-05, + "loss": 0.5727, + "step": 229 + }, + { + "epoch": 0.2944, + "grad_norm": 0.26957885657754116, + "learning_rate": 4.457564656044056e-05, + "loss": 0.5673, + "step": 230 + }, + { + "epoch": 0.29568, + "grad_norm": 0.2607171265747611, + "learning_rate": 4.4505862572299315e-05, + "loss": 0.5745, + "step": 231 + }, + { + "epoch": 0.29696, + "grad_norm": 0.24863876648291447, + "learning_rate": 4.443568793224415e-05, + "loss": 0.5681, + "step": 232 + }, + { + "epoch": 0.29824, + "grad_norm": 0.263062932590105, + "learning_rate": 4.436512404569136e-05, + "loss": 0.5644, + "step": 233 + }, + { + "epoch": 0.29952, + "grad_norm": 0.2414635376635183, + "learning_rate": 4.429417232585288e-05, + "loss": 0.5701, + "step": 234 + }, + { + "epoch": 0.3008, + "grad_norm": 0.25798244731348213, + "learning_rate": 4.422283419370789e-05, + "loss": 0.5749, + "step": 235 + }, + { + "epoch": 0.30208, + "grad_norm": 0.2705236848186794, + "learning_rate": 4.415111107797445e-05, + "loss": 0.5722, + "step": 236 + }, + { + "epoch": 0.30336, + "grad_norm": 0.2532310162226822, + "learning_rate": 4.407900441508084e-05, + "loss": 0.5613, + "step": 237 + }, + { + "epoch": 0.30464, + "grad_norm": 0.27015507001673206, + "learning_rate": 4.400651564913676e-05, + "loss": 0.5764, + "step": 238 + }, + { + "epoch": 0.30592, + "grad_norm": 0.2600356410106743, + "learning_rate": 4.3933646231904504e-05, + "loss": 0.5621, + "step": 239 + }, + { + "epoch": 0.3072, + "grad_norm": 0.2947154143151425, + "learning_rate": 4.3860397622769756e-05, + "loss": 0.5776, + "step": 240 + }, + { + "epoch": 0.30848, + "grad_norm": 0.278006374370773, + "learning_rate": 4.37867712887125e-05, + "loss": 0.5779, + "step": 241 + }, + { + "epoch": 0.30976, + "grad_norm": 0.2737057808752138, + "learning_rate": 4.371276870427753e-05, + "loss": 0.5691, + "step": 242 + }, + { + "epoch": 0.31104, + "grad_norm": 0.2779579791922396, + "learning_rate": 4.363839135154497e-05, + "loss": 0.5561, + "step": 243 + }, + { + "epoch": 0.31232, + "grad_norm": 0.2900125824723869, + "learning_rate": 4.356364072010059e-05, + "loss": 0.5862, + "step": 244 + }, + { + "epoch": 0.3136, + "grad_norm": 0.2864520071161508, + "learning_rate": 4.348851830700593e-05, + "loss": 0.5622, + "step": 245 + }, + { + "epoch": 0.31488, + "grad_norm": 0.26378798903912654, + "learning_rate": 4.3413025616768424e-05, + "loss": 0.5709, + "step": 246 + }, + { + "epoch": 0.31616, + "grad_norm": 0.3060187732302713, + "learning_rate": 4.333716416131115e-05, + "loss": 0.5669, + "step": 247 + }, + { + "epoch": 0.31744, + "grad_norm": 0.29432171614697805, + "learning_rate": 4.3260935459942584e-05, + "loss": 0.5682, + "step": 248 + }, + { + "epoch": 0.31872, + "grad_norm": 0.26017905775079875, + "learning_rate": 4.318434103932622e-05, + "loss": 0.5885, + "step": 249 + }, + { + "epoch": 0.32, + "grad_norm": 0.31505146670137313, + "learning_rate": 4.310738243344996e-05, + "loss": 0.5761, + "step": 250 + }, + { + "epoch": 0.32128, + "grad_norm": 0.28372810809515076, + "learning_rate": 4.303006118359537e-05, + "loss": 0.5592, + "step": 251 + }, + { + "epoch": 0.32256, + "grad_norm": 0.2651991449577939, + "learning_rate": 4.295237883830685e-05, + "loss": 0.5697, + "step": 252 + }, + { + "epoch": 0.32384, + "grad_norm": 0.24913000845932673, + "learning_rate": 4.2874336953360616e-05, + "loss": 0.5776, + "step": 253 + }, + { + "epoch": 0.32512, + "grad_norm": 0.2430027683011546, + "learning_rate": 4.2795937091733515e-05, + "loss": 0.5677, + "step": 254 + }, + { + "epoch": 0.3264, + "grad_norm": 0.2568626552616791, + "learning_rate": 4.271718082357175e-05, + "loss": 0.582, + "step": 255 + }, + { + "epoch": 0.32768, + "grad_norm": 0.23741828351399039, + "learning_rate": 4.2638069726159424e-05, + "loss": 0.5609, + "step": 256 + }, + { + "epoch": 0.32896, + "grad_norm": 0.23355776394833105, + "learning_rate": 4.255860538388694e-05, + "loss": 0.572, + "step": 257 + }, + { + "epoch": 0.33024, + "grad_norm": 0.28500006430456526, + "learning_rate": 4.247878938821929e-05, + "loss": 0.5783, + "step": 258 + }, + { + "epoch": 0.33152, + "grad_norm": 0.23832612238075213, + "learning_rate": 4.2398623337664176e-05, + "loss": 0.5591, + "step": 259 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3027829669410571, + "learning_rate": 4.231810883773999e-05, + "loss": 0.5723, + "step": 260 + }, + { + "epoch": 0.33408, + "grad_norm": 0.24742812416416485, + "learning_rate": 4.223724750094366e-05, + "loss": 0.562, + "step": 261 + }, + { + "epoch": 0.33536, + "grad_norm": 0.281426189909417, + "learning_rate": 4.215604094671835e-05, + "loss": 0.5711, + "step": 262 + }, + { + "epoch": 0.33664, + "grad_norm": 0.3464883424573956, + "learning_rate": 4.207449080142104e-05, + "loss": 0.56, + "step": 263 + }, + { + "epoch": 0.33792, + "grad_norm": 0.23536130124235047, + "learning_rate": 4.199259869828998e-05, + "loss": 0.5714, + "step": 264 + }, + { + "epoch": 0.3392, + "grad_norm": 0.32901239458674364, + "learning_rate": 4.191036627741191e-05, + "loss": 0.5656, + "step": 265 + }, + { + "epoch": 0.34048, + "grad_norm": 0.2513197034442368, + "learning_rate": 4.182779518568926e-05, + "loss": 0.5549, + "step": 266 + }, + { + "epoch": 0.34176, + "grad_norm": 0.2956698427587275, + "learning_rate": 4.174488707680717e-05, + "loss": 0.565, + "step": 267 + }, + { + "epoch": 0.34304, + "grad_norm": 0.2767183631207283, + "learning_rate": 4.1661643611200366e-05, + "loss": 0.5751, + "step": 268 + }, + { + "epoch": 0.34432, + "grad_norm": 0.24523313623813994, + "learning_rate": 4.157806645601988e-05, + "loss": 0.5742, + "step": 269 + }, + { + "epoch": 0.3456, + "grad_norm": 0.2908460538863046, + "learning_rate": 4.149415728509971e-05, + "loss": 0.5593, + "step": 270 + }, + { + "epoch": 0.34688, + "grad_norm": 0.4424923362280487, + "learning_rate": 4.140991777892324e-05, + "loss": 0.5814, + "step": 271 + }, + { + "epoch": 0.34816, + "grad_norm": 0.2575686169084144, + "learning_rate": 4.132534962458962e-05, + "loss": 0.5678, + "step": 272 + }, + { + "epoch": 0.34944, + "grad_norm": 0.2629805233702073, + "learning_rate": 4.124045451578001e-05, + "loss": 0.5629, + "step": 273 + }, + { + "epoch": 0.35072, + "grad_norm": 0.22898476571255702, + "learning_rate": 4.115523415272358e-05, + "loss": 0.569, + "step": 274 + }, + { + "epoch": 0.352, + "grad_norm": 0.2443223155892946, + "learning_rate": 4.1069690242163484e-05, + "loss": 0.5624, + "step": 275 + }, + { + "epoch": 0.35328, + "grad_norm": 0.24146954716830435, + "learning_rate": 4.0983824497322755e-05, + "loss": 0.5783, + "step": 276 + }, + { + "epoch": 0.35456, + "grad_norm": 0.23532353314194146, + "learning_rate": 4.0897638637869874e-05, + "loss": 0.5846, + "step": 277 + }, + { + "epoch": 0.35584, + "grad_norm": 0.21666728547483338, + "learning_rate": 4.0811134389884433e-05, + "loss": 0.5788, + "step": 278 + }, + { + "epoch": 0.35712, + "grad_norm": 0.2296647805474214, + "learning_rate": 4.07243134858225e-05, + "loss": 0.5679, + "step": 279 + }, + { + "epoch": 0.3584, + "grad_norm": 0.23340342966367908, + "learning_rate": 4.063717766448194e-05, + "loss": 0.5645, + "step": 280 + }, + { + "epoch": 0.35968, + "grad_norm": 0.23757239204061076, + "learning_rate": 4.05497286709676e-05, + "loss": 0.5745, + "step": 281 + }, + { + "epoch": 0.36096, + "grad_norm": 0.24284616004111326, + "learning_rate": 4.0461968256656376e-05, + "loss": 0.5659, + "step": 282 + }, + { + "epoch": 0.36224, + "grad_norm": 0.2317609526061923, + "learning_rate": 4.037389817916208e-05, + "loss": 0.5657, + "step": 283 + }, + { + "epoch": 0.36352, + "grad_norm": 0.2779219267379158, + "learning_rate": 4.028552020230031e-05, + "loss": 0.5612, + "step": 284 + }, + { + "epoch": 0.3648, + "grad_norm": 0.2568685752720875, + "learning_rate": 4.019683609605305e-05, + "loss": 0.5618, + "step": 285 + }, + { + "epoch": 0.36608, + "grad_norm": 0.24357977645804754, + "learning_rate": 4.010784763653331e-05, + "loss": 0.5532, + "step": 286 + }, + { + "epoch": 0.36736, + "grad_norm": 0.2854087705449231, + "learning_rate": 4.001855660594948e-05, + "loss": 0.5598, + "step": 287 + }, + { + "epoch": 0.36864, + "grad_norm": 0.27153778848296095, + "learning_rate": 3.9928964792569655e-05, + "loss": 0.5661, + "step": 288 + }, + { + "epoch": 0.36992, + "grad_norm": 0.3013570035330345, + "learning_rate": 3.983907399068587e-05, + "loss": 0.576, + "step": 289 + }, + { + "epoch": 0.3712, + "grad_norm": 0.23579225525517492, + "learning_rate": 3.974888600057808e-05, + "loss": 0.5685, + "step": 290 + }, + { + "epoch": 0.37248, + "grad_norm": 0.29344672148557993, + "learning_rate": 3.965840262847818e-05, + "loss": 0.5652, + "step": 291 + }, + { + "epoch": 0.37376, + "grad_norm": 0.2771114973156099, + "learning_rate": 3.956762568653378e-05, + "loss": 0.5615, + "step": 292 + }, + { + "epoch": 0.37504, + "grad_norm": 0.2475727777258234, + "learning_rate": 3.947655699277197e-05, + "loss": 0.5622, + "step": 293 + }, + { + "epoch": 0.37632, + "grad_norm": 0.3283313482469374, + "learning_rate": 3.9385198371062845e-05, + "loss": 0.5688, + "step": 294 + }, + { + "epoch": 0.3776, + "grad_norm": 0.2567128249078116, + "learning_rate": 3.929355165108299e-05, + "loss": 0.5685, + "step": 295 + }, + { + "epoch": 0.37888, + "grad_norm": 0.33868790476694477, + "learning_rate": 3.920161866827889e-05, + "loss": 0.5603, + "step": 296 + }, + { + "epoch": 0.38016, + "grad_norm": 0.23159577247265578, + "learning_rate": 3.910940126383013e-05, + "loss": 0.5614, + "step": 297 + }, + { + "epoch": 0.38144, + "grad_norm": 0.261651084842406, + "learning_rate": 3.9016901284612474e-05, + "loss": 0.5634, + "step": 298 + }, + { + "epoch": 0.38272, + "grad_norm": 0.25918417083311507, + "learning_rate": 3.8924120583160985e-05, + "loss": 0.564, + "step": 299 + }, + { + "epoch": 0.384, + "grad_norm": 0.24530554345595257, + "learning_rate": 3.883106101763285e-05, + "loss": 0.5751, + "step": 300 + }, + { + "epoch": 0.38528, + "grad_norm": 0.25269762220161, + "learning_rate": 3.873772445177015e-05, + "loss": 0.5666, + "step": 301 + }, + { + "epoch": 0.38656, + "grad_norm": 0.2686509936643206, + "learning_rate": 3.8644112754862614e-05, + "loss": 0.5787, + "step": 302 + }, + { + "epoch": 0.38784, + "grad_norm": 0.2306788708136305, + "learning_rate": 3.85502278017101e-05, + "loss": 0.5639, + "step": 303 + }, + { + "epoch": 0.38912, + "grad_norm": 0.25921338779127046, + "learning_rate": 3.84560714725851e-05, + "loss": 0.5611, + "step": 304 + }, + { + "epoch": 0.3904, + "grad_norm": 0.2875441041443726, + "learning_rate": 3.8361645653195026e-05, + "loss": 0.5641, + "step": 305 + }, + { + "epoch": 0.39168, + "grad_norm": 0.24679004673798802, + "learning_rate": 3.8266952234644545e-05, + "loss": 0.5696, + "step": 306 + }, + { + "epoch": 0.39296, + "grad_norm": 0.32137055545965876, + "learning_rate": 3.817199311339759e-05, + "loss": 0.5642, + "step": 307 + }, + { + "epoch": 0.39424, + "grad_norm": 0.25511061251478745, + "learning_rate": 3.807677019123944e-05, + "loss": 0.5535, + "step": 308 + }, + { + "epoch": 0.39552, + "grad_norm": 0.28918335928655264, + "learning_rate": 3.798128537523865e-05, + "loss": 0.5628, + "step": 309 + }, + { + "epoch": 0.3968, + "grad_norm": 0.24654149918381577, + "learning_rate": 3.7885540577708804e-05, + "loss": 0.5693, + "step": 310 + }, + { + "epoch": 0.39808, + "grad_norm": 0.29445546859209903, + "learning_rate": 3.7789537716170256e-05, + "loss": 0.5603, + "step": 311 + }, + { + "epoch": 0.39936, + "grad_norm": 0.2546352881022481, + "learning_rate": 3.76932787133117e-05, + "loss": 0.5605, + "step": 312 + }, + { + "epoch": 0.40064, + "grad_norm": 0.2694209869990075, + "learning_rate": 3.759676549695168e-05, + "loss": 0.5631, + "step": 313 + }, + { + "epoch": 0.40192, + "grad_norm": 0.25374147038521105, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.5737, + "step": 314 + }, + { + "epoch": 0.4032, + "grad_norm": 0.2692940305682822, + "learning_rate": 3.740298416041898e-05, + "loss": 0.5592, + "step": 315 + }, + { + "epoch": 0.40448, + "grad_norm": 0.27292894144562746, + "learning_rate": 3.730571992118462e-05, + "loss": 0.56, + "step": 316 + }, + { + "epoch": 0.40576, + "grad_norm": 0.2831296008226712, + "learning_rate": 3.720820923024778e-05, + "loss": 0.556, + "step": 317 + }, + { + "epoch": 0.40704, + "grad_norm": 0.24842153386433, + "learning_rate": 3.711045404049507e-05, + "loss": 0.5615, + "step": 318 + }, + { + "epoch": 0.40832, + "grad_norm": 0.25137316651759817, + "learning_rate": 3.701245630970979e-05, + "loss": 0.5519, + "step": 319 + }, + { + "epoch": 0.4096, + "grad_norm": 0.256842976111285, + "learning_rate": 3.69142180005327e-05, + "loss": 0.559, + "step": 320 + }, + { + "epoch": 0.41088, + "grad_norm": 0.23856265603702867, + "learning_rate": 3.681574108042274e-05, + "loss": 0.5513, + "step": 321 + }, + { + "epoch": 0.41216, + "grad_norm": 0.2636505871798519, + "learning_rate": 3.6717027521617595e-05, + "loss": 0.5553, + "step": 322 + }, + { + "epoch": 0.41344, + "grad_norm": 0.23126506975873334, + "learning_rate": 3.6618079301094216e-05, + "loss": 0.5535, + "step": 323 + }, + { + "epoch": 0.41472, + "grad_norm": 0.2765627545708339, + "learning_rate": 3.6518898400529214e-05, + "loss": 0.5727, + "step": 324 + }, + { + "epoch": 0.416, + "grad_norm": 0.25065312946741247, + "learning_rate": 3.6419486806259194e-05, + "loss": 0.5642, + "step": 325 + }, + { + "epoch": 0.41728, + "grad_norm": 0.24801934318992586, + "learning_rate": 3.631984650924094e-05, + "loss": 0.5489, + "step": 326 + }, + { + "epoch": 0.41856, + "grad_norm": 0.2505101313201257, + "learning_rate": 3.621997950501156e-05, + "loss": 0.5575, + "step": 327 + }, + { + "epoch": 0.41984, + "grad_norm": 0.22423685749771013, + "learning_rate": 3.611988779364853e-05, + "loss": 0.5468, + "step": 328 + }, + { + "epoch": 0.42112, + "grad_norm": 0.2613413831880175, + "learning_rate": 3.6019573379729643e-05, + "loss": 0.5742, + "step": 329 + }, + { + "epoch": 0.4224, + "grad_norm": 0.23270975616366224, + "learning_rate": 3.591903827229282e-05, + "loss": 0.567, + "step": 330 + }, + { + "epoch": 0.42368, + "grad_norm": 0.22747265941762546, + "learning_rate": 3.5818284484795904e-05, + "loss": 0.5521, + "step": 331 + }, + { + "epoch": 0.42496, + "grad_norm": 0.21305915578050127, + "learning_rate": 3.5717314035076355e-05, + "loss": 0.5507, + "step": 332 + }, + { + "epoch": 0.42624, + "grad_norm": 0.22564541882328742, + "learning_rate": 3.56161289453108e-05, + "loss": 0.5612, + "step": 333 + }, + { + "epoch": 0.42752, + "grad_norm": 0.24927047862972154, + "learning_rate": 3.5514731241974544e-05, + "loss": 0.5511, + "step": 334 + }, + { + "epoch": 0.4288, + "grad_norm": 0.23391865199996986, + "learning_rate": 3.5413122955801005e-05, + "loss": 0.5638, + "step": 335 + }, + { + "epoch": 0.43008, + "grad_norm": 0.2538820004383474, + "learning_rate": 3.5311306121741015e-05, + "loss": 0.5629, + "step": 336 + }, + { + "epoch": 0.43136, + "grad_norm": 0.23071027016549392, + "learning_rate": 3.5209282778922106e-05, + "loss": 0.5524, + "step": 337 + }, + { + "epoch": 0.43264, + "grad_norm": 0.2538902369228176, + "learning_rate": 3.510705497060762e-05, + "loss": 0.5589, + "step": 338 + }, + { + "epoch": 0.43392, + "grad_norm": 0.2396787654237747, + "learning_rate": 3.500462474415584e-05, + "loss": 0.5583, + "step": 339 + }, + { + "epoch": 0.4352, + "grad_norm": 0.24498876339943568, + "learning_rate": 3.490199415097892e-05, + "loss": 0.5586, + "step": 340 + }, + { + "epoch": 0.43648, + "grad_norm": 0.22429707447384664, + "learning_rate": 3.479916524650188e-05, + "loss": 0.5591, + "step": 341 + }, + { + "epoch": 0.43776, + "grad_norm": 0.2488615974495946, + "learning_rate": 3.4696140090121376e-05, + "loss": 0.5716, + "step": 342 + }, + { + "epoch": 0.43904, + "grad_norm": 0.24601068839661924, + "learning_rate": 3.459292074516449e-05, + "loss": 0.572, + "step": 343 + }, + { + "epoch": 0.44032, + "grad_norm": 0.24772654051656678, + "learning_rate": 3.4489509278847414e-05, + "loss": 0.5627, + "step": 344 + }, + { + "epoch": 0.4416, + "grad_norm": 0.24320980786790716, + "learning_rate": 3.4385907762234e-05, + "loss": 0.556, + "step": 345 + }, + { + "epoch": 0.44288, + "grad_norm": 0.2899880667030433, + "learning_rate": 3.428211827019434e-05, + "loss": 0.5625, + "step": 346 + }, + { + "epoch": 0.44416, + "grad_norm": 0.24951619342805165, + "learning_rate": 3.417814288136319e-05, + "loss": 0.5606, + "step": 347 + }, + { + "epoch": 0.44544, + "grad_norm": 0.29645948538693856, + "learning_rate": 3.407398367809832e-05, + "loss": 0.5696, + "step": 348 + }, + { + "epoch": 0.44672, + "grad_norm": 0.2628573817303425, + "learning_rate": 3.3969642746438836e-05, + "loss": 0.5548, + "step": 349 + }, + { + "epoch": 0.448, + "grad_norm": 0.25156314937857505, + "learning_rate": 3.386512217606339e-05, + "loss": 0.5584, + "step": 350 + }, + { + "epoch": 0.44928, + "grad_norm": 0.24416044960818195, + "learning_rate": 3.3760424060248344e-05, + "loss": 0.5501, + "step": 351 + }, + { + "epoch": 0.45056, + "grad_norm": 0.2508322157210064, + "learning_rate": 3.365555049582582e-05, + "loss": 0.5575, + "step": 352 + }, + { + "epoch": 0.45184, + "grad_norm": 0.23612772269109591, + "learning_rate": 3.355050358314172e-05, + "loss": 0.5682, + "step": 353 + }, + { + "epoch": 0.45312, + "grad_norm": 0.24541798896943803, + "learning_rate": 3.3445285426013685e-05, + "loss": 0.551, + "step": 354 + }, + { + "epoch": 0.4544, + "grad_norm": 0.2234016182436274, + "learning_rate": 3.3339898131688914e-05, + "loss": 0.5591, + "step": 355 + }, + { + "epoch": 0.45568, + "grad_norm": 0.24455388410655837, + "learning_rate": 3.323434381080199e-05, + "loss": 0.5542, + "step": 356 + }, + { + "epoch": 0.45696, + "grad_norm": 0.2447461284103842, + "learning_rate": 3.312862457733263e-05, + "loss": 0.5689, + "step": 357 + }, + { + "epoch": 0.45824, + "grad_norm": 0.2611063437561967, + "learning_rate": 3.302274254856329e-05, + "loss": 0.5564, + "step": 358 + }, + { + "epoch": 0.45952, + "grad_norm": 0.2609646712338443, + "learning_rate": 3.2916699845036816e-05, + "loss": 0.5609, + "step": 359 + }, + { + "epoch": 0.4608, + "grad_norm": 0.2668094661495938, + "learning_rate": 3.281049859051394e-05, + "loss": 0.55, + "step": 360 + }, + { + "epoch": 0.46208, + "grad_norm": 0.23869480950176758, + "learning_rate": 3.270414091193077e-05, + "loss": 0.5613, + "step": 361 + }, + { + "epoch": 0.46336, + "grad_norm": 0.24021036759397824, + "learning_rate": 3.2597628939356175e-05, + "loss": 0.5431, + "step": 362 + }, + { + "epoch": 0.46464, + "grad_norm": 0.22964918725760114, + "learning_rate": 3.2490964805949145e-05, + "loss": 0.5568, + "step": 363 + }, + { + "epoch": 0.46592, + "grad_norm": 0.22304779121360874, + "learning_rate": 3.238415064791603e-05, + "loss": 0.561, + "step": 364 + }, + { + "epoch": 0.4672, + "grad_norm": 0.23921210910709523, + "learning_rate": 3.227718860446782e-05, + "loss": 0.5457, + "step": 365 + }, + { + "epoch": 0.46848, + "grad_norm": 0.2209452744046842, + "learning_rate": 3.217008081777726e-05, + "loss": 0.5543, + "step": 366 + }, + { + "epoch": 0.46976, + "grad_norm": 0.24033224570485728, + "learning_rate": 3.206282943293593e-05, + "loss": 0.5491, + "step": 367 + }, + { + "epoch": 0.47104, + "grad_norm": 0.23799116455210376, + "learning_rate": 3.195543659791132e-05, + "loss": 0.5473, + "step": 368 + }, + { + "epoch": 0.47232, + "grad_norm": 0.22503904067880973, + "learning_rate": 3.1847904463503816e-05, + "loss": 0.5633, + "step": 369 + }, + { + "epoch": 0.4736, + "grad_norm": 0.22994992202231845, + "learning_rate": 3.17402351833036e-05, + "loss": 0.56, + "step": 370 + }, + { + "epoch": 0.47488, + "grad_norm": 0.2454490715349298, + "learning_rate": 3.163243091364752e-05, + "loss": 0.5502, + "step": 371 + }, + { + "epoch": 0.47616, + "grad_norm": 0.2567184232746229, + "learning_rate": 3.152449381357593e-05, + "loss": 0.5606, + "step": 372 + }, + { + "epoch": 0.47744, + "grad_norm": 0.22104709877502515, + "learning_rate": 3.141642604478942e-05, + "loss": 0.5452, + "step": 373 + }, + { + "epoch": 0.47872, + "grad_norm": 0.254791340469147, + "learning_rate": 3.130822977160554e-05, + "loss": 0.5575, + "step": 374 + }, + { + "epoch": 0.48, + "grad_norm": 0.2172815217297066, + "learning_rate": 3.119990716091546e-05, + "loss": 0.5592, + "step": 375 + }, + { + "epoch": 0.48128, + "grad_norm": 0.23931030435414483, + "learning_rate": 3.109146038214055e-05, + "loss": 0.5473, + "step": 376 + }, + { + "epoch": 0.48256, + "grad_norm": 0.2290268854716801, + "learning_rate": 3.098289160718895e-05, + "loss": 0.5645, + "step": 377 + }, + { + "epoch": 0.48384, + "grad_norm": 0.23603315462362215, + "learning_rate": 3.087420301041206e-05, + "loss": 0.5408, + "step": 378 + }, + { + "epoch": 0.48512, + "grad_norm": 0.20930260887273594, + "learning_rate": 3.076539676856101e-05, + "loss": 0.5497, + "step": 379 + }, + { + "epoch": 0.4864, + "grad_norm": 0.20115083009307477, + "learning_rate": 3.065647506074306e-05, + "loss": 0.565, + "step": 380 + }, + { + "epoch": 0.48768, + "grad_norm": 0.24482468983205832, + "learning_rate": 3.054744006837794e-05, + "loss": 0.5682, + "step": 381 + }, + { + "epoch": 0.48896, + "grad_norm": 0.21118406075167703, + "learning_rate": 3.0438293975154186e-05, + "loss": 0.5477, + "step": 382 + }, + { + "epoch": 0.49024, + "grad_norm": 0.23616813058785022, + "learning_rate": 3.03290389669854e-05, + "loss": 0.5563, + "step": 383 + }, + { + "epoch": 0.49152, + "grad_norm": 0.21365522272880177, + "learning_rate": 3.021967723196647e-05, + "loss": 0.5507, + "step": 384 + }, + { + "epoch": 0.4928, + "grad_norm": 0.22727170936570085, + "learning_rate": 3.0110210960329725e-05, + "loss": 0.5603, + "step": 385 + }, + { + "epoch": 0.49408, + "grad_norm": 0.20731397878172675, + "learning_rate": 3.0000642344401113e-05, + "loss": 0.5563, + "step": 386 + }, + { + "epoch": 0.49536, + "grad_norm": 0.21475557099958575, + "learning_rate": 2.9890973578556268e-05, + "loss": 0.552, + "step": 387 + }, + { + "epoch": 0.49664, + "grad_norm": 0.26877099196744664, + "learning_rate": 2.978120685917656e-05, + "loss": 0.5409, + "step": 388 + }, + { + "epoch": 0.49792, + "grad_norm": 0.21789486858704812, + "learning_rate": 2.9671344384605127e-05, + "loss": 0.5486, + "step": 389 + }, + { + "epoch": 0.4992, + "grad_norm": 0.23650494006372377, + "learning_rate": 2.956138835510282e-05, + "loss": 0.5454, + "step": 390 + }, + { + "epoch": 0.50048, + "grad_norm": 0.22781069793383732, + "learning_rate": 2.945134097280417e-05, + "loss": 0.5615, + "step": 391 + }, + { + "epoch": 0.50176, + "grad_norm": 0.2365389525743412, + "learning_rate": 2.9341204441673266e-05, + "loss": 0.5544, + "step": 392 + }, + { + "epoch": 0.50304, + "grad_norm": 0.21603949855780968, + "learning_rate": 2.9230980967459593e-05, + "loss": 0.569, + "step": 393 + }, + { + "epoch": 0.50432, + "grad_norm": 0.2037144247370564, + "learning_rate": 2.9120672757653916e-05, + "loss": 0.5497, + "step": 394 + }, + { + "epoch": 0.5056, + "grad_norm": 0.23323915671730888, + "learning_rate": 2.9010282021444008e-05, + "loss": 0.5583, + "step": 395 + }, + { + "epoch": 0.50688, + "grad_norm": 0.21232243650279164, + "learning_rate": 2.8899810969670448e-05, + "loss": 0.5632, + "step": 396 + }, + { + "epoch": 0.50816, + "grad_norm": 0.23946451797290258, + "learning_rate": 2.8789261814782316e-05, + "loss": 0.5593, + "step": 397 + }, + { + "epoch": 0.50944, + "grad_norm": 0.2277720908380823, + "learning_rate": 2.8678636770792906e-05, + "loss": 0.5557, + "step": 398 + }, + { + "epoch": 0.51072, + "grad_norm": 0.2121131449498051, + "learning_rate": 2.856793805323536e-05, + "loss": 0.5494, + "step": 399 + }, + { + "epoch": 0.512, + "grad_norm": 0.2257692000346662, + "learning_rate": 2.845716787911833e-05, + "loss": 0.5669, + "step": 400 + }, + { + "epoch": 0.51328, + "grad_norm": 0.23206067761113425, + "learning_rate": 2.8346328466881545e-05, + "loss": 0.5587, + "step": 401 + }, + { + "epoch": 0.51456, + "grad_norm": 0.22901702627476864, + "learning_rate": 2.8235422036351382e-05, + "loss": 0.5719, + "step": 402 + }, + { + "epoch": 0.51584, + "grad_norm": 0.2284208784515934, + "learning_rate": 2.812445080869646e-05, + "loss": 0.5608, + "step": 403 + }, + { + "epoch": 0.51712, + "grad_norm": 0.21851283140724692, + "learning_rate": 2.8013417006383076e-05, + "loss": 0.5567, + "step": 404 + }, + { + "epoch": 0.5184, + "grad_norm": 0.21275077693373082, + "learning_rate": 2.7902322853130757e-05, + "loss": 0.545, + "step": 405 + }, + { + "epoch": 0.51968, + "grad_norm": 0.2251800050901655, + "learning_rate": 2.77911705738677e-05, + "loss": 0.555, + "step": 406 + }, + { + "epoch": 0.52096, + "grad_norm": 0.21241829251347838, + "learning_rate": 2.7679962394686198e-05, + "loss": 0.5581, + "step": 407 + }, + { + "epoch": 0.52224, + "grad_norm": 0.21703951180685088, + "learning_rate": 2.756870054279811e-05, + "loss": 0.5488, + "step": 408 + }, + { + "epoch": 0.52352, + "grad_norm": 0.22069571037639818, + "learning_rate": 2.745738724649018e-05, + "loss": 0.5535, + "step": 409 + }, + { + "epoch": 0.5248, + "grad_norm": 0.25932488146409416, + "learning_rate": 2.7346024735079486e-05, + "loss": 0.5583, + "step": 410 + }, + { + "epoch": 0.52608, + "grad_norm": 0.20727956981757625, + "learning_rate": 2.7234615238868732e-05, + "loss": 0.5632, + "step": 411 + }, + { + "epoch": 0.52736, + "grad_norm": 0.24066945021398153, + "learning_rate": 2.712316098910162e-05, + "loss": 0.563, + "step": 412 + }, + { + "epoch": 0.52864, + "grad_norm": 0.25008559453154683, + "learning_rate": 2.7011664217918154e-05, + "loss": 0.5488, + "step": 413 + }, + { + "epoch": 0.52992, + "grad_norm": 0.23013581555834695, + "learning_rate": 2.6900127158309903e-05, + "loss": 0.5633, + "step": 414 + }, + { + "epoch": 0.5312, + "grad_norm": 0.22224479213901044, + "learning_rate": 2.6788552044075344e-05, + "loss": 0.549, + "step": 415 + }, + { + "epoch": 0.53248, + "grad_norm": 0.2654591019527864, + "learning_rate": 2.667694110977506e-05, + "loss": 0.5574, + "step": 416 + }, + { + "epoch": 0.53376, + "grad_norm": 0.19545471307044301, + "learning_rate": 2.656529659068705e-05, + "loss": 0.5488, + "step": 417 + }, + { + "epoch": 0.53504, + "grad_norm": 0.20254148474738726, + "learning_rate": 2.6453620722761896e-05, + "loss": 0.5638, + "step": 418 + }, + { + "epoch": 0.53632, + "grad_norm": 0.22685700497041825, + "learning_rate": 2.6341915742578037e-05, + "loss": 0.5414, + "step": 419 + }, + { + "epoch": 0.5376, + "grad_norm": 0.20942540533409634, + "learning_rate": 2.6230183887296955e-05, + "loss": 0.5565, + "step": 420 + }, + { + "epoch": 0.53888, + "grad_norm": 0.21464639120298495, + "learning_rate": 2.6118427394618357e-05, + "loss": 0.5561, + "step": 421 + }, + { + "epoch": 0.54016, + "grad_norm": 0.22217802407368148, + "learning_rate": 2.600664850273538e-05, + "loss": 0.5533, + "step": 422 + }, + { + "epoch": 0.54144, + "grad_norm": 0.18275865926888668, + "learning_rate": 2.5894849450289764e-05, + "loss": 0.5488, + "step": 423 + }, + { + "epoch": 0.54272, + "grad_norm": 0.2167014210634381, + "learning_rate": 2.5783032476327007e-05, + "loss": 0.5461, + "step": 424 + }, + { + "epoch": 0.544, + "grad_norm": 0.2034252786956083, + "learning_rate": 2.5671199820251534e-05, + "loss": 0.5509, + "step": 425 + }, + { + "epoch": 0.54528, + "grad_norm": 0.18569564751312728, + "learning_rate": 2.5559353721781832e-05, + "loss": 0.5478, + "step": 426 + }, + { + "epoch": 0.54656, + "grad_norm": 0.20022896045042796, + "learning_rate": 2.544749642090561e-05, + "loss": 0.5477, + "step": 427 + }, + { + "epoch": 0.54784, + "grad_norm": 0.21037283126869577, + "learning_rate": 2.5335630157834937e-05, + "loss": 0.5539, + "step": 428 + }, + { + "epoch": 0.54912, + "grad_norm": 0.19972461705925887, + "learning_rate": 2.522375717296137e-05, + "loss": 0.5569, + "step": 429 + }, + { + "epoch": 0.5504, + "grad_norm": 0.19005063735053182, + "learning_rate": 2.5111879706811087e-05, + "loss": 0.5457, + "step": 430 + }, + { + "epoch": 0.55168, + "grad_norm": 0.21356314651647584, + "learning_rate": 2.5e-05, + "loss": 0.5459, + "step": 431 + }, + { + "epoch": 0.55296, + "grad_norm": 0.20945885149378243, + "learning_rate": 2.4888120293188916e-05, + "loss": 0.5492, + "step": 432 + }, + { + "epoch": 0.55424, + "grad_norm": 0.2230850015397349, + "learning_rate": 2.4776242827038636e-05, + "loss": 0.5545, + "step": 433 + }, + { + "epoch": 0.55552, + "grad_norm": 0.23589759701529497, + "learning_rate": 2.4664369842165068e-05, + "loss": 0.5412, + "step": 434 + }, + { + "epoch": 0.5568, + "grad_norm": 0.22581333235197817, + "learning_rate": 2.4552503579094397e-05, + "loss": 0.5435, + "step": 435 + }, + { + "epoch": 0.55808, + "grad_norm": 0.24919774658733163, + "learning_rate": 2.4440646278218177e-05, + "loss": 0.545, + "step": 436 + }, + { + "epoch": 0.55936, + "grad_norm": 0.2190295213590674, + "learning_rate": 2.4328800179748475e-05, + "loss": 0.5408, + "step": 437 + }, + { + "epoch": 0.56064, + "grad_norm": 0.2628486805976341, + "learning_rate": 2.4216967523673e-05, + "loss": 0.5655, + "step": 438 + }, + { + "epoch": 0.56192, + "grad_norm": 0.25252172819470975, + "learning_rate": 2.4105150549710238e-05, + "loss": 0.5552, + "step": 439 + }, + { + "epoch": 0.5632, + "grad_norm": 0.20229952269202467, + "learning_rate": 2.399335149726463e-05, + "loss": 0.5429, + "step": 440 + }, + { + "epoch": 0.56448, + "grad_norm": 0.24469450227466819, + "learning_rate": 2.388157260538165e-05, + "loss": 0.5582, + "step": 441 + }, + { + "epoch": 0.56576, + "grad_norm": 0.21164210405032619, + "learning_rate": 2.3769816112703047e-05, + "loss": 0.5506, + "step": 442 + }, + { + "epoch": 0.56704, + "grad_norm": 0.22950726772051805, + "learning_rate": 2.365808425742196e-05, + "loss": 0.5497, + "step": 443 + }, + { + "epoch": 0.56832, + "grad_norm": 0.23336024035401096, + "learning_rate": 2.3546379277238107e-05, + "loss": 0.5415, + "step": 444 + }, + { + "epoch": 0.5696, + "grad_norm": 0.19765357154592897, + "learning_rate": 2.3434703409312954e-05, + "loss": 0.5423, + "step": 445 + }, + { + "epoch": 0.57088, + "grad_norm": 0.20108815980408287, + "learning_rate": 2.3323058890224938e-05, + "loss": 0.5543, + "step": 446 + }, + { + "epoch": 0.57216, + "grad_norm": 0.20801363546967222, + "learning_rate": 2.321144795592467e-05, + "loss": 0.5579, + "step": 447 + }, + { + "epoch": 0.57344, + "grad_norm": 0.19116838604162356, + "learning_rate": 2.3099872841690103e-05, + "loss": 0.5418, + "step": 448 + }, + { + "epoch": 0.57472, + "grad_norm": 0.18998557353906567, + "learning_rate": 2.2988335782081855e-05, + "loss": 0.5575, + "step": 449 + }, + { + "epoch": 0.576, + "grad_norm": 0.20390602126403667, + "learning_rate": 2.2876839010898377e-05, + "loss": 0.5517, + "step": 450 + }, + { + "epoch": 0.57728, + "grad_norm": 0.20273804136737242, + "learning_rate": 2.2765384761131274e-05, + "loss": 0.5512, + "step": 451 + }, + { + "epoch": 0.57856, + "grad_norm": 0.2323345427375102, + "learning_rate": 2.265397526492052e-05, + "loss": 0.5552, + "step": 452 + }, + { + "epoch": 0.57984, + "grad_norm": 0.21618001641053508, + "learning_rate": 2.2542612753509823e-05, + "loss": 0.5497, + "step": 453 + }, + { + "epoch": 0.58112, + "grad_norm": 0.2089693691856067, + "learning_rate": 2.24312994572019e-05, + "loss": 0.5437, + "step": 454 + }, + { + "epoch": 0.5824, + "grad_norm": 0.21321327766142864, + "learning_rate": 2.2320037605313808e-05, + "loss": 0.5457, + "step": 455 + }, + { + "epoch": 0.58368, + "grad_norm": 0.1866665167284803, + "learning_rate": 2.2208829426132307e-05, + "loss": 0.5589, + "step": 456 + }, + { + "epoch": 0.58496, + "grad_norm": 0.19549868239805784, + "learning_rate": 2.2097677146869242e-05, + "loss": 0.5428, + "step": 457 + }, + { + "epoch": 0.58624, + "grad_norm": 0.21465688730228774, + "learning_rate": 2.1986582993616926e-05, + "loss": 0.5409, + "step": 458 + }, + { + "epoch": 0.58752, + "grad_norm": 0.19658929521150698, + "learning_rate": 2.1875549191303545e-05, + "loss": 0.5464, + "step": 459 + }, + { + "epoch": 0.5888, + "grad_norm": 0.198475793396338, + "learning_rate": 2.1764577963648614e-05, + "loss": 0.5468, + "step": 460 + }, + { + "epoch": 0.59008, + "grad_norm": 0.21158010098295732, + "learning_rate": 2.1653671533118468e-05, + "loss": 0.5537, + "step": 461 + }, + { + "epoch": 0.59136, + "grad_norm": 0.21320070068715855, + "learning_rate": 2.154283212088168e-05, + "loss": 0.5486, + "step": 462 + }, + { + "epoch": 0.59264, + "grad_norm": 0.2212774837957101, + "learning_rate": 2.1432061946764644e-05, + "loss": 0.5381, + "step": 463 + }, + { + "epoch": 0.59392, + "grad_norm": 0.19109434465910716, + "learning_rate": 2.1321363229207096e-05, + "loss": 0.5538, + "step": 464 + }, + { + "epoch": 0.5952, + "grad_norm": 0.194950360975785, + "learning_rate": 2.121073818521769e-05, + "loss": 0.5539, + "step": 465 + }, + { + "epoch": 0.59648, + "grad_norm": 0.20628175723560369, + "learning_rate": 2.1100189030329558e-05, + "loss": 0.5414, + "step": 466 + }, + { + "epoch": 0.59776, + "grad_norm": 0.19027258987716833, + "learning_rate": 2.098971797855599e-05, + "loss": 0.5465, + "step": 467 + }, + { + "epoch": 0.59904, + "grad_norm": 0.1968353186636271, + "learning_rate": 2.0879327242346093e-05, + "loss": 0.5429, + "step": 468 + }, + { + "epoch": 0.60032, + "grad_norm": 0.19197267253975747, + "learning_rate": 2.0769019032540416e-05, + "loss": 0.5576, + "step": 469 + }, + { + "epoch": 0.6016, + "grad_norm": 0.1989897674210096, + "learning_rate": 2.0658795558326743e-05, + "loss": 0.5421, + "step": 470 + }, + { + "epoch": 0.60288, + "grad_norm": 0.2081969628582826, + "learning_rate": 2.054865902719584e-05, + "loss": 0.567, + "step": 471 + }, + { + "epoch": 0.60416, + "grad_norm": 0.19431308789862117, + "learning_rate": 2.043861164489719e-05, + "loss": 0.5659, + "step": 472 + }, + { + "epoch": 0.60544, + "grad_norm": 0.21246150041771483, + "learning_rate": 2.0328655615394882e-05, + "loss": 0.5457, + "step": 473 + }, + { + "epoch": 0.60672, + "grad_norm": 0.20631939020361545, + "learning_rate": 2.021879314082344e-05, + "loss": 0.5461, + "step": 474 + }, + { + "epoch": 0.608, + "grad_norm": 0.18725048532643526, + "learning_rate": 2.0109026421443745e-05, + "loss": 0.5451, + "step": 475 + }, + { + "epoch": 0.60928, + "grad_norm": 0.20394496506934465, + "learning_rate": 1.9999357655598893e-05, + "loss": 0.5575, + "step": 476 + }, + { + "epoch": 0.61056, + "grad_norm": 0.22485652668287356, + "learning_rate": 1.9889789039670277e-05, + "loss": 0.553, + "step": 477 + }, + { + "epoch": 0.61184, + "grad_norm": 0.20345540078269608, + "learning_rate": 1.978032276803354e-05, + "loss": 0.55, + "step": 478 + }, + { + "epoch": 0.61312, + "grad_norm": 0.2062068875047177, + "learning_rate": 1.9670961033014605e-05, + "loss": 0.5473, + "step": 479 + }, + { + "epoch": 0.6144, + "grad_norm": 0.18808340518062924, + "learning_rate": 1.956170602484582e-05, + "loss": 0.5481, + "step": 480 + }, + { + "epoch": 0.61568, + "grad_norm": 0.1948648618263434, + "learning_rate": 1.9452559931622067e-05, + "loss": 0.5611, + "step": 481 + }, + { + "epoch": 0.61696, + "grad_norm": 0.21011586889216233, + "learning_rate": 1.934352493925695e-05, + "loss": 0.558, + "step": 482 + }, + { + "epoch": 0.61824, + "grad_norm": 0.21372649563774826, + "learning_rate": 1.9234603231438995e-05, + "loss": 0.5504, + "step": 483 + }, + { + "epoch": 0.61952, + "grad_norm": 0.21202134397012273, + "learning_rate": 1.9125796989587947e-05, + "loss": 0.5423, + "step": 484 + }, + { + "epoch": 0.6208, + "grad_norm": 0.1813013729625519, + "learning_rate": 1.9017108392811065e-05, + "loss": 0.5478, + "step": 485 + }, + { + "epoch": 0.62208, + "grad_norm": 0.19357846387987115, + "learning_rate": 1.8908539617859456e-05, + "loss": 0.5371, + "step": 486 + }, + { + "epoch": 0.62336, + "grad_norm": 0.22921908791193235, + "learning_rate": 1.880009283908454e-05, + "loss": 0.5508, + "step": 487 + }, + { + "epoch": 0.62464, + "grad_norm": 0.17331042850311895, + "learning_rate": 1.8691770228394456e-05, + "loss": 0.5494, + "step": 488 + }, + { + "epoch": 0.62592, + "grad_norm": 0.19359927189150627, + "learning_rate": 1.858357395521058e-05, + "loss": 0.5597, + "step": 489 + }, + { + "epoch": 0.6272, + "grad_norm": 0.18648874954378045, + "learning_rate": 1.8475506186424074e-05, + "loss": 0.5484, + "step": 490 + }, + { + "epoch": 0.62848, + "grad_norm": 0.1886503036851335, + "learning_rate": 1.8367569086352483e-05, + "loss": 0.5654, + "step": 491 + }, + { + "epoch": 0.62976, + "grad_norm": 0.17838832879819544, + "learning_rate": 1.825976481669641e-05, + "loss": 0.5441, + "step": 492 + }, + { + "epoch": 0.63104, + "grad_norm": 0.18783084375563594, + "learning_rate": 1.815209553649619e-05, + "loss": 0.5499, + "step": 493 + }, + { + "epoch": 0.63232, + "grad_norm": 0.17314646438183282, + "learning_rate": 1.8044563402088684e-05, + "loss": 0.5448, + "step": 494 + }, + { + "epoch": 0.6336, + "grad_norm": 0.18733598616464592, + "learning_rate": 1.7937170567064075e-05, + "loss": 0.5664, + "step": 495 + }, + { + "epoch": 0.63488, + "grad_norm": 0.20893112653690207, + "learning_rate": 1.7829919182222752e-05, + "loss": 0.537, + "step": 496 + }, + { + "epoch": 0.63616, + "grad_norm": 0.18275232842737502, + "learning_rate": 1.7722811395532178e-05, + "loss": 0.556, + "step": 497 + }, + { + "epoch": 0.63744, + "grad_norm": 0.1735031563340356, + "learning_rate": 1.7615849352083975e-05, + "loss": 0.5541, + "step": 498 + }, + { + "epoch": 0.63872, + "grad_norm": 0.19136749409935053, + "learning_rate": 1.7509035194050868e-05, + "loss": 0.5394, + "step": 499 + }, + { + "epoch": 0.64, + "grad_norm": 0.18335262719738063, + "learning_rate": 1.740237106064383e-05, + "loss": 0.5491, + "step": 500 + }, + { + "epoch": 0.64128, + "grad_norm": 0.17240458107083004, + "learning_rate": 1.7295859088069234e-05, + "loss": 0.5368, + "step": 501 + }, + { + "epoch": 0.64256, + "grad_norm": 0.17840513795401933, + "learning_rate": 1.7189501409486062e-05, + "loss": 0.5478, + "step": 502 + }, + { + "epoch": 0.64384, + "grad_norm": 0.18720267712459818, + "learning_rate": 1.7083300154963193e-05, + "loss": 0.5616, + "step": 503 + }, + { + "epoch": 0.64512, + "grad_norm": 0.16661138773775544, + "learning_rate": 1.6977257451436712e-05, + "loss": 0.5533, + "step": 504 + }, + { + "epoch": 0.6464, + "grad_norm": 0.181117880126469, + "learning_rate": 1.6871375422667375e-05, + "loss": 0.5517, + "step": 505 + }, + { + "epoch": 0.64768, + "grad_norm": 0.16124311049954437, + "learning_rate": 1.6765656189198013e-05, + "loss": 0.5399, + "step": 506 + }, + { + "epoch": 0.64896, + "grad_norm": 0.1780112598733124, + "learning_rate": 1.6660101868311092e-05, + "loss": 0.5545, + "step": 507 + }, + { + "epoch": 0.65024, + "grad_norm": 0.17321825984782027, + "learning_rate": 1.6554714573986324e-05, + "loss": 0.5506, + "step": 508 + }, + { + "epoch": 0.65152, + "grad_norm": 0.19903568513322475, + "learning_rate": 1.6449496416858284e-05, + "loss": 0.5463, + "step": 509 + }, + { + "epoch": 0.6528, + "grad_norm": 0.16644287745949193, + "learning_rate": 1.6344449504174193e-05, + "loss": 0.547, + "step": 510 + }, + { + "epoch": 0.65408, + "grad_norm": 0.18250795813297635, + "learning_rate": 1.623957593975166e-05, + "loss": 0.5406, + "step": 511 + }, + { + "epoch": 0.65536, + "grad_norm": 0.1858788673421162, + "learning_rate": 1.613487782393661e-05, + "loss": 0.5591, + "step": 512 + }, + { + "epoch": 0.65664, + "grad_norm": 0.17724801933662296, + "learning_rate": 1.6030357253561173e-05, + "loss": 0.5418, + "step": 513 + }, + { + "epoch": 0.65792, + "grad_norm": 0.1708440658079089, + "learning_rate": 1.592601632190169e-05, + "loss": 0.5562, + "step": 514 + }, + { + "epoch": 0.6592, + "grad_norm": 0.1784213729096496, + "learning_rate": 1.582185711863681e-05, + "loss": 0.5394, + "step": 515 + }, + { + "epoch": 0.66048, + "grad_norm": 0.180502730233706, + "learning_rate": 1.571788172980566e-05, + "loss": 0.555, + "step": 516 + }, + { + "epoch": 0.66176, + "grad_norm": 0.16436771129623443, + "learning_rate": 1.5614092237766006e-05, + "loss": 0.5427, + "step": 517 + }, + { + "epoch": 0.66304, + "grad_norm": 0.18053318678607747, + "learning_rate": 1.5510490721152592e-05, + "loss": 0.5492, + "step": 518 + }, + { + "epoch": 0.66432, + "grad_norm": 0.18604940199556919, + "learning_rate": 1.5407079254835506e-05, + "loss": 0.5335, + "step": 519 + }, + { + "epoch": 0.6656, + "grad_norm": 0.19210597805514745, + "learning_rate": 1.530385990987863e-05, + "loss": 0.5442, + "step": 520 + }, + { + "epoch": 0.66688, + "grad_norm": 0.17611283743427303, + "learning_rate": 1.5200834753498128e-05, + "loss": 0.5322, + "step": 521 + }, + { + "epoch": 0.66816, + "grad_norm": 0.22007676229449954, + "learning_rate": 1.509800584902108e-05, + "loss": 0.5584, + "step": 522 + }, + { + "epoch": 0.66944, + "grad_norm": 0.17964636810711826, + "learning_rate": 1.499537525584416e-05, + "loss": 0.546, + "step": 523 + }, + { + "epoch": 0.67072, + "grad_norm": 0.18491688306598894, + "learning_rate": 1.489294502939238e-05, + "loss": 0.5469, + "step": 524 + }, + { + "epoch": 0.672, + "grad_norm": 0.20303860785516076, + "learning_rate": 1.4790717221077898e-05, + "loss": 0.5499, + "step": 525 + }, + { + "epoch": 0.67328, + "grad_norm": 0.18536413393916135, + "learning_rate": 1.4688693878258991e-05, + "loss": 0.5427, + "step": 526 + }, + { + "epoch": 0.67456, + "grad_norm": 0.19288854639543657, + "learning_rate": 1.4586877044199016e-05, + "loss": 0.5576, + "step": 527 + }, + { + "epoch": 0.67584, + "grad_norm": 0.19998872597922154, + "learning_rate": 1.4485268758025466e-05, + "loss": 0.5337, + "step": 528 + }, + { + "epoch": 0.67712, + "grad_norm": 0.2356208291248628, + "learning_rate": 1.4383871054689213e-05, + "loss": 0.5457, + "step": 529 + }, + { + "epoch": 0.6784, + "grad_norm": 0.19146077814425122, + "learning_rate": 1.4282685964923642e-05, + "loss": 0.549, + "step": 530 + }, + { + "epoch": 0.67968, + "grad_norm": 0.23473975201728703, + "learning_rate": 1.4181715515204095e-05, + "loss": 0.5377, + "step": 531 + }, + { + "epoch": 0.68096, + "grad_norm": 0.196745306404929, + "learning_rate": 1.4080961727707184e-05, + "loss": 0.5458, + "step": 532 + }, + { + "epoch": 0.68224, + "grad_norm": 0.1723810980465925, + "learning_rate": 1.398042662027035e-05, + "loss": 0.5463, + "step": 533 + }, + { + "epoch": 0.68352, + "grad_norm": 0.2148768848114498, + "learning_rate": 1.3880112206351475e-05, + "loss": 0.5511, + "step": 534 + }, + { + "epoch": 0.6848, + "grad_norm": 0.19580634261932736, + "learning_rate": 1.3780020494988446e-05, + "loss": 0.5458, + "step": 535 + }, + { + "epoch": 0.68608, + "grad_norm": 0.18528206536801226, + "learning_rate": 1.3680153490759073e-05, + "loss": 0.5513, + "step": 536 + }, + { + "epoch": 0.68736, + "grad_norm": 0.1833258102759667, + "learning_rate": 1.3580513193740818e-05, + "loss": 0.5436, + "step": 537 + }, + { + "epoch": 0.68864, + "grad_norm": 0.18879719857423252, + "learning_rate": 1.3481101599470794e-05, + "loss": 0.5381, + "step": 538 + }, + { + "epoch": 0.68992, + "grad_norm": 0.18597088087253308, + "learning_rate": 1.3381920698905787e-05, + "loss": 0.5519, + "step": 539 + }, + { + "epoch": 0.6912, + "grad_norm": 0.19724256330188028, + "learning_rate": 1.328297247838241e-05, + "loss": 0.5499, + "step": 540 + }, + { + "epoch": 0.69248, + "grad_norm": 0.1740013114346114, + "learning_rate": 1.3184258919577269e-05, + "loss": 0.5413, + "step": 541 + }, + { + "epoch": 0.69376, + "grad_norm": 0.18816042265895747, + "learning_rate": 1.3085781999467303e-05, + "loss": 0.5257, + "step": 542 + }, + { + "epoch": 0.69504, + "grad_norm": 0.18327617156382764, + "learning_rate": 1.2987543690290222e-05, + "loss": 0.5501, + "step": 543 + }, + { + "epoch": 0.69632, + "grad_norm": 0.17743894174718156, + "learning_rate": 1.2889545959504939e-05, + "loss": 0.5423, + "step": 544 + }, + { + "epoch": 0.6976, + "grad_norm": 0.1775283974897721, + "learning_rate": 1.2791790769752232e-05, + "loss": 0.5393, + "step": 545 + }, + { + "epoch": 0.69888, + "grad_norm": 0.16200081974294722, + "learning_rate": 1.2694280078815382e-05, + "loss": 0.5583, + "step": 546 + }, + { + "epoch": 0.70016, + "grad_norm": 0.16722111373810458, + "learning_rate": 1.2597015839581033e-05, + "loss": 0.5417, + "step": 547 + }, + { + "epoch": 0.70144, + "grad_norm": 0.17064698518444713, + "learning_rate": 1.2500000000000006e-05, + "loss": 0.5416, + "step": 548 + }, + { + "epoch": 0.70272, + "grad_norm": 0.18207070316671944, + "learning_rate": 1.2403234503048319e-05, + "loss": 0.5552, + "step": 549 + }, + { + "epoch": 0.704, + "grad_norm": 0.17186248314141456, + "learning_rate": 1.230672128668831e-05, + "loss": 0.5503, + "step": 550 + }, + { + "epoch": 0.70528, + "grad_norm": 0.18897462643935614, + "learning_rate": 1.2210462283829755e-05, + "loss": 0.5523, + "step": 551 + }, + { + "epoch": 0.70656, + "grad_norm": 0.18474441376802078, + "learning_rate": 1.2114459422291205e-05, + "loss": 0.543, + "step": 552 + }, + { + "epoch": 0.70784, + "grad_norm": 0.1726434041865777, + "learning_rate": 1.2018714624761352e-05, + "loss": 0.5342, + "step": 553 + }, + { + "epoch": 0.70912, + "grad_norm": 0.17218934161002325, + "learning_rate": 1.1923229808760564e-05, + "loss": 0.5422, + "step": 554 + }, + { + "epoch": 0.7104, + "grad_norm": 0.18953711573549967, + "learning_rate": 1.1828006886602422e-05, + "loss": 0.5406, + "step": 555 + }, + { + "epoch": 0.71168, + "grad_norm": 0.1880902181302069, + "learning_rate": 1.1733047765355466e-05, + "loss": 0.5462, + "step": 556 + }, + { + "epoch": 0.71296, + "grad_norm": 0.1596287230354233, + "learning_rate": 1.1638354346804971e-05, + "loss": 0.5388, + "step": 557 + }, + { + "epoch": 0.71424, + "grad_norm": 0.17158572664682273, + "learning_rate": 1.154392852741491e-05, + "loss": 0.5527, + "step": 558 + }, + { + "epoch": 0.71552, + "grad_norm": 0.1804729952348338, + "learning_rate": 1.1449772198289905e-05, + "loss": 0.5426, + "step": 559 + }, + { + "epoch": 0.7168, + "grad_norm": 0.1791816734451783, + "learning_rate": 1.1355887245137383e-05, + "loss": 0.5588, + "step": 560 + }, + { + "epoch": 0.71808, + "grad_norm": 0.16584626596374333, + "learning_rate": 1.126227554822985e-05, + "loss": 0.5366, + "step": 561 + }, + { + "epoch": 0.71936, + "grad_norm": 0.1727030219272147, + "learning_rate": 1.116893898236716e-05, + "loss": 0.5422, + "step": 562 + }, + { + "epoch": 0.72064, + "grad_norm": 0.17521577149409487, + "learning_rate": 1.1075879416839023e-05, + "loss": 0.5407, + "step": 563 + }, + { + "epoch": 0.72192, + "grad_norm": 0.27313054071540727, + "learning_rate": 1.0983098715387526e-05, + "loss": 0.5456, + "step": 564 + }, + { + "epoch": 0.7232, + "grad_norm": 0.16784636956770904, + "learning_rate": 1.089059873616988e-05, + "loss": 0.5552, + "step": 565 + }, + { + "epoch": 0.72448, + "grad_norm": 0.1733610583523821, + "learning_rate": 1.0798381331721109e-05, + "loss": 0.5462, + "step": 566 + }, + { + "epoch": 0.72576, + "grad_norm": 0.1840396076908537, + "learning_rate": 1.0706448348917006e-05, + "loss": 0.5358, + "step": 567 + }, + { + "epoch": 0.72704, + "grad_norm": 0.1655608413544908, + "learning_rate": 1.061480162893716e-05, + "loss": 0.5423, + "step": 568 + }, + { + "epoch": 0.72832, + "grad_norm": 0.17123662403250298, + "learning_rate": 1.052344300722803e-05, + "loss": 0.5468, + "step": 569 + }, + { + "epoch": 0.7296, + "grad_norm": 0.17550528875066398, + "learning_rate": 1.043237431346622e-05, + "loss": 0.5454, + "step": 570 + }, + { + "epoch": 0.73088, + "grad_norm": 0.16328379269859897, + "learning_rate": 1.0341597371521825e-05, + "loss": 0.5514, + "step": 571 + }, + { + "epoch": 0.73216, + "grad_norm": 0.16650225308818498, + "learning_rate": 1.0251113999421935e-05, + "loss": 0.5432, + "step": 572 + }, + { + "epoch": 0.73344, + "grad_norm": 0.17350055029708436, + "learning_rate": 1.016092600931414e-05, + "loss": 0.5409, + "step": 573 + }, + { + "epoch": 0.73472, + "grad_norm": 0.1596212141180814, + "learning_rate": 1.0071035207430352e-05, + "loss": 0.5367, + "step": 574 + }, + { + "epoch": 0.736, + "grad_norm": 0.2658852675551339, + "learning_rate": 9.981443394050525e-06, + "loss": 0.5595, + "step": 575 + }, + { + "epoch": 0.73728, + "grad_norm": 0.159704372500189, + "learning_rate": 9.892152363466691e-06, + "loss": 0.5357, + "step": 576 + }, + { + "epoch": 0.73856, + "grad_norm": 0.1651078900711754, + "learning_rate": 9.803163903946951e-06, + "loss": 0.5517, + "step": 577 + }, + { + "epoch": 0.73984, + "grad_norm": 0.1787435238987854, + "learning_rate": 9.714479797699694e-06, + "loss": 0.548, + "step": 578 + }, + { + "epoch": 0.74112, + "grad_norm": 0.1652971919910291, + "learning_rate": 9.626101820837927e-06, + "loss": 0.5539, + "step": 579 + }, + { + "epoch": 0.7424, + "grad_norm": 0.1639694735505629, + "learning_rate": 9.538031743343628e-06, + "loss": 0.5451, + "step": 580 + }, + { + "epoch": 0.74368, + "grad_norm": 0.1647232239012586, + "learning_rate": 9.450271329032404e-06, + "loss": 0.5505, + "step": 581 + }, + { + "epoch": 0.74496, + "grad_norm": 0.16460370416547013, + "learning_rate": 9.362822335518063e-06, + "loss": 0.553, + "step": 582 + }, + { + "epoch": 0.74624, + "grad_norm": 0.1697639854685921, + "learning_rate": 9.275686514177507e-06, + "loss": 0.5502, + "step": 583 + }, + { + "epoch": 0.74752, + "grad_norm": 0.16820172586994564, + "learning_rate": 9.18886561011557e-06, + "loss": 0.5444, + "step": 584 + }, + { + "epoch": 0.7488, + "grad_norm": 0.16239199386354092, + "learning_rate": 9.102361362130132e-06, + "loss": 0.5386, + "step": 585 + }, + { + "epoch": 0.75008, + "grad_norm": 0.17069749707133253, + "learning_rate": 9.01617550267726e-06, + "loss": 0.5407, + "step": 586 + }, + { + "epoch": 0.75136, + "grad_norm": 0.17402297905951, + "learning_rate": 8.930309757836517e-06, + "loss": 0.5508, + "step": 587 + }, + { + "epoch": 0.75264, + "grad_norm": 0.16830124871078894, + "learning_rate": 8.844765847276432e-06, + "loss": 0.5258, + "step": 588 + }, + { + "epoch": 0.75392, + "grad_norm": 0.15842747930020387, + "learning_rate": 8.759545484219984e-06, + "loss": 0.5448, + "step": 589 + }, + { + "epoch": 0.7552, + "grad_norm": 0.16546914193605752, + "learning_rate": 8.67465037541038e-06, + "loss": 0.5377, + "step": 590 + }, + { + "epoch": 0.75648, + "grad_norm": 0.1606695724745096, + "learning_rate": 8.590082221076765e-06, + "loss": 0.5221, + "step": 591 + }, + { + "epoch": 0.75776, + "grad_norm": 0.1632549785956385, + "learning_rate": 8.505842714900297e-06, + "loss": 0.5423, + "step": 592 + }, + { + "epoch": 0.75904, + "grad_norm": 0.16168391888394712, + "learning_rate": 8.421933543980126e-06, + "loss": 0.5458, + "step": 593 + }, + { + "epoch": 0.76032, + "grad_norm": 0.1756546275289762, + "learning_rate": 8.338356388799637e-06, + "loss": 0.5442, + "step": 594 + }, + { + "epoch": 0.7616, + "grad_norm": 0.16963428171763828, + "learning_rate": 8.255112923192835e-06, + "loss": 0.5459, + "step": 595 + }, + { + "epoch": 0.76288, + "grad_norm": 0.15718915668817968, + "learning_rate": 8.172204814310742e-06, + "loss": 0.5449, + "step": 596 + }, + { + "epoch": 0.76416, + "grad_norm": 0.16536550501030137, + "learning_rate": 8.089633722588103e-06, + "loss": 0.5387, + "step": 597 + }, + { + "epoch": 0.76544, + "grad_norm": 0.18318586900512912, + "learning_rate": 8.007401301710022e-06, + "loss": 0.5512, + "step": 598 + }, + { + "epoch": 0.76672, + "grad_norm": 0.16502774685748536, + "learning_rate": 7.92550919857896e-06, + "loss": 0.5369, + "step": 599 + }, + { + "epoch": 0.768, + "grad_norm": 0.15963079625942453, + "learning_rate": 7.843959053281663e-06, + "loss": 0.5445, + "step": 600 + }, + { + "epoch": 0.76928, + "grad_norm": 0.18142281673636804, + "learning_rate": 7.762752499056358e-06, + "loss": 0.5521, + "step": 601 + }, + { + "epoch": 0.77056, + "grad_norm": 0.17003382425447292, + "learning_rate": 7.681891162260015e-06, + "loss": 0.5481, + "step": 602 + }, + { + "epoch": 0.77184, + "grad_norm": 0.15682351369009967, + "learning_rate": 7.60137666233583e-06, + "loss": 0.5427, + "step": 603 + }, + { + "epoch": 0.77312, + "grad_norm": 0.16900483827947543, + "learning_rate": 7.521210611780716e-06, + "loss": 0.5491, + "step": 604 + }, + { + "epoch": 0.7744, + "grad_norm": 0.16933634809215875, + "learning_rate": 7.441394616113062e-06, + "loss": 0.5321, + "step": 605 + }, + { + "epoch": 0.77568, + "grad_norm": 0.15523375999123107, + "learning_rate": 7.361930273840581e-06, + "loss": 0.5359, + "step": 606 + }, + { + "epoch": 0.77696, + "grad_norm": 0.16476118124777908, + "learning_rate": 7.2828191764282515e-06, + "loss": 0.5405, + "step": 607 + }, + { + "epoch": 0.77824, + "grad_norm": 0.15880451278636304, + "learning_rate": 7.20406290826649e-06, + "loss": 0.5341, + "step": 608 + }, + { + "epoch": 0.77952, + "grad_norm": 0.16601175165765483, + "learning_rate": 7.125663046639386e-06, + "loss": 0.5354, + "step": 609 + }, + { + "epoch": 0.7808, + "grad_norm": 0.1594428201948071, + "learning_rate": 7.047621161693152e-06, + "loss": 0.5414, + "step": 610 + }, + { + "epoch": 0.78208, + "grad_norm": 0.15903842435489843, + "learning_rate": 6.969938816404639e-06, + "loss": 0.537, + "step": 611 + }, + { + "epoch": 0.78336, + "grad_norm": 0.16373860391653333, + "learning_rate": 6.892617566550044e-06, + "loss": 0.5324, + "step": 612 + }, + { + "epoch": 0.78464, + "grad_norm": 0.14806851457758538, + "learning_rate": 6.815658960673782e-06, + "loss": 0.5423, + "step": 613 + }, + { + "epoch": 0.78592, + "grad_norm": 0.16162392173499782, + "learning_rate": 6.739064540057424e-06, + "loss": 0.5379, + "step": 614 + }, + { + "epoch": 0.7872, + "grad_norm": 0.1556598212425785, + "learning_rate": 6.662835838688864e-06, + "loss": 0.5409, + "step": 615 + }, + { + "epoch": 0.78848, + "grad_norm": 0.1545780708161842, + "learning_rate": 6.586974383231573e-06, + "loss": 0.5371, + "step": 616 + }, + { + "epoch": 0.78976, + "grad_norm": 0.15481486085975593, + "learning_rate": 6.511481692994076e-06, + "loss": 0.5514, + "step": 617 + }, + { + "epoch": 0.79104, + "grad_norm": 0.15187060933009017, + "learning_rate": 6.436359279899426e-06, + "loss": 0.5352, + "step": 618 + }, + { + "epoch": 0.79232, + "grad_norm": 0.1627175375732009, + "learning_rate": 6.361608648455039e-06, + "loss": 0.54, + "step": 619 + }, + { + "epoch": 0.7936, + "grad_norm": 0.1550038050293079, + "learning_rate": 6.28723129572247e-06, + "loss": 0.5436, + "step": 620 + }, + { + "epoch": 0.79488, + "grad_norm": 0.1605066763261186, + "learning_rate": 6.2132287112875e-06, + "loss": 0.5401, + "step": 621 + }, + { + "epoch": 0.79616, + "grad_norm": 0.15630165231383042, + "learning_rate": 6.1396023772302465e-06, + "loss": 0.5353, + "step": 622 + }, + { + "epoch": 0.79744, + "grad_norm": 0.14818753466561113, + "learning_rate": 6.066353768095504e-06, + "loss": 0.551, + "step": 623 + }, + { + "epoch": 0.79872, + "grad_norm": 0.17322071941695266, + "learning_rate": 5.993484350863246e-06, + "loss": 0.5372, + "step": 624 + }, + { + "epoch": 0.8, + "grad_norm": 0.17299311811164592, + "learning_rate": 5.92099558491917e-06, + "loss": 0.5491, + "step": 625 + }, + { + "epoch": 0.80128, + "grad_norm": 0.15432256972090858, + "learning_rate": 5.848888922025553e-06, + "loss": 0.5451, + "step": 626 + }, + { + "epoch": 0.80256, + "grad_norm": 0.1796991664902234, + "learning_rate": 5.777165806292109e-06, + "loss": 0.542, + "step": 627 + }, + { + "epoch": 0.80384, + "grad_norm": 0.15716216593694202, + "learning_rate": 5.7058276741471236e-06, + "loss": 0.5359, + "step": 628 + } + ], + "logging_steps": 1, + "max_steps": 781, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 157, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 780227321331712.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}