{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.20096, "eval_steps": 500, "global_step": 157, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00128, "grad_norm": 2.8870697066158506, "learning_rate": 0.0, "loss": 0.8422, "step": 1 }, { "epoch": 0.00256, "grad_norm": 2.88484389829891, "learning_rate": 6.329113924050633e-07, "loss": 0.8541, "step": 2 }, { "epoch": 0.00384, "grad_norm": 2.858151965789657, "learning_rate": 1.2658227848101265e-06, "loss": 0.8376, "step": 3 }, { "epoch": 0.00512, "grad_norm": 2.759628117182127, "learning_rate": 1.8987341772151901e-06, "loss": 0.8334, "step": 4 }, { "epoch": 0.0064, "grad_norm": 2.796990062811218, "learning_rate": 2.531645569620253e-06, "loss": 0.8256, "step": 5 }, { "epoch": 0.00768, "grad_norm": 2.5779298795445023, "learning_rate": 3.1645569620253167e-06, "loss": 0.8301, "step": 6 }, { "epoch": 0.00896, "grad_norm": 2.182261607936066, "learning_rate": 3.7974683544303802e-06, "loss": 0.8156, "step": 7 }, { "epoch": 0.01024, "grad_norm": 1.9615896152651355, "learning_rate": 4.430379746835443e-06, "loss": 0.7982, "step": 8 }, { "epoch": 0.01152, "grad_norm": 1.452541644948315, "learning_rate": 5.063291139240506e-06, "loss": 0.7819, "step": 9 }, { "epoch": 0.0128, "grad_norm": 1.4723286808630864, "learning_rate": 5.69620253164557e-06, "loss": 0.7906, "step": 10 }, { "epoch": 0.01408, "grad_norm": 1.3529636617858944, "learning_rate": 6.329113924050633e-06, "loss": 0.7724, "step": 11 }, { "epoch": 0.01536, "grad_norm": 1.960737179905222, "learning_rate": 6.9620253164556965e-06, "loss": 0.7495, "step": 12 }, { "epoch": 0.01664, "grad_norm": 2.2349101055406337, "learning_rate": 7.5949367088607605e-06, "loss": 0.7581, "step": 13 }, { "epoch": 0.01792, "grad_norm": 2.0897577150322477, "learning_rate": 8.227848101265822e-06, "loss": 0.7404, "step": 14 }, { "epoch": 0.0192, "grad_norm": 1.8227218322635887, "learning_rate": 8.860759493670886e-06, "loss": 0.7382, "step": 15 }, { "epoch": 0.02048, "grad_norm": 1.2099951464458898, "learning_rate": 9.49367088607595e-06, "loss": 0.7231, "step": 16 }, { "epoch": 0.02176, "grad_norm": 1.2177037129914572, "learning_rate": 1.0126582278481012e-05, "loss": 0.7259, "step": 17 }, { "epoch": 0.02304, "grad_norm": 1.1031346132830708, "learning_rate": 1.0759493670886076e-05, "loss": 0.7059, "step": 18 }, { "epoch": 0.02432, "grad_norm": 0.9194779600801882, "learning_rate": 1.139240506329114e-05, "loss": 0.7137, "step": 19 }, { "epoch": 0.0256, "grad_norm": 0.8679468005972053, "learning_rate": 1.2025316455696203e-05, "loss": 0.7036, "step": 20 }, { "epoch": 0.02688, "grad_norm": 0.7227287276969042, "learning_rate": 1.2658227848101267e-05, "loss": 0.696, "step": 21 }, { "epoch": 0.02816, "grad_norm": 0.7425882516811844, "learning_rate": 1.3291139240506329e-05, "loss": 0.6888, "step": 22 }, { "epoch": 0.02944, "grad_norm": 0.7093793012252196, "learning_rate": 1.3924050632911393e-05, "loss": 0.6791, "step": 23 }, { "epoch": 0.03072, "grad_norm": 0.6018215463147907, "learning_rate": 1.4556962025316457e-05, "loss": 0.6783, "step": 24 }, { "epoch": 0.032, "grad_norm": 0.5846346732378257, "learning_rate": 1.5189873417721521e-05, "loss": 0.6811, "step": 25 }, { "epoch": 0.03328, "grad_norm": 0.5855419788452784, "learning_rate": 1.5822784810126583e-05, "loss": 0.683, "step": 26 }, { "epoch": 0.03456, "grad_norm": 0.5096689891724868, "learning_rate": 1.6455696202531644e-05, "loss": 0.6589, "step": 27 }, { "epoch": 0.03584, "grad_norm": 0.4871170504081146, "learning_rate": 1.7088607594936708e-05, "loss": 0.6582, "step": 28 }, { "epoch": 0.03712, "grad_norm": 0.4949600697144217, "learning_rate": 1.7721518987341772e-05, "loss": 0.669, "step": 29 }, { "epoch": 0.0384, "grad_norm": 0.5082926031630941, "learning_rate": 1.8354430379746836e-05, "loss": 0.666, "step": 30 }, { "epoch": 0.03968, "grad_norm": 0.49381475380567175, "learning_rate": 1.89873417721519e-05, "loss": 0.6556, "step": 31 }, { "epoch": 0.04096, "grad_norm": 0.4265624784331274, "learning_rate": 1.962025316455696e-05, "loss": 0.646, "step": 32 }, { "epoch": 0.04224, "grad_norm": 0.39190416547723717, "learning_rate": 2.0253164556962025e-05, "loss": 0.6473, "step": 33 }, { "epoch": 0.04352, "grad_norm": 0.4631353399929371, "learning_rate": 2.088607594936709e-05, "loss": 0.6441, "step": 34 }, { "epoch": 0.0448, "grad_norm": 0.3928335126997034, "learning_rate": 2.1518987341772153e-05, "loss": 0.6352, "step": 35 }, { "epoch": 0.04608, "grad_norm": 0.36295027582313966, "learning_rate": 2.2151898734177217e-05, "loss": 0.6333, "step": 36 }, { "epoch": 0.04736, "grad_norm": 0.35026852064181846, "learning_rate": 2.278481012658228e-05, "loss": 0.6399, "step": 37 }, { "epoch": 0.04864, "grad_norm": 0.39778614916835536, "learning_rate": 2.341772151898734e-05, "loss": 0.6298, "step": 38 }, { "epoch": 0.04992, "grad_norm": 0.33278348666417684, "learning_rate": 2.4050632911392405e-05, "loss": 0.6301, "step": 39 }, { "epoch": 0.0512, "grad_norm": 0.31444068712551376, "learning_rate": 2.468354430379747e-05, "loss": 0.6263, "step": 40 }, { "epoch": 0.05248, "grad_norm": 0.36059728676958264, "learning_rate": 2.5316455696202533e-05, "loss": 0.6458, "step": 41 }, { "epoch": 0.05376, "grad_norm": 0.3916144552301749, "learning_rate": 2.5949367088607597e-05, "loss": 0.6331, "step": 42 }, { "epoch": 0.05504, "grad_norm": 0.32338566356420756, "learning_rate": 2.6582278481012658e-05, "loss": 0.6332, "step": 43 }, { "epoch": 0.05632, "grad_norm": 0.33704233729853356, "learning_rate": 2.7215189873417722e-05, "loss": 0.6348, "step": 44 }, { "epoch": 0.0576, "grad_norm": 0.36015399213900634, "learning_rate": 2.7848101265822786e-05, "loss": 0.6392, "step": 45 }, { "epoch": 0.05888, "grad_norm": 0.31471331803021757, "learning_rate": 2.848101265822785e-05, "loss": 0.6272, "step": 46 }, { "epoch": 0.06016, "grad_norm": 0.3225170654156012, "learning_rate": 2.9113924050632914e-05, "loss": 0.635, "step": 47 }, { "epoch": 0.06144, "grad_norm": 0.3064473735810606, "learning_rate": 2.9746835443037974e-05, "loss": 0.6284, "step": 48 }, { "epoch": 0.06272, "grad_norm": 0.3038289969291092, "learning_rate": 3.0379746835443042e-05, "loss": 0.6149, "step": 49 }, { "epoch": 0.064, "grad_norm": 0.3226803690164346, "learning_rate": 3.10126582278481e-05, "loss": 0.626, "step": 50 }, { "epoch": 0.06528, "grad_norm": 0.3096398144524693, "learning_rate": 3.1645569620253167e-05, "loss": 0.621, "step": 51 }, { "epoch": 0.06656, "grad_norm": 0.2754757429130796, "learning_rate": 3.227848101265823e-05, "loss": 0.6185, "step": 52 }, { "epoch": 0.06784, "grad_norm": 0.3262507218160328, "learning_rate": 3.291139240506329e-05, "loss": 0.6171, "step": 53 }, { "epoch": 0.06912, "grad_norm": 0.34971068352090656, "learning_rate": 3.354430379746836e-05, "loss": 0.616, "step": 54 }, { "epoch": 0.0704, "grad_norm": 0.2841621281043231, "learning_rate": 3.4177215189873416e-05, "loss": 0.5995, "step": 55 }, { "epoch": 0.07168, "grad_norm": 0.4003223636484448, "learning_rate": 3.4810126582278487e-05, "loss": 0.6169, "step": 56 }, { "epoch": 0.07296, "grad_norm": 0.31868860231705426, "learning_rate": 3.5443037974683544e-05, "loss": 0.6077, "step": 57 }, { "epoch": 0.07424, "grad_norm": 0.3960425782005289, "learning_rate": 3.607594936708861e-05, "loss": 0.6164, "step": 58 }, { "epoch": 0.07552, "grad_norm": 0.363865574596696, "learning_rate": 3.670886075949367e-05, "loss": 0.6118, "step": 59 }, { "epoch": 0.0768, "grad_norm": 0.33961478774466697, "learning_rate": 3.7341772151898736e-05, "loss": 0.6137, "step": 60 }, { "epoch": 0.07808, "grad_norm": 0.4212164741206082, "learning_rate": 3.79746835443038e-05, "loss": 0.6275, "step": 61 }, { "epoch": 0.07936, "grad_norm": 0.29878729710395663, "learning_rate": 3.8607594936708864e-05, "loss": 0.6084, "step": 62 }, { "epoch": 0.08064, "grad_norm": 0.36745026817379894, "learning_rate": 3.924050632911392e-05, "loss": 0.607, "step": 63 }, { "epoch": 0.08192, "grad_norm": 0.38983571508393644, "learning_rate": 3.987341772151899e-05, "loss": 0.6176, "step": 64 }, { "epoch": 0.0832, "grad_norm": 0.37337392917475115, "learning_rate": 4.050632911392405e-05, "loss": 0.6184, "step": 65 }, { "epoch": 0.08448, "grad_norm": 0.3668068115925863, "learning_rate": 4.113924050632912e-05, "loss": 0.6194, "step": 66 }, { "epoch": 0.08576, "grad_norm": 0.36138503055306903, "learning_rate": 4.177215189873418e-05, "loss": 0.6077, "step": 67 }, { "epoch": 0.08704, "grad_norm": 0.43361127462043814, "learning_rate": 4.240506329113924e-05, "loss": 0.6147, "step": 68 }, { "epoch": 0.08832, "grad_norm": 0.33520423726109644, "learning_rate": 4.3037974683544305e-05, "loss": 0.6118, "step": 69 }, { "epoch": 0.0896, "grad_norm": 0.4381154362148859, "learning_rate": 4.367088607594937e-05, "loss": 0.6031, "step": 70 }, { "epoch": 0.09088, "grad_norm": 0.3717345864324632, "learning_rate": 4.430379746835443e-05, "loss": 0.6031, "step": 71 }, { "epoch": 0.09216, "grad_norm": 0.4861728465398392, "learning_rate": 4.49367088607595e-05, "loss": 0.6006, "step": 72 }, { "epoch": 0.09344, "grad_norm": 0.3264992939190504, "learning_rate": 4.556962025316456e-05, "loss": 0.6151, "step": 73 }, { "epoch": 0.09472, "grad_norm": 0.4319794925001871, "learning_rate": 4.6202531645569625e-05, "loss": 0.6058, "step": 74 }, { "epoch": 0.096, "grad_norm": 0.4616345840492333, "learning_rate": 4.683544303797468e-05, "loss": 0.5967, "step": 75 }, { "epoch": 0.09728, "grad_norm": 0.4405721152587957, "learning_rate": 4.7468354430379746e-05, "loss": 0.6002, "step": 76 }, { "epoch": 0.09856, "grad_norm": 0.5122605377853799, "learning_rate": 4.810126582278481e-05, "loss": 0.6076, "step": 77 }, { "epoch": 0.09984, "grad_norm": 0.45313870340097556, "learning_rate": 4.8734177215189874e-05, "loss": 0.6074, "step": 78 }, { "epoch": 0.10112, "grad_norm": 0.4340044755876676, "learning_rate": 4.936708860759494e-05, "loss": 0.606, "step": 79 }, { "epoch": 0.1024, "grad_norm": 0.4987172862476422, "learning_rate": 5e-05, "loss": 0.6158, "step": 80 }, { "epoch": 0.10368, "grad_norm": 0.6226880208665108, "learning_rate": 4.999974965737065e-05, "loss": 0.621, "step": 81 }, { "epoch": 0.10496, "grad_norm": 0.5448293131914782, "learning_rate": 4.999899863449631e-05, "loss": 0.6014, "step": 82 }, { "epoch": 0.10624, "grad_norm": 0.3427022601926917, "learning_rate": 4.999774694641803e-05, "loss": 0.6198, "step": 83 }, { "epoch": 0.10752, "grad_norm": 0.5005152113593655, "learning_rate": 4.999599461820387e-05, "loss": 0.6054, "step": 84 }, { "epoch": 0.1088, "grad_norm": 0.5702968806820528, "learning_rate": 4.999374168494844e-05, "loss": 0.6069, "step": 85 }, { "epoch": 0.11008, "grad_norm": 0.4671310661706222, "learning_rate": 4.999098819177214e-05, "loss": 0.6017, "step": 86 }, { "epoch": 0.11136, "grad_norm": 0.46081768174689064, "learning_rate": 4.9987734193820324e-05, "loss": 0.5988, "step": 87 }, { "epoch": 0.11264, "grad_norm": 0.5448729856183013, "learning_rate": 4.9983979756262136e-05, "loss": 0.6181, "step": 88 }, { "epoch": 0.11392, "grad_norm": 0.5095775592779056, "learning_rate": 4.9979724954289244e-05, "loss": 0.608, "step": 89 }, { "epoch": 0.1152, "grad_norm": 0.41119162739543413, "learning_rate": 4.997496987311431e-05, "loss": 0.5979, "step": 90 }, { "epoch": 0.11648, "grad_norm": 0.45501958535738946, "learning_rate": 4.996971460796929e-05, "loss": 0.6019, "step": 91 }, { "epoch": 0.11776, "grad_norm": 0.4287172104360816, "learning_rate": 4.9963959264103544e-05, "loss": 0.5955, "step": 92 }, { "epoch": 0.11904, "grad_norm": 0.409872269342458, "learning_rate": 4.995770395678171e-05, "loss": 0.5927, "step": 93 }, { "epoch": 0.12032, "grad_norm": 0.4304173966206036, "learning_rate": 4.995094881128138e-05, "loss": 0.5967, "step": 94 }, { "epoch": 0.1216, "grad_norm": 0.4229799776298517, "learning_rate": 4.994369396289063e-05, "loss": 0.6084, "step": 95 }, { "epoch": 0.12288, "grad_norm": 0.4509596954971553, "learning_rate": 4.9935939556905295e-05, "loss": 0.6134, "step": 96 }, { "epoch": 0.12416, "grad_norm": 0.467661146414229, "learning_rate": 4.992768574862603e-05, "loss": 0.5986, "step": 97 }, { "epoch": 0.12544, "grad_norm": 0.42432875998240194, "learning_rate": 4.9918932703355256e-05, "loss": 0.6028, "step": 98 }, { "epoch": 0.12672, "grad_norm": 0.43479377184835605, "learning_rate": 4.990968059639379e-05, "loss": 0.5942, "step": 99 }, { "epoch": 0.128, "grad_norm": 0.3680676685801686, "learning_rate": 4.989992961303738e-05, "loss": 0.5994, "step": 100 }, { "epoch": 0.12928, "grad_norm": 0.3956815409903461, "learning_rate": 4.9889679948572974e-05, "loss": 0.5871, "step": 101 }, { "epoch": 0.13056, "grad_norm": 0.34354949934586104, "learning_rate": 4.98789318082748e-05, "loss": 0.5873, "step": 102 }, { "epoch": 0.13184, "grad_norm": 0.3608260963951222, "learning_rate": 4.986768540740028e-05, "loss": 0.5883, "step": 103 }, { "epoch": 0.13312, "grad_norm": 0.3937004101078116, "learning_rate": 4.98559409711857e-05, "loss": 0.6029, "step": 104 }, { "epoch": 0.1344, "grad_norm": 0.3401718481532899, "learning_rate": 4.9843698734841705e-05, "loss": 0.5983, "step": 105 }, { "epoch": 0.13568, "grad_norm": 0.4371868869288284, "learning_rate": 4.983095894354858e-05, "loss": 0.5866, "step": 106 }, { "epoch": 0.13696, "grad_norm": 0.3722813571279646, "learning_rate": 4.981772185245135e-05, "loss": 0.5954, "step": 107 }, { "epoch": 0.13824, "grad_norm": 0.36493596395606354, "learning_rate": 4.980398772665468e-05, "loss": 0.5806, "step": 108 }, { "epoch": 0.13952, "grad_norm": 0.43678937522389644, "learning_rate": 4.9789756841217546e-05, "loss": 0.595, "step": 109 }, { "epoch": 0.1408, "grad_norm": 0.34968596729530604, "learning_rate": 4.977502948114772e-05, "loss": 0.5999, "step": 110 }, { "epoch": 0.14208, "grad_norm": 0.4035249077012057, "learning_rate": 4.9759805941396075e-05, "loss": 0.582, "step": 111 }, { "epoch": 0.14336, "grad_norm": 0.3396387531525401, "learning_rate": 4.974408652685072e-05, "loss": 0.5912, "step": 112 }, { "epoch": 0.14464, "grad_norm": 0.3888124435581031, "learning_rate": 4.9727871552330794e-05, "loss": 0.5994, "step": 113 }, { "epoch": 0.14592, "grad_norm": 0.3487289265208422, "learning_rate": 4.971116134258025e-05, "loss": 0.598, "step": 114 }, { "epoch": 0.1472, "grad_norm": 0.34084258932596606, "learning_rate": 4.969395623226133e-05, "loss": 0.5965, "step": 115 }, { "epoch": 0.14848, "grad_norm": 0.33211872605390524, "learning_rate": 4.967625656594782e-05, "loss": 0.5984, "step": 116 }, { "epoch": 0.14976, "grad_norm": 0.31055192632357626, "learning_rate": 4.9658062698118213e-05, "loss": 0.593, "step": 117 }, { "epoch": 0.15104, "grad_norm": 0.35790400007793166, "learning_rate": 4.963937499314857e-05, "loss": 0.6035, "step": 118 }, { "epoch": 0.15232, "grad_norm": 0.31118450185510343, "learning_rate": 4.962019382530521e-05, "loss": 0.5811, "step": 119 }, { "epoch": 0.1536, "grad_norm": 0.3326176465041298, "learning_rate": 4.960051957873725e-05, "loss": 0.581, "step": 120 }, { "epoch": 0.15488, "grad_norm": 0.30210249377153575, "learning_rate": 4.958035264746893e-05, "loss": 0.5837, "step": 121 }, { "epoch": 0.15616, "grad_norm": 0.3480385124671555, "learning_rate": 4.955969343539162e-05, "loss": 0.5768, "step": 122 }, { "epoch": 0.15744, "grad_norm": 0.3003392569743352, "learning_rate": 4.9538542356255866e-05, "loss": 0.5938, "step": 123 }, { "epoch": 0.15872, "grad_norm": 0.32082565179488104, "learning_rate": 4.9516899833663e-05, "loss": 0.5948, "step": 124 }, { "epoch": 0.16, "grad_norm": 0.3564349708048278, "learning_rate": 4.949476630105669e-05, "loss": 0.595, "step": 125 }, { "epoch": 0.16128, "grad_norm": 0.32049541972124757, "learning_rate": 4.94721422017143e-05, "loss": 0.5838, "step": 126 }, { "epoch": 0.16256, "grad_norm": 0.3317680882353993, "learning_rate": 4.944902798873794e-05, "loss": 0.5952, "step": 127 }, { "epoch": 0.16384, "grad_norm": 0.3381465061198974, "learning_rate": 4.942542412504543e-05, "loss": 0.6004, "step": 128 }, { "epoch": 0.16512, "grad_norm": 0.38351657127693595, "learning_rate": 4.940133108336105e-05, "loss": 0.6014, "step": 129 }, { "epoch": 0.1664, "grad_norm": 0.3276142738951724, "learning_rate": 4.9376749346206006e-05, "loss": 0.5853, "step": 130 }, { "epoch": 0.16768, "grad_norm": 0.37146400882939534, "learning_rate": 4.935167940588887e-05, "loss": 0.5995, "step": 131 }, { "epoch": 0.16896, "grad_norm": 0.32804274509201087, "learning_rate": 4.9326121764495596e-05, "loss": 0.5955, "step": 132 }, { "epoch": 0.17024, "grad_norm": 0.3344845806030499, "learning_rate": 4.9300076933879574e-05, "loss": 0.5818, "step": 133 }, { "epoch": 0.17152, "grad_norm": 0.3479572078392269, "learning_rate": 4.92735454356513e-05, "loss": 0.5941, "step": 134 }, { "epoch": 0.1728, "grad_norm": 0.34868252062960353, "learning_rate": 4.924652780116799e-05, "loss": 0.5898, "step": 135 }, { "epoch": 0.17408, "grad_norm": 0.35674279058993497, "learning_rate": 4.921902457152289e-05, "loss": 0.5899, "step": 136 }, { "epoch": 0.17536, "grad_norm": 0.3672614416380493, "learning_rate": 4.9191036297534454e-05, "loss": 0.585, "step": 137 }, { "epoch": 0.17664, "grad_norm": 0.4039478601084677, "learning_rate": 4.916256353973535e-05, "loss": 0.5994, "step": 138 }, { "epoch": 0.17792, "grad_norm": 0.3428958061155067, "learning_rate": 4.913360686836117e-05, "loss": 0.575, "step": 139 }, { "epoch": 0.1792, "grad_norm": 0.4024960602256603, "learning_rate": 4.910416686333906e-05, "loss": 0.5913, "step": 140 }, { "epoch": 0.18048, "grad_norm": 0.31040065034832104, "learning_rate": 4.907424411427608e-05, "loss": 0.5761, "step": 141 }, { "epoch": 0.18176, "grad_norm": 0.359237099401051, "learning_rate": 4.90438392204474e-05, "loss": 0.5885, "step": 142 }, { "epoch": 0.18304, "grad_norm": 0.3357545415879296, "learning_rate": 4.901295279078431e-05, "loss": 0.5907, "step": 143 }, { "epoch": 0.18432, "grad_norm": 0.2846403022642179, "learning_rate": 4.898158544386201e-05, "loss": 0.5886, "step": 144 }, { "epoch": 0.1856, "grad_norm": 0.3636245125193307, "learning_rate": 4.894973780788722e-05, "loss": 0.5816, "step": 145 }, { "epoch": 0.18688, "grad_norm": 0.25440894793562924, "learning_rate": 4.8917410520685635e-05, "loss": 0.576, "step": 146 }, { "epoch": 0.18816, "grad_norm": 0.3380189678855273, "learning_rate": 4.888460422968908e-05, "loss": 0.5931, "step": 147 }, { "epoch": 0.18944, "grad_norm": 0.3096794617975588, "learning_rate": 4.885131959192262e-05, "loss": 0.5829, "step": 148 }, { "epoch": 0.19072, "grad_norm": 0.280174710159943, "learning_rate": 4.881755727399134e-05, "loss": 0.5794, "step": 149 }, { "epoch": 0.192, "grad_norm": 0.31769340776297994, "learning_rate": 4.878331795206705e-05, "loss": 0.5729, "step": 150 }, { "epoch": 0.19328, "grad_norm": 0.31671973855902796, "learning_rate": 4.8748602311874694e-05, "loss": 0.5905, "step": 151 }, { "epoch": 0.19456, "grad_norm": 0.32614211009906474, "learning_rate": 4.8713411048678635e-05, "loss": 0.5855, "step": 152 }, { "epoch": 0.19584, "grad_norm": 0.29921149443441614, "learning_rate": 4.8677744867268764e-05, "loss": 0.5779, "step": 153 }, { "epoch": 0.19712, "grad_norm": 0.3558339409344647, "learning_rate": 4.8641604481946314e-05, "loss": 0.5892, "step": 154 }, { "epoch": 0.1984, "grad_norm": 0.285079025062, "learning_rate": 4.8604990616509616e-05, "loss": 0.5912, "step": 155 }, { "epoch": 0.19968, "grad_norm": 0.32189736402098207, "learning_rate": 4.856790400423958e-05, "loss": 0.5881, "step": 156 }, { "epoch": 0.20096, "grad_norm": 0.3293153125716864, "learning_rate": 4.8530345387885004e-05, "loss": 0.5679, "step": 157 } ], "logging_steps": 1, "max_steps": 781, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 157, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 194598775488512.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }