|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.20096, |
|
"eval_steps": 500, |
|
"global_step": 157, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00128, |
|
"grad_norm": 2.8870697066158506, |
|
"learning_rate": 0.0, |
|
"loss": 0.8422, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00256, |
|
"grad_norm": 2.88484389829891, |
|
"learning_rate": 6.329113924050633e-07, |
|
"loss": 0.8541, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00384, |
|
"grad_norm": 2.858151965789657, |
|
"learning_rate": 1.2658227848101265e-06, |
|
"loss": 0.8376, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00512, |
|
"grad_norm": 2.759628117182127, |
|
"learning_rate": 1.8987341772151901e-06, |
|
"loss": 0.8334, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 2.796990062811218, |
|
"learning_rate": 2.531645569620253e-06, |
|
"loss": 0.8256, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00768, |
|
"grad_norm": 2.5779298795445023, |
|
"learning_rate": 3.1645569620253167e-06, |
|
"loss": 0.8301, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00896, |
|
"grad_norm": 2.182261607936066, |
|
"learning_rate": 3.7974683544303802e-06, |
|
"loss": 0.8156, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.01024, |
|
"grad_norm": 1.9615896152651355, |
|
"learning_rate": 4.430379746835443e-06, |
|
"loss": 0.7982, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01152, |
|
"grad_norm": 1.452541644948315, |
|
"learning_rate": 5.063291139240506e-06, |
|
"loss": 0.7819, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 1.4723286808630864, |
|
"learning_rate": 5.69620253164557e-06, |
|
"loss": 0.7906, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01408, |
|
"grad_norm": 1.3529636617858944, |
|
"learning_rate": 6.329113924050633e-06, |
|
"loss": 0.7724, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.01536, |
|
"grad_norm": 1.960737179905222, |
|
"learning_rate": 6.9620253164556965e-06, |
|
"loss": 0.7495, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.01664, |
|
"grad_norm": 2.2349101055406337, |
|
"learning_rate": 7.5949367088607605e-06, |
|
"loss": 0.7581, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.01792, |
|
"grad_norm": 2.0897577150322477, |
|
"learning_rate": 8.227848101265822e-06, |
|
"loss": 0.7404, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 1.8227218322635887, |
|
"learning_rate": 8.860759493670886e-06, |
|
"loss": 0.7382, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02048, |
|
"grad_norm": 1.2099951464458898, |
|
"learning_rate": 9.49367088607595e-06, |
|
"loss": 0.7231, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.02176, |
|
"grad_norm": 1.2177037129914572, |
|
"learning_rate": 1.0126582278481012e-05, |
|
"loss": 0.7259, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.02304, |
|
"grad_norm": 1.1031346132830708, |
|
"learning_rate": 1.0759493670886076e-05, |
|
"loss": 0.7059, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.02432, |
|
"grad_norm": 0.9194779600801882, |
|
"learning_rate": 1.139240506329114e-05, |
|
"loss": 0.7137, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 0.8679468005972053, |
|
"learning_rate": 1.2025316455696203e-05, |
|
"loss": 0.7036, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02688, |
|
"grad_norm": 0.7227287276969042, |
|
"learning_rate": 1.2658227848101267e-05, |
|
"loss": 0.696, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.02816, |
|
"grad_norm": 0.7425882516811844, |
|
"learning_rate": 1.3291139240506329e-05, |
|
"loss": 0.6888, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.02944, |
|
"grad_norm": 0.7093793012252196, |
|
"learning_rate": 1.3924050632911393e-05, |
|
"loss": 0.6791, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.03072, |
|
"grad_norm": 0.6018215463147907, |
|
"learning_rate": 1.4556962025316457e-05, |
|
"loss": 0.6783, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.5846346732378257, |
|
"learning_rate": 1.5189873417721521e-05, |
|
"loss": 0.6811, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03328, |
|
"grad_norm": 0.5855419788452784, |
|
"learning_rate": 1.5822784810126583e-05, |
|
"loss": 0.683, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.03456, |
|
"grad_norm": 0.5096689891724868, |
|
"learning_rate": 1.6455696202531644e-05, |
|
"loss": 0.6589, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.03584, |
|
"grad_norm": 0.4871170504081146, |
|
"learning_rate": 1.7088607594936708e-05, |
|
"loss": 0.6582, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03712, |
|
"grad_norm": 0.4949600697144217, |
|
"learning_rate": 1.7721518987341772e-05, |
|
"loss": 0.669, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 0.5082926031630941, |
|
"learning_rate": 1.8354430379746836e-05, |
|
"loss": 0.666, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03968, |
|
"grad_norm": 0.49381475380567175, |
|
"learning_rate": 1.89873417721519e-05, |
|
"loss": 0.6556, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.04096, |
|
"grad_norm": 0.4265624784331274, |
|
"learning_rate": 1.962025316455696e-05, |
|
"loss": 0.646, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.04224, |
|
"grad_norm": 0.39190416547723717, |
|
"learning_rate": 2.0253164556962025e-05, |
|
"loss": 0.6473, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.04352, |
|
"grad_norm": 0.4631353399929371, |
|
"learning_rate": 2.088607594936709e-05, |
|
"loss": 0.6441, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 0.3928335126997034, |
|
"learning_rate": 2.1518987341772153e-05, |
|
"loss": 0.6352, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04608, |
|
"grad_norm": 0.36295027582313966, |
|
"learning_rate": 2.2151898734177217e-05, |
|
"loss": 0.6333, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04736, |
|
"grad_norm": 0.35026852064181846, |
|
"learning_rate": 2.278481012658228e-05, |
|
"loss": 0.6399, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.04864, |
|
"grad_norm": 0.39778614916835536, |
|
"learning_rate": 2.341772151898734e-05, |
|
"loss": 0.6298, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04992, |
|
"grad_norm": 0.33278348666417684, |
|
"learning_rate": 2.4050632911392405e-05, |
|
"loss": 0.6301, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 0.31444068712551376, |
|
"learning_rate": 2.468354430379747e-05, |
|
"loss": 0.6263, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05248, |
|
"grad_norm": 0.36059728676958264, |
|
"learning_rate": 2.5316455696202533e-05, |
|
"loss": 0.6458, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.05376, |
|
"grad_norm": 0.3916144552301749, |
|
"learning_rate": 2.5949367088607597e-05, |
|
"loss": 0.6331, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.05504, |
|
"grad_norm": 0.32338566356420756, |
|
"learning_rate": 2.6582278481012658e-05, |
|
"loss": 0.6332, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.05632, |
|
"grad_norm": 0.33704233729853356, |
|
"learning_rate": 2.7215189873417722e-05, |
|
"loss": 0.6348, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 0.36015399213900634, |
|
"learning_rate": 2.7848101265822786e-05, |
|
"loss": 0.6392, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05888, |
|
"grad_norm": 0.31471331803021757, |
|
"learning_rate": 2.848101265822785e-05, |
|
"loss": 0.6272, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.06016, |
|
"grad_norm": 0.3225170654156012, |
|
"learning_rate": 2.9113924050632914e-05, |
|
"loss": 0.635, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.06144, |
|
"grad_norm": 0.3064473735810606, |
|
"learning_rate": 2.9746835443037974e-05, |
|
"loss": 0.6284, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06272, |
|
"grad_norm": 0.3038289969291092, |
|
"learning_rate": 3.0379746835443042e-05, |
|
"loss": 0.6149, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.3226803690164346, |
|
"learning_rate": 3.10126582278481e-05, |
|
"loss": 0.626, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06528, |
|
"grad_norm": 0.3096398144524693, |
|
"learning_rate": 3.1645569620253167e-05, |
|
"loss": 0.621, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.06656, |
|
"grad_norm": 0.2754757429130796, |
|
"learning_rate": 3.227848101265823e-05, |
|
"loss": 0.6185, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.06784, |
|
"grad_norm": 0.3262507218160328, |
|
"learning_rate": 3.291139240506329e-05, |
|
"loss": 0.6171, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.06912, |
|
"grad_norm": 0.34971068352090656, |
|
"learning_rate": 3.354430379746836e-05, |
|
"loss": 0.616, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 0.2841621281043231, |
|
"learning_rate": 3.4177215189873416e-05, |
|
"loss": 0.5995, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07168, |
|
"grad_norm": 0.4003223636484448, |
|
"learning_rate": 3.4810126582278487e-05, |
|
"loss": 0.6169, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.07296, |
|
"grad_norm": 0.31868860231705426, |
|
"learning_rate": 3.5443037974683544e-05, |
|
"loss": 0.6077, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.07424, |
|
"grad_norm": 0.3960425782005289, |
|
"learning_rate": 3.607594936708861e-05, |
|
"loss": 0.6164, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.07552, |
|
"grad_norm": 0.363865574596696, |
|
"learning_rate": 3.670886075949367e-05, |
|
"loss": 0.6118, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 0.33961478774466697, |
|
"learning_rate": 3.7341772151898736e-05, |
|
"loss": 0.6137, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07808, |
|
"grad_norm": 0.4212164741206082, |
|
"learning_rate": 3.79746835443038e-05, |
|
"loss": 0.6275, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.07936, |
|
"grad_norm": 0.29878729710395663, |
|
"learning_rate": 3.8607594936708864e-05, |
|
"loss": 0.6084, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.08064, |
|
"grad_norm": 0.36745026817379894, |
|
"learning_rate": 3.924050632911392e-05, |
|
"loss": 0.607, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.08192, |
|
"grad_norm": 0.38983571508393644, |
|
"learning_rate": 3.987341772151899e-05, |
|
"loss": 0.6176, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 0.37337392917475115, |
|
"learning_rate": 4.050632911392405e-05, |
|
"loss": 0.6184, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08448, |
|
"grad_norm": 0.3668068115925863, |
|
"learning_rate": 4.113924050632912e-05, |
|
"loss": 0.6194, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.08576, |
|
"grad_norm": 0.36138503055306903, |
|
"learning_rate": 4.177215189873418e-05, |
|
"loss": 0.6077, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.08704, |
|
"grad_norm": 0.43361127462043814, |
|
"learning_rate": 4.240506329113924e-05, |
|
"loss": 0.6147, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.08832, |
|
"grad_norm": 0.33520423726109644, |
|
"learning_rate": 4.3037974683544305e-05, |
|
"loss": 0.6118, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 0.4381154362148859, |
|
"learning_rate": 4.367088607594937e-05, |
|
"loss": 0.6031, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09088, |
|
"grad_norm": 0.3717345864324632, |
|
"learning_rate": 4.430379746835443e-05, |
|
"loss": 0.6031, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.09216, |
|
"grad_norm": 0.4861728465398392, |
|
"learning_rate": 4.49367088607595e-05, |
|
"loss": 0.6006, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.09344, |
|
"grad_norm": 0.3264992939190504, |
|
"learning_rate": 4.556962025316456e-05, |
|
"loss": 0.6151, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.09472, |
|
"grad_norm": 0.4319794925001871, |
|
"learning_rate": 4.6202531645569625e-05, |
|
"loss": 0.6058, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.4616345840492333, |
|
"learning_rate": 4.683544303797468e-05, |
|
"loss": 0.5967, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09728, |
|
"grad_norm": 0.4405721152587957, |
|
"learning_rate": 4.7468354430379746e-05, |
|
"loss": 0.6002, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.09856, |
|
"grad_norm": 0.5122605377853799, |
|
"learning_rate": 4.810126582278481e-05, |
|
"loss": 0.6076, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.09984, |
|
"grad_norm": 0.45313870340097556, |
|
"learning_rate": 4.8734177215189874e-05, |
|
"loss": 0.6074, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.10112, |
|
"grad_norm": 0.4340044755876676, |
|
"learning_rate": 4.936708860759494e-05, |
|
"loss": 0.606, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 0.4987172862476422, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6158, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10368, |
|
"grad_norm": 0.6226880208665108, |
|
"learning_rate": 4.999974965737065e-05, |
|
"loss": 0.621, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.10496, |
|
"grad_norm": 0.5448293131914782, |
|
"learning_rate": 4.999899863449631e-05, |
|
"loss": 0.6014, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.10624, |
|
"grad_norm": 0.3427022601926917, |
|
"learning_rate": 4.999774694641803e-05, |
|
"loss": 0.6198, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.10752, |
|
"grad_norm": 0.5005152113593655, |
|
"learning_rate": 4.999599461820387e-05, |
|
"loss": 0.6054, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 0.5702968806820528, |
|
"learning_rate": 4.999374168494844e-05, |
|
"loss": 0.6069, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.11008, |
|
"grad_norm": 0.4671310661706222, |
|
"learning_rate": 4.999098819177214e-05, |
|
"loss": 0.6017, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.11136, |
|
"grad_norm": 0.46081768174689064, |
|
"learning_rate": 4.9987734193820324e-05, |
|
"loss": 0.5988, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.11264, |
|
"grad_norm": 0.5448729856183013, |
|
"learning_rate": 4.9983979756262136e-05, |
|
"loss": 0.6181, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.11392, |
|
"grad_norm": 0.5095775592779056, |
|
"learning_rate": 4.9979724954289244e-05, |
|
"loss": 0.608, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 0.41119162739543413, |
|
"learning_rate": 4.997496987311431e-05, |
|
"loss": 0.5979, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11648, |
|
"grad_norm": 0.45501958535738946, |
|
"learning_rate": 4.996971460796929e-05, |
|
"loss": 0.6019, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.11776, |
|
"grad_norm": 0.4287172104360816, |
|
"learning_rate": 4.9963959264103544e-05, |
|
"loss": 0.5955, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.11904, |
|
"grad_norm": 0.409872269342458, |
|
"learning_rate": 4.995770395678171e-05, |
|
"loss": 0.5927, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.12032, |
|
"grad_norm": 0.4304173966206036, |
|
"learning_rate": 4.995094881128138e-05, |
|
"loss": 0.5967, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 0.4229799776298517, |
|
"learning_rate": 4.994369396289063e-05, |
|
"loss": 0.6084, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.12288, |
|
"grad_norm": 0.4509596954971553, |
|
"learning_rate": 4.9935939556905295e-05, |
|
"loss": 0.6134, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.12416, |
|
"grad_norm": 0.467661146414229, |
|
"learning_rate": 4.992768574862603e-05, |
|
"loss": 0.5986, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.12544, |
|
"grad_norm": 0.42432875998240194, |
|
"learning_rate": 4.9918932703355256e-05, |
|
"loss": 0.6028, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.12672, |
|
"grad_norm": 0.43479377184835605, |
|
"learning_rate": 4.990968059639379e-05, |
|
"loss": 0.5942, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.3680676685801686, |
|
"learning_rate": 4.989992961303738e-05, |
|
"loss": 0.5994, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12928, |
|
"grad_norm": 0.3956815409903461, |
|
"learning_rate": 4.9889679948572974e-05, |
|
"loss": 0.5871, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.13056, |
|
"grad_norm": 0.34354949934586104, |
|
"learning_rate": 4.98789318082748e-05, |
|
"loss": 0.5873, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.13184, |
|
"grad_norm": 0.3608260963951222, |
|
"learning_rate": 4.986768540740028e-05, |
|
"loss": 0.5883, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.13312, |
|
"grad_norm": 0.3937004101078116, |
|
"learning_rate": 4.98559409711857e-05, |
|
"loss": 0.6029, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 0.3401718481532899, |
|
"learning_rate": 4.9843698734841705e-05, |
|
"loss": 0.5983, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.13568, |
|
"grad_norm": 0.4371868869288284, |
|
"learning_rate": 4.983095894354858e-05, |
|
"loss": 0.5866, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.13696, |
|
"grad_norm": 0.3722813571279646, |
|
"learning_rate": 4.981772185245135e-05, |
|
"loss": 0.5954, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.13824, |
|
"grad_norm": 0.36493596395606354, |
|
"learning_rate": 4.980398772665468e-05, |
|
"loss": 0.5806, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.13952, |
|
"grad_norm": 0.43678937522389644, |
|
"learning_rate": 4.9789756841217546e-05, |
|
"loss": 0.595, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 0.34968596729530604, |
|
"learning_rate": 4.977502948114772e-05, |
|
"loss": 0.5999, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14208, |
|
"grad_norm": 0.4035249077012057, |
|
"learning_rate": 4.9759805941396075e-05, |
|
"loss": 0.582, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.14336, |
|
"grad_norm": 0.3396387531525401, |
|
"learning_rate": 4.974408652685072e-05, |
|
"loss": 0.5912, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.14464, |
|
"grad_norm": 0.3888124435581031, |
|
"learning_rate": 4.9727871552330794e-05, |
|
"loss": 0.5994, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.14592, |
|
"grad_norm": 0.3487289265208422, |
|
"learning_rate": 4.971116134258025e-05, |
|
"loss": 0.598, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 0.34084258932596606, |
|
"learning_rate": 4.969395623226133e-05, |
|
"loss": 0.5965, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.14848, |
|
"grad_norm": 0.33211872605390524, |
|
"learning_rate": 4.967625656594782e-05, |
|
"loss": 0.5984, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.14976, |
|
"grad_norm": 0.31055192632357626, |
|
"learning_rate": 4.9658062698118213e-05, |
|
"loss": 0.593, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.15104, |
|
"grad_norm": 0.35790400007793166, |
|
"learning_rate": 4.963937499314857e-05, |
|
"loss": 0.6035, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.15232, |
|
"grad_norm": 0.31118450185510343, |
|
"learning_rate": 4.962019382530521e-05, |
|
"loss": 0.5811, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 0.3326176465041298, |
|
"learning_rate": 4.960051957873725e-05, |
|
"loss": 0.581, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15488, |
|
"grad_norm": 0.30210249377153575, |
|
"learning_rate": 4.958035264746893e-05, |
|
"loss": 0.5837, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.15616, |
|
"grad_norm": 0.3480385124671555, |
|
"learning_rate": 4.955969343539162e-05, |
|
"loss": 0.5768, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.15744, |
|
"grad_norm": 0.3003392569743352, |
|
"learning_rate": 4.9538542356255866e-05, |
|
"loss": 0.5938, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.15872, |
|
"grad_norm": 0.32082565179488104, |
|
"learning_rate": 4.9516899833663e-05, |
|
"loss": 0.5948, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.3564349708048278, |
|
"learning_rate": 4.949476630105669e-05, |
|
"loss": 0.595, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.16128, |
|
"grad_norm": 0.32049541972124757, |
|
"learning_rate": 4.94721422017143e-05, |
|
"loss": 0.5838, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.16256, |
|
"grad_norm": 0.3317680882353993, |
|
"learning_rate": 4.944902798873794e-05, |
|
"loss": 0.5952, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.16384, |
|
"grad_norm": 0.3381465061198974, |
|
"learning_rate": 4.942542412504543e-05, |
|
"loss": 0.6004, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.16512, |
|
"grad_norm": 0.38351657127693595, |
|
"learning_rate": 4.940133108336105e-05, |
|
"loss": 0.6014, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 0.3276142738951724, |
|
"learning_rate": 4.9376749346206006e-05, |
|
"loss": 0.5853, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16768, |
|
"grad_norm": 0.37146400882939534, |
|
"learning_rate": 4.935167940588887e-05, |
|
"loss": 0.5995, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.16896, |
|
"grad_norm": 0.32804274509201087, |
|
"learning_rate": 4.9326121764495596e-05, |
|
"loss": 0.5955, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.17024, |
|
"grad_norm": 0.3344845806030499, |
|
"learning_rate": 4.9300076933879574e-05, |
|
"loss": 0.5818, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.17152, |
|
"grad_norm": 0.3479572078392269, |
|
"learning_rate": 4.92735454356513e-05, |
|
"loss": 0.5941, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 0.34868252062960353, |
|
"learning_rate": 4.924652780116799e-05, |
|
"loss": 0.5898, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.17408, |
|
"grad_norm": 0.35674279058993497, |
|
"learning_rate": 4.921902457152289e-05, |
|
"loss": 0.5899, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.17536, |
|
"grad_norm": 0.3672614416380493, |
|
"learning_rate": 4.9191036297534454e-05, |
|
"loss": 0.585, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.17664, |
|
"grad_norm": 0.4039478601084677, |
|
"learning_rate": 4.916256353973535e-05, |
|
"loss": 0.5994, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.17792, |
|
"grad_norm": 0.3428958061155067, |
|
"learning_rate": 4.913360686836117e-05, |
|
"loss": 0.575, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 0.4024960602256603, |
|
"learning_rate": 4.910416686333906e-05, |
|
"loss": 0.5913, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.18048, |
|
"grad_norm": 0.31040065034832104, |
|
"learning_rate": 4.907424411427608e-05, |
|
"loss": 0.5761, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.18176, |
|
"grad_norm": 0.359237099401051, |
|
"learning_rate": 4.90438392204474e-05, |
|
"loss": 0.5885, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.18304, |
|
"grad_norm": 0.3357545415879296, |
|
"learning_rate": 4.901295279078431e-05, |
|
"loss": 0.5907, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.18432, |
|
"grad_norm": 0.2846403022642179, |
|
"learning_rate": 4.898158544386201e-05, |
|
"loss": 0.5886, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 0.3636245125193307, |
|
"learning_rate": 4.894973780788722e-05, |
|
"loss": 0.5816, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.18688, |
|
"grad_norm": 0.25440894793562924, |
|
"learning_rate": 4.8917410520685635e-05, |
|
"loss": 0.576, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.18816, |
|
"grad_norm": 0.3380189678855273, |
|
"learning_rate": 4.888460422968908e-05, |
|
"loss": 0.5931, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.18944, |
|
"grad_norm": 0.3096794617975588, |
|
"learning_rate": 4.885131959192262e-05, |
|
"loss": 0.5829, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.19072, |
|
"grad_norm": 0.280174710159943, |
|
"learning_rate": 4.881755727399134e-05, |
|
"loss": 0.5794, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.31769340776297994, |
|
"learning_rate": 4.878331795206705e-05, |
|
"loss": 0.5729, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.19328, |
|
"grad_norm": 0.31671973855902796, |
|
"learning_rate": 4.8748602311874694e-05, |
|
"loss": 0.5905, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.19456, |
|
"grad_norm": 0.32614211009906474, |
|
"learning_rate": 4.8713411048678635e-05, |
|
"loss": 0.5855, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.19584, |
|
"grad_norm": 0.29921149443441614, |
|
"learning_rate": 4.8677744867268764e-05, |
|
"loss": 0.5779, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.19712, |
|
"grad_norm": 0.3558339409344647, |
|
"learning_rate": 4.8641604481946314e-05, |
|
"loss": 0.5892, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 0.285079025062, |
|
"learning_rate": 4.8604990616509616e-05, |
|
"loss": 0.5912, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.19968, |
|
"grad_norm": 0.32189736402098207, |
|
"learning_rate": 4.856790400423958e-05, |
|
"loss": 0.5881, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.20096, |
|
"grad_norm": 0.3293153125716864, |
|
"learning_rate": 4.8530345387885004e-05, |
|
"loss": 0.5679, |
|
"step": 157 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 781, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 157, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 194598775488512.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|