|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 297, |
|
"global_step": 297, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003367003367003367, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 1e-05, |
|
"loss": 2.5758, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006734006734006734, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 9.966329966329968e-06, |
|
"loss": 2.5078, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.010101010101010102, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 9.932659932659933e-06, |
|
"loss": 2.5049, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.013468013468013467, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 9.8989898989899e-06, |
|
"loss": 2.5518, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.016835016835016835, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 9.865319865319866e-06, |
|
"loss": 2.5427, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.020202020202020204, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 9.831649831649833e-06, |
|
"loss": 2.3623, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02356902356902357, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 9.797979797979798e-06, |
|
"loss": 2.4237, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.026936026936026935, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 9.764309764309765e-06, |
|
"loss": 2.4176, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.030303030303030304, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 9.730639730639732e-06, |
|
"loss": 2.3253, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03367003367003367, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 9.696969696969698e-06, |
|
"loss": 2.3064, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.037037037037037035, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 9.663299663299665e-06, |
|
"loss": 2.2647, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04040404040404041, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 9.62962962962963e-06, |
|
"loss": 2.2657, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04377104377104377, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 9.595959595959597e-06, |
|
"loss": 2.2032, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04713804713804714, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 9.562289562289562e-06, |
|
"loss": 2.1713, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.050505050505050504, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 9.52861952861953e-06, |
|
"loss": 2.2073, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05387205387205387, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 9.494949494949497e-06, |
|
"loss": 2.1354, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05723905723905724, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 9.461279461279462e-06, |
|
"loss": 2.1304, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.06060606060606061, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 9.427609427609429e-06, |
|
"loss": 2.0846, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06397306397306397, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 9.393939393939396e-06, |
|
"loss": 2.0422, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.06734006734006734, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 9.360269360269361e-06, |
|
"loss": 2.0434, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0707070707070707, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 9.326599326599327e-06, |
|
"loss": 2.0235, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 9.292929292929294e-06, |
|
"loss": 1.9988, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.07744107744107744, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 9.25925925925926e-06, |
|
"loss": 1.8975, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.08080808080808081, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 9.225589225589226e-06, |
|
"loss": 1.9702, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.08417508417508418, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 9.191919191919193e-06, |
|
"loss": 1.9406, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08754208754208755, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 9.15824915824916e-06, |
|
"loss": 1.8887, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 9.124579124579126e-06, |
|
"loss": 1.8769, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.09427609427609428, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 1.8423, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.09764309764309764, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 9.057239057239058e-06, |
|
"loss": 1.8612, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.10101010101010101, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 9.023569023569025e-06, |
|
"loss": 1.8174, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10437710437710437, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 8.98989898989899e-06, |
|
"loss": 1.8132, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.10774410774410774, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 8.956228956228958e-06, |
|
"loss": 1.7676, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 8.922558922558923e-06, |
|
"loss": 1.7846, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.11447811447811448, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 1.7733, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.11784511784511785, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 8.855218855218855e-06, |
|
"loss": 1.739, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.12121212121212122, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 8.821548821548822e-06, |
|
"loss": 1.725, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.12457912457912458, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 8.787878787878788e-06, |
|
"loss": 1.6985, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.12794612794612795, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 8.754208754208755e-06, |
|
"loss": 1.6951, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.13131313131313133, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 8.720538720538722e-06, |
|
"loss": 1.6596, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.13468013468013468, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 8.686868686868687e-06, |
|
"loss": 1.6469, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13804713804713806, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 8.653198653198653e-06, |
|
"loss": 1.6273, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1414141414141414, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 8.61952861952862e-06, |
|
"loss": 1.6243, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1447811447811448, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 8.585858585858587e-06, |
|
"loss": 1.5959, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 8.552188552188552e-06, |
|
"loss": 1.6236, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 8.518518518518519e-06, |
|
"loss": 1.6213, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.15488215488215487, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 8.484848484848486e-06, |
|
"loss": 1.5851, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.15824915824915825, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 8.451178451178452e-06, |
|
"loss": 1.5851, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.16161616161616163, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 8.417508417508419e-06, |
|
"loss": 1.5747, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.16498316498316498, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 8.383838383838384e-06, |
|
"loss": 1.5184, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.16835016835016836, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 8.350168350168351e-06, |
|
"loss": 1.5904, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1717171717171717, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 8.316498316498316e-06, |
|
"loss": 1.5713, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1750841750841751, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 8.282828282828283e-06, |
|
"loss": 1.5267, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.17845117845117844, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 8.24915824915825e-06, |
|
"loss": 1.5543, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 8.215488215488216e-06, |
|
"loss": 1.5648, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 8.181818181818183e-06, |
|
"loss": 1.4896, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.18855218855218855, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 8.148148148148148e-06, |
|
"loss": 1.5295, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1919191919191919, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 8.114478114478115e-06, |
|
"loss": 1.5177, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.19528619528619529, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 8.08080808080808e-06, |
|
"loss": 1.4765, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.19865319865319866, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 8.047138047138048e-06, |
|
"loss": 1.4731, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.20202020202020202, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 8.013468013468015e-06, |
|
"loss": 1.5113, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2053872053872054, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 7.97979797979798e-06, |
|
"loss": 1.4836, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.20875420875420875, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 7.946127946127947e-06, |
|
"loss": 1.4953, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.21212121212121213, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 7.912457912457913e-06, |
|
"loss": 1.5195, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.21548821548821548, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 7.87878787878788e-06, |
|
"loss": 1.4541, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.21885521885521886, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 7.845117845117845e-06, |
|
"loss": 1.4226, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 7.811447811447812e-06, |
|
"loss": 1.4715, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.2255892255892256, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 7.77777777777778e-06, |
|
"loss": 1.4595, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.22895622895622897, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 7.744107744107745e-06, |
|
"loss": 1.4243, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.23232323232323232, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 7.710437710437712e-06, |
|
"loss": 1.4426, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.2356902356902357, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 7.676767676767677e-06, |
|
"loss": 1.4317, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23905723905723905, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 7.643097643097644e-06, |
|
"loss": 1.4313, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.24242424242424243, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 7.60942760942761e-06, |
|
"loss": 1.479, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.24579124579124578, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 7.5757575757575764e-06, |
|
"loss": 1.4102, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.24915824915824916, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 7.542087542087543e-06, |
|
"loss": 1.4277, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.25252525252525254, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 7.508417508417509e-06, |
|
"loss": 1.4199, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2558922558922559, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 7.474747474747476e-06, |
|
"loss": 1.4312, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.25925925925925924, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 7.441077441077442e-06, |
|
"loss": 1.4414, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.26262626262626265, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 1.4225, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.265993265993266, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 7.373737373737374e-06, |
|
"loss": 1.4135, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.26936026936026936, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 7.340067340067341e-06, |
|
"loss": 1.4481, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 7.306397306397307e-06, |
|
"loss": 1.3645, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2760942760942761, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 7.272727272727273e-06, |
|
"loss": 1.4644, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.27946127946127947, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 7.23905723905724e-06, |
|
"loss": 1.3992, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.2828282828282828, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 7.2053872053872064e-06, |
|
"loss": 1.4122, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.28619528619528617, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 7.171717171717172e-06, |
|
"loss": 1.3658, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2895622895622896, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 7.138047138047138e-06, |
|
"loss": 1.4178, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.29292929292929293, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 7.104377104377105e-06, |
|
"loss": 1.3861, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 7.070707070707071e-06, |
|
"loss": 1.3704, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2996632996632997, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 7.0370370370370375e-06, |
|
"loss": 1.41, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 7.0033670033670045e-06, |
|
"loss": 1.373, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3063973063973064, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 6.969696969696971e-06, |
|
"loss": 1.3598, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.30976430976430974, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 6.936026936026936e-06, |
|
"loss": 1.3426, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.31313131313131315, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 6.902356902356902e-06, |
|
"loss": 1.3619, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.3164983164983165, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 6.868686868686869e-06, |
|
"loss": 1.3631, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.31986531986531985, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 6.835016835016836e-06, |
|
"loss": 1.4074, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.32323232323232326, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 6.801346801346802e-06, |
|
"loss": 1.3986, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3265993265993266, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 6.767676767676769e-06, |
|
"loss": 1.3347, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.32996632996632996, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 6.734006734006735e-06, |
|
"loss": 1.3661, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 6.7003367003367004e-06, |
|
"loss": 1.3536, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3367003367003367, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.3298, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3400673400673401, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 6.632996632996634e-06, |
|
"loss": 1.3568, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.3434343434343434, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 6.5993265993266e-06, |
|
"loss": 1.3967, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.3468013468013468, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 6.565656565656566e-06, |
|
"loss": 1.3748, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.3501683501683502, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 6.531986531986533e-06, |
|
"loss": 1.3578, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.35353535353535354, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 6.498316498316499e-06, |
|
"loss": 1.3561, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3569023569023569, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 6.464646464646466e-06, |
|
"loss": 1.335, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.3602693602693603, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 6.430976430976431e-06, |
|
"loss": 1.3539, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 6.397306397306397e-06, |
|
"loss": 1.3218, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.367003367003367, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 6.363636363636364e-06, |
|
"loss": 1.3466, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 6.3299663299663304e-06, |
|
"loss": 1.3496, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.37373737373737376, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 6.296296296296297e-06, |
|
"loss": 1.3861, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3771043771043771, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 6.262626262626264e-06, |
|
"loss": 1.3067, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.38047138047138046, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 6.22895622895623e-06, |
|
"loss": 1.3356, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.3838383838383838, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 6.195286195286195e-06, |
|
"loss": 1.3661, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3872053872053872, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 6.1616161616161615e-06, |
|
"loss": 1.3279, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.39057239057239057, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 6.1279461279461286e-06, |
|
"loss": 1.3434, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3939393939393939, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 6.094276094276095e-06, |
|
"loss": 1.3545, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.39730639730639733, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 6.060606060606061e-06, |
|
"loss": 1.3066, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.4006734006734007, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 6.026936026936028e-06, |
|
"loss": 1.3392, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.40404040404040403, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 5.993265993265994e-06, |
|
"loss": 1.3769, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4074074074074074, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 5.95959595959596e-06, |
|
"loss": 1.324, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.4107744107744108, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 5.925925925925926e-06, |
|
"loss": 1.316, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.41414141414141414, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 5.892255892255893e-06, |
|
"loss": 1.3214, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.4175084175084175, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 5.858585858585859e-06, |
|
"loss": 1.313, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.4208754208754209, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 5.824915824915825e-06, |
|
"loss": 1.2991, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.42424242424242425, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 5.791245791245792e-06, |
|
"loss": 1.3314, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.4276094276094276, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 5.7575757575757586e-06, |
|
"loss": 1.3422, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.43097643097643096, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 5.723905723905724e-06, |
|
"loss": 1.3403, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.43434343434343436, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 5.69023569023569e-06, |
|
"loss": 1.3299, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.4377104377104377, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 5.656565656565657e-06, |
|
"loss": 1.2948, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.44107744107744107, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 5.622895622895623e-06, |
|
"loss": 1.3008, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 5.58922558922559e-06, |
|
"loss": 1.3109, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4478114478114478, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 1.3173, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.4511784511784512, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 5.521885521885523e-06, |
|
"loss": 1.3035, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 5.488215488215489e-06, |
|
"loss": 1.3143, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.45791245791245794, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 5.4545454545454545e-06, |
|
"loss": 1.3273, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.4612794612794613, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 5.420875420875421e-06, |
|
"loss": 1.2928, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.46464646464646464, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 5.387205387205388e-06, |
|
"loss": 1.3109, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.468013468013468, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 5.353535353535354e-06, |
|
"loss": 1.3421, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.4713804713804714, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 5.31986531986532e-06, |
|
"loss": 1.3012, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.47474747474747475, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 5.286195286195287e-06, |
|
"loss": 1.2922, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.4781144781144781, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 5.252525252525253e-06, |
|
"loss": 1.2697, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.48148148148148145, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 5.218855218855219e-06, |
|
"loss": 1.3034, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.48484848484848486, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 5.185185185185185e-06, |
|
"loss": 1.3159, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.4882154882154882, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 5.151515151515152e-06, |
|
"loss": 1.3265, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.49158249158249157, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 5.117845117845118e-06, |
|
"loss": 1.3025, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.494949494949495, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 5.0841750841750845e-06, |
|
"loss": 1.292, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.4983164983164983, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 5.0505050505050515e-06, |
|
"loss": 1.2741, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5016835016835017, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 5.016835016835018e-06, |
|
"loss": 1.2658, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.5050505050505051, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 4.983164983164984e-06, |
|
"loss": 1.2984, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5084175084175084, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 4.94949494949495e-06, |
|
"loss": 1.29, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.5117845117845118, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 4.915824915824916e-06, |
|
"loss": 1.2808, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.5151515151515151, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 4.8821548821548826e-06, |
|
"loss": 1.2975, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 4.848484848484849e-06, |
|
"loss": 1.2837, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.5218855218855218, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 4.814814814814815e-06, |
|
"loss": 1.2542, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5252525252525253, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 4.781144781144781e-06, |
|
"loss": 1.2679, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.5286195286195287, |
|
"grad_norm": 0.169921875, |
|
"learning_rate": 4.747474747474748e-06, |
|
"loss": 1.2849, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.531986531986532, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 4.7138047138047145e-06, |
|
"loss": 1.2927, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.5353535353535354, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 4.680134680134681e-06, |
|
"loss": 1.2951, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.5387205387205387, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 4.646464646464647e-06, |
|
"loss": 1.2925, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5420875420875421, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 4.612794612794613e-06, |
|
"loss": 1.2964, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 4.57912457912458e-06, |
|
"loss": 1.3198, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5488215488215489, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 1.2564, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.5521885521885522, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 4.5117845117845126e-06, |
|
"loss": 1.2536, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 4.478114478114479e-06, |
|
"loss": 1.2581, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5589225589225589, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 1.2864, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.5622895622895623, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 4.410774410774411e-06, |
|
"loss": 1.2741, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.5656565656565656, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 4.377104377104377e-06, |
|
"loss": 1.2903, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.569023569023569, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 4.343434343434344e-06, |
|
"loss": 1.2491, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.5723905723905723, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 4.30976430976431e-06, |
|
"loss": 1.2999, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5757575757575758, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 4.276094276094276e-06, |
|
"loss": 1.252, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.5791245791245792, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 4.242424242424243e-06, |
|
"loss": 1.2698, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5824915824915825, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 4.208754208754209e-06, |
|
"loss": 1.2976, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5858585858585859, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 4.1750841750841755e-06, |
|
"loss": 1.2809, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5892255892255892, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 4.141414141414142e-06, |
|
"loss": 1.2793, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 4.107744107744108e-06, |
|
"loss": 1.283, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.5959595959595959, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 4.074074074074074e-06, |
|
"loss": 1.2703, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.5993265993265994, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 4.04040404040404e-06, |
|
"loss": 1.2958, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.6026936026936027, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 4.0067340067340074e-06, |
|
"loss": 1.2795, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 3.973063973063974e-06, |
|
"loss": 1.2627, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6094276094276094, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 3.93939393939394e-06, |
|
"loss": 1.2868, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.6127946127946128, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 3.905723905723906e-06, |
|
"loss": 1.2585, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.6161616161616161, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 3.872053872053872e-06, |
|
"loss": 1.2739, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.6195286195286195, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 3.8383838383838385e-06, |
|
"loss": 1.2952, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.622895622895623, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 3.804713804713805e-06, |
|
"loss": 1.2613, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6262626262626263, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 3.7710437710437713e-06, |
|
"loss": 1.2619, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6296296296296297, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 3.737373737373738e-06, |
|
"loss": 1.2612, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.632996632996633, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 1.2619, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 3.6700336700336704e-06, |
|
"loss": 1.2732, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.6397306397306397, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 3.6363636363636366e-06, |
|
"loss": 1.2528, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6430976430976431, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 3.6026936026936032e-06, |
|
"loss": 1.2548, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.6464646464646465, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 3.569023569023569e-06, |
|
"loss": 1.2408, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.6498316498316499, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 3.5353535353535356e-06, |
|
"loss": 1.2915, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.6531986531986532, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 3.5016835016835023e-06, |
|
"loss": 1.2444, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.6565656565656566, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 3.468013468013468e-06, |
|
"loss": 1.2512, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6599326599326599, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 3.4343434343434347e-06, |
|
"loss": 1.2214, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.6632996632996633, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 3.400673400673401e-06, |
|
"loss": 1.2741, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 3.3670033670033675e-06, |
|
"loss": 1.2408, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.67003367003367, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.2188, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.6734006734006734, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 3.2996632996633e-06, |
|
"loss": 1.2488, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6767676767676768, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 3.2659932659932666e-06, |
|
"loss": 1.2758, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.6801346801346801, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 3.232323232323233e-06, |
|
"loss": 1.2792, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.6835016835016835, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 3.1986531986531986e-06, |
|
"loss": 1.2542, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.6868686868686869, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 3.1649831649831652e-06, |
|
"loss": 1.2586, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.6902356902356902, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 3.131313131313132e-06, |
|
"loss": 1.268, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6936026936026936, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 3.0976430976430976e-06, |
|
"loss": 1.2389, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.696969696969697, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 3.0639730639730643e-06, |
|
"loss": 1.2394, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.7003367003367004, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 3.0303030303030305e-06, |
|
"loss": 1.2688, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.7037037037037037, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 2.996632996632997e-06, |
|
"loss": 1.2535, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.7070707070707071, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 2.962962962962963e-06, |
|
"loss": 1.2406, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7104377104377104, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 2.9292929292929295e-06, |
|
"loss": 1.2489, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.7138047138047138, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 2.895622895622896e-06, |
|
"loss": 1.2694, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.7171717171717171, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 2.861952861952862e-06, |
|
"loss": 1.2729, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.7205387205387206, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 2.8282828282828286e-06, |
|
"loss": 1.2425, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.7239057239057239, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 2.794612794612795e-06, |
|
"loss": 1.2807, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 2.7609427609427614e-06, |
|
"loss": 1.255, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.7306397306397306, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 2.7272727272727272e-06, |
|
"loss": 1.2519, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.734006734006734, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 2.693602693602694e-06, |
|
"loss": 1.2452, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.7373737373737373, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 2.65993265993266e-06, |
|
"loss": 1.2769, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 2.6262626262626267e-06, |
|
"loss": 1.2567, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7441077441077442, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 2.5925925925925925e-06, |
|
"loss": 1.2641, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.7474747474747475, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 2.558922558922559e-06, |
|
"loss": 1.2451, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.7508417508417509, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 2.5252525252525258e-06, |
|
"loss": 1.2237, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.7542087542087542, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 2.491582491582492e-06, |
|
"loss": 1.2465, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.7575757575757576, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 2.457912457912458e-06, |
|
"loss": 1.3071, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7609427609427609, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 2.4242424242424244e-06, |
|
"loss": 1.2614, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.7643097643097643, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 2.3905723905723906e-06, |
|
"loss": 1.2847, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.7676767676767676, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 2.3569023569023572e-06, |
|
"loss": 1.2603, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.7710437710437711, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 2.3232323232323234e-06, |
|
"loss": 1.2442, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.7744107744107744, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 2.28956228956229e-06, |
|
"loss": 1.268, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 2.2558922558922563e-06, |
|
"loss": 1.238, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.7811447811447811, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 1.2514, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.7845117845117845, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 2.1885521885521887e-06, |
|
"loss": 1.2194, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.7878787878787878, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 2.154882154882155e-06, |
|
"loss": 1.244, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.7912457912457912, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 2.1212121212121216e-06, |
|
"loss": 1.225, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7946127946127947, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 2.0875420875420878e-06, |
|
"loss": 1.2393, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.797979797979798, |
|
"grad_norm": 0.18359375, |
|
"learning_rate": 2.053872053872054e-06, |
|
"loss": 1.2488, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.8013468013468014, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 2.02020202020202e-06, |
|
"loss": 1.2427, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.8047138047138047, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 1.986531986531987e-06, |
|
"loss": 1.2546, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.8080808080808081, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 1.952861952861953e-06, |
|
"loss": 1.2521, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8114478114478114, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 1.9191919191919192e-06, |
|
"loss": 1.2965, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.8148148148148148, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 1.8855218855218857e-06, |
|
"loss": 1.2844, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.8181818181818182, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 1.8518518518518519e-06, |
|
"loss": 1.2445, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.8215488215488216, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 1.8181818181818183e-06, |
|
"loss": 1.244, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.8249158249158249, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 1.7845117845117845e-06, |
|
"loss": 1.2586, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8282828282828283, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 1.7508417508417511e-06, |
|
"loss": 1.2333, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.8316498316498316, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 1.7171717171717173e-06, |
|
"loss": 1.2431, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.835016835016835, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 1.6835016835016838e-06, |
|
"loss": 1.2606, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.8383838383838383, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 1.64983164983165e-06, |
|
"loss": 1.2771, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.8417508417508418, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 1.6161616161616164e-06, |
|
"loss": 1.2611, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8451178451178452, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 1.5824915824915826e-06, |
|
"loss": 1.2421, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.8484848484848485, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 1.5488215488215488e-06, |
|
"loss": 1.2568, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.8518518518518519, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 1.5151515151515152e-06, |
|
"loss": 1.2583, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.8552188552188552, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 1.4814814814814815e-06, |
|
"loss": 1.2582, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.8585858585858586, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 1.447811447811448e-06, |
|
"loss": 1.2667, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8619528619528619, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 1.4141414141414143e-06, |
|
"loss": 1.2687, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.8653198653198653, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 1.3804713804713807e-06, |
|
"loss": 1.2761, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.8686868686868687, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 1.346801346801347e-06, |
|
"loss": 1.2392, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.8720538720538721, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 1.3131313131313134e-06, |
|
"loss": 1.2687, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.8754208754208754, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 1.2794612794612796e-06, |
|
"loss": 1.2291, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8787878787878788, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 1.245791245791246e-06, |
|
"loss": 1.239, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.8821548821548821, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 1.2121212121212122e-06, |
|
"loss": 1.2599, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.8855218855218855, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 1.1784511784511786e-06, |
|
"loss": 1.2912, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 1.144781144781145e-06, |
|
"loss": 1.2382, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.8922558922558923, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 1.111111111111111e-06, |
|
"loss": 1.2655, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8956228956228957, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 1.0774410774410775e-06, |
|
"loss": 1.2769, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.898989898989899, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 1.0437710437710439e-06, |
|
"loss": 1.2753, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.9023569023569024, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 1.01010101010101e-06, |
|
"loss": 1.2405, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.9057239057239057, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 9.764309764309765e-07, |
|
"loss": 1.2443, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 9.427609427609428e-07, |
|
"loss": 1.2461, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9124579124579124, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 9.090909090909091e-07, |
|
"loss": 1.2383, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.9158249158249159, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 8.754208754208756e-07, |
|
"loss": 1.2546, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.9191919191919192, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 8.417508417508419e-07, |
|
"loss": 1.3184, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.9225589225589226, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 8.080808080808082e-07, |
|
"loss": 1.2659, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 7.744107744107744e-07, |
|
"loss": 1.2878, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9292929292929293, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 7.407407407407407e-07, |
|
"loss": 1.282, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.9326599326599326, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 7.070707070707071e-07, |
|
"loss": 1.2493, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.936026936026936, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 6.734006734006735e-07, |
|
"loss": 1.2419, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.9393939393939394, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 6.397306397306398e-07, |
|
"loss": 1.2396, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.9427609427609428, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 6.060606060606061e-07, |
|
"loss": 1.2458, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9461279461279462, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 5.723905723905725e-07, |
|
"loss": 1.2557, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.9494949494949495, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 5.387205387205387e-07, |
|
"loss": 1.2546, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.9528619528619529, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 5.05050505050505e-07, |
|
"loss": 1.2439, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.9562289562289562, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 4.713804713804714e-07, |
|
"loss": 1.2948, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.9595959595959596, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 4.377104377104378e-07, |
|
"loss": 1.215, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9629629629629629, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 4.040404040404041e-07, |
|
"loss": 1.2779, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.9663299663299664, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 3.7037037037037036e-07, |
|
"loss": 1.2564, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.9696969696969697, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 3.3670033670033673e-07, |
|
"loss": 1.2712, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.9730639730639731, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 3.0303030303030305e-07, |
|
"loss": 1.2408, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.9764309764309764, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 2.6936026936026936e-07, |
|
"loss": 1.2375, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9797979797979798, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 2.356902356902357e-07, |
|
"loss": 1.2509, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.9831649831649831, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 2.0202020202020205e-07, |
|
"loss": 1.2597, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.9865319865319865, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 1.6835016835016837e-07, |
|
"loss": 1.2576, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.98989898989899, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 1.3468013468013468e-07, |
|
"loss": 1.2312, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.9932659932659933, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 1.0101010101010103e-07, |
|
"loss": 1.2391, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.9966329966329966, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 6.734006734006734e-08, |
|
"loss": 1.2403, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 3.367003367003367e-08, |
|
"loss": 1.2241, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.2526977062225342, |
|
"eval_runtime": 10.0634, |
|
"eval_samples_per_second": 2.782, |
|
"eval_steps_per_second": 0.397, |
|
"step": 297 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 297, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.429499898169917e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|