|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 188, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02127659574468085, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 2e-05, |
|
"loss": 3.872, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0425531914893617, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 4e-05, |
|
"loss": 3.9714, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.06382978723404255, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 6e-05, |
|
"loss": 3.9502, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0851063829787234, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 8e-05, |
|
"loss": 4.0793, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.10638297872340426, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.0001, |
|
"loss": 3.9545, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1276595744680851, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00012, |
|
"loss": 3.803, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.14893617021276595, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00014, |
|
"loss": 3.5993, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1702127659574468, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00016, |
|
"loss": 3.8622, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.19148936170212766, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00018, |
|
"loss": 3.6551, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0002, |
|
"loss": 3.6643, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.23404255319148937, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00019998442534369985, |
|
"loss": 3.7215, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.2553191489361702, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00019993770622619782, |
|
"loss": 3.6194, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.2765957446808511, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019985985720017785, |
|
"loss": 3.7064, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.2978723404255319, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00019975090251507638, |
|
"loss": 3.5715, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.3191489361702128, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.0001996108761095289, |
|
"loss": 3.5818, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.3404255319148936, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0001994398216007982, |
|
"loss": 3.5229, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3617021276595745, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0001992377922711879, |
|
"loss": 3.5894, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.3829787234042553, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00019900485105144543, |
|
"loss": 3.5212, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.40425531914893614, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.00019874107050115954, |
|
"loss": 3.5943, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 0.00019844653278615833, |
|
"loss": 3.5288, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.44680851063829785, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.00019812132965291545, |
|
"loss": 3.6237, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.46808510638297873, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.00019776556239997146, |
|
"loss": 3.5176, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.48936170212765956, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.00019737934184638006, |
|
"loss": 3.5751, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.5106382978723404, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 0.00019696278829718883, |
|
"loss": 3.5265, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5319148936170213, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00019651603150596495, |
|
"loss": 3.5635, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5531914893617021, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.00019603921063437793, |
|
"loss": 3.543, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.574468085106383, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.00019553247420885157, |
|
"loss": 3.573, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5957446808510638, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0001949959800742991, |
|
"loss": 3.566, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.6170212765957447, |
|
"grad_norm": 0.1748046875, |
|
"learning_rate": 0.00019442989534495557, |
|
"loss": 3.6022, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00019383439635232294, |
|
"loss": 3.514, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6595744680851063, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.00019320966859024397, |
|
"loss": 3.5634, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.00019255590665712214, |
|
"loss": 3.6299, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.7021276595744681, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0001918733141953056, |
|
"loss": 3.4891, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.723404255319149, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.0001911621038276542, |
|
"loss": 3.6105, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.7446808510638298, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001904224970913085, |
|
"loss": 3.5861, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.7659574468085106, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00018965472436868286, |
|
"loss": 3.5116, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7872340425531915, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0001888590248157027, |
|
"loss": 3.4935, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.8085106382978723, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.00018803564628730915, |
|
"loss": 3.6211, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.8297872340425532, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.00018718484526025387, |
|
"loss": 3.634, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00018630688675320842, |
|
"loss": 3.6452, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8723404255319149, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.00018540204424421263, |
|
"loss": 3.5727, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.8936170212765957, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0001844705995854882, |
|
"loss": 3.5772, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.9148936170212766, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00018351284291564358, |
|
"loss": 3.6298, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.9361702127659575, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00018252907256929775, |
|
"loss": 3.6612, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.9574468085106383, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00018151959498415122, |
|
"loss": 3.6252, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.9787234042553191, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00018048472460553257, |
|
"loss": 3.6423, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0001794247837884511, |
|
"loss": 3.4633, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.0212765957446808, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00017834010269718526, |
|
"loss": 3.2725, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.0425531914893618, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0001772310192024389, |
|
"loss": 3.3142, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.0638297872340425, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0001760978787760968, |
|
"loss": 3.3171, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.0851063829787233, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0001749410343836125, |
|
"loss": 3.4249, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.1063829787234043, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00017376084637406222, |
|
"loss": 3.37, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.127659574468085, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00017255768236789826, |
|
"loss": 3.3532, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.148936170212766, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00017133191714243805, |
|
"loss": 3.1741, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.1702127659574468, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00017008393251512332, |
|
"loss": 3.402, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.1914893617021276, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00016881411722458688, |
|
"loss": 3.2908, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.2127659574468086, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00016752286680956306, |
|
"loss": 3.3467, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.2340425531914894, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00016621058348568007, |
|
"loss": 3.3667, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.2553191489361701, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00016487767602017263, |
|
"loss": 3.308, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00016352455960455387, |
|
"loss": 3.4156, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.297872340425532, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00016215165572528597, |
|
"loss": 3.3147, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.3191489361702127, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0001607593920324899, |
|
"loss": 3.3296, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.3404255319148937, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00015934820220673564, |
|
"loss": 3.2744, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.3617021276595744, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00015791852582395334, |
|
"loss": 3.3446, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.3829787234042552, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0001564708082185087, |
|
"loss": 3.2568, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.4042553191489362, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.00015500550034448413, |
|
"loss": 3.319, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.425531914893617, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00015352305863520991, |
|
"loss": 3.2982, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.4468085106382977, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0001520239448610882, |
|
"loss": 3.3751, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.4680851063829787, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00015050862598575476, |
|
"loss": 3.293, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.4893617021276595, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.00014897757402062284, |
|
"loss": 3.3238, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.5106382978723403, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00014743126587785522, |
|
"loss": 3.2718, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.5319148936170213, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00014587018322180905, |
|
"loss": 3.3136, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.5531914893617023, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00014429481231900083, |
|
"loss": 3.3282, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.574468085106383, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0001427056438866376, |
|
"loss": 3.3317, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.5957446808510638, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00014110317293976218, |
|
"loss": 3.3342, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.6170212765957448, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00013948789863705912, |
|
"loss": 3.351, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.6382978723404256, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.00013786032412537035, |
|
"loss": 3.3016, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.6595744680851063, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00013622095638296826, |
|
"loss": 3.3357, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.6808510638297873, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00013457030606163562, |
|
"loss": 3.3777, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.702127659574468, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.000132908887327601, |
|
"loss": 3.2683, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7234042553191489, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00013123721770137944, |
|
"loss": 3.3866, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.7446808510638299, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00012955581789656843, |
|
"loss": 3.3582, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.7659574468085106, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0001278652116576492, |
|
"loss": 3.2956, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.7872340425531914, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0001261659255968441, |
|
"loss": 3.258, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.8085106382978724, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00012445848903008003, |
|
"loss": 3.4091, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.8297872340425532, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00012274343381211066, |
|
"loss": 3.4071, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.851063829787234, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00012102129417084714, |
|
"loss": 3.4251, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.872340425531915, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00011929260654094969, |
|
"loss": 3.3607, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.8936170212765957, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00011755790939673209, |
|
"loss": 3.3509, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.9148936170212765, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0001158177430844304, |
|
"loss": 3.4081, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.9361702127659575, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00011407264965388906, |
|
"loss": 3.4399, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.9574468085106385, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00011232317268971585, |
|
"loss": 3.4059, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.978723404255319, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00011056985714195932, |
|
"loss": 3.4202, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00010881324915636019, |
|
"loss": 3.0814, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.021276595744681, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0001070538959042311, |
|
"loss": 3.1079, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.0425531914893615, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00010529234541201631, |
|
"loss": 3.1756, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 2.0638297872340425, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00010352914639058526, |
|
"loss": 3.1467, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 2.0851063829787235, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00010176484806431288, |
|
"loss": 3.2387, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.106382978723404, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.0001, |
|
"loss": 3.2315, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 2.127659574468085, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 9.823515193568715e-05, |
|
"loss": 3.2065, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.148936170212766, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 9.647085360941476e-05, |
|
"loss": 3.0125, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 2.1702127659574466, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 9.470765458798368e-05, |
|
"loss": 3.25, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.1914893617021276, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 9.29461040957689e-05, |
|
"loss": 3.1501, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 2.2127659574468086, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 9.118675084363986e-05, |
|
"loss": 3.1963, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.2340425531914896, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 8.943014285804072e-05, |
|
"loss": 3.205, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.25531914893617, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 8.767682731028415e-05, |
|
"loss": 3.1699, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.276595744680851, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 8.592735034611097e-05, |
|
"loss": 3.2659, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.297872340425532, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 8.418225691556962e-05, |
|
"loss": 3.1723, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.3191489361702127, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 8.244209060326794e-05, |
|
"loss": 3.1985, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.3404255319148937, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 8.070739345905032e-05, |
|
"loss": 3.1365, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.3617021276595747, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 7.897870582915288e-05, |
|
"loss": 3.2121, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.382978723404255, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 7.725656618788937e-05, |
|
"loss": 3.115, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.404255319148936, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 7.554151096992001e-05, |
|
"loss": 3.1752, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.425531914893617, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 7.383407440315596e-05, |
|
"loss": 3.1718, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.4468085106382977, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 7.213478834235079e-05, |
|
"loss": 3.2311, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.4680851063829787, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 7.04441821034316e-05, |
|
"loss": 3.167, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.4893617021276597, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 6.87627822986206e-05, |
|
"loss": 3.1866, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.5106382978723403, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 6.7091112672399e-05, |
|
"loss": 3.1326, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.5319148936170213, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 6.542969393836436e-05, |
|
"loss": 3.1708, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.5531914893617023, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 6.377904361703178e-05, |
|
"loss": 3.1963, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.574468085106383, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 6.213967587462968e-05, |
|
"loss": 3.1986, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.595744680851064, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 6.051210136294089e-05, |
|
"loss": 3.1945, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.617021276595745, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 5.889682706023783e-05, |
|
"loss": 3.2107, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.6382978723404253, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 5.729435611336239e-05, |
|
"loss": 3.1784, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.6595744680851063, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 5.570518768099918e-05, |
|
"loss": 3.1984, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.6808510638297873, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 5.4129816778190936e-05, |
|
"loss": 3.2371, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.702127659574468, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 5.2568734122144756e-05, |
|
"loss": 3.1334, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.723404255319149, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 5.102242597937717e-05, |
|
"loss": 3.255, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.74468085106383, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 4.949137401424527e-05, |
|
"loss": 3.2194, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.7659574468085104, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 4.797605513891179e-05, |
|
"loss": 3.1648, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.7872340425531914, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 4.6476941364790074e-05, |
|
"loss": 3.1241, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.8085106382978724, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 4.4994499655515865e-05, |
|
"loss": 3.2732, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.829787234042553, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 4.352919178149132e-05, |
|
"loss": 3.2714, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.851063829787234, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 4.2081474176046646e-05, |
|
"loss": 3.2881, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.872340425531915, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 4.0651797793264354e-05, |
|
"loss": 3.2259, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.8936170212765955, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 3.924060796751012e-05, |
|
"loss": 3.2168, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.9148936170212765, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 3.784834427471408e-05, |
|
"loss": 3.2721, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.9361702127659575, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 3.647544039544615e-05, |
|
"loss": 3.2962, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.9574468085106385, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 3.5122323979827395e-05, |
|
"loss": 3.2635, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.978723404255319, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 3.378941651431996e-05, |
|
"loss": 3.2729, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 3.2477133190436945e-05, |
|
"loss": 2.838, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 3.021276595744681, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 3.118588277541312e-05, |
|
"loss": 3.0105, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 3.0425531914893615, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 2.99160674848767e-05, |
|
"loss": 3.0744, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 3.0638297872340425, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 2.8668082857562005e-05, |
|
"loss": 3.0395, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 3.0851063829787235, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 2.7442317632101745e-05, |
|
"loss": 3.1344, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 3.106382978723404, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 2.6239153625937784e-05, |
|
"loss": 3.1349, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 3.127659574468085, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 2.5058965616387498e-05, |
|
"loss": 3.1193, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 3.148936170212766, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 2.390212122390323e-05, |
|
"loss": 2.9173, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 3.1702127659574466, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 2.2768980797561124e-05, |
|
"loss": 3.1502, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 3.1914893617021276, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 2.1659897302814747e-05, |
|
"loss": 3.065, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.2127659574468086, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 2.0575216211548907e-05, |
|
"loss": 3.1079, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 3.2340425531914896, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 1.9515275394467446e-05, |
|
"loss": 3.1094, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 3.25531914893617, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 1.8480405015848824e-05, |
|
"loss": 3.0872, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 3.276595744680851, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.7470927430702277e-05, |
|
"loss": 3.1752, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 3.297872340425532, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 1.648715708435645e-05, |
|
"loss": 3.0836, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.3191489361702127, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 1.5529400414511806e-05, |
|
"loss": 3.1217, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 3.3404255319148937, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 1.4597955755787373e-05, |
|
"loss": 3.0535, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 3.3617021276595747, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1.3693113246791589e-05, |
|
"loss": 3.1319, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 3.382978723404255, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 1.2815154739746138e-05, |
|
"loss": 3.0385, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 3.404255319148936, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 1.196435371269089e-05, |
|
"loss": 3.0892, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.425531914893617, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.1140975184297331e-05, |
|
"loss": 3.1002, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 3.4468085106382977, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.0345275631317163e-05, |
|
"loss": 3.1541, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 3.4680851063829787, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 9.577502908691526e-06, |
|
"loss": 3.0974, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 3.4893617021276597, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 8.837896172345827e-06, |
|
"loss": 3.1121, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 3.5106382978723403, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 8.1266858046944e-06, |
|
"loss": 3.0568, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.5319148936170213, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 7.4440933428779e-06, |
|
"loss": 3.0998, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 3.5531914893617023, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 6.7903314097560454e-06, |
|
"loss": 3.1263, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 3.574468085106383, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 6.165603647677054e-06, |
|
"loss": 3.1272, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 3.595744680851064, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 5.570104655044428e-06, |
|
"loss": 3.1268, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 3.617021276595745, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 5.00401992570092e-06, |
|
"loss": 3.1398, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.6382978723404253, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 4.467525791148453e-06, |
|
"loss": 3.1151, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 3.6595744680851063, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 3.960789365622075e-06, |
|
"loss": 3.1346, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 3.6808510638297873, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 3.483968494035039e-06, |
|
"loss": 3.1692, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 3.702127659574468, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 3.0372117028111825e-06, |
|
"loss": 3.0739, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 3.723404255319149, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 2.6206581536199594e-06, |
|
"loss": 3.1945, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.74468085106383, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.2344376000285604e-06, |
|
"loss": 3.1558, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 3.7659574468085104, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 1.8786703470845547e-06, |
|
"loss": 3.1134, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 3.7872340425531914, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 1.553467213841664e-06, |
|
"loss": 3.0711, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 3.8085106382978724, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 1.2589294988404888e-06, |
|
"loss": 3.2178, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 3.829787234042553, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 9.951489485545695e-07, |
|
"loss": 3.2206, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.851063829787234, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 7.622077288121033e-07, |
|
"loss": 3.24, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 3.872340425531915, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 5.60178399201805e-07, |
|
"loss": 3.1806, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 3.8936170212765955, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 3.8912389047108813e-07, |
|
"loss": 3.1704, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 3.9148936170212765, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 2.490974849236216e-07, |
|
"loss": 3.2246, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 3.9361702127659575, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 1.401427998221627e-07, |
|
"loss": 3.2522, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.9574468085106385, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 6.229377380218005e-08, |
|
"loss": 3.2196, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 3.978723404255319, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 1.5574656300143542e-08, |
|
"loss": 3.2318, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.0, |
|
"loss": 2.7607, |
|
"step": 188 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 188, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 24, |
|
"total_flos": 2.79753919561728e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|