|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 750, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.012, |
|
"grad_norm": 1.0642749071121216, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 1.6085, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 1.2326161861419678, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 1.8001, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.036, |
|
"grad_norm": 0.9848614931106567, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 1.6124, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 1.1129230260849, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6241, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.9483588933944702, |
|
"learning_rate": 0.00012727272727272728, |
|
"loss": 1.5575, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 0.7823343276977539, |
|
"learning_rate": 0.00015454545454545454, |
|
"loss": 1.3941, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.084, |
|
"grad_norm": 1.2419981956481934, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 1.4403, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 1.2519845962524414, |
|
"learning_rate": 0.00019999906887858078, |
|
"loss": 1.527, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.108, |
|
"grad_norm": 1.28350830078125, |
|
"learning_rate": 0.00019998510240408496, |
|
"loss": 1.2943, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.3078265190124512, |
|
"learning_rate": 0.00019995437844895334, |
|
"loss": 1.2846, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.132, |
|
"grad_norm": 1.412528395652771, |
|
"learning_rate": 0.00019990690216251396, |
|
"loss": 1.0703, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 1.135581612586975, |
|
"learning_rate": 0.00019984268150178167, |
|
"loss": 1.0773, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.156, |
|
"grad_norm": 1.4680742025375366, |
|
"learning_rate": 0.0001997617272301248, |
|
"loss": 0.8765, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 1.1661655902862549, |
|
"learning_rate": 0.00019966405291546094, |
|
"loss": 1.1527, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.564658284187317, |
|
"learning_rate": 0.00019954967492798333, |
|
"loss": 1.0761, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 1.2385212182998657, |
|
"learning_rate": 0.00019941861243741684, |
|
"loss": 1.1971, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.204, |
|
"grad_norm": 1.300706148147583, |
|
"learning_rate": 0.0001992708874098054, |
|
"loss": 1.1586, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 1.4915199279785156, |
|
"learning_rate": 0.00019910652460383035, |
|
"loss": 1.0416, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.228, |
|
"grad_norm": 1.3301211595535278, |
|
"learning_rate": 0.00019892555156666089, |
|
"loss": 0.9176, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.579613447189331, |
|
"learning_rate": 0.00019872799862933731, |
|
"loss": 0.924, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.252, |
|
"grad_norm": 1.756352424621582, |
|
"learning_rate": 0.0001985138989016874, |
|
"loss": 1.1, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 1.9087395668029785, |
|
"learning_rate": 0.00019828328826677726, |
|
"loss": 1.3137, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.276, |
|
"grad_norm": 1.5105749368667603, |
|
"learning_rate": 0.00019803620537489736, |
|
"loss": 0.971, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.9009131193161011, |
|
"learning_rate": 0.00019777269163708468, |
|
"loss": 0.918, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.5716756582260132, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 0.9052, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 1.2776657342910767, |
|
"learning_rate": 0.00019719655102943753, |
|
"loss": 1.0727, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.324, |
|
"grad_norm": 1.7678760290145874, |
|
"learning_rate": 0.00019688402072063903, |
|
"loss": 1.0161, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 1.4070408344268799, |
|
"learning_rate": 0.00019655525267179625, |
|
"loss": 0.9853, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.348, |
|
"grad_norm": 1.3068615198135376, |
|
"learning_rate": 0.00019621030198436006, |
|
"loss": 0.8356, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.5878366231918335, |
|
"learning_rate": 0.000195849226471988, |
|
"loss": 0.8672, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.372, |
|
"grad_norm": 1.5617555379867554, |
|
"learning_rate": 0.00019547208665085457, |
|
"loss": 0.9721, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 1.2143884897232056, |
|
"learning_rate": 0.00019507894572950882, |
|
"loss": 0.8779, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.396, |
|
"grad_norm": 1.5331264734268188, |
|
"learning_rate": 0.0001946698695982806, |
|
"loss": 0.7773, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 1.795127272605896, |
|
"learning_rate": 0.00019424492681823733, |
|
"loss": 1.0404, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.2372801303863525, |
|
"learning_rate": 0.00019380418860969322, |
|
"loss": 1.1664, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 1.2694289684295654, |
|
"learning_rate": 0.00019334772884027266, |
|
"loss": 0.7488, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.444, |
|
"grad_norm": 1.663999319076538, |
|
"learning_rate": 0.00019287562401253022, |
|
"loss": 1.1851, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 1.3195521831512451, |
|
"learning_rate": 0.0001923879532511287, |
|
"loss": 0.9345, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.468, |
|
"grad_norm": 1.3398083448410034, |
|
"learning_rate": 0.00019188479828957772, |
|
"loss": 0.8728, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.5292913913726807, |
|
"learning_rate": 0.00019136624345653558, |
|
"loss": 0.8164, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.492, |
|
"grad_norm": 1.5369495153427124, |
|
"learning_rate": 0.0001908323756616754, |
|
"loss": 0.9344, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 1.511875033378601, |
|
"learning_rate": 0.00019028328438111934, |
|
"loss": 0.7121, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.516, |
|
"grad_norm": 2.1549644470214844, |
|
"learning_rate": 0.00018971906164244232, |
|
"loss": 0.9891, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 2.0295217037200928, |
|
"learning_rate": 0.00018913980200924822, |
|
"loss": 1.0895, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.9123669862747192, |
|
"learning_rate": 0.000188545602565321, |
|
"loss": 0.798, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 1.4052222967147827, |
|
"learning_rate": 0.00018793656289835365, |
|
"loss": 0.8515, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.564, |
|
"grad_norm": 1.736244797706604, |
|
"learning_rate": 0.00018731278508325708, |
|
"loss": 1.1771, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 1.6685560941696167, |
|
"learning_rate": 0.0001866743736650526, |
|
"loss": 0.9137, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.588, |
|
"grad_norm": 1.3800030946731567, |
|
"learning_rate": 0.0001860214356413501, |
|
"loss": 0.8261, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.19209623336792, |
|
"learning_rate": 0.00018535408044441514, |
|
"loss": 1.0179, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.612, |
|
"grad_norm": 1.3164235353469849, |
|
"learning_rate": 0.00018467241992282843, |
|
"loss": 0.7551, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 1.8980730772018433, |
|
"learning_rate": 0.0001839765683227398, |
|
"loss": 0.939, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.636, |
|
"grad_norm": 1.7097257375717163, |
|
"learning_rate": 0.00018326664226872065, |
|
"loss": 0.854, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 1.4715948104858398, |
|
"learning_rate": 0.00018254276074421768, |
|
"loss": 0.8128, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.0332729816436768, |
|
"learning_rate": 0.0001818050450716113, |
|
"loss": 0.9127, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 1.5340715646743774, |
|
"learning_rate": 0.00018105361889188203, |
|
"loss": 0.6944, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.684, |
|
"grad_norm": 1.4474735260009766, |
|
"learning_rate": 0.00018028860814388827, |
|
"loss": 0.9063, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 1.3392386436462402, |
|
"learning_rate": 0.00017951014104325904, |
|
"loss": 1.1269, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.708, |
|
"grad_norm": 1.6966164112091064, |
|
"learning_rate": 0.00017871834806090501, |
|
"loss": 0.7627, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.2823700904846191, |
|
"learning_rate": 0.00017791336190115165, |
|
"loss": 0.8398, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.732, |
|
"grad_norm": 1.6307556629180908, |
|
"learning_rate": 0.00017709531747949796, |
|
"loss": 0.931, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 1.6843342781066895, |
|
"learning_rate": 0.00017626435190000467, |
|
"loss": 0.8472, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.756, |
|
"grad_norm": 1.5309902429580688, |
|
"learning_rate": 0.00017542060443231572, |
|
"loss": 1.0447, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 1.315926194190979, |
|
"learning_rate": 0.00017456421648831655, |
|
"loss": 0.8866, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.4360320568084717, |
|
"learning_rate": 0.00017369533159843369, |
|
"loss": 0.8383, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 1.3954721689224243, |
|
"learning_rate": 0.00017281409538757883, |
|
"loss": 0.8642, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.804, |
|
"grad_norm": 1.0596987009048462, |
|
"learning_rate": 0.00017192065555074245, |
|
"loss": 1.0095, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 1.3004777431488037, |
|
"learning_rate": 0.00017101516182823986, |
|
"loss": 0.8043, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.828, |
|
"grad_norm": 1.6855559349060059, |
|
"learning_rate": 0.00017009776598061495, |
|
"loss": 0.6924, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.01497745513916, |
|
"learning_rate": 0.00016916862176320508, |
|
"loss": 0.9077, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.852, |
|
"grad_norm": 1.7603662014007568, |
|
"learning_rate": 0.00016822788490037177, |
|
"loss": 0.7555, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 1.3321644067764282, |
|
"learning_rate": 0.00016727571305940125, |
|
"loss": 0.8248, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.876, |
|
"grad_norm": 1.8733290433883667, |
|
"learning_rate": 0.00016631226582407952, |
|
"loss": 0.9551, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 1.51967453956604, |
|
"learning_rate": 0.00016533770466794622, |
|
"loss": 0.8852, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.008765935897827, |
|
"learning_rate": 0.00016435219292723147, |
|
"loss": 0.8557, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 1.6430020332336426, |
|
"learning_rate": 0.00016335589577348104, |
|
"loss": 0.9588, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.924, |
|
"grad_norm": 1.231882929801941, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.8806, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 1.5018281936645508, |
|
"learning_rate": 0.0001613316149232341, |
|
"loss": 0.8107, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.948, |
|
"grad_norm": 1.7227574586868286, |
|
"learning_rate": 0.00016030397049575203, |
|
"loss": 0.8624, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.324074387550354, |
|
"learning_rate": 0.0001592662191364017, |
|
"loss": 0.8795, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.972, |
|
"grad_norm": 1.803436279296875, |
|
"learning_rate": 0.00015821853477207708, |
|
"loss": 0.9413, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 1.5749870538711548, |
|
"learning_rate": 0.00015716109299444158, |
|
"loss": 0.7096, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.996, |
|
"grad_norm": 1.7215006351470947, |
|
"learning_rate": 0.00015609407103049896, |
|
"loss": 0.8372, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 1.418531894683838, |
|
"learning_rate": 0.0001550176477128899, |
|
"loss": 0.9418, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.624677062034607, |
|
"learning_rate": 0.00015393200344991995, |
|
"loss": 0.6523, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.032, |
|
"grad_norm": 1.8510764837265015, |
|
"learning_rate": 0.0001528373201953229, |
|
"loss": 0.6848, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.044, |
|
"grad_norm": 2.023068904876709, |
|
"learning_rate": 0.00015173378141776568, |
|
"loss": 0.6711, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 1.6575472354888916, |
|
"learning_rate": 0.00015062157207009874, |
|
"loss": 0.9591, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.068, |
|
"grad_norm": 1.551555871963501, |
|
"learning_rate": 0.00014950087855835815, |
|
"loss": 0.5599, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.8778928518295288, |
|
"learning_rate": 0.000148371888710524, |
|
"loss": 0.8402, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.092, |
|
"grad_norm": 1.701336145401001, |
|
"learning_rate": 0.00014723479174504037, |
|
"loss": 0.646, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 2.068992853164673, |
|
"learning_rate": 0.00014608977823910257, |
|
"loss": 0.8327, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.116, |
|
"grad_norm": 2.2500243186950684, |
|
"learning_rate": 0.00014493704009671613, |
|
"loss": 0.8004, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 1.7683219909667969, |
|
"learning_rate": 0.00014377677051653404, |
|
"loss": 0.7407, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.1400000000000001, |
|
"grad_norm": 1.5629931688308716, |
|
"learning_rate": 0.00014260916395947656, |
|
"loss": 0.6372, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 1.4379006624221802, |
|
"learning_rate": 0.0001414344161161395, |
|
"loss": 0.714, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.164, |
|
"grad_norm": 2.0503599643707275, |
|
"learning_rate": 0.00014025272387399674, |
|
"loss": 0.7416, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.176, |
|
"grad_norm": 1.7053464651107788, |
|
"learning_rate": 0.0001390642852844019, |
|
"loss": 0.7885, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.188, |
|
"grad_norm": 1.7159677743911743, |
|
"learning_rate": 0.00013786929952939477, |
|
"loss": 0.7757, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.182704448699951, |
|
"learning_rate": 0.00013666796688831866, |
|
"loss": 0.7267, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.212, |
|
"grad_norm": 1.7983808517456055, |
|
"learning_rate": 0.00013546048870425356, |
|
"loss": 0.8256, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.224, |
|
"grad_norm": 2.5008952617645264, |
|
"learning_rate": 0.00013424706735027108, |
|
"loss": 0.8931, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.236, |
|
"grad_norm": 2.0425970554351807, |
|
"learning_rate": 0.00013302790619551674, |
|
"loss": 0.8057, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 1.6034411191940308, |
|
"learning_rate": 0.00013180320957112537, |
|
"loss": 0.5682, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.7696748971939087, |
|
"learning_rate": 0.0001305731827359753, |
|
"loss": 0.6621, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.272, |
|
"grad_norm": 1.8495662212371826, |
|
"learning_rate": 0.000129338031842287, |
|
"loss": 1.0297, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.284, |
|
"grad_norm": 1.9274413585662842, |
|
"learning_rate": 0.00012809796390107195, |
|
"loss": 0.698, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 1.388038158416748, |
|
"learning_rate": 0.0001268531867474377, |
|
"loss": 0.6074, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.308, |
|
"grad_norm": 1.9666837453842163, |
|
"learning_rate": 0.0001256039090057547, |
|
"loss": 0.6017, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.2773425579071045, |
|
"learning_rate": 0.00012435034005469107, |
|
"loss": 0.7342, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.332, |
|
"grad_norm": 2.059131383895874, |
|
"learning_rate": 0.0001230926899921206, |
|
"loss": 0.7482, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 1.4702714681625366, |
|
"learning_rate": 0.00012183116959991024, |
|
"loss": 0.6535, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.3559999999999999, |
|
"grad_norm": 2.339047908782959, |
|
"learning_rate": 0.00012056599030859366, |
|
"loss": 0.6876, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.3679999999999999, |
|
"grad_norm": 1.404402732849121, |
|
"learning_rate": 0.000119297364161935, |
|
"loss": 0.6558, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.5481153726577759, |
|
"learning_rate": 0.0001180255037813906, |
|
"loss": 0.8033, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 1.8469116687774658, |
|
"learning_rate": 0.00011675062233047364, |
|
"loss": 0.9062, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.404, |
|
"grad_norm": 2.170661211013794, |
|
"learning_rate": 0.00011547293347902812, |
|
"loss": 1.0204, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.416, |
|
"grad_norm": 1.6551048755645752, |
|
"learning_rate": 0.00011419265136741768, |
|
"loss": 0.7657, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.428, |
|
"grad_norm": 1.9395204782485962, |
|
"learning_rate": 0.00011290999057063569, |
|
"loss": 0.5953, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.674118995666504, |
|
"learning_rate": 0.00011162516606234277, |
|
"loss": 0.6788, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.452, |
|
"grad_norm": 2.0402939319610596, |
|
"learning_rate": 0.00011033839317883701, |
|
"loss": 0.6253, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.464, |
|
"grad_norm": 1.7363512516021729, |
|
"learning_rate": 0.0001090498875829638, |
|
"loss": 0.7973, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.476, |
|
"grad_norm": 3.42630672454834, |
|
"learning_rate": 0.00010775986522797063, |
|
"loss": 0.8157, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"grad_norm": 2.1797847747802734, |
|
"learning_rate": 0.0001064685423213136, |
|
"loss": 0.6909, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.1669764518737793, |
|
"learning_rate": 0.00010517613528842097, |
|
"loss": 0.8111, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.512, |
|
"grad_norm": 1.4340484142303467, |
|
"learning_rate": 0.00010388286073642015, |
|
"loss": 0.8091, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.524, |
|
"grad_norm": 1.9211022853851318, |
|
"learning_rate": 0.00010258893541783476, |
|
"loss": 0.7045, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 1.3780417442321777, |
|
"learning_rate": 0.0001012945761942566, |
|
"loss": 0.6279, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.548, |
|
"grad_norm": 2.026918888092041, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8944, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.160315990447998, |
|
"learning_rate": 9.870542380574341e-05, |
|
"loss": 0.8724, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.572, |
|
"grad_norm": 1.6254347562789917, |
|
"learning_rate": 9.741106458216528e-05, |
|
"loss": 0.787, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.584, |
|
"grad_norm": 1.8320547342300415, |
|
"learning_rate": 9.61171392635799e-05, |
|
"loss": 0.8769, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.596, |
|
"grad_norm": 1.8300505876541138, |
|
"learning_rate": 9.482386471157904e-05, |
|
"loss": 0.6325, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.608, |
|
"grad_norm": 1.4335874319076538, |
|
"learning_rate": 9.353145767868638e-05, |
|
"loss": 0.6019, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.346412181854248, |
|
"learning_rate": 9.224013477202939e-05, |
|
"loss": 0.5956, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 1.917075276374817, |
|
"learning_rate": 9.095011241703623e-05, |
|
"loss": 0.8203, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.6440000000000001, |
|
"grad_norm": 2.0511739253997803, |
|
"learning_rate": 8.9661606821163e-05, |
|
"loss": 0.7623, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.6560000000000001, |
|
"grad_norm": 1.5712239742279053, |
|
"learning_rate": 8.837483393765724e-05, |
|
"loss": 0.7278, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.6680000000000001, |
|
"grad_norm": 2.1062729358673096, |
|
"learning_rate": 8.709000942936433e-05, |
|
"loss": 0.6844, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 1.0471903085708618, |
|
"learning_rate": 8.580734863258237e-05, |
|
"loss": 0.6001, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.692, |
|
"grad_norm": 2.182645797729492, |
|
"learning_rate": 8.452706652097186e-05, |
|
"loss": 0.773, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.704, |
|
"grad_norm": 1.5720593929290771, |
|
"learning_rate": 8.324937766952638e-05, |
|
"loss": 0.5371, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.716, |
|
"grad_norm": 2.1978089809417725, |
|
"learning_rate": 8.197449621860943e-05, |
|
"loss": 0.8579, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 2.014761209487915, |
|
"learning_rate": 8.070263583806503e-05, |
|
"loss": 0.7591, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 1.8996471166610718, |
|
"learning_rate": 7.943400969140635e-05, |
|
"loss": 0.8224, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.752, |
|
"grad_norm": 1.631941556930542, |
|
"learning_rate": 7.816883040008978e-05, |
|
"loss": 0.5066, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.764, |
|
"grad_norm": 2.4093146324157715, |
|
"learning_rate": 7.690731000787948e-05, |
|
"loss": 0.7607, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.776, |
|
"grad_norm": 2.3090007305145264, |
|
"learning_rate": 7.56496599453089e-05, |
|
"loss": 0.7572, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.788, |
|
"grad_norm": 1.954633355140686, |
|
"learning_rate": 7.43960909942453e-05, |
|
"loss": 0.6932, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.2641658782958984, |
|
"learning_rate": 7.314681325256232e-05, |
|
"loss": 0.7305, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.812, |
|
"grad_norm": 1.7580173015594482, |
|
"learning_rate": 7.190203609892808e-05, |
|
"loss": 0.6618, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 2.495814085006714, |
|
"learning_rate": 7.066196815771302e-05, |
|
"loss": 0.6998, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.8359999999999999, |
|
"grad_norm": 1.600066900253296, |
|
"learning_rate": 6.942681726402473e-05, |
|
"loss": 0.5911, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.8479999999999999, |
|
"grad_norm": 1.9360573291778564, |
|
"learning_rate": 6.819679042887468e-05, |
|
"loss": 0.9434, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.8599999999999999, |
|
"grad_norm": 2.6737253665924072, |
|
"learning_rate": 6.697209380448333e-05, |
|
"loss": 0.7037, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.8719999999999999, |
|
"grad_norm": 2.3846089839935303, |
|
"learning_rate": 6.575293264972893e-05, |
|
"loss": 0.8254, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.884, |
|
"grad_norm": 2.3048312664031982, |
|
"learning_rate": 6.453951129574644e-05, |
|
"loss": 0.9766, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.896, |
|
"grad_norm": 1.872440218925476, |
|
"learning_rate": 6.333203311168135e-05, |
|
"loss": 0.8265, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.908, |
|
"grad_norm": 1.8152889013290405, |
|
"learning_rate": 6.213070047060524e-05, |
|
"loss": 0.8833, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.7105317115783691, |
|
"learning_rate": 6.093571471559811e-05, |
|
"loss": 0.6763, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.932, |
|
"grad_norm": 2.321948766708374, |
|
"learning_rate": 5.9747276126003257e-05, |
|
"loss": 0.6024, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.944, |
|
"grad_norm": 1.6151583194732666, |
|
"learning_rate": 5.856558388386055e-05, |
|
"loss": 0.5752, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.956, |
|
"grad_norm": 1.6309243440628052, |
|
"learning_rate": 5.739083604052351e-05, |
|
"loss": 0.4642, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.968, |
|
"grad_norm": 2.6804358959198, |
|
"learning_rate": 5.622322948346594e-05, |
|
"loss": 0.8228, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 2.1613543033599854, |
|
"learning_rate": 5.506295990328385e-05, |
|
"loss": 0.9623, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.992, |
|
"grad_norm": 1.6900639533996582, |
|
"learning_rate": 5.3910221760897464e-05, |
|
"loss": 0.7248, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 2.004, |
|
"grad_norm": 1.9322189092636108, |
|
"learning_rate": 5.276520825495963e-05, |
|
"loss": 0.719, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 2.016, |
|
"grad_norm": 1.8491179943084717, |
|
"learning_rate": 5.162811128947602e-05, |
|
"loss": 0.6374, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 2.028, |
|
"grad_norm": 1.827660322189331, |
|
"learning_rate": 5.0499121441641864e-05, |
|
"loss": 0.7009, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 1.9986926317214966, |
|
"learning_rate": 4.9378427929901306e-05, |
|
"loss": 0.693, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.052, |
|
"grad_norm": 2.270709753036499, |
|
"learning_rate": 4.826621858223431e-05, |
|
"loss": 0.5809, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 2.064, |
|
"grad_norm": 1.7829395532608032, |
|
"learning_rate": 4.7162679804677076e-05, |
|
"loss": 0.7653, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 2.076, |
|
"grad_norm": 1.772600531578064, |
|
"learning_rate": 4.606799655008009e-05, |
|
"loss": 0.4681, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 2.088, |
|
"grad_norm": 1.8009684085845947, |
|
"learning_rate": 4.498235228711012e-05, |
|
"loss": 0.4102, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.441601276397705, |
|
"learning_rate": 4.3905928969501056e-05, |
|
"loss": 0.6207, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 2.350313663482666, |
|
"learning_rate": 4.2838907005558406e-05, |
|
"loss": 0.659, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 2.124, |
|
"grad_norm": 2.4971227645874023, |
|
"learning_rate": 4.1781465227922957e-05, |
|
"loss": 0.6974, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 2.136, |
|
"grad_norm": 2.6324408054351807, |
|
"learning_rate": 4.0733780863598335e-05, |
|
"loss": 0.7108, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 2.148, |
|
"grad_norm": 2.413882255554199, |
|
"learning_rate": 3.9696029504247956e-05, |
|
"loss": 0.6228, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.309478282928467, |
|
"learning_rate": 3.866838507676592e-05, |
|
"loss": 0.6082, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.172, |
|
"grad_norm": 1.7304553985595703, |
|
"learning_rate": 3.7651019814126654e-05, |
|
"loss": 0.482, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 2.184, |
|
"grad_norm": 1.6498790979385376, |
|
"learning_rate": 3.664410422651898e-05, |
|
"loss": 0.5673, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 2.196, |
|
"grad_norm": 2.485692262649536, |
|
"learning_rate": 3.5647807072768526e-05, |
|
"loss": 0.6241, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 2.208, |
|
"grad_norm": 2.3340401649475098, |
|
"learning_rate": 3.466229533205382e-05, |
|
"loss": 0.6536, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 2.2559235095977783, |
|
"learning_rate": 3.36877341759205e-05, |
|
"loss": 0.5349, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.232, |
|
"grad_norm": 2.2168805599212646, |
|
"learning_rate": 3.272428694059876e-05, |
|
"loss": 0.587, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 2.2439999999999998, |
|
"grad_norm": 2.802232503890991, |
|
"learning_rate": 3.177211509962826e-05, |
|
"loss": 0.7633, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 2.2560000000000002, |
|
"grad_norm": 2.523928642272949, |
|
"learning_rate": 3.083137823679493e-05, |
|
"loss": 0.832, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 2.268, |
|
"grad_norm": 3.1273038387298584, |
|
"learning_rate": 2.9902234019385057e-05, |
|
"loss": 0.6549, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 2.2800000000000002, |
|
"grad_norm": 1.7540401220321655, |
|
"learning_rate": 2.8984838171760143e-05, |
|
"loss": 0.5879, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.292, |
|
"grad_norm": 1.6333884000778198, |
|
"learning_rate": 2.8079344449257572e-05, |
|
"loss": 0.4111, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 2.337137222290039, |
|
"learning_rate": 2.7185904612421176e-05, |
|
"loss": 0.6164, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 2.316, |
|
"grad_norm": 2.0755574703216553, |
|
"learning_rate": 2.6304668401566335e-05, |
|
"loss": 0.5297, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 2.328, |
|
"grad_norm": 1.6737080812454224, |
|
"learning_rate": 2.5435783511683443e-05, |
|
"loss": 0.5128, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 1.6794757843017578, |
|
"learning_rate": 2.4579395567684283e-05, |
|
"loss": 0.5672, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.352, |
|
"grad_norm": 2.5523290634155273, |
|
"learning_rate": 2.373564809999532e-05, |
|
"loss": 0.604, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 2.364, |
|
"grad_norm": 2.5056543350219727, |
|
"learning_rate": 2.290468252050204e-05, |
|
"loss": 0.6708, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 2.376, |
|
"grad_norm": 2.3490312099456787, |
|
"learning_rate": 2.2086638098848356e-05, |
|
"loss": 0.5676, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 2.388, |
|
"grad_norm": 2.346902370452881, |
|
"learning_rate": 2.1281651939094992e-05, |
|
"loss": 0.4205, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 2.412750482559204, |
|
"learning_rate": 2.048985895674098e-05, |
|
"loss": 0.6508, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.412, |
|
"grad_norm": 1.9428837299346924, |
|
"learning_rate": 1.971139185611176e-05, |
|
"loss": 0.6276, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 2.424, |
|
"grad_norm": 2.1207127571105957, |
|
"learning_rate": 1.8946381108118006e-05, |
|
"loss": 0.5896, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 2.436, |
|
"grad_norm": 2.845691680908203, |
|
"learning_rate": 1.819495492838872e-05, |
|
"loss": 0.686, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 2.448, |
|
"grad_norm": 2.436755418777466, |
|
"learning_rate": 1.7457239255782333e-05, |
|
"loss": 0.5219, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.8439298868179321, |
|
"learning_rate": 1.6733357731279377e-05, |
|
"loss": 0.5294, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.472, |
|
"grad_norm": 2.1576616764068604, |
|
"learning_rate": 1.6023431677260214e-05, |
|
"loss": 0.642, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 2.484, |
|
"grad_norm": 2.6464619636535645, |
|
"learning_rate": 1.5327580077171587e-05, |
|
"loss": 0.72, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 2.5440258979797363, |
|
"learning_rate": 1.4645919555584885e-05, |
|
"loss": 0.721, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 2.508, |
|
"grad_norm": 2.1442267894744873, |
|
"learning_rate": 1.3978564358649927e-05, |
|
"loss": 0.6866, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 1.863466739654541, |
|
"learning_rate": 1.3325626334947383e-05, |
|
"loss": 0.6299, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.532, |
|
"grad_norm": 2.593524217605591, |
|
"learning_rate": 1.2687214916742918e-05, |
|
"loss": 0.8559, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 2.544, |
|
"grad_norm": 2.2633841037750244, |
|
"learning_rate": 1.206343710164638e-05, |
|
"loss": 0.7061, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 2.556, |
|
"grad_norm": 2.067728281021118, |
|
"learning_rate": 1.1454397434679021e-05, |
|
"loss": 0.5805, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 2.568, |
|
"grad_norm": 3.0227150917053223, |
|
"learning_rate": 1.0860197990751807e-05, |
|
"loss": 0.7688, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 2.5358314514160156, |
|
"learning_rate": 1.028093835755769e-05, |
|
"loss": 0.8175, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.592, |
|
"grad_norm": 2.311980962753296, |
|
"learning_rate": 9.716715618880668e-06, |
|
"loss": 0.7278, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 2.604, |
|
"grad_norm": 2.0393810272216797, |
|
"learning_rate": 9.1676243383246e-06, |
|
"loss": 0.5096, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 2.616, |
|
"grad_norm": 2.303673505783081, |
|
"learning_rate": 8.633756543464421e-06, |
|
"loss": 0.6962, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 2.628, |
|
"grad_norm": 2.4138078689575195, |
|
"learning_rate": 8.115201710422282e-06, |
|
"loss": 0.5913, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 2.1802985668182373, |
|
"learning_rate": 7.612046748871327e-06, |
|
"loss": 0.4857, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.652, |
|
"grad_norm": 2.260392427444458, |
|
"learning_rate": 7.124375987469767e-06, |
|
"loss": 0.5718, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 2.664, |
|
"grad_norm": 2.359084129333496, |
|
"learning_rate": 6.652271159727352e-06, |
|
"loss": 0.572, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 2.676, |
|
"grad_norm": 2.000070810317993, |
|
"learning_rate": 6.195811390306816e-06, |
|
"loss": 0.5156, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 1.6218239068984985, |
|
"learning_rate": 5.755073181762671e-06, |
|
"loss": 0.5135, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 2.580967903137207, |
|
"learning_rate": 5.3301304017194135e-06, |
|
"loss": 0.5636, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.7119999999999997, |
|
"grad_norm": 2.247042655944824, |
|
"learning_rate": 4.921054270491187e-06, |
|
"loss": 0.5567, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 2.724, |
|
"grad_norm": 1.9530930519104004, |
|
"learning_rate": 4.527913349145441e-06, |
|
"loss": 0.5014, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 2.7359999999999998, |
|
"grad_norm": 2.009063482284546, |
|
"learning_rate": 4.150773528012008e-06, |
|
"loss": 0.5435, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 2.748, |
|
"grad_norm": 2.426750659942627, |
|
"learning_rate": 3.789698015639953e-06, |
|
"loss": 0.5175, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 2.8535356521606445, |
|
"learning_rate": 3.444747328203779e-06, |
|
"loss": 0.6014, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.7720000000000002, |
|
"grad_norm": 2.2690775394439697, |
|
"learning_rate": 3.115979279360992e-06, |
|
"loss": 0.5924, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 2.784, |
|
"grad_norm": 2.555889844894409, |
|
"learning_rate": 2.8034489705624944e-06, |
|
"loss": 0.5563, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 2.7960000000000003, |
|
"grad_norm": 2.984099864959717, |
|
"learning_rate": 2.5072087818176382e-06, |
|
"loss": 0.6043, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 2.808, |
|
"grad_norm": 2.361802339553833, |
|
"learning_rate": 2.2273083629153147e-06, |
|
"loss": 0.6905, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 2.0989291667938232, |
|
"learning_rate": 1.963794625102655e-06, |
|
"loss": 0.5107, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.832, |
|
"grad_norm": 1.436015248298645, |
|
"learning_rate": 1.7167117332227533e-06, |
|
"loss": 0.4399, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 2.844, |
|
"grad_norm": 2.377495527267456, |
|
"learning_rate": 1.48610109831262e-06, |
|
"loss": 0.5663, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 2.856, |
|
"grad_norm": 2.165872097015381, |
|
"learning_rate": 1.2720013706627122e-06, |
|
"loss": 0.4201, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 2.868, |
|
"grad_norm": 1.8650953769683838, |
|
"learning_rate": 1.0744484333391368e-06, |
|
"loss": 0.6054, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 2.115549325942993, |
|
"learning_rate": 8.93475396169674e-07, |
|
"loss": 0.6261, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.892, |
|
"grad_norm": 2.3398571014404297, |
|
"learning_rate": 7.291125901946027e-07, |
|
"loss": 0.5593, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 2.904, |
|
"grad_norm": 2.0688586235046387, |
|
"learning_rate": 5.813875625831467e-07, |
|
"loss": 0.7174, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 2.916, |
|
"grad_norm": 2.1658966541290283, |
|
"learning_rate": 4.503250720166774e-07, |
|
"loss": 0.6091, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 2.928, |
|
"grad_norm": 1.7528609037399292, |
|
"learning_rate": 3.3594708453906775e-07, |
|
"loss": 0.4243, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 2.115555763244629, |
|
"learning_rate": 2.382727698752474e-07, |
|
"loss": 0.4401, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.952, |
|
"grad_norm": 2.410830020904541, |
|
"learning_rate": 1.5731849821833954e-07, |
|
"loss": 0.4738, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 2.964, |
|
"grad_norm": 2.3854079246520996, |
|
"learning_rate": 9.309783748606693e-08, |
|
"loss": 0.597, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 2.976, |
|
"grad_norm": 2.455721855163574, |
|
"learning_rate": 4.562155104665955e-08, |
|
"loss": 0.7317, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 2.988, |
|
"grad_norm": 2.517000436782837, |
|
"learning_rate": 1.4897595915053242e-08, |
|
"loss": 0.6519, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 2.7324557304382324, |
|
"learning_rate": 9.311214192120332e-10, |
|
"loss": 0.6816, |
|
"step": 750 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8318132819066880.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|