|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0005646527385659, |
|
"eval_steps": 222, |
|
"global_step": 443, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002258610954263128, |
|
"grad_norm": 0.696092426776886, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 2.1698, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002258610954263128, |
|
"eval_loss": 2.083223819732666, |
|
"eval_runtime": 250.3739, |
|
"eval_samples_per_second": 2.98, |
|
"eval_steps_per_second": 0.375, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004517221908526256, |
|
"grad_norm": 0.7258424758911133, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 2.2383, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.006775832862789385, |
|
"grad_norm": 0.999031662940979, |
|
"learning_rate": 1e-05, |
|
"loss": 1.9518, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.009034443817052512, |
|
"grad_norm": 0.8558158278465271, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 2.3031, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01129305477131564, |
|
"grad_norm": 1.0158196687698364, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 2.3383, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01355166572557877, |
|
"grad_norm": 1.3310142755508423, |
|
"learning_rate": 2e-05, |
|
"loss": 2.3352, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.015810276679841896, |
|
"grad_norm": 0.7580591440200806, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 1.6814, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.018068887634105024, |
|
"grad_norm": 0.7518572211265564, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 1.7568, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.020327498588368152, |
|
"grad_norm": 2.006964683532715, |
|
"learning_rate": 3e-05, |
|
"loss": 2.6346, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02258610954263128, |
|
"grad_norm": 0.6915990114212036, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 2.3084, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.024844720496894408, |
|
"grad_norm": 0.8575783967971802, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 2.0648, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02710333145115754, |
|
"grad_norm": 1.1422725915908813, |
|
"learning_rate": 4e-05, |
|
"loss": 2.1607, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.029361942405420668, |
|
"grad_norm": 1.0447956323623657, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 1.9577, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03162055335968379, |
|
"grad_norm": 1.0838192701339722, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 1.8649, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03387916431394692, |
|
"grad_norm": 1.2682744264602661, |
|
"learning_rate": 5e-05, |
|
"loss": 2.0382, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03613777526821005, |
|
"grad_norm": 1.5851482152938843, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 1.9882, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.038396386222473176, |
|
"grad_norm": 1.1369619369506836, |
|
"learning_rate": 5.666666666666667e-05, |
|
"loss": 1.609, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.040654997176736304, |
|
"grad_norm": 1.489020824432373, |
|
"learning_rate": 6e-05, |
|
"loss": 1.6515, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04291360813099943, |
|
"grad_norm": 1.0247050523757935, |
|
"learning_rate": 6.333333333333333e-05, |
|
"loss": 1.7473, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.04517221908526256, |
|
"grad_norm": 0.913925051689148, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.7019, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04743083003952569, |
|
"grad_norm": 1.1576547622680664, |
|
"learning_rate": 7e-05, |
|
"loss": 1.5537, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.049689440993788817, |
|
"grad_norm": 1.4580681324005127, |
|
"learning_rate": 7.333333333333333e-05, |
|
"loss": 1.6705, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05194805194805195, |
|
"grad_norm": 1.6036103963851929, |
|
"learning_rate": 7.666666666666667e-05, |
|
"loss": 1.6114, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.05420666290231508, |
|
"grad_norm": 1.218241572380066, |
|
"learning_rate": 8e-05, |
|
"loss": 1.8225, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.05646527385657821, |
|
"grad_norm": 1.334812879562378, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.6176, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.058723884810841336, |
|
"grad_norm": 1.0912553071975708, |
|
"learning_rate": 8.666666666666667e-05, |
|
"loss": 1.7103, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.060982495765104464, |
|
"grad_norm": 0.8983953595161438, |
|
"learning_rate": 9e-05, |
|
"loss": 1.826, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.06324110671936758, |
|
"grad_norm": 1.1796810626983643, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 1.3892, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.06549971767363072, |
|
"grad_norm": 0.9814470410346985, |
|
"learning_rate": 9.666666666666667e-05, |
|
"loss": 1.4582, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.06775832862789384, |
|
"grad_norm": 1.072553038597107, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4753, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07001693958215698, |
|
"grad_norm": 1.1693260669708252, |
|
"learning_rate": 9.999855343632036e-05, |
|
"loss": 1.6425, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0722755505364201, |
|
"grad_norm": 0.8432520031929016, |
|
"learning_rate": 9.999421382898329e-05, |
|
"loss": 1.6913, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.07453416149068323, |
|
"grad_norm": 0.6651790738105774, |
|
"learning_rate": 9.998698142908953e-05, |
|
"loss": 1.2049, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.07679277244494635, |
|
"grad_norm": 1.2236829996109009, |
|
"learning_rate": 9.997685665512418e-05, |
|
"loss": 1.3681, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.07905138339920949, |
|
"grad_norm": 0.5604356527328491, |
|
"learning_rate": 9.99638400929324e-05, |
|
"loss": 1.2572, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08130999435347261, |
|
"grad_norm": 0.5676224827766418, |
|
"learning_rate": 9.994793249568569e-05, |
|
"loss": 1.3594, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.08356860530773574, |
|
"grad_norm": 0.5219910144805908, |
|
"learning_rate": 9.99291347838381e-05, |
|
"loss": 1.597, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.08582721626199886, |
|
"grad_norm": 0.7226600646972656, |
|
"learning_rate": 9.990744804507315e-05, |
|
"loss": 1.3577, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.088085827216262, |
|
"grad_norm": 0.5216960310935974, |
|
"learning_rate": 9.988287353424077e-05, |
|
"loss": 1.7175, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.09034443817052512, |
|
"grad_norm": 0.5965225100517273, |
|
"learning_rate": 9.985541267328477e-05, |
|
"loss": 1.5252, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09260304912478826, |
|
"grad_norm": 0.5782445669174194, |
|
"learning_rate": 9.98250670511605e-05, |
|
"loss": 1.9063, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.09486166007905138, |
|
"grad_norm": 0.9046345353126526, |
|
"learning_rate": 9.979183842374293e-05, |
|
"loss": 1.371, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.09712027103331451, |
|
"grad_norm": 0.7153818607330322, |
|
"learning_rate": 9.975572871372513e-05, |
|
"loss": 1.3616, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.09937888198757763, |
|
"grad_norm": 1.479604959487915, |
|
"learning_rate": 9.971674001050686e-05, |
|
"loss": 1.4922, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.10163749294184077, |
|
"grad_norm": 0.7710794806480408, |
|
"learning_rate": 9.967487457007381e-05, |
|
"loss": 1.0812, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1038961038961039, |
|
"grad_norm": 0.792734682559967, |
|
"learning_rate": 9.963013481486703e-05, |
|
"loss": 1.5412, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.10615471485036702, |
|
"grad_norm": 0.6719285249710083, |
|
"learning_rate": 9.958252333364267e-05, |
|
"loss": 1.1906, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.10841332580463016, |
|
"grad_norm": 1.0701615810394287, |
|
"learning_rate": 9.953204288132234e-05, |
|
"loss": 1.3426, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.11067193675889328, |
|
"grad_norm": 1.067221760749817, |
|
"learning_rate": 9.947869637883358e-05, |
|
"loss": 1.435, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.11293054771315642, |
|
"grad_norm": 0.6624462008476257, |
|
"learning_rate": 9.942248691294093e-05, |
|
"loss": 1.148, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11518915866741954, |
|
"grad_norm": 1.10640549659729, |
|
"learning_rate": 9.936341773606723e-05, |
|
"loss": 1.4428, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.11744776962168267, |
|
"grad_norm": 0.7549408674240112, |
|
"learning_rate": 9.930149226610554e-05, |
|
"loss": 1.4182, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.11970638057594579, |
|
"grad_norm": 0.6992340683937073, |
|
"learning_rate": 9.923671408622129e-05, |
|
"loss": 1.3721, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.12196499153020893, |
|
"grad_norm": 0.6394023895263672, |
|
"learning_rate": 9.916908694464492e-05, |
|
"loss": 1.5456, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.12422360248447205, |
|
"grad_norm": 0.8388894200325012, |
|
"learning_rate": 9.909861475445517e-05, |
|
"loss": 1.6266, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.12648221343873517, |
|
"grad_norm": 0.5841717720031738, |
|
"learning_rate": 9.902530159335243e-05, |
|
"loss": 1.6483, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1287408243929983, |
|
"grad_norm": 0.635645866394043, |
|
"learning_rate": 9.894915170342295e-05, |
|
"loss": 1.2798, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.13099943534726144, |
|
"grad_norm": 0.8329370617866516, |
|
"learning_rate": 9.887016949089333e-05, |
|
"loss": 1.5148, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.13325804630152457, |
|
"grad_norm": 0.7651770710945129, |
|
"learning_rate": 9.878835952587559e-05, |
|
"loss": 1.4382, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.13551665725578768, |
|
"grad_norm": 0.6698804497718811, |
|
"learning_rate": 9.870372654210265e-05, |
|
"loss": 1.4922, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13777526821005082, |
|
"grad_norm": 0.807618260383606, |
|
"learning_rate": 9.861627543665456e-05, |
|
"loss": 1.8907, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.14003387916431395, |
|
"grad_norm": 0.610428512096405, |
|
"learning_rate": 9.852601126967502e-05, |
|
"loss": 1.4187, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1422924901185771, |
|
"grad_norm": 0.5798942446708679, |
|
"learning_rate": 9.843293926407866e-05, |
|
"loss": 1.4308, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1445511010728402, |
|
"grad_norm": 0.5696601271629333, |
|
"learning_rate": 9.833706480524878e-05, |
|
"loss": 1.6111, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.14680971202710333, |
|
"grad_norm": 0.5774890780448914, |
|
"learning_rate": 9.82383934407258e-05, |
|
"loss": 1.2417, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.14906832298136646, |
|
"grad_norm": 0.6529080271720886, |
|
"learning_rate": 9.81369308798862e-05, |
|
"loss": 1.3103, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1513269339356296, |
|
"grad_norm": 0.47429534792900085, |
|
"learning_rate": 9.803268299361217e-05, |
|
"loss": 1.2813, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.1535855448898927, |
|
"grad_norm": 0.9300480484962463, |
|
"learning_rate": 9.7925655813952e-05, |
|
"loss": 1.5553, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.15584415584415584, |
|
"grad_norm": 0.6896660327911377, |
|
"learning_rate": 9.781585553377085e-05, |
|
"loss": 1.3391, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.15810276679841898, |
|
"grad_norm": 0.513652503490448, |
|
"learning_rate": 9.770328850639268e-05, |
|
"loss": 1.2952, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1603613777526821, |
|
"grad_norm": 0.6096103191375732, |
|
"learning_rate": 9.758796124523239e-05, |
|
"loss": 1.6856, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.16261998870694522, |
|
"grad_norm": 0.9341354370117188, |
|
"learning_rate": 9.746988042341906e-05, |
|
"loss": 1.2378, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.16487859966120835, |
|
"grad_norm": 0.616295337677002, |
|
"learning_rate": 9.734905287340985e-05, |
|
"loss": 1.4577, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.1671372106154715, |
|
"grad_norm": 0.5131121277809143, |
|
"learning_rate": 9.722548558659457e-05, |
|
"loss": 1.6515, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.16939582156973462, |
|
"grad_norm": 0.7272030711174011, |
|
"learning_rate": 9.709918571289114e-05, |
|
"loss": 1.3936, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.17165443252399773, |
|
"grad_norm": 0.6986638903617859, |
|
"learning_rate": 9.697016056033201e-05, |
|
"loss": 1.5857, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 0.5578714609146118, |
|
"learning_rate": 9.683841759464113e-05, |
|
"loss": 1.4122, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.176171654432524, |
|
"grad_norm": 0.49430081248283386, |
|
"learning_rate": 9.670396443880208e-05, |
|
"loss": 1.5742, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.17843026538678713, |
|
"grad_norm": 0.7834444642066956, |
|
"learning_rate": 9.656680887261693e-05, |
|
"loss": 1.4708, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.18068887634105024, |
|
"grad_norm": 0.7123986482620239, |
|
"learning_rate": 9.64269588322561e-05, |
|
"loss": 1.6196, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.18294748729531338, |
|
"grad_norm": 0.7276789546012878, |
|
"learning_rate": 9.628442240979916e-05, |
|
"loss": 1.38, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.1852060982495765, |
|
"grad_norm": 0.580315351486206, |
|
"learning_rate": 9.613920785276656e-05, |
|
"loss": 1.62, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.18746470920383965, |
|
"grad_norm": 0.8134167790412903, |
|
"learning_rate": 9.599132356364247e-05, |
|
"loss": 1.3458, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.18972332015810275, |
|
"grad_norm": 0.9041950702667236, |
|
"learning_rate": 9.584077809938855e-05, |
|
"loss": 1.3943, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1919819311123659, |
|
"grad_norm": 0.6255788207054138, |
|
"learning_rate": 9.568758017094883e-05, |
|
"loss": 1.2289, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.19424054206662902, |
|
"grad_norm": 0.5172642469406128, |
|
"learning_rate": 9.553173864274567e-05, |
|
"loss": 1.4261, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.19649915302089216, |
|
"grad_norm": 0.4685104489326477, |
|
"learning_rate": 9.537326253216685e-05, |
|
"loss": 1.4084, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.19875776397515527, |
|
"grad_norm": 0.5454607605934143, |
|
"learning_rate": 9.521216100904378e-05, |
|
"loss": 1.5901, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2010163749294184, |
|
"grad_norm": 0.5021257996559143, |
|
"learning_rate": 9.504844339512095e-05, |
|
"loss": 1.4189, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.20327498588368154, |
|
"grad_norm": 0.6921396851539612, |
|
"learning_rate": 9.488211916351656e-05, |
|
"loss": 1.3857, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.20553359683794467, |
|
"grad_norm": 0.6917213797569275, |
|
"learning_rate": 9.471319793817426e-05, |
|
"loss": 1.6815, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2077922077922078, |
|
"grad_norm": 1.099507451057434, |
|
"learning_rate": 9.454168949330645e-05, |
|
"loss": 1.4097, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2100508187464709, |
|
"grad_norm": 0.5121451616287231, |
|
"learning_rate": 9.436760375282859e-05, |
|
"loss": 1.4267, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.21230942970073405, |
|
"grad_norm": 0.5218266248703003, |
|
"learning_rate": 9.419095078978506e-05, |
|
"loss": 1.2902, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.21456804065499718, |
|
"grad_norm": 0.5271095633506775, |
|
"learning_rate": 9.40117408257663e-05, |
|
"loss": 1.402, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.21682665160926032, |
|
"grad_norm": 0.5236210227012634, |
|
"learning_rate": 9.382998423031727e-05, |
|
"loss": 1.7724, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.21908526256352343, |
|
"grad_norm": 0.5875529646873474, |
|
"learning_rate": 9.364569152033756e-05, |
|
"loss": 1.7372, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.22134387351778656, |
|
"grad_norm": 0.6892306208610535, |
|
"learning_rate": 9.345887335947281e-05, |
|
"loss": 1.6303, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2236024844720497, |
|
"grad_norm": 0.5231241583824158, |
|
"learning_rate": 9.326954055749767e-05, |
|
"loss": 1.618, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.22586109542631283, |
|
"grad_norm": 0.6638787388801575, |
|
"learning_rate": 9.30777040696903e-05, |
|
"loss": 1.1699, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22811970638057594, |
|
"grad_norm": 0.604011058807373, |
|
"learning_rate": 9.288337499619857e-05, |
|
"loss": 1.3338, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.23037831733483907, |
|
"grad_norm": 0.5968809127807617, |
|
"learning_rate": 9.268656458139762e-05, |
|
"loss": 1.4609, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2326369282891022, |
|
"grad_norm": 0.5389347672462463, |
|
"learning_rate": 9.248728421323941e-05, |
|
"loss": 1.5215, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.23489553924336534, |
|
"grad_norm": 0.712431013584137, |
|
"learning_rate": 9.22855454225936e-05, |
|
"loss": 1.0865, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.23715415019762845, |
|
"grad_norm": 0.7650911808013916, |
|
"learning_rate": 9.208135988258051e-05, |
|
"loss": 1.478, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.23941276115189158, |
|
"grad_norm": 0.9736595153808594, |
|
"learning_rate": 9.187473940789557e-05, |
|
"loss": 1.4201, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.24167137210615472, |
|
"grad_norm": 0.703115701675415, |
|
"learning_rate": 9.166569595412575e-05, |
|
"loss": 1.3264, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.24392998306041785, |
|
"grad_norm": 0.9081875085830688, |
|
"learning_rate": 9.145424161705776e-05, |
|
"loss": 1.3541, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.24618859401468096, |
|
"grad_norm": 0.5371877551078796, |
|
"learning_rate": 9.124038863197818e-05, |
|
"loss": 1.4782, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2484472049689441, |
|
"grad_norm": 0.5240530967712402, |
|
"learning_rate": 9.10241493729654e-05, |
|
"loss": 1.5567, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.25070581592320723, |
|
"grad_norm": 0.6024327874183655, |
|
"learning_rate": 9.08055363521738e-05, |
|
"loss": 1.565, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.25296442687747034, |
|
"grad_norm": 0.6612941026687622, |
|
"learning_rate": 9.058456221910956e-05, |
|
"loss": 1.489, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2552230378317335, |
|
"grad_norm": 0.6889235377311707, |
|
"learning_rate": 9.036123975989892e-05, |
|
"loss": 1.4708, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2574816487859966, |
|
"grad_norm": 0.611516535282135, |
|
"learning_rate": 9.013558189654819e-05, |
|
"loss": 1.4802, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 0.5951672196388245, |
|
"learning_rate": 8.990760168619615e-05, |
|
"loss": 1.2703, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2619988706945229, |
|
"grad_norm": 0.670804500579834, |
|
"learning_rate": 8.967731232035847e-05, |
|
"loss": 1.8547, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.264257481648786, |
|
"grad_norm": 0.4999150037765503, |
|
"learning_rate": 8.944472712416447e-05, |
|
"loss": 1.44, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.26651609260304915, |
|
"grad_norm": 0.5740485191345215, |
|
"learning_rate": 8.9209859555586e-05, |
|
"loss": 1.2195, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.26877470355731226, |
|
"grad_norm": 0.5336365103721619, |
|
"learning_rate": 8.897272320465887e-05, |
|
"loss": 1.4808, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.27103331451157536, |
|
"grad_norm": 0.5398093461990356, |
|
"learning_rate": 8.873333179269635e-05, |
|
"loss": 1.4662, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2732919254658385, |
|
"grad_norm": 0.5843866467475891, |
|
"learning_rate": 8.849169917149531e-05, |
|
"loss": 1.6004, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.27555053642010163, |
|
"grad_norm": 0.5099990963935852, |
|
"learning_rate": 8.82478393225347e-05, |
|
"loss": 1.6265, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.27780914737436474, |
|
"grad_norm": 0.6847774386405945, |
|
"learning_rate": 8.800176635616657e-05, |
|
"loss": 1.7536, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.2800677583286279, |
|
"grad_norm": 0.6712337732315063, |
|
"learning_rate": 8.775349451079948e-05, |
|
"loss": 1.3252, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.282326369282891, |
|
"grad_norm": 0.7772756814956665, |
|
"learning_rate": 8.750303815207486e-05, |
|
"loss": 1.6567, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2845849802371542, |
|
"grad_norm": 0.6570346355438232, |
|
"learning_rate": 8.725041177203554e-05, |
|
"loss": 1.2232, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.2868435911914173, |
|
"grad_norm": 0.6152611374855042, |
|
"learning_rate": 8.699562998828738e-05, |
|
"loss": 1.42, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.2891022021456804, |
|
"grad_norm": 0.5262449979782104, |
|
"learning_rate": 8.673870754315336e-05, |
|
"loss": 1.3769, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.29136081309994355, |
|
"grad_norm": 0.644555926322937, |
|
"learning_rate": 8.647965930282059e-05, |
|
"loss": 1.4069, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.29361942405420666, |
|
"grad_norm": 0.74615079164505, |
|
"learning_rate": 8.621850025648009e-05, |
|
"loss": 1.2758, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.29587803500846976, |
|
"grad_norm": 1.079114317893982, |
|
"learning_rate": 8.59552455154595e-05, |
|
"loss": 1.7415, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2981366459627329, |
|
"grad_norm": 0.522380530834198, |
|
"learning_rate": 8.56899103123487e-05, |
|
"loss": 1.3716, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.30039525691699603, |
|
"grad_norm": 0.6177389621734619, |
|
"learning_rate": 8.54225100001184e-05, |
|
"loss": 1.2745, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3026538678712592, |
|
"grad_norm": 0.5246083736419678, |
|
"learning_rate": 8.51530600512318e-05, |
|
"loss": 1.4917, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3049124788255223, |
|
"grad_norm": 1.4917463064193726, |
|
"learning_rate": 8.488157605674925e-05, |
|
"loss": 1.2877, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3071710897797854, |
|
"grad_norm": 0.5848981738090515, |
|
"learning_rate": 8.460807372542618e-05, |
|
"loss": 1.2985, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3094297007340486, |
|
"grad_norm": 0.8174607157707214, |
|
"learning_rate": 8.43325688828042e-05, |
|
"loss": 1.183, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3116883116883117, |
|
"grad_norm": 0.467540442943573, |
|
"learning_rate": 8.405507747029523e-05, |
|
"loss": 1.3237, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.31394692264257484, |
|
"grad_norm": 0.43935471773147583, |
|
"learning_rate": 8.377561554425922e-05, |
|
"loss": 1.1914, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.31620553359683795, |
|
"grad_norm": 0.5460615754127502, |
|
"learning_rate": 8.349419927507505e-05, |
|
"loss": 1.474, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.31846414455110106, |
|
"grad_norm": 0.5589008331298828, |
|
"learning_rate": 8.321084494620488e-05, |
|
"loss": 1.5669, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3207227555053642, |
|
"grad_norm": 0.5428789854049683, |
|
"learning_rate": 8.292556895325194e-05, |
|
"loss": 1.2317, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.32298136645962733, |
|
"grad_norm": 0.5516197085380554, |
|
"learning_rate": 8.263838780301182e-05, |
|
"loss": 1.374, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.32523997741389044, |
|
"grad_norm": 0.6941014528274536, |
|
"learning_rate": 8.234931811251739e-05, |
|
"loss": 1.1839, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3274985883681536, |
|
"grad_norm": 0.7662340402603149, |
|
"learning_rate": 8.205837660807725e-05, |
|
"loss": 1.6056, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3297571993224167, |
|
"grad_norm": 0.6064298748970032, |
|
"learning_rate": 8.176558012430791e-05, |
|
"loss": 1.2736, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.33201581027667987, |
|
"grad_norm": 0.7128156423568726, |
|
"learning_rate": 8.147094560315977e-05, |
|
"loss": 1.156, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.334274421230943, |
|
"grad_norm": 0.5706272721290588, |
|
"learning_rate": 8.117449009293668e-05, |
|
"loss": 1.3529, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3365330321852061, |
|
"grad_norm": 0.4535754919052124, |
|
"learning_rate": 8.08762307473096e-05, |
|
"loss": 1.2878, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.33879164313946925, |
|
"grad_norm": 0.5142524242401123, |
|
"learning_rate": 8.057618482432399e-05, |
|
"loss": 1.4953, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.34105025409373235, |
|
"grad_norm": 0.4920322895050049, |
|
"learning_rate": 8.027436968540123e-05, |
|
"loss": 1.4318, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.34330886504799546, |
|
"grad_norm": 0.5119665861129761, |
|
"learning_rate": 7.997080279433402e-05, |
|
"loss": 1.2697, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3455674760022586, |
|
"grad_norm": 0.6550598740577698, |
|
"learning_rate": 7.966550171627592e-05, |
|
"loss": 1.3713, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 0.47998011112213135, |
|
"learning_rate": 7.9358484116725e-05, |
|
"loss": 1.3111, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.3500846979107849, |
|
"grad_norm": 0.535916268825531, |
|
"learning_rate": 7.904976776050156e-05, |
|
"loss": 1.5861, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.352343308865048, |
|
"grad_norm": 0.4098435640335083, |
|
"learning_rate": 7.873937051072035e-05, |
|
"loss": 1.1488, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3546019198193111, |
|
"grad_norm": 0.6115076541900635, |
|
"learning_rate": 7.842731032775687e-05, |
|
"loss": 1.4248, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.35686053077357427, |
|
"grad_norm": 0.6232655644416809, |
|
"learning_rate": 7.81136052682082e-05, |
|
"loss": 1.3005, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3591191417278374, |
|
"grad_norm": 0.9366422295570374, |
|
"learning_rate": 7.779827348384813e-05, |
|
"loss": 1.6395, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3613777526821005, |
|
"grad_norm": 0.709642231464386, |
|
"learning_rate": 7.748133322057693e-05, |
|
"loss": 1.5304, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 0.5911598205566406, |
|
"learning_rate": 7.716280281736551e-05, |
|
"loss": 1.6141, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.36589497459062675, |
|
"grad_norm": 0.45050862431526184, |
|
"learning_rate": 7.68427007051944e-05, |
|
"loss": 1.3448, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.3681535855448899, |
|
"grad_norm": 0.5454220771789551, |
|
"learning_rate": 7.652104540598712e-05, |
|
"loss": 1.2553, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.370412196499153, |
|
"grad_norm": 0.5132555365562439, |
|
"learning_rate": 7.619785553153864e-05, |
|
"loss": 1.4322, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.37267080745341613, |
|
"grad_norm": 0.48642510175704956, |
|
"learning_rate": 7.58731497824383e-05, |
|
"loss": 1.2141, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3749294184076793, |
|
"grad_norm": 0.6442435383796692, |
|
"learning_rate": 7.554694694698784e-05, |
|
"loss": 1.3274, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3771880293619424, |
|
"grad_norm": 0.6116953492164612, |
|
"learning_rate": 7.521926590011418e-05, |
|
"loss": 1.5006, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.3794466403162055, |
|
"grad_norm": 3.9714138507843018, |
|
"learning_rate": 7.489012560227742e-05, |
|
"loss": 1.193, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.38170525127046867, |
|
"grad_norm": 0.5512414574623108, |
|
"learning_rate": 7.455954509837352e-05, |
|
"loss": 1.4702, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.3839638622247318, |
|
"grad_norm": 0.5597122311592102, |
|
"learning_rate": 7.422754351663252e-05, |
|
"loss": 1.4636, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.38622247317899494, |
|
"grad_norm": 0.4637336730957031, |
|
"learning_rate": 7.389414006751158e-05, |
|
"loss": 1.4661, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.38848108413325805, |
|
"grad_norm": 1.08772873878479, |
|
"learning_rate": 7.355935404258354e-05, |
|
"loss": 1.4758, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.39073969508752115, |
|
"grad_norm": 0.537563681602478, |
|
"learning_rate": 7.322320481342054e-05, |
|
"loss": 1.4603, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.3929983060417843, |
|
"grad_norm": 0.6120302677154541, |
|
"learning_rate": 7.288571183047322e-05, |
|
"loss": 1.1336, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.3952569169960474, |
|
"grad_norm": 0.5834792852401733, |
|
"learning_rate": 7.254689462194522e-05, |
|
"loss": 1.1359, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.39751552795031053, |
|
"grad_norm": 0.5482034683227539, |
|
"learning_rate": 7.220677279266327e-05, |
|
"loss": 1.416, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.3997741389045737, |
|
"grad_norm": 0.6085582375526428, |
|
"learning_rate": 7.186536602294278e-05, |
|
"loss": 1.5104, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.4020327498588368, |
|
"grad_norm": 0.5022709369659424, |
|
"learning_rate": 7.152269406744903e-05, |
|
"loss": 1.8106, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.40429136081309996, |
|
"grad_norm": 0.6673846244812012, |
|
"learning_rate": 7.117877675405427e-05, |
|
"loss": 1.4334, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.40654997176736307, |
|
"grad_norm": 0.6298004388809204, |
|
"learning_rate": 7.083363398269022e-05, |
|
"loss": 1.4593, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4088085827216262, |
|
"grad_norm": 0.5097036361694336, |
|
"learning_rate": 7.04872857241968e-05, |
|
"loss": 1.3969, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.41106719367588934, |
|
"grad_norm": 0.5180112719535828, |
|
"learning_rate": 7.013975201916648e-05, |
|
"loss": 1.7094, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.41332580463015245, |
|
"grad_norm": 0.803783655166626, |
|
"learning_rate": 6.979105297678462e-05, |
|
"loss": 1.4688, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.4155844155844156, |
|
"grad_norm": 0.5442876219749451, |
|
"learning_rate": 6.944120877366604e-05, |
|
"loss": 1.5643, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4178430265386787, |
|
"grad_norm": 0.6098489165306091, |
|
"learning_rate": 6.909023965268746e-05, |
|
"loss": 1.3152, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4201016374929418, |
|
"grad_norm": 0.8846858143806458, |
|
"learning_rate": 6.873816592181617e-05, |
|
"loss": 1.1858, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.422360248447205, |
|
"grad_norm": 0.5681151747703552, |
|
"learning_rate": 6.838500795293505e-05, |
|
"loss": 1.3645, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.4246188594014681, |
|
"grad_norm": 0.5397717356681824, |
|
"learning_rate": 6.803078618066378e-05, |
|
"loss": 1.4444, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4268774703557312, |
|
"grad_norm": 0.5300354361534119, |
|
"learning_rate": 6.767552110117631e-05, |
|
"loss": 1.4324, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.42913608130999437, |
|
"grad_norm": 0.4856337010860443, |
|
"learning_rate": 6.73192332710151e-05, |
|
"loss": 1.3462, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4313946922642575, |
|
"grad_norm": 0.6010419130325317, |
|
"learning_rate": 6.696194330590151e-05, |
|
"loss": 1.6966, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.43365330321852064, |
|
"grad_norm": 0.5775107145309448, |
|
"learning_rate": 6.660367187954304e-05, |
|
"loss": 1.4229, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.43591191417278374, |
|
"grad_norm": 0.6146912574768066, |
|
"learning_rate": 6.624443972243698e-05, |
|
"loss": 1.2737, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.43817052512704685, |
|
"grad_norm": 1.4060486555099487, |
|
"learning_rate": 6.5884267620671e-05, |
|
"loss": 1.2221, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.44042913608131, |
|
"grad_norm": 1.0087642669677734, |
|
"learning_rate": 6.552317641472026e-05, |
|
"loss": 1.504, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4426877470355731, |
|
"grad_norm": 0.4778720438480377, |
|
"learning_rate": 6.516118699824178e-05, |
|
"loss": 1.2763, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4449463579898362, |
|
"grad_norm": 0.5498337745666504, |
|
"learning_rate": 6.479832031686521e-05, |
|
"loss": 1.4852, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.4472049689440994, |
|
"grad_norm": 0.5609453916549683, |
|
"learning_rate": 6.443459736698105e-05, |
|
"loss": 1.6017, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.4494635798983625, |
|
"grad_norm": 0.5456724166870117, |
|
"learning_rate": 6.407003919452564e-05, |
|
"loss": 1.2885, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.45172219085262566, |
|
"grad_norm": 0.7034818530082703, |
|
"learning_rate": 6.370466689376342e-05, |
|
"loss": 1.4856, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.45398080180688877, |
|
"grad_norm": 0.5647463202476501, |
|
"learning_rate": 6.33385016060664e-05, |
|
"loss": 1.745, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.4562394127611519, |
|
"grad_norm": 0.5447326302528381, |
|
"learning_rate": 6.297156451869082e-05, |
|
"loss": 1.407, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.45849802371541504, |
|
"grad_norm": 0.5139984488487244, |
|
"learning_rate": 6.260387686355121e-05, |
|
"loss": 1.3284, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.46075663466967814, |
|
"grad_norm": 0.44981297850608826, |
|
"learning_rate": 6.223545991599184e-05, |
|
"loss": 1.4727, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.46301524562394125, |
|
"grad_norm": 0.508630096912384, |
|
"learning_rate": 6.186633499355576e-05, |
|
"loss": 1.6505, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.4652738565782044, |
|
"grad_norm": 0.5300900340080261, |
|
"learning_rate": 6.149652345475118e-05, |
|
"loss": 1.3005, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.4675324675324675, |
|
"grad_norm": 0.8527065515518188, |
|
"learning_rate": 6.112604669781572e-05, |
|
"loss": 1.4615, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.4697910784867307, |
|
"grad_norm": 0.6106131672859192, |
|
"learning_rate": 6.075492615947823e-05, |
|
"loss": 1.3441, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.4720496894409938, |
|
"grad_norm": 0.4431338608264923, |
|
"learning_rate": 6.038318331371836e-05, |
|
"loss": 1.5369, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.4743083003952569, |
|
"grad_norm": 0.6845422983169556, |
|
"learning_rate": 6.001083967052408e-05, |
|
"loss": 1.6926, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.47656691134952006, |
|
"grad_norm": 0.48047032952308655, |
|
"learning_rate": 5.963791677464696e-05, |
|
"loss": 1.7314, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.47882552230378317, |
|
"grad_norm": 0.7335491180419922, |
|
"learning_rate": 5.9264436204355724e-05, |
|
"loss": 1.2773, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.4810841332580463, |
|
"grad_norm": 0.7190502285957336, |
|
"learning_rate": 5.889041957018745e-05, |
|
"loss": 1.2519, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.48334274421230944, |
|
"grad_norm": 0.547556459903717, |
|
"learning_rate": 5.85158885136973e-05, |
|
"loss": 1.1781, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.48560135516657255, |
|
"grad_norm": 0.6162766814231873, |
|
"learning_rate": 5.81408647062062e-05, |
|
"loss": 1.4605, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.4878599661208357, |
|
"grad_norm": 0.7362288236618042, |
|
"learning_rate": 5.7765369847546916e-05, |
|
"loss": 1.5264, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.4901185770750988, |
|
"grad_norm": 0.8551615476608276, |
|
"learning_rate": 5.7389425664808396e-05, |
|
"loss": 1.7111, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.4923771880293619, |
|
"grad_norm": 0.5673311948776245, |
|
"learning_rate": 5.7013053911078677e-05, |
|
"loss": 1.4621, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.4946357989836251, |
|
"grad_norm": 0.4883188307285309, |
|
"learning_rate": 5.6636276364186105e-05, |
|
"loss": 1.4759, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.4968944099378882, |
|
"grad_norm": 0.5014625191688538, |
|
"learning_rate": 5.6259114825439275e-05, |
|
"loss": 1.5336, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4991530208921513, |
|
"grad_norm": 0.5976287722587585, |
|
"learning_rate": 5.588159111836553e-05, |
|
"loss": 1.3577, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5014116318464145, |
|
"grad_norm": 0.5646213889122009, |
|
"learning_rate": 5.550372708744815e-05, |
|
"loss": 1.4191, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5014116318464145, |
|
"eval_loss": 1.4101738929748535, |
|
"eval_runtime": 96.1988, |
|
"eval_samples_per_second": 7.755, |
|
"eval_steps_per_second": 0.977, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5036702428006776, |
|
"grad_norm": 0.48547643423080444, |
|
"learning_rate": 5.51255445968625e-05, |
|
"loss": 1.4118, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5059288537549407, |
|
"grad_norm": 0.7442102432250977, |
|
"learning_rate": 5.4747065529210736e-05, |
|
"loss": 1.3571, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5081874647092038, |
|
"grad_norm": 0.6088778376579285, |
|
"learning_rate": 5.436831178425582e-05, |
|
"loss": 1.3432, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.510446075663467, |
|
"grad_norm": 0.8041794896125793, |
|
"learning_rate": 5.3989305277654156e-05, |
|
"loss": 1.4741, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.51270468661773, |
|
"grad_norm": 0.47421059012413025, |
|
"learning_rate": 5.361006793968764e-05, |
|
"loss": 1.2545, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5149632975719932, |
|
"grad_norm": 0.6154189109802246, |
|
"learning_rate": 5.32306217139946e-05, |
|
"loss": 1.4776, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5172219085262564, |
|
"grad_norm": 0.5990487337112427, |
|
"learning_rate": 5.28509885563002e-05, |
|
"loss": 1.2825, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 0.6563726663589478, |
|
"learning_rate": 5.247119043314592e-05, |
|
"loss": 1.4692, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 0.5759412050247192, |
|
"learning_rate": 5.209124932061862e-05, |
|
"loss": 1.599, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.5239977413890458, |
|
"grad_norm": 0.5122321248054504, |
|
"learning_rate": 5.1711187203078824e-05, |
|
"loss": 1.4495, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5262563523433089, |
|
"grad_norm": 0.5724780559539795, |
|
"learning_rate": 5.133102607188874e-05, |
|
"loss": 1.4343, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.528514963297572, |
|
"grad_norm": 0.6793734431266785, |
|
"learning_rate": 5.0950787924139764e-05, |
|
"loss": 1.2671, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5307735742518351, |
|
"grad_norm": 0.5128283500671387, |
|
"learning_rate": 5.057049476137967e-05, |
|
"loss": 1.1848, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5330321852060983, |
|
"grad_norm": 0.5823889970779419, |
|
"learning_rate": 5.0190168588339536e-05, |
|
"loss": 1.2724, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5352907961603613, |
|
"grad_norm": 0.41200748085975647, |
|
"learning_rate": 4.9809831411660476e-05, |
|
"loss": 1.3234, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5375494071146245, |
|
"grad_norm": 0.5784906148910522, |
|
"learning_rate": 4.942950523862033e-05, |
|
"loss": 1.4, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5398080180688877, |
|
"grad_norm": 0.9113706946372986, |
|
"learning_rate": 4.904921207586024e-05, |
|
"loss": 1.3569, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.5420666290231507, |
|
"grad_norm": 0.5041924118995667, |
|
"learning_rate": 4.866897392811126e-05, |
|
"loss": 1.1772, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5443252399774139, |
|
"grad_norm": 0.7096860408782959, |
|
"learning_rate": 4.828881279692119e-05, |
|
"loss": 1.6993, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.546583850931677, |
|
"grad_norm": 0.4959520101547241, |
|
"learning_rate": 4.7908750679381384e-05, |
|
"loss": 1.6176, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.5488424618859401, |
|
"grad_norm": 0.5700163841247559, |
|
"learning_rate": 4.752880956685407e-05, |
|
"loss": 1.0817, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.5511010728402033, |
|
"grad_norm": 0.5470147132873535, |
|
"learning_rate": 4.7149011443699814e-05, |
|
"loss": 1.4854, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.5533596837944664, |
|
"grad_norm": 0.7049497961997986, |
|
"learning_rate": 4.676937828600542e-05, |
|
"loss": 1.4135, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5556182947487295, |
|
"grad_norm": 0.7252426743507385, |
|
"learning_rate": 4.638993206031237e-05, |
|
"loss": 1.4154, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.5578769057029926, |
|
"grad_norm": 0.575930118560791, |
|
"learning_rate": 4.601069472234584e-05, |
|
"loss": 1.0767, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.5601355166572558, |
|
"grad_norm": 1.1477127075195312, |
|
"learning_rate": 4.56316882157442e-05, |
|
"loss": 1.332, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.562394127611519, |
|
"grad_norm": 0.5191376209259033, |
|
"learning_rate": 4.525293447078927e-05, |
|
"loss": 1.7038, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.564652738565782, |
|
"grad_norm": 0.554302990436554, |
|
"learning_rate": 4.4874455403137514e-05, |
|
"loss": 1.7356, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5669113495200452, |
|
"grad_norm": 1.1342941522598267, |
|
"learning_rate": 4.449627291255184e-05, |
|
"loss": 1.6412, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.5691699604743083, |
|
"grad_norm": 0.5467344522476196, |
|
"learning_rate": 4.411840888163449e-05, |
|
"loss": 1.3031, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.4619009494781494, |
|
"learning_rate": 4.3740885174560736e-05, |
|
"loss": 1.4958, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.5736871823828346, |
|
"grad_norm": 0.6281998157501221, |
|
"learning_rate": 4.336372363581391e-05, |
|
"loss": 1.4351, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.5759457933370977, |
|
"grad_norm": 0.6288211345672607, |
|
"learning_rate": 4.298694608892134e-05, |
|
"loss": 1.7786, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5782044042913608, |
|
"grad_norm": 1.0841307640075684, |
|
"learning_rate": 4.2610574335191615e-05, |
|
"loss": 1.4286, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.5804630152456239, |
|
"grad_norm": 0.7780132293701172, |
|
"learning_rate": 4.2234630152453116e-05, |
|
"loss": 1.385, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.5827216261998871, |
|
"grad_norm": 0.49743711948394775, |
|
"learning_rate": 4.185913529379381e-05, |
|
"loss": 1.1707, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.5849802371541502, |
|
"grad_norm": 0.7499518394470215, |
|
"learning_rate": 4.1484111486302704e-05, |
|
"loss": 1.2754, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.5872388481084133, |
|
"grad_norm": 0.5472561717033386, |
|
"learning_rate": 4.110958042981255e-05, |
|
"loss": 1.4632, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5894974590626765, |
|
"grad_norm": 0.6489593386650085, |
|
"learning_rate": 4.0735563795644294e-05, |
|
"loss": 1.5454, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.5917560700169395, |
|
"grad_norm": 0.5738951563835144, |
|
"learning_rate": 4.0362083225353046e-05, |
|
"loss": 1.4356, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.5940146809712027, |
|
"grad_norm": 0.4810039699077606, |
|
"learning_rate": 3.998916032947594e-05, |
|
"loss": 1.3287, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.5962732919254659, |
|
"grad_norm": 0.6687359809875488, |
|
"learning_rate": 3.961681668628164e-05, |
|
"loss": 1.2772, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.598531902879729, |
|
"grad_norm": 0.5497633814811707, |
|
"learning_rate": 3.9245073840521765e-05, |
|
"loss": 1.4254, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6007905138339921, |
|
"grad_norm": 0.5534753203392029, |
|
"learning_rate": 3.887395330218429e-05, |
|
"loss": 1.3866, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6030491247882552, |
|
"grad_norm": 1.2714121341705322, |
|
"learning_rate": 3.850347654524883e-05, |
|
"loss": 1.4241, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.6053077357425184, |
|
"grad_norm": 0.6475468277931213, |
|
"learning_rate": 3.8133665006444255e-05, |
|
"loss": 1.2423, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6075663466967814, |
|
"grad_norm": 0.6580026745796204, |
|
"learning_rate": 3.776454008400816e-05, |
|
"loss": 1.7248, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.6098249576510446, |
|
"grad_norm": 0.47581014037132263, |
|
"learning_rate": 3.7396123136448824e-05, |
|
"loss": 1.5206, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6120835686053078, |
|
"grad_norm": 0.6214271783828735, |
|
"learning_rate": 3.70284354813092e-05, |
|
"loss": 1.3408, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.6143421795595708, |
|
"grad_norm": 0.6450148224830627, |
|
"learning_rate": 3.666149839393361e-05, |
|
"loss": 1.6774, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.616600790513834, |
|
"grad_norm": 0.4316481947898865, |
|
"learning_rate": 3.629533310623658e-05, |
|
"loss": 1.0687, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.6188594014680971, |
|
"grad_norm": 0.6752032041549683, |
|
"learning_rate": 3.592996080547438e-05, |
|
"loss": 1.7115, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.6211180124223602, |
|
"grad_norm": 0.6112974882125854, |
|
"learning_rate": 3.556540263301896e-05, |
|
"loss": 1.5235, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6233766233766234, |
|
"grad_norm": 0.5716705322265625, |
|
"learning_rate": 3.520167968313479e-05, |
|
"loss": 1.4077, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6256352343308865, |
|
"grad_norm": 0.443218469619751, |
|
"learning_rate": 3.483881300175823e-05, |
|
"loss": 1.5901, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.6278938452851497, |
|
"grad_norm": 0.6423050761222839, |
|
"learning_rate": 3.447682358527974e-05, |
|
"loss": 1.2472, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.6301524562394127, |
|
"grad_norm": 0.6637226939201355, |
|
"learning_rate": 3.411573237932904e-05, |
|
"loss": 1.416, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.6324110671936759, |
|
"grad_norm": 0.6266383528709412, |
|
"learning_rate": 3.3755560277563023e-05, |
|
"loss": 1.3596, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6346696781479391, |
|
"grad_norm": 0.4592457711696625, |
|
"learning_rate": 3.339632812045696e-05, |
|
"loss": 1.4775, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.6369282891022021, |
|
"grad_norm": 0.4435870945453644, |
|
"learning_rate": 3.303805669409848e-05, |
|
"loss": 1.2606, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.6391869000564653, |
|
"grad_norm": 0.8418070673942566, |
|
"learning_rate": 3.268076672898492e-05, |
|
"loss": 1.292, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.6414455110107284, |
|
"grad_norm": 0.46599528193473816, |
|
"learning_rate": 3.2324478898823705e-05, |
|
"loss": 1.4636, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.6437041219649915, |
|
"grad_norm": 0.5560009479522705, |
|
"learning_rate": 3.196921381933624e-05, |
|
"loss": 1.3654, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6459627329192547, |
|
"grad_norm": 0.5951889753341675, |
|
"learning_rate": 3.1614992047064945e-05, |
|
"loss": 1.2783, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.6482213438735178, |
|
"grad_norm": 0.5692819952964783, |
|
"learning_rate": 3.126183407818384e-05, |
|
"loss": 1.3873, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.6504799548277809, |
|
"grad_norm": 0.7618367671966553, |
|
"learning_rate": 3.090976034731257e-05, |
|
"loss": 1.3041, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.652738565782044, |
|
"grad_norm": 0.6365513205528259, |
|
"learning_rate": 3.055879122633397e-05, |
|
"loss": 1.3736, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.6549971767363072, |
|
"grad_norm": 0.6634312272071838, |
|
"learning_rate": 3.020894702321539e-05, |
|
"loss": 1.5358, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6572557876905702, |
|
"grad_norm": 0.4850894510746002, |
|
"learning_rate": 2.9860247980833532e-05, |
|
"loss": 1.2746, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.6595143986448334, |
|
"grad_norm": 0.6311604976654053, |
|
"learning_rate": 2.951271427580321e-05, |
|
"loss": 1.3607, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.6617730095990966, |
|
"grad_norm": 0.6195625066757202, |
|
"learning_rate": 2.91663660173098e-05, |
|
"loss": 1.3412, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.6640316205533597, |
|
"grad_norm": 0.6908369660377502, |
|
"learning_rate": 2.882122324594575e-05, |
|
"loss": 1.5332, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.6662902315076228, |
|
"grad_norm": 0.9951730966567993, |
|
"learning_rate": 2.847730593255097e-05, |
|
"loss": 1.4562, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.668548842461886, |
|
"grad_norm": 0.4664572775363922, |
|
"learning_rate": 2.8134633977057235e-05, |
|
"loss": 1.608, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.6708074534161491, |
|
"grad_norm": 0.5350282788276672, |
|
"learning_rate": 2.779322720733673e-05, |
|
"loss": 1.6665, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.6730660643704122, |
|
"grad_norm": 0.4838089346885681, |
|
"learning_rate": 2.745310537805479e-05, |
|
"loss": 1.3432, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.6753246753246753, |
|
"grad_norm": 0.5749621987342834, |
|
"learning_rate": 2.7114288169526793e-05, |
|
"loss": 1.5841, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.6775832862789385, |
|
"grad_norm": 0.44221848249435425, |
|
"learning_rate": 2.6776795186579468e-05, |
|
"loss": 1.2465, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6798418972332015, |
|
"grad_norm": 0.5502551794052124, |
|
"learning_rate": 2.6440645957416484e-05, |
|
"loss": 1.7499, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.6821005081874647, |
|
"grad_norm": 0.51673823595047, |
|
"learning_rate": 2.610585993248843e-05, |
|
"loss": 1.4352, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.6843591191417279, |
|
"grad_norm": 0.5909664034843445, |
|
"learning_rate": 2.5772456483367497e-05, |
|
"loss": 1.2685, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.6866177300959909, |
|
"grad_norm": 0.5207455158233643, |
|
"learning_rate": 2.5440454901626486e-05, |
|
"loss": 1.4422, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.6888763410502541, |
|
"grad_norm": 0.6264783143997192, |
|
"learning_rate": 2.510987439772261e-05, |
|
"loss": 1.1306, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6911349520045172, |
|
"grad_norm": 0.5614155530929565, |
|
"learning_rate": 2.4780734099885833e-05, |
|
"loss": 1.9378, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.6933935629587803, |
|
"grad_norm": 0.5099676251411438, |
|
"learning_rate": 2.4453053053012187e-05, |
|
"loss": 1.4991, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 0.6968526244163513, |
|
"learning_rate": 2.4126850217561698e-05, |
|
"loss": 1.5211, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.6979107848673066, |
|
"grad_norm": 0.4605090320110321, |
|
"learning_rate": 2.3802144468461367e-05, |
|
"loss": 1.5865, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.7001693958215698, |
|
"grad_norm": 0.8055828809738159, |
|
"learning_rate": 2.347895459401288e-05, |
|
"loss": 1.4216, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7024280067758328, |
|
"grad_norm": 0.5503789186477661, |
|
"learning_rate": 2.3157299294805613e-05, |
|
"loss": 1.378, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.704686617730096, |
|
"grad_norm": 0.6064462661743164, |
|
"learning_rate": 2.2837197182634483e-05, |
|
"loss": 1.6254, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.7069452286843592, |
|
"grad_norm": 0.7875195741653442, |
|
"learning_rate": 2.2518666779423074e-05, |
|
"loss": 1.459, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.7092038396386222, |
|
"grad_norm": 0.5678854584693909, |
|
"learning_rate": 2.2201726516151882e-05, |
|
"loss": 1.4943, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7114624505928854, |
|
"grad_norm": 0.6135299801826477, |
|
"learning_rate": 2.1886394731791816e-05, |
|
"loss": 1.5494, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7137210615471485, |
|
"grad_norm": 0.49695029854774475, |
|
"learning_rate": 2.157268967224314e-05, |
|
"loss": 1.4126, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7159796725014116, |
|
"grad_norm": 0.5455291271209717, |
|
"learning_rate": 2.126062948927966e-05, |
|
"loss": 1.5505, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.7182382834556748, |
|
"grad_norm": 0.6816684603691101, |
|
"learning_rate": 2.0950232239498446e-05, |
|
"loss": 1.1783, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.7204968944099379, |
|
"grad_norm": 0.548568069934845, |
|
"learning_rate": 2.064151588327501e-05, |
|
"loss": 1.5532, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.722755505364201, |
|
"grad_norm": 0.44020459055900574, |
|
"learning_rate": 2.0334498283724078e-05, |
|
"loss": 1.7366, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7250141163184641, |
|
"grad_norm": 0.5120103359222412, |
|
"learning_rate": 2.002919720566599e-05, |
|
"loss": 1.2849, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 1.1308592557907104, |
|
"learning_rate": 1.9725630314598782e-05, |
|
"loss": 1.3336, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.7295313382269905, |
|
"grad_norm": 0.7465949058532715, |
|
"learning_rate": 1.9423815175676025e-05, |
|
"loss": 1.4263, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.7317899491812535, |
|
"grad_norm": 0.6979694962501526, |
|
"learning_rate": 1.912376925269041e-05, |
|
"loss": 1.2292, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.7340485601355167, |
|
"grad_norm": 0.6051463484764099, |
|
"learning_rate": 1.8825509907063327e-05, |
|
"loss": 1.6008, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7363071710897798, |
|
"grad_norm": 0.5719790458679199, |
|
"learning_rate": 1.8529054396840234e-05, |
|
"loss": 1.5861, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.7385657820440429, |
|
"grad_norm": 0.4965897798538208, |
|
"learning_rate": 1.8234419875692105e-05, |
|
"loss": 1.9165, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.740824392998306, |
|
"grad_norm": 0.6010226011276245, |
|
"learning_rate": 1.7941623391922772e-05, |
|
"loss": 1.2716, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.7430830039525692, |
|
"grad_norm": 0.6560564637184143, |
|
"learning_rate": 1.7650681887482628e-05, |
|
"loss": 1.5902, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.7453416149068323, |
|
"grad_norm": 0.7512021660804749, |
|
"learning_rate": 1.7361612196988174e-05, |
|
"loss": 1.4901, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7476002258610954, |
|
"grad_norm": 0.5139881372451782, |
|
"learning_rate": 1.7074431046748075e-05, |
|
"loss": 1.366, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.7498588368153586, |
|
"grad_norm": 0.5443575978279114, |
|
"learning_rate": 1.678915505379513e-05, |
|
"loss": 1.2283, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.7521174477696216, |
|
"grad_norm": 0.5868397951126099, |
|
"learning_rate": 1.650580072492496e-05, |
|
"loss": 1.396, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.7543760587238848, |
|
"grad_norm": 0.5796262621879578, |
|
"learning_rate": 1.6224384455740788e-05, |
|
"loss": 1.522, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.756634669678148, |
|
"grad_norm": 0.5410485863685608, |
|
"learning_rate": 1.5944922529704777e-05, |
|
"loss": 1.3733, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.758893280632411, |
|
"grad_norm": 0.5935060977935791, |
|
"learning_rate": 1.5667431117195814e-05, |
|
"loss": 1.4833, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.7611518915866742, |
|
"grad_norm": 0.6086990833282471, |
|
"learning_rate": 1.539192627457382e-05, |
|
"loss": 1.2748, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.7634105025409373, |
|
"grad_norm": 0.9184845089912415, |
|
"learning_rate": 1.5118423943250771e-05, |
|
"loss": 1.5052, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.7656691134952005, |
|
"grad_norm": 0.5554947853088379, |
|
"learning_rate": 1.4846939948768218e-05, |
|
"loss": 1.8114, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.7679277244494636, |
|
"grad_norm": 0.5822046399116516, |
|
"learning_rate": 1.45774899998816e-05, |
|
"loss": 1.1491, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7701863354037267, |
|
"grad_norm": 0.5082180500030518, |
|
"learning_rate": 1.4310089687651301e-05, |
|
"loss": 1.2931, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.7724449463579899, |
|
"grad_norm": 0.7087529897689819, |
|
"learning_rate": 1.40447544845405e-05, |
|
"loss": 1.4946, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.7747035573122529, |
|
"grad_norm": 0.566061794757843, |
|
"learning_rate": 1.378149974351991e-05, |
|
"loss": 1.31, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.7769621682665161, |
|
"grad_norm": 0.48755723237991333, |
|
"learning_rate": 1.3520340697179406e-05, |
|
"loss": 1.5299, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 0.6160857677459717, |
|
"learning_rate": 1.3261292456846647e-05, |
|
"loss": 1.468, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.7814793901750423, |
|
"grad_norm": 0.9709001779556274, |
|
"learning_rate": 1.3004370011712624e-05, |
|
"loss": 1.3858, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.7837380011293055, |
|
"grad_norm": 0.5348103046417236, |
|
"learning_rate": 1.2749588227964465e-05, |
|
"loss": 1.2159, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.7859966120835686, |
|
"grad_norm": 0.6455698609352112, |
|
"learning_rate": 1.2496961847925153e-05, |
|
"loss": 1.4438, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.7882552230378317, |
|
"grad_norm": 0.7605583071708679, |
|
"learning_rate": 1.2246505489200532e-05, |
|
"loss": 1.2351, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.7905138339920948, |
|
"grad_norm": 0.6536300182342529, |
|
"learning_rate": 1.1998233643833457e-05, |
|
"loss": 1.3756, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.792772444946358, |
|
"grad_norm": 0.5800183415412903, |
|
"learning_rate": 1.1752160677465286e-05, |
|
"loss": 1.3849, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.7950310559006211, |
|
"grad_norm": 0.5287428498268127, |
|
"learning_rate": 1.150830082850468e-05, |
|
"loss": 1.4647, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.7972896668548842, |
|
"grad_norm": 0.4410305619239807, |
|
"learning_rate": 1.126666820730366e-05, |
|
"loss": 1.3785, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.7995482778091474, |
|
"grad_norm": 0.4867897927761078, |
|
"learning_rate": 1.1027276795341135e-05, |
|
"loss": 1.4087, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.8018068887634106, |
|
"grad_norm": 0.7169702053070068, |
|
"learning_rate": 1.0790140444414e-05, |
|
"loss": 1.1061, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8040654997176736, |
|
"grad_norm": 1.4617806673049927, |
|
"learning_rate": 1.0555272875835537e-05, |
|
"loss": 1.3367, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8063241106719368, |
|
"grad_norm": 0.37601158022880554, |
|
"learning_rate": 1.0322687679641523e-05, |
|
"loss": 1.3613, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.8085827216261999, |
|
"grad_norm": 0.6595029830932617, |
|
"learning_rate": 1.0092398313803863e-05, |
|
"loss": 1.6219, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.810841332580463, |
|
"grad_norm": 0.5519759058952332, |
|
"learning_rate": 9.864418103451828e-06, |
|
"loss": 1.5668, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.8130999435347261, |
|
"grad_norm": 0.5467275977134705, |
|
"learning_rate": 9.638760240101102e-06, |
|
"loss": 1.2981, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8153585544889893, |
|
"grad_norm": 0.5513888597488403, |
|
"learning_rate": 9.415437780890451e-06, |
|
"loss": 1.319, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.8176171654432524, |
|
"grad_norm": 0.6388846039772034, |
|
"learning_rate": 9.194463647826223e-06, |
|
"loss": 1.6887, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.8198757763975155, |
|
"grad_norm": 0.6353502869606018, |
|
"learning_rate": 8.975850627034604e-06, |
|
"loss": 1.3972, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.8221343873517787, |
|
"grad_norm": 0.6129746437072754, |
|
"learning_rate": 8.759611368021831e-06, |
|
"loss": 1.3317, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.8243929983060417, |
|
"grad_norm": 0.6045680046081543, |
|
"learning_rate": 8.545758382942232e-06, |
|
"loss": 1.2371, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8266516092603049, |
|
"grad_norm": 0.5147614479064941, |
|
"learning_rate": 8.334304045874247e-06, |
|
"loss": 1.498, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.8289102202145681, |
|
"grad_norm": 0.41877540946006775, |
|
"learning_rate": 8.125260592104445e-06, |
|
"loss": 1.4604, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.8311688311688312, |
|
"grad_norm": 0.5674257278442383, |
|
"learning_rate": 7.918640117419507e-06, |
|
"loss": 1.2845, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.8334274421230943, |
|
"grad_norm": 0.7951464056968689, |
|
"learning_rate": 7.71445457740641e-06, |
|
"loss": 1.3372, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.8356860530773574, |
|
"grad_norm": 0.6295720934867859, |
|
"learning_rate": 7.512715786760605e-06, |
|
"loss": 1.6711, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8379446640316206, |
|
"grad_norm": 0.5587788224220276, |
|
"learning_rate": 7.313435418602388e-06, |
|
"loss": 1.4419, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.8402032749858837, |
|
"grad_norm": 0.8322708010673523, |
|
"learning_rate": 7.116625003801436e-06, |
|
"loss": 1.3559, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.8424618859401468, |
|
"grad_norm": 0.6834149956703186, |
|
"learning_rate": 6.922295930309691e-06, |
|
"loss": 1.5552, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.84472049689441, |
|
"grad_norm": 0.7518057823181152, |
|
"learning_rate": 6.730459442502329e-06, |
|
"loss": 1.5805, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.846979107848673, |
|
"grad_norm": 0.5238639712333679, |
|
"learning_rate": 6.541126640527195e-06, |
|
"loss": 1.6161, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.8492377188029362, |
|
"grad_norm": 0.5006189346313477, |
|
"learning_rate": 6.354308479662446e-06, |
|
"loss": 1.5392, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.8514963297571994, |
|
"grad_norm": 0.5511265397071838, |
|
"learning_rate": 6.170015769682741e-06, |
|
"loss": 1.6469, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.8537549407114624, |
|
"grad_norm": 0.5680528283119202, |
|
"learning_rate": 5.988259174233713e-06, |
|
"loss": 1.3312, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.8560135516657256, |
|
"grad_norm": 0.773024320602417, |
|
"learning_rate": 5.80904921021494e-06, |
|
"loss": 1.4814, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.8582721626199887, |
|
"grad_norm": 0.47784173488616943, |
|
"learning_rate": 5.6323962471714286e-06, |
|
"loss": 1.3431, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8605307735742518, |
|
"grad_norm": 0.5493645668029785, |
|
"learning_rate": 5.458310506693571e-06, |
|
"loss": 1.5127, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.862789384528515, |
|
"grad_norm": 0.5539724826812744, |
|
"learning_rate": 5.286802061825752e-06, |
|
"loss": 1.4401, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.8650479954827781, |
|
"grad_norm": 0.4766329824924469, |
|
"learning_rate": 5.117880836483452e-06, |
|
"loss": 1.5344, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.8673066064370413, |
|
"grad_norm": 0.46693822741508484, |
|
"learning_rate": 4.951556604879048e-06, |
|
"loss": 1.3009, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 0.7873904705047607, |
|
"learning_rate": 4.7878389909562285e-06, |
|
"loss": 1.4413, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8718238283455675, |
|
"grad_norm": 0.5490197539329529, |
|
"learning_rate": 4.62673746783317e-06, |
|
"loss": 1.2131, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.8740824392998306, |
|
"grad_norm": 0.4545706808567047, |
|
"learning_rate": 4.468261357254339e-06, |
|
"loss": 1.1735, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.8763410502540937, |
|
"grad_norm": 0.5005091428756714, |
|
"learning_rate": 4.312419829051173e-06, |
|
"loss": 1.5943, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.8785996612083569, |
|
"grad_norm": 0.546002984046936, |
|
"learning_rate": 4.15922190061146e-06, |
|
"loss": 1.3671, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.88085827216262, |
|
"grad_norm": 0.6055895686149597, |
|
"learning_rate": 4.008676436357539e-06, |
|
"loss": 1.4737, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8831168831168831, |
|
"grad_norm": 0.7526850700378418, |
|
"learning_rate": 3.86079214723345e-06, |
|
"loss": 1.6545, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.8853754940711462, |
|
"grad_norm": 0.6517372727394104, |
|
"learning_rate": 3.7155775902008526e-06, |
|
"loss": 1.2377, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.8876341050254094, |
|
"grad_norm": 0.590649425983429, |
|
"learning_rate": 3.5730411677439125e-06, |
|
"loss": 1.5282, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.8898927159796725, |
|
"grad_norm": 0.7884031534194946, |
|
"learning_rate": 3.4331911273830784e-06, |
|
"loss": 1.2815, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.8921513269339356, |
|
"grad_norm": 0.4688451886177063, |
|
"learning_rate": 3.2960355611979245e-06, |
|
"loss": 1.2642, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8944099378881988, |
|
"grad_norm": 0.5359352231025696, |
|
"learning_rate": 3.161582405358876e-06, |
|
"loss": 1.4675, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.8966685488424618, |
|
"grad_norm": 0.5633996725082397, |
|
"learning_rate": 3.029839439668003e-06, |
|
"loss": 1.5731, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.898927159796725, |
|
"grad_norm": 0.6047767400741577, |
|
"learning_rate": 2.9008142871088663e-06, |
|
"loss": 1.4189, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.9011857707509882, |
|
"grad_norm": 0.47979751229286194, |
|
"learning_rate": 2.7745144134054433e-06, |
|
"loss": 1.3879, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.9034443817052513, |
|
"grad_norm": 0.4920281171798706, |
|
"learning_rate": 2.6509471265901477e-06, |
|
"loss": 1.423, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9057029926595144, |
|
"grad_norm": 0.5504844784736633, |
|
"learning_rate": 2.530119576580936e-06, |
|
"loss": 1.3404, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.9079616036137775, |
|
"grad_norm": 0.4826345145702362, |
|
"learning_rate": 2.412038754767626e-06, |
|
"loss": 1.2597, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.9102202145680407, |
|
"grad_norm": 0.7517184615135193, |
|
"learning_rate": 2.296711493607334e-06, |
|
"loss": 1.3037, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.9124788255223037, |
|
"grad_norm": 0.6532472372055054, |
|
"learning_rate": 2.1841444662291543e-06, |
|
"loss": 1.4865, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.9147374364765669, |
|
"grad_norm": 0.5889557600021362, |
|
"learning_rate": 2.074344186048022e-06, |
|
"loss": 1.2332, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.9169960474308301, |
|
"grad_norm": 0.4623706638813019, |
|
"learning_rate": 1.967317006387831e-06, |
|
"loss": 1.5374, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.9192546583850931, |
|
"grad_norm": 0.6362307071685791, |
|
"learning_rate": 1.863069120113814e-06, |
|
"loss": 1.0415, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.9215132693393563, |
|
"grad_norm": 0.5328913927078247, |
|
"learning_rate": 1.7616065592742038e-06, |
|
"loss": 1.3665, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.9237718802936195, |
|
"grad_norm": 0.61063551902771, |
|
"learning_rate": 1.6629351947512195e-06, |
|
"loss": 1.4575, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.9260304912478825, |
|
"grad_norm": 0.5676048994064331, |
|
"learning_rate": 1.567060735921344e-06, |
|
"loss": 1.5069, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9282891022021457, |
|
"grad_norm": 0.6710496544837952, |
|
"learning_rate": 1.4739887303249877e-06, |
|
"loss": 1.4816, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.9305477131564088, |
|
"grad_norm": 0.5965588092803955, |
|
"learning_rate": 1.383724563345451e-06, |
|
"loss": 1.6784, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.932806324110672, |
|
"grad_norm": 0.6151901483535767, |
|
"learning_rate": 1.2962734578973568e-06, |
|
"loss": 1.4018, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.935064935064935, |
|
"grad_norm": 0.6532612442970276, |
|
"learning_rate": 1.2116404741244203e-06, |
|
"loss": 1.415, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.9373235460191982, |
|
"grad_norm": 0.5151341557502747, |
|
"learning_rate": 1.1298305091066664e-06, |
|
"loss": 1.3436, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.9395821569734614, |
|
"grad_norm": 0.6843656301498413, |
|
"learning_rate": 1.0508482965770505e-06, |
|
"loss": 1.5971, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.9418407679277244, |
|
"grad_norm": 0.5822469592094421, |
|
"learning_rate": 9.746984066475729e-07, |
|
"loss": 1.3158, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.9440993788819876, |
|
"grad_norm": 0.8204763531684875, |
|
"learning_rate": 9.013852455448335e-07, |
|
"loss": 1.2683, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.9463579898362507, |
|
"grad_norm": 0.9668586254119873, |
|
"learning_rate": 8.309130553550815e-07, |
|
"loss": 1.4966, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.9486166007905138, |
|
"grad_norm": 0.3911222815513611, |
|
"learning_rate": 7.63285913778733e-07, |
|
"loss": 1.3486, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.950875211744777, |
|
"grad_norm": 0.5275930166244507, |
|
"learning_rate": 6.985077338944657e-07, |
|
"loss": 1.044, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.9531338226990401, |
|
"grad_norm": 0.5202364921569824, |
|
"learning_rate": 6.365822639327723e-07, |
|
"loss": 1.262, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.9553924336533032, |
|
"grad_norm": 0.5621113181114197, |
|
"learning_rate": 5.775130870590783e-07, |
|
"loss": 1.3838, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.9576510446075663, |
|
"grad_norm": 0.5562776923179626, |
|
"learning_rate": 5.213036211664191e-07, |
|
"loss": 1.4821, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.9599096555618295, |
|
"grad_norm": 0.5257914066314697, |
|
"learning_rate": 4.6795711867766436e-07, |
|
"loss": 1.6874, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9621682665160926, |
|
"grad_norm": 0.7182251811027527, |
|
"learning_rate": 4.1747666635733597e-07, |
|
"loss": 1.3114, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.9644268774703557, |
|
"grad_norm": 0.7395084500312805, |
|
"learning_rate": 3.698651851329837e-07, |
|
"loss": 1.4869, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.9666854884246189, |
|
"grad_norm": 0.5292602777481079, |
|
"learning_rate": 3.251254299261874e-07, |
|
"loss": 1.6935, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.968944099378882, |
|
"grad_norm": 0.7703830599784851, |
|
"learning_rate": 2.8325998949314536e-07, |
|
"loss": 1.581, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.9712027103331451, |
|
"grad_norm": 0.4652191400527954, |
|
"learning_rate": 2.442712862748775e-07, |
|
"loss": 1.7907, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9734613212874083, |
|
"grad_norm": 0.7090250849723816, |
|
"learning_rate": 2.0816157625706545e-07, |
|
"loss": 1.3354, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.9757199322416714, |
|
"grad_norm": 0.6182876825332642, |
|
"learning_rate": 1.749329488395124e-07, |
|
"loss": 1.6426, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.9779785431959345, |
|
"grad_norm": 0.42393407225608826, |
|
"learning_rate": 1.4458732671523977e-07, |
|
"loss": 1.2865, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.9802371541501976, |
|
"grad_norm": 0.7182570695877075, |
|
"learning_rate": 1.1712646575922637e-07, |
|
"loss": 1.3558, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.9824957651044608, |
|
"grad_norm": 0.5384182929992676, |
|
"learning_rate": 9.255195492685609e-08, |
|
"loss": 1.4792, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.9847543760587238, |
|
"grad_norm": 0.6862145662307739, |
|
"learning_rate": 7.086521616190279e-08, |
|
"loss": 1.4509, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.987012987012987, |
|
"grad_norm": 0.9429339170455933, |
|
"learning_rate": 5.2067504314323723e-08, |
|
"loss": 1.2235, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.9892715979672502, |
|
"grad_norm": 0.48626038432121277, |
|
"learning_rate": 3.6159907067601085e-08, |
|
"loss": 1.3711, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.9915302089215132, |
|
"grad_norm": 0.6334356665611267, |
|
"learning_rate": 2.3143344875831142e-08, |
|
"loss": 1.3192, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.9937888198757764, |
|
"grad_norm": 0.49483799934387207, |
|
"learning_rate": 1.3018570910466877e-08, |
|
"loss": 1.1924, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9960474308300395, |
|
"grad_norm": 0.5049090385437012, |
|
"learning_rate": 5.786171016708419e-09, |
|
"loss": 1.5949, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.9983060417843026, |
|
"grad_norm": 0.6922277808189392, |
|
"learning_rate": 1.446563679641244e-09, |
|
"loss": 1.4329, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.0005646527385659, |
|
"grad_norm": 0.656385064125061, |
|
"learning_rate": 0.0, |
|
"loss": 1.5656, |
|
"step": 443 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 443, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.630732306700042e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|