|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.972522897585345, |
|
"eval_steps": 500, |
|
"global_step": 750, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013322231473771857, |
|
"grad_norm": 5.780612066318371, |
|
"learning_rate": 1.3333333333333336e-07, |
|
"loss": 0.9209, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.026644462947543714, |
|
"grad_norm": 5.803797516781435, |
|
"learning_rate": 2.666666666666667e-07, |
|
"loss": 0.9065, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03996669442131557, |
|
"grad_norm": 5.67259689729213, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 0.9182, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05328892589508743, |
|
"grad_norm": 5.354376269147714, |
|
"learning_rate": 5.333333333333335e-07, |
|
"loss": 0.9233, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06661115736885928, |
|
"grad_norm": 4.851036441351623, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 0.8906, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07993338884263114, |
|
"grad_norm": 4.566190641845786, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 0.8719, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.093255620316403, |
|
"grad_norm": 4.013324466824848, |
|
"learning_rate": 9.333333333333334e-07, |
|
"loss": 0.8861, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.10657785179017486, |
|
"grad_norm": 3.540718772081854, |
|
"learning_rate": 1.066666666666667e-06, |
|
"loss": 0.8791, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11990008326394672, |
|
"grad_norm": 2.88258712818169, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.8532, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.13322231473771856, |
|
"grad_norm": 2.6937271254013613, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 0.8455, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14654454621149043, |
|
"grad_norm": 2.3220586555688816, |
|
"learning_rate": 1.4666666666666669e-06, |
|
"loss": 0.8159, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.15986677768526228, |
|
"grad_norm": 3.400225929248788, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.8004, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.17318900915903415, |
|
"grad_norm": 3.5504999091076845, |
|
"learning_rate": 1.7333333333333336e-06, |
|
"loss": 0.8084, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.186511240632806, |
|
"grad_norm": 3.2668071978806057, |
|
"learning_rate": 1.8666666666666669e-06, |
|
"loss": 0.8086, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.19983347210657784, |
|
"grad_norm": 2.4897231787044154, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.7821, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.21315570358034971, |
|
"grad_norm": 2.006655051652645, |
|
"learning_rate": 2.133333333333334e-06, |
|
"loss": 0.7767, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.22647793505412156, |
|
"grad_norm": 1.8054118845570075, |
|
"learning_rate": 2.266666666666667e-06, |
|
"loss": 0.764, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.23980016652789343, |
|
"grad_norm": 2.060437415012226, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.7577, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2531223980016653, |
|
"grad_norm": 1.822445369621608, |
|
"learning_rate": 2.5333333333333338e-06, |
|
"loss": 0.7441, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2664446294754371, |
|
"grad_norm": 1.702085750109042, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.7364, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.279766860949209, |
|
"grad_norm": 1.4905140822613037, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 0.7252, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.29308909242298087, |
|
"grad_norm": 1.602012507677594, |
|
"learning_rate": 2.9333333333333338e-06, |
|
"loss": 0.727, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3064113238967527, |
|
"grad_norm": 1.5462649172094083, |
|
"learning_rate": 3.066666666666667e-06, |
|
"loss": 0.7221, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.31973355537052456, |
|
"grad_norm": 1.4593506403426082, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.6922, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.33305578684429643, |
|
"grad_norm": 1.42990322544597, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.6873, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3463780183180683, |
|
"grad_norm": 1.4803196580352989, |
|
"learning_rate": 3.4666666666666672e-06, |
|
"loss": 0.6975, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3597002497918401, |
|
"grad_norm": 1.5152423594984497, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 0.662, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.373022481265612, |
|
"grad_norm": 1.4183767379176757, |
|
"learning_rate": 3.7333333333333337e-06, |
|
"loss": 0.6578, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.38634471273938387, |
|
"grad_norm": 1.5361891229630817, |
|
"learning_rate": 3.866666666666667e-06, |
|
"loss": 0.6655, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.3996669442131557, |
|
"grad_norm": 1.430106096852744, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.648, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.41298917568692756, |
|
"grad_norm": 1.4201912298932542, |
|
"learning_rate": 4.133333333333333e-06, |
|
"loss": 0.6449, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.42631140716069943, |
|
"grad_norm": 1.45821665687646, |
|
"learning_rate": 4.266666666666668e-06, |
|
"loss": 0.636, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.43963363863447125, |
|
"grad_norm": 1.351061303304786, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.6287, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4529558701082431, |
|
"grad_norm": 1.4213406831821087, |
|
"learning_rate": 4.533333333333334e-06, |
|
"loss": 0.6162, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.466278101582015, |
|
"grad_norm": 1.5920606382997864, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 0.6269, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.47960033305578686, |
|
"grad_norm": 1.4259278183625448, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.628, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.4929225645295587, |
|
"grad_norm": 1.4277383814389801, |
|
"learning_rate": 4.933333333333334e-06, |
|
"loss": 0.6047, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5062447960033306, |
|
"grad_norm": 1.4056841775145905, |
|
"learning_rate": 4.999972922944898e-06, |
|
"loss": 0.5984, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5195670274771024, |
|
"grad_norm": 1.3201694715565466, |
|
"learning_rate": 4.999756310023261e-06, |
|
"loss": 0.5954, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5328892589508742, |
|
"grad_norm": 1.3231909628376382, |
|
"learning_rate": 4.999323102948655e-06, |
|
"loss": 0.5954, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5462114904246461, |
|
"grad_norm": 1.3735284931415068, |
|
"learning_rate": 4.998673339256785e-06, |
|
"loss": 0.5744, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.559533721898418, |
|
"grad_norm": 1.4504463513541146, |
|
"learning_rate": 4.997807075247147e-06, |
|
"loss": 0.593, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5728559533721899, |
|
"grad_norm": 1.2740174759395542, |
|
"learning_rate": 4.996724385978142e-06, |
|
"loss": 0.5903, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5861781848459617, |
|
"grad_norm": 1.3594170816449038, |
|
"learning_rate": 4.995425365260585e-06, |
|
"loss": 0.5748, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5995004163197336, |
|
"grad_norm": 1.3782163690261147, |
|
"learning_rate": 4.993910125649561e-06, |
|
"loss": 0.5814, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6128226477935054, |
|
"grad_norm": 1.2584130530987572, |
|
"learning_rate": 4.992178798434684e-06, |
|
"loss": 0.5752, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6261448792672772, |
|
"grad_norm": 1.4216349386698004, |
|
"learning_rate": 4.990231533628719e-06, |
|
"loss": 0.5757, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6394671107410491, |
|
"grad_norm": 1.3768977749050733, |
|
"learning_rate": 4.988068499954578e-06, |
|
"loss": 0.5555, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.652789342214821, |
|
"grad_norm": 1.463407945745149, |
|
"learning_rate": 4.985689884830711e-06, |
|
"loss": 0.5591, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6661115736885929, |
|
"grad_norm": 1.3808427236512926, |
|
"learning_rate": 4.983095894354858e-06, |
|
"loss": 0.5588, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6794338051623647, |
|
"grad_norm": 1.482231013162315, |
|
"learning_rate": 4.980286753286196e-06, |
|
"loss": 0.5418, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6927560366361366, |
|
"grad_norm": 1.3778109634949367, |
|
"learning_rate": 4.97726270502586e-06, |
|
"loss": 0.5399, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7060782681099084, |
|
"grad_norm": 1.4002755485164502, |
|
"learning_rate": 4.974024011595864e-06, |
|
"loss": 0.5533, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7194004995836802, |
|
"grad_norm": 1.3296620938997752, |
|
"learning_rate": 4.970570953616383e-06, |
|
"loss": 0.5438, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7327227310574521, |
|
"grad_norm": 1.4458203791375825, |
|
"learning_rate": 4.966903830281449e-06, |
|
"loss": 0.5378, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.746044962531224, |
|
"grad_norm": 1.5136526829998074, |
|
"learning_rate": 4.9630229593330226e-06, |
|
"loss": 0.5348, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7593671940049959, |
|
"grad_norm": 1.4362377777815807, |
|
"learning_rate": 4.958928677033465e-06, |
|
"loss": 0.5267, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7726894254787677, |
|
"grad_norm": 1.2730640176398647, |
|
"learning_rate": 4.954621338136399e-06, |
|
"loss": 0.5393, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7860116569525396, |
|
"grad_norm": 1.3685353603260022, |
|
"learning_rate": 4.95010131585597e-06, |
|
"loss": 0.534, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7993338884263114, |
|
"grad_norm": 1.2683696145515575, |
|
"learning_rate": 4.9453690018345144e-06, |
|
"loss": 0.527, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8126561199000832, |
|
"grad_norm": 1.323958192575613, |
|
"learning_rate": 4.940424806108619e-06, |
|
"loss": 0.5267, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.8259783513738551, |
|
"grad_norm": 1.2428318596261736, |
|
"learning_rate": 4.935269157073597e-06, |
|
"loss": 0.5149, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.839300582847627, |
|
"grad_norm": 1.27851729445364, |
|
"learning_rate": 4.9299025014463665e-06, |
|
"loss": 0.5228, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8526228143213989, |
|
"grad_norm": 1.2913119874277892, |
|
"learning_rate": 4.924325304226745e-06, |
|
"loss": 0.5028, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8659450457951707, |
|
"grad_norm": 1.3471089811240304, |
|
"learning_rate": 4.91853804865716e-06, |
|
"loss": 0.5402, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8792672772689425, |
|
"grad_norm": 1.3919989303105873, |
|
"learning_rate": 4.912541236180779e-06, |
|
"loss": 0.5208, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8925895087427144, |
|
"grad_norm": 1.336135856095439, |
|
"learning_rate": 4.9063353863980565e-06, |
|
"loss": 0.5232, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.9059117402164862, |
|
"grad_norm": 1.535058182009125, |
|
"learning_rate": 4.899921037021719e-06, |
|
"loss": 0.5183, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.9192339716902581, |
|
"grad_norm": 1.4366704774523757, |
|
"learning_rate": 4.893298743830168e-06, |
|
"loss": 0.5152, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.93255620316403, |
|
"grad_norm": 1.4306647802429082, |
|
"learning_rate": 4.88646908061933e-06, |
|
"loss": 0.5241, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9458784346378019, |
|
"grad_norm": 1.3151003083587773, |
|
"learning_rate": 4.879432639152935e-06, |
|
"loss": 0.518, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9592006661115737, |
|
"grad_norm": 1.3682779135005043, |
|
"learning_rate": 4.8721900291112415e-06, |
|
"loss": 0.51, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9725228975853455, |
|
"grad_norm": 1.3896990341168534, |
|
"learning_rate": 4.864741878038218e-06, |
|
"loss": 0.5207, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9858451290591174, |
|
"grad_norm": 1.2929489978661655, |
|
"learning_rate": 4.857088831287158e-06, |
|
"loss": 0.5121, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9991673605328892, |
|
"grad_norm": 1.3614193317791738, |
|
"learning_rate": 4.849231551964771e-06, |
|
"loss": 0.5016, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0066611157368859, |
|
"grad_norm": 1.3229981405906006, |
|
"learning_rate": 4.841170720873723e-06, |
|
"loss": 0.2569, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.0199833472106579, |
|
"grad_norm": 1.2274098346213043, |
|
"learning_rate": 4.832907036453647e-06, |
|
"loss": 0.4662, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0333055786844296, |
|
"grad_norm": 1.3810724651132364, |
|
"learning_rate": 4.824441214720629e-06, |
|
"loss": 0.4503, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.0466278101582014, |
|
"grad_norm": 1.5094355408493076, |
|
"learning_rate": 4.815773989205165e-06, |
|
"loss": 0.4525, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.0599500416319734, |
|
"grad_norm": 1.191750486186588, |
|
"learning_rate": 4.806906110888606e-06, |
|
"loss": 0.4548, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0732722731057451, |
|
"grad_norm": 1.2840884507072778, |
|
"learning_rate": 4.7978383481380865e-06, |
|
"loss": 0.4552, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.0865945045795171, |
|
"grad_norm": 1.3818002604555029, |
|
"learning_rate": 4.788571486639948e-06, |
|
"loss": 0.452, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.0999167360532889, |
|
"grad_norm": 1.3200111006279347, |
|
"learning_rate": 4.779106329331665e-06, |
|
"loss": 0.45, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.1132389675270609, |
|
"grad_norm": 1.2755161939993753, |
|
"learning_rate": 4.769443696332272e-06, |
|
"loss": 0.4454, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.1265611990008326, |
|
"grad_norm": 1.3421067926153882, |
|
"learning_rate": 4.759584424871302e-06, |
|
"loss": 0.4429, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1398834304746046, |
|
"grad_norm": 1.2219457405458125, |
|
"learning_rate": 4.749529369216246e-06, |
|
"loss": 0.4481, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.1532056619483764, |
|
"grad_norm": 1.330574738869651, |
|
"learning_rate": 4.7392794005985324e-06, |
|
"loss": 0.4459, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.1665278934221481, |
|
"grad_norm": 1.2042952174150132, |
|
"learning_rate": 4.7288354071380415e-06, |
|
"loss": 0.4339, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.1798501248959201, |
|
"grad_norm": 1.2535265876319093, |
|
"learning_rate": 4.7181982937661485e-06, |
|
"loss": 0.4364, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.1931723563696919, |
|
"grad_norm": 1.1967067502698956, |
|
"learning_rate": 4.707368982147318e-06, |
|
"loss": 0.4484, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.2064945878434639, |
|
"grad_norm": 1.3022379327320546, |
|
"learning_rate": 4.696348410599244e-06, |
|
"loss": 0.4468, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.2198168193172356, |
|
"grad_norm": 1.3137228151962215, |
|
"learning_rate": 4.685137534011549e-06, |
|
"loss": 0.4492, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.2331390507910074, |
|
"grad_norm": 1.3650226627212705, |
|
"learning_rate": 4.673737323763048e-06, |
|
"loss": 0.4389, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.2464612822647794, |
|
"grad_norm": 1.3122923570081069, |
|
"learning_rate": 4.662148767637578e-06, |
|
"loss": 0.4426, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.2597835137385511, |
|
"grad_norm": 1.3191199275346543, |
|
"learning_rate": 4.650372869738415e-06, |
|
"loss": 0.434, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.2731057452123231, |
|
"grad_norm": 1.4425884017899313, |
|
"learning_rate": 4.638410650401267e-06, |
|
"loss": 0.4382, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.2864279766860949, |
|
"grad_norm": 1.4066578837011166, |
|
"learning_rate": 4.626263146105875e-06, |
|
"loss": 0.4473, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.2997502081598669, |
|
"grad_norm": 1.4430824831613096, |
|
"learning_rate": 4.613931409386196e-06, |
|
"loss": 0.4488, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.3130724396336386, |
|
"grad_norm": 1.2217740909502797, |
|
"learning_rate": 4.601416508739211e-06, |
|
"loss": 0.4395, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.3263946711074106, |
|
"grad_norm": 1.474039226711776, |
|
"learning_rate": 4.588719528532342e-06, |
|
"loss": 0.4381, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.3397169025811824, |
|
"grad_norm": 1.2503538717797444, |
|
"learning_rate": 4.575841568909494e-06, |
|
"loss": 0.4317, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.3530391340549541, |
|
"grad_norm": 1.3172152085291207, |
|
"learning_rate": 4.562783745695738e-06, |
|
"loss": 0.4284, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.3663613655287261, |
|
"grad_norm": 1.2950216606489513, |
|
"learning_rate": 4.549547190300622e-06, |
|
"loss": 0.4372, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.3796835970024979, |
|
"grad_norm": 1.2065789326345406, |
|
"learning_rate": 4.536133049620143e-06, |
|
"loss": 0.4376, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.3930058284762699, |
|
"grad_norm": 1.450309483143858, |
|
"learning_rate": 4.522542485937369e-06, |
|
"loss": 0.4368, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4063280599500416, |
|
"grad_norm": 1.2856432394840618, |
|
"learning_rate": 4.508776676821739e-06, |
|
"loss": 0.4359, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.4196502914238134, |
|
"grad_norm": 1.303392410991855, |
|
"learning_rate": 4.494836815027022e-06, |
|
"loss": 0.437, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.4329725228975854, |
|
"grad_norm": 1.2374383776516957, |
|
"learning_rate": 4.4807241083879774e-06, |
|
"loss": 0.4277, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.4462947543713571, |
|
"grad_norm": 1.1895403487373037, |
|
"learning_rate": 4.466439779715696e-06, |
|
"loss": 0.4219, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.4596169858451291, |
|
"grad_norm": 1.3959427610193165, |
|
"learning_rate": 4.451985066691649e-06, |
|
"loss": 0.4341, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.4729392173189009, |
|
"grad_norm": 1.2421484766590198, |
|
"learning_rate": 4.437361221760449e-06, |
|
"loss": 0.4162, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.4862614487926726, |
|
"grad_norm": 1.287463815955178, |
|
"learning_rate": 4.422569512021332e-06, |
|
"loss": 0.4282, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.4995836802664446, |
|
"grad_norm": 1.4250139528752677, |
|
"learning_rate": 4.407611219118363e-06, |
|
"loss": 0.421, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.5129059117402166, |
|
"grad_norm": 1.239295099017855, |
|
"learning_rate": 4.3924876391293915e-06, |
|
"loss": 0.427, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.5262281432139884, |
|
"grad_norm": 1.3453909852418124, |
|
"learning_rate": 4.377200082453748e-06, |
|
"loss": 0.4357, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.5395503746877601, |
|
"grad_norm": 1.2197270804139342, |
|
"learning_rate": 4.361749873698707e-06, |
|
"loss": 0.4101, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.552872606161532, |
|
"grad_norm": 1.2833857194816787, |
|
"learning_rate": 4.346138351564711e-06, |
|
"loss": 0.424, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.5661948376353039, |
|
"grad_norm": 1.2293200008377447, |
|
"learning_rate": 4.330366868729376e-06, |
|
"loss": 0.421, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.5795170691090759, |
|
"grad_norm": 1.1926560926173428, |
|
"learning_rate": 4.3144367917302964e-06, |
|
"loss": 0.4142, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.5928393005828476, |
|
"grad_norm": 1.1594494067766803, |
|
"learning_rate": 4.2983495008466285e-06, |
|
"loss": 0.4191, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.6061615320566194, |
|
"grad_norm": 1.224729384745418, |
|
"learning_rate": 4.2821063899795015e-06, |
|
"loss": 0.4128, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.6194837635303914, |
|
"grad_norm": 1.1481228567549495, |
|
"learning_rate": 4.265708866531238e-06, |
|
"loss": 0.4279, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.6328059950041633, |
|
"grad_norm": 1.3467092580505746, |
|
"learning_rate": 4.249158351283414e-06, |
|
"loss": 0.4262, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.646128226477935, |
|
"grad_norm": 1.2776898545321251, |
|
"learning_rate": 4.232456278273743e-06, |
|
"loss": 0.4314, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.6594504579517069, |
|
"grad_norm": 1.2719662910087424, |
|
"learning_rate": 4.215604094671835e-06, |
|
"loss": 0.4108, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6727726894254786, |
|
"grad_norm": 1.1745562871590098, |
|
"learning_rate": 4.198603260653792e-06, |
|
"loss": 0.4165, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.6860949208992506, |
|
"grad_norm": 1.2455715420366917, |
|
"learning_rate": 4.181455249275701e-06, |
|
"loss": 0.4079, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.6994171523730226, |
|
"grad_norm": 1.3896213959063652, |
|
"learning_rate": 4.1641615463459926e-06, |
|
"loss": 0.417, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.7127393838467944, |
|
"grad_norm": 1.2131393445621887, |
|
"learning_rate": 4.146723650296701e-06, |
|
"loss": 0.4116, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.7260616153205661, |
|
"grad_norm": 1.2101597375627524, |
|
"learning_rate": 4.129143072053639e-06, |
|
"loss": 0.4169, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.739383846794338, |
|
"grad_norm": 1.2983597203629458, |
|
"learning_rate": 4.111421334905468e-06, |
|
"loss": 0.4101, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.7527060782681099, |
|
"grad_norm": 1.1756761204986788, |
|
"learning_rate": 4.093559974371725e-06, |
|
"loss": 0.4023, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.7660283097418819, |
|
"grad_norm": 1.296750722093234, |
|
"learning_rate": 4.075560538069767e-06, |
|
"loss": 0.4037, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.7793505412156536, |
|
"grad_norm": 1.2664686153860956, |
|
"learning_rate": 4.05742458558068e-06, |
|
"loss": 0.4005, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.7926727726894254, |
|
"grad_norm": 1.3144115093925024, |
|
"learning_rate": 4.039153688314146e-06, |
|
"loss": 0.4123, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.8059950041631974, |
|
"grad_norm": 1.177870994913812, |
|
"learning_rate": 4.020749429372286e-06, |
|
"loss": 0.4061, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.8193172356369693, |
|
"grad_norm": 1.1211392036639862, |
|
"learning_rate": 4.002213403412492e-06, |
|
"loss": 0.4207, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.832639467110741, |
|
"grad_norm": 1.1967335338983747, |
|
"learning_rate": 3.983547216509254e-06, |
|
"loss": 0.4037, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.8459616985845129, |
|
"grad_norm": 1.163438902600854, |
|
"learning_rate": 3.964752486015001e-06, |
|
"loss": 0.3983, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.8592839300582846, |
|
"grad_norm": 1.3897690758852341, |
|
"learning_rate": 3.945830840419966e-06, |
|
"loss": 0.406, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8726061615320566, |
|
"grad_norm": 1.2302319797016965, |
|
"learning_rate": 3.92678391921108e-06, |
|
"loss": 0.4102, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.8859283930058286, |
|
"grad_norm": 1.2515743950418428, |
|
"learning_rate": 3.907613372729916e-06, |
|
"loss": 0.4121, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.8992506244796004, |
|
"grad_norm": 1.2250514633864378, |
|
"learning_rate": 3.888320862029699e-06, |
|
"loss": 0.4135, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.9125728559533721, |
|
"grad_norm": 1.1786595929578796, |
|
"learning_rate": 3.868908058731376e-06, |
|
"loss": 0.3961, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.9258950874271439, |
|
"grad_norm": 1.2316483388259516, |
|
"learning_rate": 3.849376644878783e-06, |
|
"loss": 0.3991, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.9392173189009159, |
|
"grad_norm": 1.2218522002215788, |
|
"learning_rate": 3.829728312792895e-06, |
|
"loss": 0.4068, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.9525395503746878, |
|
"grad_norm": 1.218981908305007, |
|
"learning_rate": 3.8099647649251984e-06, |
|
"loss": 0.4116, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.9658617818484596, |
|
"grad_norm": 1.1473329397682062, |
|
"learning_rate": 3.790087713710179e-06, |
|
"loss": 0.3961, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.9791840133222314, |
|
"grad_norm": 1.15330486401059, |
|
"learning_rate": 3.770098881416945e-06, |
|
"loss": 0.397, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.9925062447960034, |
|
"grad_norm": 1.1147439818886564, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.391, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.1888727583821848, |
|
"learning_rate": 3.7297928109491765e-06, |
|
"loss": 0.2238, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.0133222314737718, |
|
"grad_norm": 1.1682600742115117, |
|
"learning_rate": 3.7094790651387414e-06, |
|
"loss": 0.3464, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.0266444629475435, |
|
"grad_norm": 1.2543709475465634, |
|
"learning_rate": 3.689060522675689e-06, |
|
"loss": 0.3299, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.0399666944213157, |
|
"grad_norm": 1.209782299511866, |
|
"learning_rate": 3.668538952747236e-06, |
|
"loss": 0.3335, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.0532889258950875, |
|
"grad_norm": 1.2314074580378418, |
|
"learning_rate": 3.6479161334675294e-06, |
|
"loss": 0.3402, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.0666111573688593, |
|
"grad_norm": 1.089978871118908, |
|
"learning_rate": 3.627193851723577e-06, |
|
"loss": 0.3282, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.079933388842631, |
|
"grad_norm": 1.1440650029159125, |
|
"learning_rate": 3.6063739030204226e-06, |
|
"loss": 0.3353, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.0932556203164028, |
|
"grad_norm": 1.1412527172991913, |
|
"learning_rate": 3.5854580913255706e-06, |
|
"loss": 0.3377, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.106577851790175, |
|
"grad_norm": 1.1374336855151732, |
|
"learning_rate": 3.564448228912682e-06, |
|
"loss": 0.3303, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.1199000832639467, |
|
"grad_norm": 1.1689768541112975, |
|
"learning_rate": 3.543346136204545e-06, |
|
"loss": 0.3269, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.1332223147377185, |
|
"grad_norm": 1.1613635619803697, |
|
"learning_rate": 3.522153641615345e-06, |
|
"loss": 0.3447, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.1465445462114903, |
|
"grad_norm": 1.0764748235217316, |
|
"learning_rate": 3.5008725813922383e-06, |
|
"loss": 0.3347, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.1598667776852625, |
|
"grad_norm": 1.242351223908071, |
|
"learning_rate": 3.4795047994562463e-06, |
|
"loss": 0.3337, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.1731890091590342, |
|
"grad_norm": 1.1068446291466676, |
|
"learning_rate": 3.458052147242494e-06, |
|
"loss": 0.3411, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.186511240632806, |
|
"grad_norm": 1.16808964966109, |
|
"learning_rate": 3.436516483539781e-06, |
|
"loss": 0.3376, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.1998334721065778, |
|
"grad_norm": 1.1025319948129593, |
|
"learning_rate": 3.4148996743295305e-06, |
|
"loss": 0.3316, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.2131557035803495, |
|
"grad_norm": 1.1758686501416102, |
|
"learning_rate": 3.3932035926241103e-06, |
|
"loss": 0.3355, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.2264779350541217, |
|
"grad_norm": 1.1003768444337116, |
|
"learning_rate": 3.3714301183045382e-06, |
|
"loss": 0.3357, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.2398001665278935, |
|
"grad_norm": 1.0881028666604091, |
|
"learning_rate": 3.349581137957604e-06, |
|
"loss": 0.3364, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.2531223980016652, |
|
"grad_norm": 1.211964671213877, |
|
"learning_rate": 3.3276585447123957e-06, |
|
"loss": 0.3353, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.266444629475437, |
|
"grad_norm": 1.163639286937533, |
|
"learning_rate": 3.3056642380762783e-06, |
|
"loss": 0.329, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.279766860949209, |
|
"grad_norm": 1.1618660863336634, |
|
"learning_rate": 3.2836001237702993e-06, |
|
"loss": 0.3299, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.293089092422981, |
|
"grad_norm": 1.1575282219975258, |
|
"learning_rate": 3.2614681135640696e-06, |
|
"loss": 0.3297, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.3064113238967527, |
|
"grad_norm": 1.1756458412662194, |
|
"learning_rate": 3.2392701251101172e-06, |
|
"loss": 0.3367, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.3197335553705245, |
|
"grad_norm": 1.1830174958146948, |
|
"learning_rate": 3.217008081777726e-06, |
|
"loss": 0.3319, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.3330557868442963, |
|
"grad_norm": 1.1667340496607632, |
|
"learning_rate": 3.1946839124862873e-06, |
|
"loss": 0.3361, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.3463780183180685, |
|
"grad_norm": 1.1105411198444493, |
|
"learning_rate": 3.1722995515381644e-06, |
|
"loss": 0.3425, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.3597002497918402, |
|
"grad_norm": 1.1234133483520614, |
|
"learning_rate": 3.149856938451094e-06, |
|
"loss": 0.3314, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.373022481265612, |
|
"grad_norm": 1.1838235154662082, |
|
"learning_rate": 3.127358017790132e-06, |
|
"loss": 0.3392, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.3863447127393838, |
|
"grad_norm": 1.080453742242657, |
|
"learning_rate": 3.1048047389991693e-06, |
|
"loss": 0.3336, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.3996669442131555, |
|
"grad_norm": 1.1140835000073062, |
|
"learning_rate": 3.082199056232015e-06, |
|
"loss": 0.3414, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.4129891756869277, |
|
"grad_norm": 1.138752925836035, |
|
"learning_rate": 3.059542928183079e-06, |
|
"loss": 0.3329, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.4263114071606995, |
|
"grad_norm": 1.0610831482375092, |
|
"learning_rate": 3.0368383179176584e-06, |
|
"loss": 0.342, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.4396336386344712, |
|
"grad_norm": 1.1718171313930514, |
|
"learning_rate": 3.0140871927018466e-06, |
|
"loss": 0.3266, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.452955870108243, |
|
"grad_norm": 1.2039181830598997, |
|
"learning_rate": 2.9912915238320755e-06, |
|
"loss": 0.338, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.4662781015820148, |
|
"grad_norm": 1.0760682240024106, |
|
"learning_rate": 2.9684532864643123e-06, |
|
"loss": 0.3277, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.479600333055787, |
|
"grad_norm": 1.2378751102400485, |
|
"learning_rate": 2.945574459442917e-06, |
|
"loss": 0.3398, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.4929225645295587, |
|
"grad_norm": 1.171184691228538, |
|
"learning_rate": 2.922657025129185e-06, |
|
"loss": 0.3313, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.5062447960033305, |
|
"grad_norm": 1.179077198361453, |
|
"learning_rate": 2.8997029692295875e-06, |
|
"loss": 0.3364, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.5195670274771023, |
|
"grad_norm": 1.1745776843262559, |
|
"learning_rate": 2.876714280623708e-06, |
|
"loss": 0.3261, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.532889258950874, |
|
"grad_norm": 1.1445296979936388, |
|
"learning_rate": 2.8536929511919227e-06, |
|
"loss": 0.3352, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.5462114904246462, |
|
"grad_norm": 1.2025025630072426, |
|
"learning_rate": 2.8306409756428067e-06, |
|
"loss": 0.3375, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.559533721898418, |
|
"grad_norm": 1.0971352592709565, |
|
"learning_rate": 2.807560351340302e-06, |
|
"loss": 0.3313, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.5728559533721898, |
|
"grad_norm": 1.1249045530287038, |
|
"learning_rate": 2.7844530781306544e-06, |
|
"loss": 0.3359, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.586178184845962, |
|
"grad_norm": 1.1665793984798016, |
|
"learning_rate": 2.761321158169134e-06, |
|
"loss": 0.3251, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.5995004163197337, |
|
"grad_norm": 1.1275088907272068, |
|
"learning_rate": 2.738166595746554e-06, |
|
"loss": 0.3189, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.6128226477935055, |
|
"grad_norm": 1.1697820606518197, |
|
"learning_rate": 2.7149913971156105e-06, |
|
"loss": 0.3305, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.6261448792672772, |
|
"grad_norm": 1.0995774846811734, |
|
"learning_rate": 2.6917975703170466e-06, |
|
"loss": 0.3323, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.639467110741049, |
|
"grad_norm": 1.1471735378793595, |
|
"learning_rate": 2.668587125005663e-06, |
|
"loss": 0.3348, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.652789342214821, |
|
"grad_norm": 1.1043284251546557, |
|
"learning_rate": 2.6453620722761897e-06, |
|
"loss": 0.3244, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.666111573688593, |
|
"grad_norm": 1.2133025722214072, |
|
"learning_rate": 2.6221244244890336e-06, |
|
"loss": 0.3297, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.6794338051623647, |
|
"grad_norm": 1.0759704642431338, |
|
"learning_rate": 2.5988761950959133e-06, |
|
"loss": 0.3294, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.6927560366361365, |
|
"grad_norm": 1.1303123852236616, |
|
"learning_rate": 2.575619398465402e-06, |
|
"loss": 0.327, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.7060782681099083, |
|
"grad_norm": 1.1874408347855483, |
|
"learning_rate": 2.5523560497083927e-06, |
|
"loss": 0.3297, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.7194004995836805, |
|
"grad_norm": 1.0814676034937838, |
|
"learning_rate": 2.5290881645034932e-06, |
|
"loss": 0.3308, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.7327227310574522, |
|
"grad_norm": 1.0821014638265758, |
|
"learning_rate": 2.5058177589223766e-06, |
|
"loss": 0.3286, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.746044962531224, |
|
"grad_norm": 1.1078782647950531, |
|
"learning_rate": 2.482546849255096e-06, |
|
"loss": 0.3289, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.7593671940049957, |
|
"grad_norm": 1.0709928206025467, |
|
"learning_rate": 2.4592774518353858e-06, |
|
"loss": 0.3349, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.7726894254787675, |
|
"grad_norm": 0.9986348268877544, |
|
"learning_rate": 2.436011582865945e-06, |
|
"loss": 0.3284, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.7860116569525397, |
|
"grad_norm": 1.0407659756205825, |
|
"learning_rate": 2.4127512582437486e-06, |
|
"loss": 0.3255, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.7993338884263115, |
|
"grad_norm": 1.1250160278057286, |
|
"learning_rate": 2.3894984933853734e-06, |
|
"loss": 0.3189, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.8126561199000832, |
|
"grad_norm": 1.1080847331634105, |
|
"learning_rate": 2.366255303052377e-06, |
|
"loss": 0.3286, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.825978351373855, |
|
"grad_norm": 1.1096965833381898, |
|
"learning_rate": 2.3430237011767166e-06, |
|
"loss": 0.3393, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.8393005828476268, |
|
"grad_norm": 1.177552126401324, |
|
"learning_rate": 2.319805700686257e-06, |
|
"loss": 0.323, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.852622814321399, |
|
"grad_norm": 1.1531088333635726, |
|
"learning_rate": 2.296603313330355e-06, |
|
"loss": 0.3275, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.8659450457951707, |
|
"grad_norm": 1.1189431785006225, |
|
"learning_rate": 2.2734185495055503e-06, |
|
"loss": 0.3234, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.8792672772689425, |
|
"grad_norm": 1.0872804861007128, |
|
"learning_rate": 2.250253418081373e-06, |
|
"loss": 0.3304, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.8925895087427143, |
|
"grad_norm": 1.100386612123568, |
|
"learning_rate": 2.22710992622628e-06, |
|
"loss": 0.326, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.905911740216486, |
|
"grad_norm": 1.0750519008987303, |
|
"learning_rate": 2.2039900792337477e-06, |
|
"loss": 0.3161, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.9192339716902582, |
|
"grad_norm": 1.0912428298625954, |
|
"learning_rate": 2.1808958803485134e-06, |
|
"loss": 0.3209, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.93255620316403, |
|
"grad_norm": 1.107507049641638, |
|
"learning_rate": 2.157829330593008e-06, |
|
"loss": 0.3363, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.9458784346378017, |
|
"grad_norm": 1.169768928903536, |
|
"learning_rate": 2.134792428593971e-06, |
|
"loss": 0.3327, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.959200666111574, |
|
"grad_norm": 1.1405241904375514, |
|
"learning_rate": 2.1117871704092818e-06, |
|
"loss": 0.3264, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.9725228975853453, |
|
"grad_norm": 1.1001277407179797, |
|
"learning_rate": 2.0888155493550027e-06, |
|
"loss": 0.3135, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.9858451290591175, |
|
"grad_norm": 1.1129546974887563, |
|
"learning_rate": 2.0658795558326745e-06, |
|
"loss": 0.3234, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.9991673605328892, |
|
"grad_norm": 1.1080818714362697, |
|
"learning_rate": 2.0429811771568468e-06, |
|
"loss": 0.322, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 3.006661115736886, |
|
"grad_norm": 1.1307642011297194, |
|
"learning_rate": 2.0201223973828917e-06, |
|
"loss": 0.1617, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 3.019983347210658, |
|
"grad_norm": 1.0035953126863488, |
|
"learning_rate": 1.997305197135089e-06, |
|
"loss": 0.2598, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 3.0333055786844296, |
|
"grad_norm": 1.0534955052337636, |
|
"learning_rate": 1.9745315534350157e-06, |
|
"loss": 0.2715, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 3.0466278101582014, |
|
"grad_norm": 1.189844390221147, |
|
"learning_rate": 1.9518034395302413e-06, |
|
"loss": 0.2646, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.059950041631973, |
|
"grad_norm": 1.1253150795326456, |
|
"learning_rate": 1.9291228247233607e-06, |
|
"loss": 0.2701, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 3.0732722731057454, |
|
"grad_norm": 1.1193701526310147, |
|
"learning_rate": 1.9064916742013515e-06, |
|
"loss": 0.2673, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 3.086594504579517, |
|
"grad_norm": 1.0959015217977324, |
|
"learning_rate": 1.883911948865306e-06, |
|
"loss": 0.2649, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 3.099916736053289, |
|
"grad_norm": 1.1965240464412776, |
|
"learning_rate": 1.8613856051605242e-06, |
|
"loss": 0.2629, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.1132389675270606, |
|
"grad_norm": 1.0473419859838504, |
|
"learning_rate": 1.8389145949069953e-06, |
|
"loss": 0.2613, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.126561199000833, |
|
"grad_norm": 1.2108832644207754, |
|
"learning_rate": 1.816500865130279e-06, |
|
"loss": 0.2571, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.1398834304746046, |
|
"grad_norm": 1.0441673255917416, |
|
"learning_rate": 1.7941463578928088e-06, |
|
"loss": 0.2766, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.1532056619483764, |
|
"grad_norm": 1.1708679609837331, |
|
"learning_rate": 1.7718530101256115e-06, |
|
"loss": 0.2718, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.166527893422148, |
|
"grad_norm": 1.1284481739249688, |
|
"learning_rate": 1.7496227534604859e-06, |
|
"loss": 0.2575, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.17985012489592, |
|
"grad_norm": 1.0770429901542908, |
|
"learning_rate": 1.7274575140626318e-06, |
|
"loss": 0.2629, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.193172356369692, |
|
"grad_norm": 1.0692152501631402, |
|
"learning_rate": 1.7053592124637557e-06, |
|
"loss": 0.2694, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.206494587843464, |
|
"grad_norm": 1.039094308900864, |
|
"learning_rate": 1.6833297633956647e-06, |
|
"loss": 0.2687, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.2198168193172356, |
|
"grad_norm": 1.1432726083538918, |
|
"learning_rate": 1.661371075624363e-06, |
|
"loss": 0.2722, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.2331390507910074, |
|
"grad_norm": 1.047486598216707, |
|
"learning_rate": 1.6394850517846621e-06, |
|
"loss": 0.26, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.246461282264779, |
|
"grad_norm": 1.1299207627919639, |
|
"learning_rate": 1.6176735882153284e-06, |
|
"loss": 0.2646, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.2597835137385514, |
|
"grad_norm": 1.0456944660867535, |
|
"learning_rate": 1.5959385747947697e-06, |
|
"loss": 0.2628, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.273105745212323, |
|
"grad_norm": 1.0617694211022177, |
|
"learning_rate": 1.5742818947772875e-06, |
|
"loss": 0.2576, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.286427976686095, |
|
"grad_norm": 1.0978333522782833, |
|
"learning_rate": 1.552705424629898e-06, |
|
"loss": 0.2703, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.2997502081598666, |
|
"grad_norm": 1.0865484727411876, |
|
"learning_rate": 1.5312110338697427e-06, |
|
"loss": 0.2692, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.313072439633639, |
|
"grad_norm": 1.0418725249305938, |
|
"learning_rate": 1.509800584902108e-06, |
|
"loss": 0.2642, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.3263946711074106, |
|
"grad_norm": 1.0660556477168224, |
|
"learning_rate": 1.4884759328590476e-06, |
|
"loss": 0.2633, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 3.3397169025811824, |
|
"grad_norm": 1.0851955569492033, |
|
"learning_rate": 1.467238925438646e-06, |
|
"loss": 0.2677, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.353039134054954, |
|
"grad_norm": 1.0460266601554127, |
|
"learning_rate": 1.446091402744923e-06, |
|
"loss": 0.2682, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 3.366361365528726, |
|
"grad_norm": 1.0320032665713081, |
|
"learning_rate": 1.4250351971283937e-06, |
|
"loss": 0.2673, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 3.379683597002498, |
|
"grad_norm": 1.0694000065346523, |
|
"learning_rate": 1.4040721330273063e-06, |
|
"loss": 0.273, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.39300582847627, |
|
"grad_norm": 1.0986183217647922, |
|
"learning_rate": 1.3832040268095589e-06, |
|
"loss": 0.2615, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 3.4063280599500416, |
|
"grad_norm": 1.063489274495733, |
|
"learning_rate": 1.362432686615316e-06, |
|
"loss": 0.2763, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 3.4196502914238134, |
|
"grad_norm": 1.0408747367635172, |
|
"learning_rate": 1.3417599122003464e-06, |
|
"loss": 0.2677, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 3.432972522897585, |
|
"grad_norm": 1.1352124059324844, |
|
"learning_rate": 1.3211874947800747e-06, |
|
"loss": 0.2614, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 3.4462947543713573, |
|
"grad_norm": 1.0881790246993637, |
|
"learning_rate": 1.3007172168743854e-06, |
|
"loss": 0.2659, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.459616985845129, |
|
"grad_norm": 1.080506653895442, |
|
"learning_rate": 1.280350852153168e-06, |
|
"loss": 0.2666, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 3.472939217318901, |
|
"grad_norm": 1.0583310544029485, |
|
"learning_rate": 1.260090165282645e-06, |
|
"loss": 0.2648, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 3.4862614487926726, |
|
"grad_norm": 1.1152914883872809, |
|
"learning_rate": 1.2399369117724582e-06, |
|
"loss": 0.2704, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 3.4995836802664444, |
|
"grad_norm": 1.0455279885524973, |
|
"learning_rate": 1.2198928378235717e-06, |
|
"loss": 0.2672, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 3.5129059117402166, |
|
"grad_norm": 1.0576879823812282, |
|
"learning_rate": 1.1999596801769617e-06, |
|
"loss": 0.264, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.5262281432139884, |
|
"grad_norm": 1.1014402939329688, |
|
"learning_rate": 1.1801391659631423e-06, |
|
"loss": 0.2654, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 3.53955037468776, |
|
"grad_norm": 1.028865585013293, |
|
"learning_rate": 1.160433012552508e-06, |
|
"loss": 0.2637, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 3.5528726061615323, |
|
"grad_norm": 1.0546829340917359, |
|
"learning_rate": 1.1408429274065418e-06, |
|
"loss": 0.27, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 3.5661948376353036, |
|
"grad_norm": 1.0417474358737957, |
|
"learning_rate": 1.1213706079298566e-06, |
|
"loss": 0.2589, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 3.579517069109076, |
|
"grad_norm": 1.0937717215676659, |
|
"learning_rate": 1.1020177413231334e-06, |
|
"loss": 0.2697, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.5928393005828476, |
|
"grad_norm": 1.0736209133266341, |
|
"learning_rate": 1.0827860044369226e-06, |
|
"loss": 0.2645, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 3.6061615320566194, |
|
"grad_norm": 1.0474112636925237, |
|
"learning_rate": 1.06367706362636e-06, |
|
"loss": 0.2681, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 3.6194837635303916, |
|
"grad_norm": 1.103432827044926, |
|
"learning_rate": 1.0446925746067768e-06, |
|
"loss": 0.2695, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 3.6328059950041633, |
|
"grad_norm": 1.0836832730433759, |
|
"learning_rate": 1.0258341823102418e-06, |
|
"loss": 0.2632, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 3.646128226477935, |
|
"grad_norm": 1.0859645184669795, |
|
"learning_rate": 1.0071035207430352e-06, |
|
"loss": 0.2669, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.659450457951707, |
|
"grad_norm": 1.1090309698734075, |
|
"learning_rate": 9.88502212844063e-07, |
|
"loss": 0.2598, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 3.6727726894254786, |
|
"grad_norm": 1.040309343372892, |
|
"learning_rate": 9.700318703442437e-07, |
|
"loss": 0.259, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 3.686094920899251, |
|
"grad_norm": 1.0821866491462884, |
|
"learning_rate": 9.516940936268504e-07, |
|
"loss": 0.261, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 3.6994171523730226, |
|
"grad_norm": 1.032839245512739, |
|
"learning_rate": 9.334904715888496e-07, |
|
"loss": 0.2726, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 3.7127393838467944, |
|
"grad_norm": 1.2154127605600453, |
|
"learning_rate": 9.154225815032242e-07, |
|
"loss": 0.257, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.726061615320566, |
|
"grad_norm": 1.0461952582538157, |
|
"learning_rate": 8.974919888823164e-07, |
|
"loss": 0.255, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 3.739383846794338, |
|
"grad_norm": 1.0817761044485013, |
|
"learning_rate": 8.797002473421729e-07, |
|
"loss": 0.2672, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 3.75270607826811, |
|
"grad_norm": 1.1088064613565192, |
|
"learning_rate": 8.620488984679378e-07, |
|
"loss": 0.2701, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 3.766028309741882, |
|
"grad_norm": 1.0905619434743687, |
|
"learning_rate": 8.445394716802754e-07, |
|
"loss": 0.2699, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 3.7793505412156536, |
|
"grad_norm": 1.1348105280382488, |
|
"learning_rate": 8.271734841028553e-07, |
|
"loss": 0.2625, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.7926727726894254, |
|
"grad_norm": 1.0895135923548163, |
|
"learning_rate": 8.099524404308948e-07, |
|
"loss": 0.2652, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 3.805995004163197, |
|
"grad_norm": 1.081980856394784, |
|
"learning_rate": 7.928778328007918e-07, |
|
"loss": 0.2725, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 3.8193172356369693, |
|
"grad_norm": 1.072896110364212, |
|
"learning_rate": 7.759511406608255e-07, |
|
"loss": 0.2534, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 3.832639467110741, |
|
"grad_norm": 1.0739615029579452, |
|
"learning_rate": 7.591738306429769e-07, |
|
"loss": 0.2664, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 3.845961698584513, |
|
"grad_norm": 1.050183747712219, |
|
"learning_rate": 7.425473564358457e-07, |
|
"loss": 0.2644, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.8592839300582846, |
|
"grad_norm": 0.9907767398098887, |
|
"learning_rate": 7.260731586586983e-07, |
|
"loss": 0.2654, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 3.8726061615320564, |
|
"grad_norm": 1.0920216934407247, |
|
"learning_rate": 7.097526647366379e-07, |
|
"loss": 0.2652, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 3.8859283930058286, |
|
"grad_norm": 1.0675877474822888, |
|
"learning_rate": 6.935872887769299e-07, |
|
"loss": 0.265, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 3.8992506244796004, |
|
"grad_norm": 1.047365669548006, |
|
"learning_rate": 6.775784314464717e-07, |
|
"loss": 0.2635, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 3.912572855953372, |
|
"grad_norm": 1.030022751644315, |
|
"learning_rate": 6.617274798504286e-07, |
|
"loss": 0.2628, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.925895087427144, |
|
"grad_norm": 1.044545063593376, |
|
"learning_rate": 6.460358074120518e-07, |
|
"loss": 0.2647, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 3.9392173189009156, |
|
"grad_norm": 1.0225003684521647, |
|
"learning_rate": 6.305047737536707e-07, |
|
"loss": 0.2625, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 3.952539550374688, |
|
"grad_norm": 1.0533487294826005, |
|
"learning_rate": 6.151357245788917e-07, |
|
"loss": 0.2731, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 3.9658617818484596, |
|
"grad_norm": 1.034466643395701, |
|
"learning_rate": 5.999299915559956e-07, |
|
"loss": 0.2558, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 3.9791840133222314, |
|
"grad_norm": 1.037903752833203, |
|
"learning_rate": 5.848888922025553e-07, |
|
"loss": 0.2618, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.9925062447960036, |
|
"grad_norm": 1.027043640385128, |
|
"learning_rate": 5.700137297712749e-07, |
|
"loss": 0.2669, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.098155802724837, |
|
"learning_rate": 5.553057931370729e-07, |
|
"loss": 0.1505, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 4.013322231473772, |
|
"grad_norm": 1.0453208918296613, |
|
"learning_rate": 5.407663566854008e-07, |
|
"loss": 0.2321, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 4.0266444629475435, |
|
"grad_norm": 1.0336345571966956, |
|
"learning_rate": 5.263966802018275e-07, |
|
"loss": 0.2359, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 4.039966694421316, |
|
"grad_norm": 1.0051543661966322, |
|
"learning_rate": 5.121980087628802e-07, |
|
"loss": 0.2286, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.053288925895087, |
|
"grad_norm": 1.0291906973127083, |
|
"learning_rate": 4.981715726281666e-07, |
|
"loss": 0.2322, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 4.066611157368859, |
|
"grad_norm": 1.0535023572236821, |
|
"learning_rate": 4.843185871337722e-07, |
|
"loss": 0.2402, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 4.0799333888426315, |
|
"grad_norm": 1.1023709895301759, |
|
"learning_rate": 4.706402525869633e-07, |
|
"loss": 0.2322, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 4.093255620316403, |
|
"grad_norm": 1.0283405302482598, |
|
"learning_rate": 4.5713775416217884e-07, |
|
"loss": 0.2238, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 4.106577851790175, |
|
"grad_norm": 1.0393264816137988, |
|
"learning_rate": 4.438122617983442e-07, |
|
"loss": 0.2292, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.119900083263946, |
|
"grad_norm": 1.0587543311732102, |
|
"learning_rate": 4.3066493009749853e-07, |
|
"loss": 0.2293, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 4.1332223147377185, |
|
"grad_norm": 1.0914720689441537, |
|
"learning_rate": 4.1769689822475147e-07, |
|
"loss": 0.2317, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 4.146544546211491, |
|
"grad_norm": 1.0417273334496886, |
|
"learning_rate": 4.049092898095816e-07, |
|
"loss": 0.2358, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 4.159866777685262, |
|
"grad_norm": 1.0248130840986234, |
|
"learning_rate": 3.9230321284847856e-07, |
|
"loss": 0.2355, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 4.173189009159034, |
|
"grad_norm": 1.0589305077753277, |
|
"learning_rate": 3.798797596089351e-07, |
|
"loss": 0.2331, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.1865112406328056, |
|
"grad_norm": 1.0399059506346249, |
|
"learning_rate": 3.6764000653481263e-07, |
|
"loss": 0.2352, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 4.199833472106578, |
|
"grad_norm": 1.0352919697923912, |
|
"learning_rate": 3.555850141530659e-07, |
|
"loss": 0.2327, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 4.21315570358035, |
|
"grad_norm": 0.989140712662966, |
|
"learning_rate": 3.4371582698185636e-07, |
|
"loss": 0.228, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 4.226477935054121, |
|
"grad_norm": 1.0090105290858724, |
|
"learning_rate": 3.3203347344004737e-07, |
|
"loss": 0.2258, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 4.2398001665278935, |
|
"grad_norm": 0.9991149517617007, |
|
"learning_rate": 3.2053896575809426e-07, |
|
"loss": 0.2199, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.253122398001666, |
|
"grad_norm": 1.014374420272404, |
|
"learning_rate": 3.092332998903416e-07, |
|
"loss": 0.2261, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 4.266444629475437, |
|
"grad_norm": 1.0250317424256117, |
|
"learning_rate": 2.981174554287239e-07, |
|
"loss": 0.2381, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 4.279766860949209, |
|
"grad_norm": 1.0442581559998447, |
|
"learning_rate": 2.871923955178918e-07, |
|
"loss": 0.2315, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 4.2930890924229805, |
|
"grad_norm": 1.0098371613642636, |
|
"learning_rate": 2.764590667717562e-07, |
|
"loss": 0.2272, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 4.306411323896753, |
|
"grad_norm": 1.0807767731419033, |
|
"learning_rate": 2.6591839919146963e-07, |
|
"loss": 0.2394, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.319733555370525, |
|
"grad_norm": 1.054167521910636, |
|
"learning_rate": 2.555713060848433e-07, |
|
"loss": 0.2324, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 4.333055786844296, |
|
"grad_norm": 1.107035611645368, |
|
"learning_rate": 2.454186839872158e-07, |
|
"loss": 0.2357, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 4.3463780183180685, |
|
"grad_norm": 1.0552376707704954, |
|
"learning_rate": 2.3546141258376786e-07, |
|
"loss": 0.2289, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 4.35970024979184, |
|
"grad_norm": 1.0047757316250936, |
|
"learning_rate": 2.257003546333042e-07, |
|
"loss": 0.2281, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 4.373022481265612, |
|
"grad_norm": 1.0426529499317703, |
|
"learning_rate": 2.1613635589349756e-07, |
|
"loss": 0.2351, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.386344712739384, |
|
"grad_norm": 1.0168386832947722, |
|
"learning_rate": 2.0677024504760752e-07, |
|
"loss": 0.2329, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 4.3996669442131555, |
|
"grad_norm": 1.004364247984247, |
|
"learning_rate": 1.9760283363267684e-07, |
|
"loss": 0.2309, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 4.412989175686928, |
|
"grad_norm": 1.0575692314944383, |
|
"learning_rate": 1.8863491596921745e-07, |
|
"loss": 0.2338, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 4.426311407160699, |
|
"grad_norm": 1.0256602646785253, |
|
"learning_rate": 1.798672690923828e-07, |
|
"loss": 0.2286, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 4.439633638634471, |
|
"grad_norm": 0.9903962555666792, |
|
"learning_rate": 1.713006526846439e-07, |
|
"loss": 0.2299, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.4529558701082435, |
|
"grad_norm": 1.006720208531802, |
|
"learning_rate": 1.629358090099639e-07, |
|
"loss": 0.2308, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 4.466278101582015, |
|
"grad_norm": 1.0131829979414444, |
|
"learning_rate": 1.5477346284948292e-07, |
|
"loss": 0.2291, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 4.479600333055787, |
|
"grad_norm": 1.0035493986435864, |
|
"learning_rate": 1.4681432143872133e-07, |
|
"loss": 0.2345, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 4.492922564529558, |
|
"grad_norm": 1.0043750746548528, |
|
"learning_rate": 1.3905907440629752e-07, |
|
"loss": 0.2293, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 4.5062447960033305, |
|
"grad_norm": 1.041883268646126, |
|
"learning_rate": 1.31508393714177e-07, |
|
"loss": 0.2228, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.519567027477103, |
|
"grad_norm": 1.0405556943028, |
|
"learning_rate": 1.241629335994471e-07, |
|
"loss": 0.2281, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 4.532889258950874, |
|
"grad_norm": 1.0206604077473356, |
|
"learning_rate": 1.1702333051763271e-07, |
|
"loss": 0.2223, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 4.546211490424646, |
|
"grad_norm": 1.1168719067709043, |
|
"learning_rate": 1.1009020308754587e-07, |
|
"loss": 0.2296, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 4.559533721898418, |
|
"grad_norm": 1.061012715283086, |
|
"learning_rate": 1.0336415203768962e-07, |
|
"loss": 0.2338, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 4.57285595337219, |
|
"grad_norm": 1.0309834474331188, |
|
"learning_rate": 9.684576015420277e-08, |
|
"loss": 0.2328, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.586178184845962, |
|
"grad_norm": 1.0027277768812777, |
|
"learning_rate": 9.053559223036746e-08, |
|
"loss": 0.2195, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 4.599500416319733, |
|
"grad_norm": 0.993857076400692, |
|
"learning_rate": 8.44341950176683e-08, |
|
"loss": 0.2256, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 4.6128226477935055, |
|
"grad_norm": 1.007429158342742, |
|
"learning_rate": 7.854209717842231e-08, |
|
"loss": 0.2319, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 4.626144879267278, |
|
"grad_norm": 1.033726145073033, |
|
"learning_rate": 7.285980923996989e-08, |
|
"loss": 0.2342, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 4.639467110741049, |
|
"grad_norm": 1.0168527853674483, |
|
"learning_rate": 6.738782355044048e-08, |
|
"loss": 0.234, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.652789342214821, |
|
"grad_norm": 1.0477507076137358, |
|
"learning_rate": 6.212661423609184e-08, |
|
"loss": 0.2342, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 4.6661115736885925, |
|
"grad_norm": 1.0066144201644125, |
|
"learning_rate": 5.707663716023021e-08, |
|
"loss": 0.2181, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 4.679433805162365, |
|
"grad_norm": 1.0584682384889934, |
|
"learning_rate": 5.22383298837098e-08, |
|
"loss": 0.2316, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 4.692756036636137, |
|
"grad_norm": 0.9785329546521053, |
|
"learning_rate": 4.761211162702117e-08, |
|
"loss": 0.23, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 4.706078268109908, |
|
"grad_norm": 1.0383108228822218, |
|
"learning_rate": 4.319838323396691e-08, |
|
"loss": 0.2331, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.7194004995836805, |
|
"grad_norm": 1.0036079922992014, |
|
"learning_rate": 3.8997527136930004e-08, |
|
"loss": 0.2255, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 4.732722731057452, |
|
"grad_norm": 1.0350261932080171, |
|
"learning_rate": 3.5009907323737826e-08, |
|
"loss": 0.241, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 4.746044962531224, |
|
"grad_norm": 1.0546445582741784, |
|
"learning_rate": 3.1235869306123766e-08, |
|
"loss": 0.2278, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 4.759367194004996, |
|
"grad_norm": 1.002658916833135, |
|
"learning_rate": 2.767574008979007e-08, |
|
"loss": 0.2263, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 4.7726894254787675, |
|
"grad_norm": 1.0293431900981997, |
|
"learning_rate": 2.4329828146074096e-08, |
|
"loss": 0.234, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.78601165695254, |
|
"grad_norm": 1.048978506558003, |
|
"learning_rate": 2.1198423385220822e-08, |
|
"loss": 0.2272, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 4.799333888426311, |
|
"grad_norm": 1.0188945259859212, |
|
"learning_rate": 1.82817971312621e-08, |
|
"loss": 0.2254, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 4.812656119900083, |
|
"grad_norm": 1.0799756942526888, |
|
"learning_rate": 1.5580202098509078e-08, |
|
"loss": 0.2324, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 4.8259783513738554, |
|
"grad_norm": 1.0033120373906426, |
|
"learning_rate": 1.3093872369654148e-08, |
|
"loss": 0.2261, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 4.839300582847627, |
|
"grad_norm": 1.0341825743001596, |
|
"learning_rate": 1.0823023375489128e-08, |
|
"loss": 0.2301, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.852622814321399, |
|
"grad_norm": 0.9930386687248629, |
|
"learning_rate": 8.767851876239075e-09, |
|
"loss": 0.2289, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 4.86594504579517, |
|
"grad_norm": 1.050611973007718, |
|
"learning_rate": 6.9285359445145366e-09, |
|
"loss": 0.2322, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 4.8792672772689425, |
|
"grad_norm": 1.0238154022229573, |
|
"learning_rate": 5.305234949880001e-09, |
|
"loss": 0.2314, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 4.892589508742715, |
|
"grad_norm": 1.0359224074295086, |
|
"learning_rate": 3.8980895450474455e-09, |
|
"loss": 0.2445, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 4.905911740216486, |
|
"grad_norm": 1.0513486639225555, |
|
"learning_rate": 2.7072216536885855e-09, |
|
"loss": 0.2366, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.919233971690258, |
|
"grad_norm": 1.0043522118975665, |
|
"learning_rate": 1.7327344598702667e-09, |
|
"loss": 0.2373, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 4.9325562031640295, |
|
"grad_norm": 1.0244814544018699, |
|
"learning_rate": 9.747123991141193e-10, |
|
"loss": 0.2333, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 4.945878434637802, |
|
"grad_norm": 1.0043902802504958, |
|
"learning_rate": 4.332211510807427e-10, |
|
"loss": 0.2322, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 4.959200666111574, |
|
"grad_norm": 0.9910871784096957, |
|
"learning_rate": 1.0830763387897902e-10, |
|
"loss": 0.2172, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 4.972522897585345, |
|
"grad_norm": 1.0059895919675135, |
|
"learning_rate": 0.0, |
|
"loss": 0.2232, |
|
"step": 750 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1025260732350464.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|