WebJudge-7B / trainer_state.json
xuetianci99's picture
Initial model upload
0c35837 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.972522897585345,
"eval_steps": 500,
"global_step": 750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013322231473771857,
"grad_norm": 5.780612066318371,
"learning_rate": 1.3333333333333336e-07,
"loss": 0.9209,
"step": 2
},
{
"epoch": 0.026644462947543714,
"grad_norm": 5.803797516781435,
"learning_rate": 2.666666666666667e-07,
"loss": 0.9065,
"step": 4
},
{
"epoch": 0.03996669442131557,
"grad_norm": 5.67259689729213,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.9182,
"step": 6
},
{
"epoch": 0.05328892589508743,
"grad_norm": 5.354376269147714,
"learning_rate": 5.333333333333335e-07,
"loss": 0.9233,
"step": 8
},
{
"epoch": 0.06661115736885928,
"grad_norm": 4.851036441351623,
"learning_rate": 6.666666666666667e-07,
"loss": 0.8906,
"step": 10
},
{
"epoch": 0.07993338884263114,
"grad_norm": 4.566190641845786,
"learning_rate": 8.000000000000001e-07,
"loss": 0.8719,
"step": 12
},
{
"epoch": 0.093255620316403,
"grad_norm": 4.013324466824848,
"learning_rate": 9.333333333333334e-07,
"loss": 0.8861,
"step": 14
},
{
"epoch": 0.10657785179017486,
"grad_norm": 3.540718772081854,
"learning_rate": 1.066666666666667e-06,
"loss": 0.8791,
"step": 16
},
{
"epoch": 0.11990008326394672,
"grad_norm": 2.88258712818169,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.8532,
"step": 18
},
{
"epoch": 0.13322231473771856,
"grad_norm": 2.6937271254013613,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.8455,
"step": 20
},
{
"epoch": 0.14654454621149043,
"grad_norm": 2.3220586555688816,
"learning_rate": 1.4666666666666669e-06,
"loss": 0.8159,
"step": 22
},
{
"epoch": 0.15986677768526228,
"grad_norm": 3.400225929248788,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.8004,
"step": 24
},
{
"epoch": 0.17318900915903415,
"grad_norm": 3.5504999091076845,
"learning_rate": 1.7333333333333336e-06,
"loss": 0.8084,
"step": 26
},
{
"epoch": 0.186511240632806,
"grad_norm": 3.2668071978806057,
"learning_rate": 1.8666666666666669e-06,
"loss": 0.8086,
"step": 28
},
{
"epoch": 0.19983347210657784,
"grad_norm": 2.4897231787044154,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7821,
"step": 30
},
{
"epoch": 0.21315570358034971,
"grad_norm": 2.006655051652645,
"learning_rate": 2.133333333333334e-06,
"loss": 0.7767,
"step": 32
},
{
"epoch": 0.22647793505412156,
"grad_norm": 1.8054118845570075,
"learning_rate": 2.266666666666667e-06,
"loss": 0.764,
"step": 34
},
{
"epoch": 0.23980016652789343,
"grad_norm": 2.060437415012226,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.7577,
"step": 36
},
{
"epoch": 0.2531223980016653,
"grad_norm": 1.822445369621608,
"learning_rate": 2.5333333333333338e-06,
"loss": 0.7441,
"step": 38
},
{
"epoch": 0.2664446294754371,
"grad_norm": 1.702085750109042,
"learning_rate": 2.666666666666667e-06,
"loss": 0.7364,
"step": 40
},
{
"epoch": 0.279766860949209,
"grad_norm": 1.4905140822613037,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.7252,
"step": 42
},
{
"epoch": 0.29308909242298087,
"grad_norm": 1.602012507677594,
"learning_rate": 2.9333333333333338e-06,
"loss": 0.727,
"step": 44
},
{
"epoch": 0.3064113238967527,
"grad_norm": 1.5462649172094083,
"learning_rate": 3.066666666666667e-06,
"loss": 0.7221,
"step": 46
},
{
"epoch": 0.31973355537052456,
"grad_norm": 1.4593506403426082,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.6922,
"step": 48
},
{
"epoch": 0.33305578684429643,
"grad_norm": 1.42990322544597,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.6873,
"step": 50
},
{
"epoch": 0.3463780183180683,
"grad_norm": 1.4803196580352989,
"learning_rate": 3.4666666666666672e-06,
"loss": 0.6975,
"step": 52
},
{
"epoch": 0.3597002497918401,
"grad_norm": 1.5152423594984497,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.662,
"step": 54
},
{
"epoch": 0.373022481265612,
"grad_norm": 1.4183767379176757,
"learning_rate": 3.7333333333333337e-06,
"loss": 0.6578,
"step": 56
},
{
"epoch": 0.38634471273938387,
"grad_norm": 1.5361891229630817,
"learning_rate": 3.866666666666667e-06,
"loss": 0.6655,
"step": 58
},
{
"epoch": 0.3996669442131557,
"grad_norm": 1.430106096852744,
"learning_rate": 4.000000000000001e-06,
"loss": 0.648,
"step": 60
},
{
"epoch": 0.41298917568692756,
"grad_norm": 1.4201912298932542,
"learning_rate": 4.133333333333333e-06,
"loss": 0.6449,
"step": 62
},
{
"epoch": 0.42631140716069943,
"grad_norm": 1.45821665687646,
"learning_rate": 4.266666666666668e-06,
"loss": 0.636,
"step": 64
},
{
"epoch": 0.43963363863447125,
"grad_norm": 1.351061303304786,
"learning_rate": 4.4e-06,
"loss": 0.6287,
"step": 66
},
{
"epoch": 0.4529558701082431,
"grad_norm": 1.4213406831821087,
"learning_rate": 4.533333333333334e-06,
"loss": 0.6162,
"step": 68
},
{
"epoch": 0.466278101582015,
"grad_norm": 1.5920606382997864,
"learning_rate": 4.666666666666667e-06,
"loss": 0.6269,
"step": 70
},
{
"epoch": 0.47960033305578686,
"grad_norm": 1.4259278183625448,
"learning_rate": 4.800000000000001e-06,
"loss": 0.628,
"step": 72
},
{
"epoch": 0.4929225645295587,
"grad_norm": 1.4277383814389801,
"learning_rate": 4.933333333333334e-06,
"loss": 0.6047,
"step": 74
},
{
"epoch": 0.5062447960033306,
"grad_norm": 1.4056841775145905,
"learning_rate": 4.999972922944898e-06,
"loss": 0.5984,
"step": 76
},
{
"epoch": 0.5195670274771024,
"grad_norm": 1.3201694715565466,
"learning_rate": 4.999756310023261e-06,
"loss": 0.5954,
"step": 78
},
{
"epoch": 0.5328892589508742,
"grad_norm": 1.3231909628376382,
"learning_rate": 4.999323102948655e-06,
"loss": 0.5954,
"step": 80
},
{
"epoch": 0.5462114904246461,
"grad_norm": 1.3735284931415068,
"learning_rate": 4.998673339256785e-06,
"loss": 0.5744,
"step": 82
},
{
"epoch": 0.559533721898418,
"grad_norm": 1.4504463513541146,
"learning_rate": 4.997807075247147e-06,
"loss": 0.593,
"step": 84
},
{
"epoch": 0.5728559533721899,
"grad_norm": 1.2740174759395542,
"learning_rate": 4.996724385978142e-06,
"loss": 0.5903,
"step": 86
},
{
"epoch": 0.5861781848459617,
"grad_norm": 1.3594170816449038,
"learning_rate": 4.995425365260585e-06,
"loss": 0.5748,
"step": 88
},
{
"epoch": 0.5995004163197336,
"grad_norm": 1.3782163690261147,
"learning_rate": 4.993910125649561e-06,
"loss": 0.5814,
"step": 90
},
{
"epoch": 0.6128226477935054,
"grad_norm": 1.2584130530987572,
"learning_rate": 4.992178798434684e-06,
"loss": 0.5752,
"step": 92
},
{
"epoch": 0.6261448792672772,
"grad_norm": 1.4216349386698004,
"learning_rate": 4.990231533628719e-06,
"loss": 0.5757,
"step": 94
},
{
"epoch": 0.6394671107410491,
"grad_norm": 1.3768977749050733,
"learning_rate": 4.988068499954578e-06,
"loss": 0.5555,
"step": 96
},
{
"epoch": 0.652789342214821,
"grad_norm": 1.463407945745149,
"learning_rate": 4.985689884830711e-06,
"loss": 0.5591,
"step": 98
},
{
"epoch": 0.6661115736885929,
"grad_norm": 1.3808427236512926,
"learning_rate": 4.983095894354858e-06,
"loss": 0.5588,
"step": 100
},
{
"epoch": 0.6794338051623647,
"grad_norm": 1.482231013162315,
"learning_rate": 4.980286753286196e-06,
"loss": 0.5418,
"step": 102
},
{
"epoch": 0.6927560366361366,
"grad_norm": 1.3778109634949367,
"learning_rate": 4.97726270502586e-06,
"loss": 0.5399,
"step": 104
},
{
"epoch": 0.7060782681099084,
"grad_norm": 1.4002755485164502,
"learning_rate": 4.974024011595864e-06,
"loss": 0.5533,
"step": 106
},
{
"epoch": 0.7194004995836802,
"grad_norm": 1.3296620938997752,
"learning_rate": 4.970570953616383e-06,
"loss": 0.5438,
"step": 108
},
{
"epoch": 0.7327227310574521,
"grad_norm": 1.4458203791375825,
"learning_rate": 4.966903830281449e-06,
"loss": 0.5378,
"step": 110
},
{
"epoch": 0.746044962531224,
"grad_norm": 1.5136526829998074,
"learning_rate": 4.9630229593330226e-06,
"loss": 0.5348,
"step": 112
},
{
"epoch": 0.7593671940049959,
"grad_norm": 1.4362377777815807,
"learning_rate": 4.958928677033465e-06,
"loss": 0.5267,
"step": 114
},
{
"epoch": 0.7726894254787677,
"grad_norm": 1.2730640176398647,
"learning_rate": 4.954621338136399e-06,
"loss": 0.5393,
"step": 116
},
{
"epoch": 0.7860116569525396,
"grad_norm": 1.3685353603260022,
"learning_rate": 4.95010131585597e-06,
"loss": 0.534,
"step": 118
},
{
"epoch": 0.7993338884263114,
"grad_norm": 1.2683696145515575,
"learning_rate": 4.9453690018345144e-06,
"loss": 0.527,
"step": 120
},
{
"epoch": 0.8126561199000832,
"grad_norm": 1.323958192575613,
"learning_rate": 4.940424806108619e-06,
"loss": 0.5267,
"step": 122
},
{
"epoch": 0.8259783513738551,
"grad_norm": 1.2428318596261736,
"learning_rate": 4.935269157073597e-06,
"loss": 0.5149,
"step": 124
},
{
"epoch": 0.839300582847627,
"grad_norm": 1.27851729445364,
"learning_rate": 4.9299025014463665e-06,
"loss": 0.5228,
"step": 126
},
{
"epoch": 0.8526228143213989,
"grad_norm": 1.2913119874277892,
"learning_rate": 4.924325304226745e-06,
"loss": 0.5028,
"step": 128
},
{
"epoch": 0.8659450457951707,
"grad_norm": 1.3471089811240304,
"learning_rate": 4.91853804865716e-06,
"loss": 0.5402,
"step": 130
},
{
"epoch": 0.8792672772689425,
"grad_norm": 1.3919989303105873,
"learning_rate": 4.912541236180779e-06,
"loss": 0.5208,
"step": 132
},
{
"epoch": 0.8925895087427144,
"grad_norm": 1.336135856095439,
"learning_rate": 4.9063353863980565e-06,
"loss": 0.5232,
"step": 134
},
{
"epoch": 0.9059117402164862,
"grad_norm": 1.535058182009125,
"learning_rate": 4.899921037021719e-06,
"loss": 0.5183,
"step": 136
},
{
"epoch": 0.9192339716902581,
"grad_norm": 1.4366704774523757,
"learning_rate": 4.893298743830168e-06,
"loss": 0.5152,
"step": 138
},
{
"epoch": 0.93255620316403,
"grad_norm": 1.4306647802429082,
"learning_rate": 4.88646908061933e-06,
"loss": 0.5241,
"step": 140
},
{
"epoch": 0.9458784346378019,
"grad_norm": 1.3151003083587773,
"learning_rate": 4.879432639152935e-06,
"loss": 0.518,
"step": 142
},
{
"epoch": 0.9592006661115737,
"grad_norm": 1.3682779135005043,
"learning_rate": 4.8721900291112415e-06,
"loss": 0.51,
"step": 144
},
{
"epoch": 0.9725228975853455,
"grad_norm": 1.3896990341168534,
"learning_rate": 4.864741878038218e-06,
"loss": 0.5207,
"step": 146
},
{
"epoch": 0.9858451290591174,
"grad_norm": 1.2929489978661655,
"learning_rate": 4.857088831287158e-06,
"loss": 0.5121,
"step": 148
},
{
"epoch": 0.9991673605328892,
"grad_norm": 1.3614193317791738,
"learning_rate": 4.849231551964771e-06,
"loss": 0.5016,
"step": 150
},
{
"epoch": 1.0066611157368859,
"grad_norm": 1.3229981405906006,
"learning_rate": 4.841170720873723e-06,
"loss": 0.2569,
"step": 152
},
{
"epoch": 1.0199833472106579,
"grad_norm": 1.2274098346213043,
"learning_rate": 4.832907036453647e-06,
"loss": 0.4662,
"step": 154
},
{
"epoch": 1.0333055786844296,
"grad_norm": 1.3810724651132364,
"learning_rate": 4.824441214720629e-06,
"loss": 0.4503,
"step": 156
},
{
"epoch": 1.0466278101582014,
"grad_norm": 1.5094355408493076,
"learning_rate": 4.815773989205165e-06,
"loss": 0.4525,
"step": 158
},
{
"epoch": 1.0599500416319734,
"grad_norm": 1.191750486186588,
"learning_rate": 4.806906110888606e-06,
"loss": 0.4548,
"step": 160
},
{
"epoch": 1.0732722731057451,
"grad_norm": 1.2840884507072778,
"learning_rate": 4.7978383481380865e-06,
"loss": 0.4552,
"step": 162
},
{
"epoch": 1.0865945045795171,
"grad_norm": 1.3818002604555029,
"learning_rate": 4.788571486639948e-06,
"loss": 0.452,
"step": 164
},
{
"epoch": 1.0999167360532889,
"grad_norm": 1.3200111006279347,
"learning_rate": 4.779106329331665e-06,
"loss": 0.45,
"step": 166
},
{
"epoch": 1.1132389675270609,
"grad_norm": 1.2755161939993753,
"learning_rate": 4.769443696332272e-06,
"loss": 0.4454,
"step": 168
},
{
"epoch": 1.1265611990008326,
"grad_norm": 1.3421067926153882,
"learning_rate": 4.759584424871302e-06,
"loss": 0.4429,
"step": 170
},
{
"epoch": 1.1398834304746046,
"grad_norm": 1.2219457405458125,
"learning_rate": 4.749529369216246e-06,
"loss": 0.4481,
"step": 172
},
{
"epoch": 1.1532056619483764,
"grad_norm": 1.330574738869651,
"learning_rate": 4.7392794005985324e-06,
"loss": 0.4459,
"step": 174
},
{
"epoch": 1.1665278934221481,
"grad_norm": 1.2042952174150132,
"learning_rate": 4.7288354071380415e-06,
"loss": 0.4339,
"step": 176
},
{
"epoch": 1.1798501248959201,
"grad_norm": 1.2535265876319093,
"learning_rate": 4.7181982937661485e-06,
"loss": 0.4364,
"step": 178
},
{
"epoch": 1.1931723563696919,
"grad_norm": 1.1967067502698956,
"learning_rate": 4.707368982147318e-06,
"loss": 0.4484,
"step": 180
},
{
"epoch": 1.2064945878434639,
"grad_norm": 1.3022379327320546,
"learning_rate": 4.696348410599244e-06,
"loss": 0.4468,
"step": 182
},
{
"epoch": 1.2198168193172356,
"grad_norm": 1.3137228151962215,
"learning_rate": 4.685137534011549e-06,
"loss": 0.4492,
"step": 184
},
{
"epoch": 1.2331390507910074,
"grad_norm": 1.3650226627212705,
"learning_rate": 4.673737323763048e-06,
"loss": 0.4389,
"step": 186
},
{
"epoch": 1.2464612822647794,
"grad_norm": 1.3122923570081069,
"learning_rate": 4.662148767637578e-06,
"loss": 0.4426,
"step": 188
},
{
"epoch": 1.2597835137385511,
"grad_norm": 1.3191199275346543,
"learning_rate": 4.650372869738415e-06,
"loss": 0.434,
"step": 190
},
{
"epoch": 1.2731057452123231,
"grad_norm": 1.4425884017899313,
"learning_rate": 4.638410650401267e-06,
"loss": 0.4382,
"step": 192
},
{
"epoch": 1.2864279766860949,
"grad_norm": 1.4066578837011166,
"learning_rate": 4.626263146105875e-06,
"loss": 0.4473,
"step": 194
},
{
"epoch": 1.2997502081598669,
"grad_norm": 1.4430824831613096,
"learning_rate": 4.613931409386196e-06,
"loss": 0.4488,
"step": 196
},
{
"epoch": 1.3130724396336386,
"grad_norm": 1.2217740909502797,
"learning_rate": 4.601416508739211e-06,
"loss": 0.4395,
"step": 198
},
{
"epoch": 1.3263946711074106,
"grad_norm": 1.474039226711776,
"learning_rate": 4.588719528532342e-06,
"loss": 0.4381,
"step": 200
},
{
"epoch": 1.3397169025811824,
"grad_norm": 1.2503538717797444,
"learning_rate": 4.575841568909494e-06,
"loss": 0.4317,
"step": 202
},
{
"epoch": 1.3530391340549541,
"grad_norm": 1.3172152085291207,
"learning_rate": 4.562783745695738e-06,
"loss": 0.4284,
"step": 204
},
{
"epoch": 1.3663613655287261,
"grad_norm": 1.2950216606489513,
"learning_rate": 4.549547190300622e-06,
"loss": 0.4372,
"step": 206
},
{
"epoch": 1.3796835970024979,
"grad_norm": 1.2065789326345406,
"learning_rate": 4.536133049620143e-06,
"loss": 0.4376,
"step": 208
},
{
"epoch": 1.3930058284762699,
"grad_norm": 1.450309483143858,
"learning_rate": 4.522542485937369e-06,
"loss": 0.4368,
"step": 210
},
{
"epoch": 1.4063280599500416,
"grad_norm": 1.2856432394840618,
"learning_rate": 4.508776676821739e-06,
"loss": 0.4359,
"step": 212
},
{
"epoch": 1.4196502914238134,
"grad_norm": 1.303392410991855,
"learning_rate": 4.494836815027022e-06,
"loss": 0.437,
"step": 214
},
{
"epoch": 1.4329725228975854,
"grad_norm": 1.2374383776516957,
"learning_rate": 4.4807241083879774e-06,
"loss": 0.4277,
"step": 216
},
{
"epoch": 1.4462947543713571,
"grad_norm": 1.1895403487373037,
"learning_rate": 4.466439779715696e-06,
"loss": 0.4219,
"step": 218
},
{
"epoch": 1.4596169858451291,
"grad_norm": 1.3959427610193165,
"learning_rate": 4.451985066691649e-06,
"loss": 0.4341,
"step": 220
},
{
"epoch": 1.4729392173189009,
"grad_norm": 1.2421484766590198,
"learning_rate": 4.437361221760449e-06,
"loss": 0.4162,
"step": 222
},
{
"epoch": 1.4862614487926726,
"grad_norm": 1.287463815955178,
"learning_rate": 4.422569512021332e-06,
"loss": 0.4282,
"step": 224
},
{
"epoch": 1.4995836802664446,
"grad_norm": 1.4250139528752677,
"learning_rate": 4.407611219118363e-06,
"loss": 0.421,
"step": 226
},
{
"epoch": 1.5129059117402166,
"grad_norm": 1.239295099017855,
"learning_rate": 4.3924876391293915e-06,
"loss": 0.427,
"step": 228
},
{
"epoch": 1.5262281432139884,
"grad_norm": 1.3453909852418124,
"learning_rate": 4.377200082453748e-06,
"loss": 0.4357,
"step": 230
},
{
"epoch": 1.5395503746877601,
"grad_norm": 1.2197270804139342,
"learning_rate": 4.361749873698707e-06,
"loss": 0.4101,
"step": 232
},
{
"epoch": 1.552872606161532,
"grad_norm": 1.2833857194816787,
"learning_rate": 4.346138351564711e-06,
"loss": 0.424,
"step": 234
},
{
"epoch": 1.5661948376353039,
"grad_norm": 1.2293200008377447,
"learning_rate": 4.330366868729376e-06,
"loss": 0.421,
"step": 236
},
{
"epoch": 1.5795170691090759,
"grad_norm": 1.1926560926173428,
"learning_rate": 4.3144367917302964e-06,
"loss": 0.4142,
"step": 238
},
{
"epoch": 1.5928393005828476,
"grad_norm": 1.1594494067766803,
"learning_rate": 4.2983495008466285e-06,
"loss": 0.4191,
"step": 240
},
{
"epoch": 1.6061615320566194,
"grad_norm": 1.224729384745418,
"learning_rate": 4.2821063899795015e-06,
"loss": 0.4128,
"step": 242
},
{
"epoch": 1.6194837635303914,
"grad_norm": 1.1481228567549495,
"learning_rate": 4.265708866531238e-06,
"loss": 0.4279,
"step": 244
},
{
"epoch": 1.6328059950041633,
"grad_norm": 1.3467092580505746,
"learning_rate": 4.249158351283414e-06,
"loss": 0.4262,
"step": 246
},
{
"epoch": 1.646128226477935,
"grad_norm": 1.2776898545321251,
"learning_rate": 4.232456278273743e-06,
"loss": 0.4314,
"step": 248
},
{
"epoch": 1.6594504579517069,
"grad_norm": 1.2719662910087424,
"learning_rate": 4.215604094671835e-06,
"loss": 0.4108,
"step": 250
},
{
"epoch": 1.6727726894254786,
"grad_norm": 1.1745562871590098,
"learning_rate": 4.198603260653792e-06,
"loss": 0.4165,
"step": 252
},
{
"epoch": 1.6860949208992506,
"grad_norm": 1.2455715420366917,
"learning_rate": 4.181455249275701e-06,
"loss": 0.4079,
"step": 254
},
{
"epoch": 1.6994171523730226,
"grad_norm": 1.3896213959063652,
"learning_rate": 4.1641615463459926e-06,
"loss": 0.417,
"step": 256
},
{
"epoch": 1.7127393838467944,
"grad_norm": 1.2131393445621887,
"learning_rate": 4.146723650296701e-06,
"loss": 0.4116,
"step": 258
},
{
"epoch": 1.7260616153205661,
"grad_norm": 1.2101597375627524,
"learning_rate": 4.129143072053639e-06,
"loss": 0.4169,
"step": 260
},
{
"epoch": 1.739383846794338,
"grad_norm": 1.2983597203629458,
"learning_rate": 4.111421334905468e-06,
"loss": 0.4101,
"step": 262
},
{
"epoch": 1.7527060782681099,
"grad_norm": 1.1756761204986788,
"learning_rate": 4.093559974371725e-06,
"loss": 0.4023,
"step": 264
},
{
"epoch": 1.7660283097418819,
"grad_norm": 1.296750722093234,
"learning_rate": 4.075560538069767e-06,
"loss": 0.4037,
"step": 266
},
{
"epoch": 1.7793505412156536,
"grad_norm": 1.2664686153860956,
"learning_rate": 4.05742458558068e-06,
"loss": 0.4005,
"step": 268
},
{
"epoch": 1.7926727726894254,
"grad_norm": 1.3144115093925024,
"learning_rate": 4.039153688314146e-06,
"loss": 0.4123,
"step": 270
},
{
"epoch": 1.8059950041631974,
"grad_norm": 1.177870994913812,
"learning_rate": 4.020749429372286e-06,
"loss": 0.4061,
"step": 272
},
{
"epoch": 1.8193172356369693,
"grad_norm": 1.1211392036639862,
"learning_rate": 4.002213403412492e-06,
"loss": 0.4207,
"step": 274
},
{
"epoch": 1.832639467110741,
"grad_norm": 1.1967335338983747,
"learning_rate": 3.983547216509254e-06,
"loss": 0.4037,
"step": 276
},
{
"epoch": 1.8459616985845129,
"grad_norm": 1.163438902600854,
"learning_rate": 3.964752486015001e-06,
"loss": 0.3983,
"step": 278
},
{
"epoch": 1.8592839300582846,
"grad_norm": 1.3897690758852341,
"learning_rate": 3.945830840419966e-06,
"loss": 0.406,
"step": 280
},
{
"epoch": 1.8726061615320566,
"grad_norm": 1.2302319797016965,
"learning_rate": 3.92678391921108e-06,
"loss": 0.4102,
"step": 282
},
{
"epoch": 1.8859283930058286,
"grad_norm": 1.2515743950418428,
"learning_rate": 3.907613372729916e-06,
"loss": 0.4121,
"step": 284
},
{
"epoch": 1.8992506244796004,
"grad_norm": 1.2250514633864378,
"learning_rate": 3.888320862029699e-06,
"loss": 0.4135,
"step": 286
},
{
"epoch": 1.9125728559533721,
"grad_norm": 1.1786595929578796,
"learning_rate": 3.868908058731376e-06,
"loss": 0.3961,
"step": 288
},
{
"epoch": 1.9258950874271439,
"grad_norm": 1.2316483388259516,
"learning_rate": 3.849376644878783e-06,
"loss": 0.3991,
"step": 290
},
{
"epoch": 1.9392173189009159,
"grad_norm": 1.2218522002215788,
"learning_rate": 3.829728312792895e-06,
"loss": 0.4068,
"step": 292
},
{
"epoch": 1.9525395503746878,
"grad_norm": 1.218981908305007,
"learning_rate": 3.8099647649251984e-06,
"loss": 0.4116,
"step": 294
},
{
"epoch": 1.9658617818484596,
"grad_norm": 1.1473329397682062,
"learning_rate": 3.790087713710179e-06,
"loss": 0.3961,
"step": 296
},
{
"epoch": 1.9791840133222314,
"grad_norm": 1.15330486401059,
"learning_rate": 3.770098881416945e-06,
"loss": 0.397,
"step": 298
},
{
"epoch": 1.9925062447960034,
"grad_norm": 1.1147439818886564,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.391,
"step": 300
},
{
"epoch": 2.0,
"grad_norm": 1.1888727583821848,
"learning_rate": 3.7297928109491765e-06,
"loss": 0.2238,
"step": 302
},
{
"epoch": 2.0133222314737718,
"grad_norm": 1.1682600742115117,
"learning_rate": 3.7094790651387414e-06,
"loss": 0.3464,
"step": 304
},
{
"epoch": 2.0266444629475435,
"grad_norm": 1.2543709475465634,
"learning_rate": 3.689060522675689e-06,
"loss": 0.3299,
"step": 306
},
{
"epoch": 2.0399666944213157,
"grad_norm": 1.209782299511866,
"learning_rate": 3.668538952747236e-06,
"loss": 0.3335,
"step": 308
},
{
"epoch": 2.0532889258950875,
"grad_norm": 1.2314074580378418,
"learning_rate": 3.6479161334675294e-06,
"loss": 0.3402,
"step": 310
},
{
"epoch": 2.0666111573688593,
"grad_norm": 1.089978871118908,
"learning_rate": 3.627193851723577e-06,
"loss": 0.3282,
"step": 312
},
{
"epoch": 2.079933388842631,
"grad_norm": 1.1440650029159125,
"learning_rate": 3.6063739030204226e-06,
"loss": 0.3353,
"step": 314
},
{
"epoch": 2.0932556203164028,
"grad_norm": 1.1412527172991913,
"learning_rate": 3.5854580913255706e-06,
"loss": 0.3377,
"step": 316
},
{
"epoch": 2.106577851790175,
"grad_norm": 1.1374336855151732,
"learning_rate": 3.564448228912682e-06,
"loss": 0.3303,
"step": 318
},
{
"epoch": 2.1199000832639467,
"grad_norm": 1.1689768541112975,
"learning_rate": 3.543346136204545e-06,
"loss": 0.3269,
"step": 320
},
{
"epoch": 2.1332223147377185,
"grad_norm": 1.1613635619803697,
"learning_rate": 3.522153641615345e-06,
"loss": 0.3447,
"step": 322
},
{
"epoch": 2.1465445462114903,
"grad_norm": 1.0764748235217316,
"learning_rate": 3.5008725813922383e-06,
"loss": 0.3347,
"step": 324
},
{
"epoch": 2.1598667776852625,
"grad_norm": 1.242351223908071,
"learning_rate": 3.4795047994562463e-06,
"loss": 0.3337,
"step": 326
},
{
"epoch": 2.1731890091590342,
"grad_norm": 1.1068446291466676,
"learning_rate": 3.458052147242494e-06,
"loss": 0.3411,
"step": 328
},
{
"epoch": 2.186511240632806,
"grad_norm": 1.16808964966109,
"learning_rate": 3.436516483539781e-06,
"loss": 0.3376,
"step": 330
},
{
"epoch": 2.1998334721065778,
"grad_norm": 1.1025319948129593,
"learning_rate": 3.4148996743295305e-06,
"loss": 0.3316,
"step": 332
},
{
"epoch": 2.2131557035803495,
"grad_norm": 1.1758686501416102,
"learning_rate": 3.3932035926241103e-06,
"loss": 0.3355,
"step": 334
},
{
"epoch": 2.2264779350541217,
"grad_norm": 1.1003768444337116,
"learning_rate": 3.3714301183045382e-06,
"loss": 0.3357,
"step": 336
},
{
"epoch": 2.2398001665278935,
"grad_norm": 1.0881028666604091,
"learning_rate": 3.349581137957604e-06,
"loss": 0.3364,
"step": 338
},
{
"epoch": 2.2531223980016652,
"grad_norm": 1.211964671213877,
"learning_rate": 3.3276585447123957e-06,
"loss": 0.3353,
"step": 340
},
{
"epoch": 2.266444629475437,
"grad_norm": 1.163639286937533,
"learning_rate": 3.3056642380762783e-06,
"loss": 0.329,
"step": 342
},
{
"epoch": 2.279766860949209,
"grad_norm": 1.1618660863336634,
"learning_rate": 3.2836001237702993e-06,
"loss": 0.3299,
"step": 344
},
{
"epoch": 2.293089092422981,
"grad_norm": 1.1575282219975258,
"learning_rate": 3.2614681135640696e-06,
"loss": 0.3297,
"step": 346
},
{
"epoch": 2.3064113238967527,
"grad_norm": 1.1756458412662194,
"learning_rate": 3.2392701251101172e-06,
"loss": 0.3367,
"step": 348
},
{
"epoch": 2.3197335553705245,
"grad_norm": 1.1830174958146948,
"learning_rate": 3.217008081777726e-06,
"loss": 0.3319,
"step": 350
},
{
"epoch": 2.3330557868442963,
"grad_norm": 1.1667340496607632,
"learning_rate": 3.1946839124862873e-06,
"loss": 0.3361,
"step": 352
},
{
"epoch": 2.3463780183180685,
"grad_norm": 1.1105411198444493,
"learning_rate": 3.1722995515381644e-06,
"loss": 0.3425,
"step": 354
},
{
"epoch": 2.3597002497918402,
"grad_norm": 1.1234133483520614,
"learning_rate": 3.149856938451094e-06,
"loss": 0.3314,
"step": 356
},
{
"epoch": 2.373022481265612,
"grad_norm": 1.1838235154662082,
"learning_rate": 3.127358017790132e-06,
"loss": 0.3392,
"step": 358
},
{
"epoch": 2.3863447127393838,
"grad_norm": 1.080453742242657,
"learning_rate": 3.1048047389991693e-06,
"loss": 0.3336,
"step": 360
},
{
"epoch": 2.3996669442131555,
"grad_norm": 1.1140835000073062,
"learning_rate": 3.082199056232015e-06,
"loss": 0.3414,
"step": 362
},
{
"epoch": 2.4129891756869277,
"grad_norm": 1.138752925836035,
"learning_rate": 3.059542928183079e-06,
"loss": 0.3329,
"step": 364
},
{
"epoch": 2.4263114071606995,
"grad_norm": 1.0610831482375092,
"learning_rate": 3.0368383179176584e-06,
"loss": 0.342,
"step": 366
},
{
"epoch": 2.4396336386344712,
"grad_norm": 1.1718171313930514,
"learning_rate": 3.0140871927018466e-06,
"loss": 0.3266,
"step": 368
},
{
"epoch": 2.452955870108243,
"grad_norm": 1.2039181830598997,
"learning_rate": 2.9912915238320755e-06,
"loss": 0.338,
"step": 370
},
{
"epoch": 2.4662781015820148,
"grad_norm": 1.0760682240024106,
"learning_rate": 2.9684532864643123e-06,
"loss": 0.3277,
"step": 372
},
{
"epoch": 2.479600333055787,
"grad_norm": 1.2378751102400485,
"learning_rate": 2.945574459442917e-06,
"loss": 0.3398,
"step": 374
},
{
"epoch": 2.4929225645295587,
"grad_norm": 1.171184691228538,
"learning_rate": 2.922657025129185e-06,
"loss": 0.3313,
"step": 376
},
{
"epoch": 2.5062447960033305,
"grad_norm": 1.179077198361453,
"learning_rate": 2.8997029692295875e-06,
"loss": 0.3364,
"step": 378
},
{
"epoch": 2.5195670274771023,
"grad_norm": 1.1745776843262559,
"learning_rate": 2.876714280623708e-06,
"loss": 0.3261,
"step": 380
},
{
"epoch": 2.532889258950874,
"grad_norm": 1.1445296979936388,
"learning_rate": 2.8536929511919227e-06,
"loss": 0.3352,
"step": 382
},
{
"epoch": 2.5462114904246462,
"grad_norm": 1.2025025630072426,
"learning_rate": 2.8306409756428067e-06,
"loss": 0.3375,
"step": 384
},
{
"epoch": 2.559533721898418,
"grad_norm": 1.0971352592709565,
"learning_rate": 2.807560351340302e-06,
"loss": 0.3313,
"step": 386
},
{
"epoch": 2.5728559533721898,
"grad_norm": 1.1249045530287038,
"learning_rate": 2.7844530781306544e-06,
"loss": 0.3359,
"step": 388
},
{
"epoch": 2.586178184845962,
"grad_norm": 1.1665793984798016,
"learning_rate": 2.761321158169134e-06,
"loss": 0.3251,
"step": 390
},
{
"epoch": 2.5995004163197337,
"grad_norm": 1.1275088907272068,
"learning_rate": 2.738166595746554e-06,
"loss": 0.3189,
"step": 392
},
{
"epoch": 2.6128226477935055,
"grad_norm": 1.1697820606518197,
"learning_rate": 2.7149913971156105e-06,
"loss": 0.3305,
"step": 394
},
{
"epoch": 2.6261448792672772,
"grad_norm": 1.0995774846811734,
"learning_rate": 2.6917975703170466e-06,
"loss": 0.3323,
"step": 396
},
{
"epoch": 2.639467110741049,
"grad_norm": 1.1471735378793595,
"learning_rate": 2.668587125005663e-06,
"loss": 0.3348,
"step": 398
},
{
"epoch": 2.652789342214821,
"grad_norm": 1.1043284251546557,
"learning_rate": 2.6453620722761897e-06,
"loss": 0.3244,
"step": 400
},
{
"epoch": 2.666111573688593,
"grad_norm": 1.2133025722214072,
"learning_rate": 2.6221244244890336e-06,
"loss": 0.3297,
"step": 402
},
{
"epoch": 2.6794338051623647,
"grad_norm": 1.0759704642431338,
"learning_rate": 2.5988761950959133e-06,
"loss": 0.3294,
"step": 404
},
{
"epoch": 2.6927560366361365,
"grad_norm": 1.1303123852236616,
"learning_rate": 2.575619398465402e-06,
"loss": 0.327,
"step": 406
},
{
"epoch": 2.7060782681099083,
"grad_norm": 1.1874408347855483,
"learning_rate": 2.5523560497083927e-06,
"loss": 0.3297,
"step": 408
},
{
"epoch": 2.7194004995836805,
"grad_norm": 1.0814676034937838,
"learning_rate": 2.5290881645034932e-06,
"loss": 0.3308,
"step": 410
},
{
"epoch": 2.7327227310574522,
"grad_norm": 1.0821014638265758,
"learning_rate": 2.5058177589223766e-06,
"loss": 0.3286,
"step": 412
},
{
"epoch": 2.746044962531224,
"grad_norm": 1.1078782647950531,
"learning_rate": 2.482546849255096e-06,
"loss": 0.3289,
"step": 414
},
{
"epoch": 2.7593671940049957,
"grad_norm": 1.0709928206025467,
"learning_rate": 2.4592774518353858e-06,
"loss": 0.3349,
"step": 416
},
{
"epoch": 2.7726894254787675,
"grad_norm": 0.9986348268877544,
"learning_rate": 2.436011582865945e-06,
"loss": 0.3284,
"step": 418
},
{
"epoch": 2.7860116569525397,
"grad_norm": 1.0407659756205825,
"learning_rate": 2.4127512582437486e-06,
"loss": 0.3255,
"step": 420
},
{
"epoch": 2.7993338884263115,
"grad_norm": 1.1250160278057286,
"learning_rate": 2.3894984933853734e-06,
"loss": 0.3189,
"step": 422
},
{
"epoch": 2.8126561199000832,
"grad_norm": 1.1080847331634105,
"learning_rate": 2.366255303052377e-06,
"loss": 0.3286,
"step": 424
},
{
"epoch": 2.825978351373855,
"grad_norm": 1.1096965833381898,
"learning_rate": 2.3430237011767166e-06,
"loss": 0.3393,
"step": 426
},
{
"epoch": 2.8393005828476268,
"grad_norm": 1.177552126401324,
"learning_rate": 2.319805700686257e-06,
"loss": 0.323,
"step": 428
},
{
"epoch": 2.852622814321399,
"grad_norm": 1.1531088333635726,
"learning_rate": 2.296603313330355e-06,
"loss": 0.3275,
"step": 430
},
{
"epoch": 2.8659450457951707,
"grad_norm": 1.1189431785006225,
"learning_rate": 2.2734185495055503e-06,
"loss": 0.3234,
"step": 432
},
{
"epoch": 2.8792672772689425,
"grad_norm": 1.0872804861007128,
"learning_rate": 2.250253418081373e-06,
"loss": 0.3304,
"step": 434
},
{
"epoch": 2.8925895087427143,
"grad_norm": 1.100386612123568,
"learning_rate": 2.22710992622628e-06,
"loss": 0.326,
"step": 436
},
{
"epoch": 2.905911740216486,
"grad_norm": 1.0750519008987303,
"learning_rate": 2.2039900792337477e-06,
"loss": 0.3161,
"step": 438
},
{
"epoch": 2.9192339716902582,
"grad_norm": 1.0912428298625954,
"learning_rate": 2.1808958803485134e-06,
"loss": 0.3209,
"step": 440
},
{
"epoch": 2.93255620316403,
"grad_norm": 1.107507049641638,
"learning_rate": 2.157829330593008e-06,
"loss": 0.3363,
"step": 442
},
{
"epoch": 2.9458784346378017,
"grad_norm": 1.169768928903536,
"learning_rate": 2.134792428593971e-06,
"loss": 0.3327,
"step": 444
},
{
"epoch": 2.959200666111574,
"grad_norm": 1.1405241904375514,
"learning_rate": 2.1117871704092818e-06,
"loss": 0.3264,
"step": 446
},
{
"epoch": 2.9725228975853453,
"grad_norm": 1.1001277407179797,
"learning_rate": 2.0888155493550027e-06,
"loss": 0.3135,
"step": 448
},
{
"epoch": 2.9858451290591175,
"grad_norm": 1.1129546974887563,
"learning_rate": 2.0658795558326745e-06,
"loss": 0.3234,
"step": 450
},
{
"epoch": 2.9991673605328892,
"grad_norm": 1.1080818714362697,
"learning_rate": 2.0429811771568468e-06,
"loss": 0.322,
"step": 452
},
{
"epoch": 3.006661115736886,
"grad_norm": 1.1307642011297194,
"learning_rate": 2.0201223973828917e-06,
"loss": 0.1617,
"step": 454
},
{
"epoch": 3.019983347210658,
"grad_norm": 1.0035953126863488,
"learning_rate": 1.997305197135089e-06,
"loss": 0.2598,
"step": 456
},
{
"epoch": 3.0333055786844296,
"grad_norm": 1.0534955052337636,
"learning_rate": 1.9745315534350157e-06,
"loss": 0.2715,
"step": 458
},
{
"epoch": 3.0466278101582014,
"grad_norm": 1.189844390221147,
"learning_rate": 1.9518034395302413e-06,
"loss": 0.2646,
"step": 460
},
{
"epoch": 3.059950041631973,
"grad_norm": 1.1253150795326456,
"learning_rate": 1.9291228247233607e-06,
"loss": 0.2701,
"step": 462
},
{
"epoch": 3.0732722731057454,
"grad_norm": 1.1193701526310147,
"learning_rate": 1.9064916742013515e-06,
"loss": 0.2673,
"step": 464
},
{
"epoch": 3.086594504579517,
"grad_norm": 1.0959015217977324,
"learning_rate": 1.883911948865306e-06,
"loss": 0.2649,
"step": 466
},
{
"epoch": 3.099916736053289,
"grad_norm": 1.1965240464412776,
"learning_rate": 1.8613856051605242e-06,
"loss": 0.2629,
"step": 468
},
{
"epoch": 3.1132389675270606,
"grad_norm": 1.0473419859838504,
"learning_rate": 1.8389145949069953e-06,
"loss": 0.2613,
"step": 470
},
{
"epoch": 3.126561199000833,
"grad_norm": 1.2108832644207754,
"learning_rate": 1.816500865130279e-06,
"loss": 0.2571,
"step": 472
},
{
"epoch": 3.1398834304746046,
"grad_norm": 1.0441673255917416,
"learning_rate": 1.7941463578928088e-06,
"loss": 0.2766,
"step": 474
},
{
"epoch": 3.1532056619483764,
"grad_norm": 1.1708679609837331,
"learning_rate": 1.7718530101256115e-06,
"loss": 0.2718,
"step": 476
},
{
"epoch": 3.166527893422148,
"grad_norm": 1.1284481739249688,
"learning_rate": 1.7496227534604859e-06,
"loss": 0.2575,
"step": 478
},
{
"epoch": 3.17985012489592,
"grad_norm": 1.0770429901542908,
"learning_rate": 1.7274575140626318e-06,
"loss": 0.2629,
"step": 480
},
{
"epoch": 3.193172356369692,
"grad_norm": 1.0692152501631402,
"learning_rate": 1.7053592124637557e-06,
"loss": 0.2694,
"step": 482
},
{
"epoch": 3.206494587843464,
"grad_norm": 1.039094308900864,
"learning_rate": 1.6833297633956647e-06,
"loss": 0.2687,
"step": 484
},
{
"epoch": 3.2198168193172356,
"grad_norm": 1.1432726083538918,
"learning_rate": 1.661371075624363e-06,
"loss": 0.2722,
"step": 486
},
{
"epoch": 3.2331390507910074,
"grad_norm": 1.047486598216707,
"learning_rate": 1.6394850517846621e-06,
"loss": 0.26,
"step": 488
},
{
"epoch": 3.246461282264779,
"grad_norm": 1.1299207627919639,
"learning_rate": 1.6176735882153284e-06,
"loss": 0.2646,
"step": 490
},
{
"epoch": 3.2597835137385514,
"grad_norm": 1.0456944660867535,
"learning_rate": 1.5959385747947697e-06,
"loss": 0.2628,
"step": 492
},
{
"epoch": 3.273105745212323,
"grad_norm": 1.0617694211022177,
"learning_rate": 1.5742818947772875e-06,
"loss": 0.2576,
"step": 494
},
{
"epoch": 3.286427976686095,
"grad_norm": 1.0978333522782833,
"learning_rate": 1.552705424629898e-06,
"loss": 0.2703,
"step": 496
},
{
"epoch": 3.2997502081598666,
"grad_norm": 1.0865484727411876,
"learning_rate": 1.5312110338697427e-06,
"loss": 0.2692,
"step": 498
},
{
"epoch": 3.313072439633639,
"grad_norm": 1.0418725249305938,
"learning_rate": 1.509800584902108e-06,
"loss": 0.2642,
"step": 500
},
{
"epoch": 3.3263946711074106,
"grad_norm": 1.0660556477168224,
"learning_rate": 1.4884759328590476e-06,
"loss": 0.2633,
"step": 502
},
{
"epoch": 3.3397169025811824,
"grad_norm": 1.0851955569492033,
"learning_rate": 1.467238925438646e-06,
"loss": 0.2677,
"step": 504
},
{
"epoch": 3.353039134054954,
"grad_norm": 1.0460266601554127,
"learning_rate": 1.446091402744923e-06,
"loss": 0.2682,
"step": 506
},
{
"epoch": 3.366361365528726,
"grad_norm": 1.0320032665713081,
"learning_rate": 1.4250351971283937e-06,
"loss": 0.2673,
"step": 508
},
{
"epoch": 3.379683597002498,
"grad_norm": 1.0694000065346523,
"learning_rate": 1.4040721330273063e-06,
"loss": 0.273,
"step": 510
},
{
"epoch": 3.39300582847627,
"grad_norm": 1.0986183217647922,
"learning_rate": 1.3832040268095589e-06,
"loss": 0.2615,
"step": 512
},
{
"epoch": 3.4063280599500416,
"grad_norm": 1.063489274495733,
"learning_rate": 1.362432686615316e-06,
"loss": 0.2763,
"step": 514
},
{
"epoch": 3.4196502914238134,
"grad_norm": 1.0408747367635172,
"learning_rate": 1.3417599122003464e-06,
"loss": 0.2677,
"step": 516
},
{
"epoch": 3.432972522897585,
"grad_norm": 1.1352124059324844,
"learning_rate": 1.3211874947800747e-06,
"loss": 0.2614,
"step": 518
},
{
"epoch": 3.4462947543713573,
"grad_norm": 1.0881790246993637,
"learning_rate": 1.3007172168743854e-06,
"loss": 0.2659,
"step": 520
},
{
"epoch": 3.459616985845129,
"grad_norm": 1.080506653895442,
"learning_rate": 1.280350852153168e-06,
"loss": 0.2666,
"step": 522
},
{
"epoch": 3.472939217318901,
"grad_norm": 1.0583310544029485,
"learning_rate": 1.260090165282645e-06,
"loss": 0.2648,
"step": 524
},
{
"epoch": 3.4862614487926726,
"grad_norm": 1.1152914883872809,
"learning_rate": 1.2399369117724582e-06,
"loss": 0.2704,
"step": 526
},
{
"epoch": 3.4995836802664444,
"grad_norm": 1.0455279885524973,
"learning_rate": 1.2198928378235717e-06,
"loss": 0.2672,
"step": 528
},
{
"epoch": 3.5129059117402166,
"grad_norm": 1.0576879823812282,
"learning_rate": 1.1999596801769617e-06,
"loss": 0.264,
"step": 530
},
{
"epoch": 3.5262281432139884,
"grad_norm": 1.1014402939329688,
"learning_rate": 1.1801391659631423e-06,
"loss": 0.2654,
"step": 532
},
{
"epoch": 3.53955037468776,
"grad_norm": 1.028865585013293,
"learning_rate": 1.160433012552508e-06,
"loss": 0.2637,
"step": 534
},
{
"epoch": 3.5528726061615323,
"grad_norm": 1.0546829340917359,
"learning_rate": 1.1408429274065418e-06,
"loss": 0.27,
"step": 536
},
{
"epoch": 3.5661948376353036,
"grad_norm": 1.0417474358737957,
"learning_rate": 1.1213706079298566e-06,
"loss": 0.2589,
"step": 538
},
{
"epoch": 3.579517069109076,
"grad_norm": 1.0937717215676659,
"learning_rate": 1.1020177413231334e-06,
"loss": 0.2697,
"step": 540
},
{
"epoch": 3.5928393005828476,
"grad_norm": 1.0736209133266341,
"learning_rate": 1.0827860044369226e-06,
"loss": 0.2645,
"step": 542
},
{
"epoch": 3.6061615320566194,
"grad_norm": 1.0474112636925237,
"learning_rate": 1.06367706362636e-06,
"loss": 0.2681,
"step": 544
},
{
"epoch": 3.6194837635303916,
"grad_norm": 1.103432827044926,
"learning_rate": 1.0446925746067768e-06,
"loss": 0.2695,
"step": 546
},
{
"epoch": 3.6328059950041633,
"grad_norm": 1.0836832730433759,
"learning_rate": 1.0258341823102418e-06,
"loss": 0.2632,
"step": 548
},
{
"epoch": 3.646128226477935,
"grad_norm": 1.0859645184669795,
"learning_rate": 1.0071035207430352e-06,
"loss": 0.2669,
"step": 550
},
{
"epoch": 3.659450457951707,
"grad_norm": 1.1090309698734075,
"learning_rate": 9.88502212844063e-07,
"loss": 0.2598,
"step": 552
},
{
"epoch": 3.6727726894254786,
"grad_norm": 1.040309343372892,
"learning_rate": 9.700318703442437e-07,
"loss": 0.259,
"step": 554
},
{
"epoch": 3.686094920899251,
"grad_norm": 1.0821866491462884,
"learning_rate": 9.516940936268504e-07,
"loss": 0.261,
"step": 556
},
{
"epoch": 3.6994171523730226,
"grad_norm": 1.032839245512739,
"learning_rate": 9.334904715888496e-07,
"loss": 0.2726,
"step": 558
},
{
"epoch": 3.7127393838467944,
"grad_norm": 1.2154127605600453,
"learning_rate": 9.154225815032242e-07,
"loss": 0.257,
"step": 560
},
{
"epoch": 3.726061615320566,
"grad_norm": 1.0461952582538157,
"learning_rate": 8.974919888823164e-07,
"loss": 0.255,
"step": 562
},
{
"epoch": 3.739383846794338,
"grad_norm": 1.0817761044485013,
"learning_rate": 8.797002473421729e-07,
"loss": 0.2672,
"step": 564
},
{
"epoch": 3.75270607826811,
"grad_norm": 1.1088064613565192,
"learning_rate": 8.620488984679378e-07,
"loss": 0.2701,
"step": 566
},
{
"epoch": 3.766028309741882,
"grad_norm": 1.0905619434743687,
"learning_rate": 8.445394716802754e-07,
"loss": 0.2699,
"step": 568
},
{
"epoch": 3.7793505412156536,
"grad_norm": 1.1348105280382488,
"learning_rate": 8.271734841028553e-07,
"loss": 0.2625,
"step": 570
},
{
"epoch": 3.7926727726894254,
"grad_norm": 1.0895135923548163,
"learning_rate": 8.099524404308948e-07,
"loss": 0.2652,
"step": 572
},
{
"epoch": 3.805995004163197,
"grad_norm": 1.081980856394784,
"learning_rate": 7.928778328007918e-07,
"loss": 0.2725,
"step": 574
},
{
"epoch": 3.8193172356369693,
"grad_norm": 1.072896110364212,
"learning_rate": 7.759511406608255e-07,
"loss": 0.2534,
"step": 576
},
{
"epoch": 3.832639467110741,
"grad_norm": 1.0739615029579452,
"learning_rate": 7.591738306429769e-07,
"loss": 0.2664,
"step": 578
},
{
"epoch": 3.845961698584513,
"grad_norm": 1.050183747712219,
"learning_rate": 7.425473564358457e-07,
"loss": 0.2644,
"step": 580
},
{
"epoch": 3.8592839300582846,
"grad_norm": 0.9907767398098887,
"learning_rate": 7.260731586586983e-07,
"loss": 0.2654,
"step": 582
},
{
"epoch": 3.8726061615320564,
"grad_norm": 1.0920216934407247,
"learning_rate": 7.097526647366379e-07,
"loss": 0.2652,
"step": 584
},
{
"epoch": 3.8859283930058286,
"grad_norm": 1.0675877474822888,
"learning_rate": 6.935872887769299e-07,
"loss": 0.265,
"step": 586
},
{
"epoch": 3.8992506244796004,
"grad_norm": 1.047365669548006,
"learning_rate": 6.775784314464717e-07,
"loss": 0.2635,
"step": 588
},
{
"epoch": 3.912572855953372,
"grad_norm": 1.030022751644315,
"learning_rate": 6.617274798504286e-07,
"loss": 0.2628,
"step": 590
},
{
"epoch": 3.925895087427144,
"grad_norm": 1.044545063593376,
"learning_rate": 6.460358074120518e-07,
"loss": 0.2647,
"step": 592
},
{
"epoch": 3.9392173189009156,
"grad_norm": 1.0225003684521647,
"learning_rate": 6.305047737536707e-07,
"loss": 0.2625,
"step": 594
},
{
"epoch": 3.952539550374688,
"grad_norm": 1.0533487294826005,
"learning_rate": 6.151357245788917e-07,
"loss": 0.2731,
"step": 596
},
{
"epoch": 3.9658617818484596,
"grad_norm": 1.034466643395701,
"learning_rate": 5.999299915559956e-07,
"loss": 0.2558,
"step": 598
},
{
"epoch": 3.9791840133222314,
"grad_norm": 1.037903752833203,
"learning_rate": 5.848888922025553e-07,
"loss": 0.2618,
"step": 600
},
{
"epoch": 3.9925062447960036,
"grad_norm": 1.027043640385128,
"learning_rate": 5.700137297712749e-07,
"loss": 0.2669,
"step": 602
},
{
"epoch": 4.0,
"grad_norm": 1.098155802724837,
"learning_rate": 5.553057931370729e-07,
"loss": 0.1505,
"step": 604
},
{
"epoch": 4.013322231473772,
"grad_norm": 1.0453208918296613,
"learning_rate": 5.407663566854008e-07,
"loss": 0.2321,
"step": 606
},
{
"epoch": 4.0266444629475435,
"grad_norm": 1.0336345571966956,
"learning_rate": 5.263966802018275e-07,
"loss": 0.2359,
"step": 608
},
{
"epoch": 4.039966694421316,
"grad_norm": 1.0051543661966322,
"learning_rate": 5.121980087628802e-07,
"loss": 0.2286,
"step": 610
},
{
"epoch": 4.053288925895087,
"grad_norm": 1.0291906973127083,
"learning_rate": 4.981715726281666e-07,
"loss": 0.2322,
"step": 612
},
{
"epoch": 4.066611157368859,
"grad_norm": 1.0535023572236821,
"learning_rate": 4.843185871337722e-07,
"loss": 0.2402,
"step": 614
},
{
"epoch": 4.0799333888426315,
"grad_norm": 1.1023709895301759,
"learning_rate": 4.706402525869633e-07,
"loss": 0.2322,
"step": 616
},
{
"epoch": 4.093255620316403,
"grad_norm": 1.0283405302482598,
"learning_rate": 4.5713775416217884e-07,
"loss": 0.2238,
"step": 618
},
{
"epoch": 4.106577851790175,
"grad_norm": 1.0393264816137988,
"learning_rate": 4.438122617983442e-07,
"loss": 0.2292,
"step": 620
},
{
"epoch": 4.119900083263946,
"grad_norm": 1.0587543311732102,
"learning_rate": 4.3066493009749853e-07,
"loss": 0.2293,
"step": 622
},
{
"epoch": 4.1332223147377185,
"grad_norm": 1.0914720689441537,
"learning_rate": 4.1769689822475147e-07,
"loss": 0.2317,
"step": 624
},
{
"epoch": 4.146544546211491,
"grad_norm": 1.0417273334496886,
"learning_rate": 4.049092898095816e-07,
"loss": 0.2358,
"step": 626
},
{
"epoch": 4.159866777685262,
"grad_norm": 1.0248130840986234,
"learning_rate": 3.9230321284847856e-07,
"loss": 0.2355,
"step": 628
},
{
"epoch": 4.173189009159034,
"grad_norm": 1.0589305077753277,
"learning_rate": 3.798797596089351e-07,
"loss": 0.2331,
"step": 630
},
{
"epoch": 4.1865112406328056,
"grad_norm": 1.0399059506346249,
"learning_rate": 3.6764000653481263e-07,
"loss": 0.2352,
"step": 632
},
{
"epoch": 4.199833472106578,
"grad_norm": 1.0352919697923912,
"learning_rate": 3.555850141530659e-07,
"loss": 0.2327,
"step": 634
},
{
"epoch": 4.21315570358035,
"grad_norm": 0.989140712662966,
"learning_rate": 3.4371582698185636e-07,
"loss": 0.228,
"step": 636
},
{
"epoch": 4.226477935054121,
"grad_norm": 1.0090105290858724,
"learning_rate": 3.3203347344004737e-07,
"loss": 0.2258,
"step": 638
},
{
"epoch": 4.2398001665278935,
"grad_norm": 0.9991149517617007,
"learning_rate": 3.2053896575809426e-07,
"loss": 0.2199,
"step": 640
},
{
"epoch": 4.253122398001666,
"grad_norm": 1.014374420272404,
"learning_rate": 3.092332998903416e-07,
"loss": 0.2261,
"step": 642
},
{
"epoch": 4.266444629475437,
"grad_norm": 1.0250317424256117,
"learning_rate": 2.981174554287239e-07,
"loss": 0.2381,
"step": 644
},
{
"epoch": 4.279766860949209,
"grad_norm": 1.0442581559998447,
"learning_rate": 2.871923955178918e-07,
"loss": 0.2315,
"step": 646
},
{
"epoch": 4.2930890924229805,
"grad_norm": 1.0098371613642636,
"learning_rate": 2.764590667717562e-07,
"loss": 0.2272,
"step": 648
},
{
"epoch": 4.306411323896753,
"grad_norm": 1.0807767731419033,
"learning_rate": 2.6591839919146963e-07,
"loss": 0.2394,
"step": 650
},
{
"epoch": 4.319733555370525,
"grad_norm": 1.054167521910636,
"learning_rate": 2.555713060848433e-07,
"loss": 0.2324,
"step": 652
},
{
"epoch": 4.333055786844296,
"grad_norm": 1.107035611645368,
"learning_rate": 2.454186839872158e-07,
"loss": 0.2357,
"step": 654
},
{
"epoch": 4.3463780183180685,
"grad_norm": 1.0552376707704954,
"learning_rate": 2.3546141258376786e-07,
"loss": 0.2289,
"step": 656
},
{
"epoch": 4.35970024979184,
"grad_norm": 1.0047757316250936,
"learning_rate": 2.257003546333042e-07,
"loss": 0.2281,
"step": 658
},
{
"epoch": 4.373022481265612,
"grad_norm": 1.0426529499317703,
"learning_rate": 2.1613635589349756e-07,
"loss": 0.2351,
"step": 660
},
{
"epoch": 4.386344712739384,
"grad_norm": 1.0168386832947722,
"learning_rate": 2.0677024504760752e-07,
"loss": 0.2329,
"step": 662
},
{
"epoch": 4.3996669442131555,
"grad_norm": 1.004364247984247,
"learning_rate": 1.9760283363267684e-07,
"loss": 0.2309,
"step": 664
},
{
"epoch": 4.412989175686928,
"grad_norm": 1.0575692314944383,
"learning_rate": 1.8863491596921745e-07,
"loss": 0.2338,
"step": 666
},
{
"epoch": 4.426311407160699,
"grad_norm": 1.0256602646785253,
"learning_rate": 1.798672690923828e-07,
"loss": 0.2286,
"step": 668
},
{
"epoch": 4.439633638634471,
"grad_norm": 0.9903962555666792,
"learning_rate": 1.713006526846439e-07,
"loss": 0.2299,
"step": 670
},
{
"epoch": 4.4529558701082435,
"grad_norm": 1.006720208531802,
"learning_rate": 1.629358090099639e-07,
"loss": 0.2308,
"step": 672
},
{
"epoch": 4.466278101582015,
"grad_norm": 1.0131829979414444,
"learning_rate": 1.5477346284948292e-07,
"loss": 0.2291,
"step": 674
},
{
"epoch": 4.479600333055787,
"grad_norm": 1.0035493986435864,
"learning_rate": 1.4681432143872133e-07,
"loss": 0.2345,
"step": 676
},
{
"epoch": 4.492922564529558,
"grad_norm": 1.0043750746548528,
"learning_rate": 1.3905907440629752e-07,
"loss": 0.2293,
"step": 678
},
{
"epoch": 4.5062447960033305,
"grad_norm": 1.041883268646126,
"learning_rate": 1.31508393714177e-07,
"loss": 0.2228,
"step": 680
},
{
"epoch": 4.519567027477103,
"grad_norm": 1.0405556943028,
"learning_rate": 1.241629335994471e-07,
"loss": 0.2281,
"step": 682
},
{
"epoch": 4.532889258950874,
"grad_norm": 1.0206604077473356,
"learning_rate": 1.1702333051763271e-07,
"loss": 0.2223,
"step": 684
},
{
"epoch": 4.546211490424646,
"grad_norm": 1.1168719067709043,
"learning_rate": 1.1009020308754587e-07,
"loss": 0.2296,
"step": 686
},
{
"epoch": 4.559533721898418,
"grad_norm": 1.061012715283086,
"learning_rate": 1.0336415203768962e-07,
"loss": 0.2338,
"step": 688
},
{
"epoch": 4.57285595337219,
"grad_norm": 1.0309834474331188,
"learning_rate": 9.684576015420277e-08,
"loss": 0.2328,
"step": 690
},
{
"epoch": 4.586178184845962,
"grad_norm": 1.0027277768812777,
"learning_rate": 9.053559223036746e-08,
"loss": 0.2195,
"step": 692
},
{
"epoch": 4.599500416319733,
"grad_norm": 0.993857076400692,
"learning_rate": 8.44341950176683e-08,
"loss": 0.2256,
"step": 694
},
{
"epoch": 4.6128226477935055,
"grad_norm": 1.007429158342742,
"learning_rate": 7.854209717842231e-08,
"loss": 0.2319,
"step": 696
},
{
"epoch": 4.626144879267278,
"grad_norm": 1.033726145073033,
"learning_rate": 7.285980923996989e-08,
"loss": 0.2342,
"step": 698
},
{
"epoch": 4.639467110741049,
"grad_norm": 1.0168527853674483,
"learning_rate": 6.738782355044048e-08,
"loss": 0.234,
"step": 700
},
{
"epoch": 4.652789342214821,
"grad_norm": 1.0477507076137358,
"learning_rate": 6.212661423609184e-08,
"loss": 0.2342,
"step": 702
},
{
"epoch": 4.6661115736885925,
"grad_norm": 1.0066144201644125,
"learning_rate": 5.707663716023021e-08,
"loss": 0.2181,
"step": 704
},
{
"epoch": 4.679433805162365,
"grad_norm": 1.0584682384889934,
"learning_rate": 5.22383298837098e-08,
"loss": 0.2316,
"step": 706
},
{
"epoch": 4.692756036636137,
"grad_norm": 0.9785329546521053,
"learning_rate": 4.761211162702117e-08,
"loss": 0.23,
"step": 708
},
{
"epoch": 4.706078268109908,
"grad_norm": 1.0383108228822218,
"learning_rate": 4.319838323396691e-08,
"loss": 0.2331,
"step": 710
},
{
"epoch": 4.7194004995836805,
"grad_norm": 1.0036079922992014,
"learning_rate": 3.8997527136930004e-08,
"loss": 0.2255,
"step": 712
},
{
"epoch": 4.732722731057452,
"grad_norm": 1.0350261932080171,
"learning_rate": 3.5009907323737826e-08,
"loss": 0.241,
"step": 714
},
{
"epoch": 4.746044962531224,
"grad_norm": 1.0546445582741784,
"learning_rate": 3.1235869306123766e-08,
"loss": 0.2278,
"step": 716
},
{
"epoch": 4.759367194004996,
"grad_norm": 1.002658916833135,
"learning_rate": 2.767574008979007e-08,
"loss": 0.2263,
"step": 718
},
{
"epoch": 4.7726894254787675,
"grad_norm": 1.0293431900981997,
"learning_rate": 2.4329828146074096e-08,
"loss": 0.234,
"step": 720
},
{
"epoch": 4.78601165695254,
"grad_norm": 1.048978506558003,
"learning_rate": 2.1198423385220822e-08,
"loss": 0.2272,
"step": 722
},
{
"epoch": 4.799333888426311,
"grad_norm": 1.0188945259859212,
"learning_rate": 1.82817971312621e-08,
"loss": 0.2254,
"step": 724
},
{
"epoch": 4.812656119900083,
"grad_norm": 1.0799756942526888,
"learning_rate": 1.5580202098509078e-08,
"loss": 0.2324,
"step": 726
},
{
"epoch": 4.8259783513738554,
"grad_norm": 1.0033120373906426,
"learning_rate": 1.3093872369654148e-08,
"loss": 0.2261,
"step": 728
},
{
"epoch": 4.839300582847627,
"grad_norm": 1.0341825743001596,
"learning_rate": 1.0823023375489128e-08,
"loss": 0.2301,
"step": 730
},
{
"epoch": 4.852622814321399,
"grad_norm": 0.9930386687248629,
"learning_rate": 8.767851876239075e-09,
"loss": 0.2289,
"step": 732
},
{
"epoch": 4.86594504579517,
"grad_norm": 1.050611973007718,
"learning_rate": 6.9285359445145366e-09,
"loss": 0.2322,
"step": 734
},
{
"epoch": 4.8792672772689425,
"grad_norm": 1.0238154022229573,
"learning_rate": 5.305234949880001e-09,
"loss": 0.2314,
"step": 736
},
{
"epoch": 4.892589508742715,
"grad_norm": 1.0359224074295086,
"learning_rate": 3.8980895450474455e-09,
"loss": 0.2445,
"step": 738
},
{
"epoch": 4.905911740216486,
"grad_norm": 1.0513486639225555,
"learning_rate": 2.7072216536885855e-09,
"loss": 0.2366,
"step": 740
},
{
"epoch": 4.919233971690258,
"grad_norm": 1.0043522118975665,
"learning_rate": 1.7327344598702667e-09,
"loss": 0.2373,
"step": 742
},
{
"epoch": 4.9325562031640295,
"grad_norm": 1.0244814544018699,
"learning_rate": 9.747123991141193e-10,
"loss": 0.2333,
"step": 744
},
{
"epoch": 4.945878434637802,
"grad_norm": 1.0043902802504958,
"learning_rate": 4.332211510807427e-10,
"loss": 0.2322,
"step": 746
},
{
"epoch": 4.959200666111574,
"grad_norm": 0.9910871784096957,
"learning_rate": 1.0830763387897902e-10,
"loss": 0.2172,
"step": 748
},
{
"epoch": 4.972522897585345,
"grad_norm": 1.0059895919675135,
"learning_rate": 0.0,
"loss": 0.2232,
"step": 750
}
],
"logging_steps": 2,
"max_steps": 750,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1025260732350464.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}