VeriThoughts-Reasoning-7B-Qwen3 / trainer_state.json
wilyub's picture
End of training
5ca721e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 603,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004975124378109453,
"grad_norm": 5.879948830184706,
"learning_rate": 0.0,
"loss": 0.7667,
"step": 1
},
{
"epoch": 0.009950248756218905,
"grad_norm": 5.2235442655986954,
"learning_rate": 1.3114754098360657e-06,
"loss": 0.7672,
"step": 2
},
{
"epoch": 0.014925373134328358,
"grad_norm": 6.05052115788454,
"learning_rate": 2.6229508196721314e-06,
"loss": 0.7488,
"step": 3
},
{
"epoch": 0.01990049751243781,
"grad_norm": 5.533884257629409,
"learning_rate": 3.934426229508197e-06,
"loss": 0.7477,
"step": 4
},
{
"epoch": 0.024875621890547265,
"grad_norm": 2.9452605687276066,
"learning_rate": 5.245901639344263e-06,
"loss": 0.7031,
"step": 5
},
{
"epoch": 0.029850746268656716,
"grad_norm": 3.088496358850151,
"learning_rate": 6.5573770491803276e-06,
"loss": 0.6635,
"step": 6
},
{
"epoch": 0.03482587064676617,
"grad_norm": 1.1483388314243999,
"learning_rate": 7.868852459016394e-06,
"loss": 0.5938,
"step": 7
},
{
"epoch": 0.03980099502487562,
"grad_norm": 1.038360280928093,
"learning_rate": 9.18032786885246e-06,
"loss": 0.6236,
"step": 8
},
{
"epoch": 0.04477611940298507,
"grad_norm": 1.62146931905999,
"learning_rate": 1.0491803278688525e-05,
"loss": 0.6265,
"step": 9
},
{
"epoch": 0.04975124378109453,
"grad_norm": 1.091027063595789,
"learning_rate": 1.1803278688524591e-05,
"loss": 0.5663,
"step": 10
},
{
"epoch": 0.05472636815920398,
"grad_norm": 1.118045200909107,
"learning_rate": 1.3114754098360655e-05,
"loss": 0.5634,
"step": 11
},
{
"epoch": 0.05970149253731343,
"grad_norm": 1.1296501537591321,
"learning_rate": 1.4426229508196722e-05,
"loss": 0.6183,
"step": 12
},
{
"epoch": 0.06467661691542288,
"grad_norm": 1.1267046281368494,
"learning_rate": 1.5737704918032788e-05,
"loss": 0.5825,
"step": 13
},
{
"epoch": 0.06965174129353234,
"grad_norm": 0.8499261863813117,
"learning_rate": 1.7049180327868854e-05,
"loss": 0.5805,
"step": 14
},
{
"epoch": 0.07462686567164178,
"grad_norm": 1.3598777450843793,
"learning_rate": 1.836065573770492e-05,
"loss": 0.5932,
"step": 15
},
{
"epoch": 0.07960199004975124,
"grad_norm": 0.7667369938360317,
"learning_rate": 1.9672131147540985e-05,
"loss": 0.5842,
"step": 16
},
{
"epoch": 0.0845771144278607,
"grad_norm": 0.696963272396721,
"learning_rate": 2.098360655737705e-05,
"loss": 0.5457,
"step": 17
},
{
"epoch": 0.08955223880597014,
"grad_norm": 0.7786007741172544,
"learning_rate": 2.2295081967213113e-05,
"loss": 0.5941,
"step": 18
},
{
"epoch": 0.0945273631840796,
"grad_norm": 0.6350186171919544,
"learning_rate": 2.3606557377049182e-05,
"loss": 0.5547,
"step": 19
},
{
"epoch": 0.09950248756218906,
"grad_norm": 0.7025126334296926,
"learning_rate": 2.4918032786885248e-05,
"loss": 0.571,
"step": 20
},
{
"epoch": 0.1044776119402985,
"grad_norm": 0.809615897517396,
"learning_rate": 2.622950819672131e-05,
"loss": 0.5567,
"step": 21
},
{
"epoch": 0.10945273631840796,
"grad_norm": 0.712052865613253,
"learning_rate": 2.754098360655738e-05,
"loss": 0.5726,
"step": 22
},
{
"epoch": 0.11442786069651742,
"grad_norm": 0.6685647092557887,
"learning_rate": 2.8852459016393445e-05,
"loss": 0.5624,
"step": 23
},
{
"epoch": 0.11940298507462686,
"grad_norm": 0.6788511366076325,
"learning_rate": 3.0163934426229507e-05,
"loss": 0.5436,
"step": 24
},
{
"epoch": 0.12437810945273632,
"grad_norm": 0.7528664788886847,
"learning_rate": 3.1475409836065576e-05,
"loss": 0.5544,
"step": 25
},
{
"epoch": 0.12935323383084577,
"grad_norm": 0.6862044740604918,
"learning_rate": 3.278688524590164e-05,
"loss": 0.5499,
"step": 26
},
{
"epoch": 0.13432835820895522,
"grad_norm": 0.6138161287460158,
"learning_rate": 3.409836065573771e-05,
"loss": 0.5445,
"step": 27
},
{
"epoch": 0.13930348258706468,
"grad_norm": 0.6540655635462894,
"learning_rate": 3.5409836065573773e-05,
"loss": 0.5535,
"step": 28
},
{
"epoch": 0.14427860696517414,
"grad_norm": 0.6213994340053169,
"learning_rate": 3.672131147540984e-05,
"loss": 0.5593,
"step": 29
},
{
"epoch": 0.14925373134328357,
"grad_norm": 0.5255618873475716,
"learning_rate": 3.8032786885245905e-05,
"loss": 0.5306,
"step": 30
},
{
"epoch": 0.15422885572139303,
"grad_norm": 0.5618938647818913,
"learning_rate": 3.934426229508197e-05,
"loss": 0.5481,
"step": 31
},
{
"epoch": 0.15920398009950248,
"grad_norm": 0.5382661117044717,
"learning_rate": 4.0655737704918036e-05,
"loss": 0.5407,
"step": 32
},
{
"epoch": 0.16417910447761194,
"grad_norm": 0.9651676443670698,
"learning_rate": 4.19672131147541e-05,
"loss": 0.5585,
"step": 33
},
{
"epoch": 0.1691542288557214,
"grad_norm": 0.564225092510184,
"learning_rate": 4.3278688524590174e-05,
"loss": 0.5189,
"step": 34
},
{
"epoch": 0.17412935323383086,
"grad_norm": 0.5321277446853472,
"learning_rate": 4.4590163934426226e-05,
"loss": 0.5459,
"step": 35
},
{
"epoch": 0.1791044776119403,
"grad_norm": 0.4991846369713298,
"learning_rate": 4.59016393442623e-05,
"loss": 0.5076,
"step": 36
},
{
"epoch": 0.18407960199004975,
"grad_norm": 0.49071480532725875,
"learning_rate": 4.7213114754098365e-05,
"loss": 0.5024,
"step": 37
},
{
"epoch": 0.1890547263681592,
"grad_norm": 0.5943409512367948,
"learning_rate": 4.852459016393443e-05,
"loss": 0.5315,
"step": 38
},
{
"epoch": 0.19402985074626866,
"grad_norm": 0.5707557126537657,
"learning_rate": 4.9836065573770496e-05,
"loss": 0.538,
"step": 39
},
{
"epoch": 0.19900497512437812,
"grad_norm": 0.7926233950361419,
"learning_rate": 5.114754098360657e-05,
"loss": 0.5621,
"step": 40
},
{
"epoch": 0.20398009950248755,
"grad_norm": 0.5495160602179542,
"learning_rate": 5.245901639344262e-05,
"loss": 0.5414,
"step": 41
},
{
"epoch": 0.208955223880597,
"grad_norm": 0.5640801102119853,
"learning_rate": 5.377049180327869e-05,
"loss": 0.5011,
"step": 42
},
{
"epoch": 0.21393034825870647,
"grad_norm": 0.5415477216078182,
"learning_rate": 5.508196721311476e-05,
"loss": 0.5605,
"step": 43
},
{
"epoch": 0.21890547263681592,
"grad_norm": 0.5588862009803612,
"learning_rate": 5.6393442622950824e-05,
"loss": 0.5437,
"step": 44
},
{
"epoch": 0.22388059701492538,
"grad_norm": 0.5029511965106807,
"learning_rate": 5.770491803278689e-05,
"loss": 0.5473,
"step": 45
},
{
"epoch": 0.22885572139303484,
"grad_norm": 0.5258790964390814,
"learning_rate": 5.9016393442622956e-05,
"loss": 0.5575,
"step": 46
},
{
"epoch": 0.23383084577114427,
"grad_norm": 0.5480016416626384,
"learning_rate": 6.0327868852459015e-05,
"loss": 0.5421,
"step": 47
},
{
"epoch": 0.23880597014925373,
"grad_norm": 0.5588192051221643,
"learning_rate": 6.163934426229509e-05,
"loss": 0.5605,
"step": 48
},
{
"epoch": 0.24378109452736318,
"grad_norm": 0.5600403065536617,
"learning_rate": 6.295081967213115e-05,
"loss": 0.5265,
"step": 49
},
{
"epoch": 0.24875621890547264,
"grad_norm": 0.6060663481562588,
"learning_rate": 6.426229508196722e-05,
"loss": 0.5631,
"step": 50
},
{
"epoch": 0.2537313432835821,
"grad_norm": 0.7022502664226856,
"learning_rate": 6.557377049180328e-05,
"loss": 0.533,
"step": 51
},
{
"epoch": 0.25870646766169153,
"grad_norm": 0.6780424557118481,
"learning_rate": 6.688524590163935e-05,
"loss": 0.5366,
"step": 52
},
{
"epoch": 0.263681592039801,
"grad_norm": 0.5669957328738275,
"learning_rate": 6.819672131147542e-05,
"loss": 0.552,
"step": 53
},
{
"epoch": 0.26865671641791045,
"grad_norm": 0.5914734306512129,
"learning_rate": 6.950819672131148e-05,
"loss": 0.5389,
"step": 54
},
{
"epoch": 0.2736318407960199,
"grad_norm": 0.6149348037977527,
"learning_rate": 7.081967213114755e-05,
"loss": 0.5274,
"step": 55
},
{
"epoch": 0.27860696517412936,
"grad_norm": 0.6245100364124376,
"learning_rate": 7.213114754098361e-05,
"loss": 0.5659,
"step": 56
},
{
"epoch": 0.2835820895522388,
"grad_norm": 0.6004973149821893,
"learning_rate": 7.344262295081968e-05,
"loss": 0.5358,
"step": 57
},
{
"epoch": 0.2885572139303483,
"grad_norm": 0.5997254911753757,
"learning_rate": 7.475409836065574e-05,
"loss": 0.5645,
"step": 58
},
{
"epoch": 0.2935323383084577,
"grad_norm": 0.6246002314997682,
"learning_rate": 7.606557377049181e-05,
"loss": 0.5663,
"step": 59
},
{
"epoch": 0.29850746268656714,
"grad_norm": 0.5406788999040716,
"learning_rate": 7.737704918032788e-05,
"loss": 0.5453,
"step": 60
},
{
"epoch": 0.3034825870646766,
"grad_norm": 0.5816580903983424,
"learning_rate": 7.868852459016394e-05,
"loss": 0.5111,
"step": 61
},
{
"epoch": 0.30845771144278605,
"grad_norm": 0.5816022558938057,
"learning_rate": 8e-05,
"loss": 0.5173,
"step": 62
},
{
"epoch": 0.31343283582089554,
"grad_norm": 0.5504439572021722,
"learning_rate": 7.99993280608401e-05,
"loss": 0.5334,
"step": 63
},
{
"epoch": 0.31840796019900497,
"grad_norm": 0.5838277491059845,
"learning_rate": 7.999731226593547e-05,
"loss": 0.5172,
"step": 64
},
{
"epoch": 0.32338308457711445,
"grad_norm": 0.6579416139688873,
"learning_rate": 7.999395268301069e-05,
"loss": 0.5475,
"step": 65
},
{
"epoch": 0.3283582089552239,
"grad_norm": 0.5320888799910365,
"learning_rate": 7.998924942493754e-05,
"loss": 0.5102,
"step": 66
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.5281701415578218,
"learning_rate": 7.99832026497312e-05,
"loss": 0.5536,
"step": 67
},
{
"epoch": 0.3383084577114428,
"grad_norm": 0.5127833977720212,
"learning_rate": 7.997581256054488e-05,
"loss": 0.5621,
"step": 68
},
{
"epoch": 0.34328358208955223,
"grad_norm": 0.5167985708679069,
"learning_rate": 7.996707940566312e-05,
"loss": 0.5678,
"step": 69
},
{
"epoch": 0.3482587064676617,
"grad_norm": 0.5313575574415697,
"learning_rate": 7.995700347849337e-05,
"loss": 0.5792,
"step": 70
},
{
"epoch": 0.35323383084577115,
"grad_norm": 0.46516112663404313,
"learning_rate": 7.994558511755611e-05,
"loss": 0.5333,
"step": 71
},
{
"epoch": 0.3582089552238806,
"grad_norm": 0.5127434932833409,
"learning_rate": 7.993282470647356e-05,
"loss": 0.5381,
"step": 72
},
{
"epoch": 0.36318407960199006,
"grad_norm": 0.4611747374859348,
"learning_rate": 7.991872267395666e-05,
"loss": 0.5297,
"step": 73
},
{
"epoch": 0.3681592039800995,
"grad_norm": 0.48491223979662357,
"learning_rate": 7.990327949379087e-05,
"loss": 0.5496,
"step": 74
},
{
"epoch": 0.373134328358209,
"grad_norm": 0.44507811144929044,
"learning_rate": 7.988649568482003e-05,
"loss": 0.5613,
"step": 75
},
{
"epoch": 0.3781094527363184,
"grad_norm": 0.5065099749041349,
"learning_rate": 7.986837181092907e-05,
"loss": 0.545,
"step": 76
},
{
"epoch": 0.38308457711442784,
"grad_norm": 0.4496572705412145,
"learning_rate": 7.984890848102501e-05,
"loss": 0.5625,
"step": 77
},
{
"epoch": 0.3880597014925373,
"grad_norm": 0.49181167685096244,
"learning_rate": 7.982810634901654e-05,
"loss": 0.5287,
"step": 78
},
{
"epoch": 0.39303482587064675,
"grad_norm": 0.5095805376917029,
"learning_rate": 7.980596611379202e-05,
"loss": 0.5218,
"step": 79
},
{
"epoch": 0.39800995024875624,
"grad_norm": 0.4524659588566981,
"learning_rate": 7.9782488519196e-05,
"loss": 0.5421,
"step": 80
},
{
"epoch": 0.40298507462686567,
"grad_norm": 0.48114599310151146,
"learning_rate": 7.975767435400424e-05,
"loss": 0.5322,
"step": 81
},
{
"epoch": 0.4079601990049751,
"grad_norm": 0.4455277267645172,
"learning_rate": 7.973152445189719e-05,
"loss": 0.5847,
"step": 82
},
{
"epoch": 0.4129353233830846,
"grad_norm": 0.4788117510507055,
"learning_rate": 7.970403969143203e-05,
"loss": 0.5616,
"step": 83
},
{
"epoch": 0.417910447761194,
"grad_norm": 0.4606436249359568,
"learning_rate": 7.967522099601309e-05,
"loss": 0.5371,
"step": 84
},
{
"epoch": 0.4228855721393035,
"grad_norm": 0.44668898675104635,
"learning_rate": 7.964506933386088e-05,
"loss": 0.5291,
"step": 85
},
{
"epoch": 0.42786069651741293,
"grad_norm": 0.4502604139613404,
"learning_rate": 7.961358571797953e-05,
"loss": 0.5324,
"step": 86
},
{
"epoch": 0.43283582089552236,
"grad_norm": 0.43560491156054876,
"learning_rate": 7.958077120612275e-05,
"loss": 0.5245,
"step": 87
},
{
"epoch": 0.43781094527363185,
"grad_norm": 0.4842024445653769,
"learning_rate": 7.95466269007583e-05,
"loss": 0.5513,
"step": 88
},
{
"epoch": 0.4427860696517413,
"grad_norm": 0.42614132833617197,
"learning_rate": 7.9511153949031e-05,
"loss": 0.5451,
"step": 89
},
{
"epoch": 0.44776119402985076,
"grad_norm": 0.4891312472692003,
"learning_rate": 7.947435354272414e-05,
"loss": 0.5263,
"step": 90
},
{
"epoch": 0.4527363184079602,
"grad_norm": 0.4545172494931765,
"learning_rate": 7.943622691821938e-05,
"loss": 0.5396,
"step": 91
},
{
"epoch": 0.4577114427860697,
"grad_norm": 0.448529204030658,
"learning_rate": 7.939677535645533e-05,
"loss": 0.5264,
"step": 92
},
{
"epoch": 0.4626865671641791,
"grad_norm": 0.4516516546653182,
"learning_rate": 7.935600018288447e-05,
"loss": 0.5456,
"step": 93
},
{
"epoch": 0.46766169154228854,
"grad_norm": 0.46358875853260084,
"learning_rate": 7.931390276742859e-05,
"loss": 0.5075,
"step": 94
},
{
"epoch": 0.472636815920398,
"grad_norm": 0.4388895352432698,
"learning_rate": 7.927048452443279e-05,
"loss": 0.5121,
"step": 95
},
{
"epoch": 0.47761194029850745,
"grad_norm": 0.44989392045392973,
"learning_rate": 7.922574691261794e-05,
"loss": 0.556,
"step": 96
},
{
"epoch": 0.48258706467661694,
"grad_norm": 0.4132769512058111,
"learning_rate": 7.917969143503172e-05,
"loss": 0.5201,
"step": 97
},
{
"epoch": 0.48756218905472637,
"grad_norm": 0.4748955194225827,
"learning_rate": 7.913231963899806e-05,
"loss": 0.5548,
"step": 98
},
{
"epoch": 0.4925373134328358,
"grad_norm": 0.40947989720005395,
"learning_rate": 7.908363311606525e-05,
"loss": 0.5409,
"step": 99
},
{
"epoch": 0.4975124378109453,
"grad_norm": 0.38003180467688796,
"learning_rate": 7.903363350195229e-05,
"loss": 0.5125,
"step": 100
},
{
"epoch": 0.5024875621890548,
"grad_norm": 0.6902176948042787,
"learning_rate": 7.898232247649414e-05,
"loss": 0.5169,
"step": 101
},
{
"epoch": 0.5074626865671642,
"grad_norm": 0.4242062359610916,
"learning_rate": 7.892970176358519e-05,
"loss": 0.5112,
"step": 102
},
{
"epoch": 0.5124378109452736,
"grad_norm": 0.44210587009357805,
"learning_rate": 7.887577313112129e-05,
"loss": 0.5478,
"step": 103
},
{
"epoch": 0.5174129353233831,
"grad_norm": 0.46468109194219087,
"learning_rate": 7.882053839094045e-05,
"loss": 0.5222,
"step": 104
},
{
"epoch": 0.5223880597014925,
"grad_norm": 0.4130922336671526,
"learning_rate": 7.876399939876194e-05,
"loss": 0.5369,
"step": 105
},
{
"epoch": 0.527363184079602,
"grad_norm": 0.42770523171556496,
"learning_rate": 7.870615805412387e-05,
"loss": 0.5249,
"step": 106
},
{
"epoch": 0.5323383084577115,
"grad_norm": 0.42829673493515213,
"learning_rate": 7.864701630031949e-05,
"loss": 0.5256,
"step": 107
},
{
"epoch": 0.5373134328358209,
"grad_norm": 0.4577850551423171,
"learning_rate": 7.858657612433179e-05,
"loss": 0.5017,
"step": 108
},
{
"epoch": 0.5422885572139303,
"grad_norm": 0.4580208652431875,
"learning_rate": 7.852483955676685e-05,
"loss": 0.5385,
"step": 109
},
{
"epoch": 0.5472636815920398,
"grad_norm": 0.4521081619187588,
"learning_rate": 7.846180867178553e-05,
"loss": 0.519,
"step": 110
},
{
"epoch": 0.5522388059701493,
"grad_norm": 0.43746850745160454,
"learning_rate": 7.839748558703383e-05,
"loss": 0.5417,
"step": 111
},
{
"epoch": 0.5572139303482587,
"grad_norm": 0.4438955801038043,
"learning_rate": 7.833187246357172e-05,
"loss": 0.5187,
"step": 112
},
{
"epoch": 0.5621890547263682,
"grad_norm": 0.4286197509144701,
"learning_rate": 7.826497150580055e-05,
"loss": 0.5142,
"step": 113
},
{
"epoch": 0.5671641791044776,
"grad_norm": 0.39145027844847796,
"learning_rate": 7.8196784961389e-05,
"loss": 0.5319,
"step": 114
},
{
"epoch": 0.572139303482587,
"grad_norm": 0.4116110135547263,
"learning_rate": 7.812731512119753e-05,
"loss": 0.5017,
"step": 115
},
{
"epoch": 0.5771144278606966,
"grad_norm": 0.4192891992656687,
"learning_rate": 7.805656431920143e-05,
"loss": 0.5471,
"step": 116
},
{
"epoch": 0.582089552238806,
"grad_norm": 0.41450926014695316,
"learning_rate": 7.798453493241246e-05,
"loss": 0.5198,
"step": 117
},
{
"epoch": 0.5870646766169154,
"grad_norm": 0.4540241153108466,
"learning_rate": 7.791122938079887e-05,
"loss": 0.5848,
"step": 118
},
{
"epoch": 0.5920398009950248,
"grad_norm": 0.39408273059441834,
"learning_rate": 7.783665012720419e-05,
"loss": 0.5457,
"step": 119
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.43126153424114066,
"learning_rate": 7.77607996772645e-05,
"loss": 0.5576,
"step": 120
},
{
"epoch": 0.6019900497512438,
"grad_norm": 0.4824573509496151,
"learning_rate": 7.768368057932417e-05,
"loss": 0.5844,
"step": 121
},
{
"epoch": 0.6069651741293532,
"grad_norm": 0.4186425292993652,
"learning_rate": 7.760529542435029e-05,
"loss": 0.5697,
"step": 122
},
{
"epoch": 0.6119402985074627,
"grad_norm": 0.4377766818907997,
"learning_rate": 7.752564684584563e-05,
"loss": 0.5069,
"step": 123
},
{
"epoch": 0.6169154228855721,
"grad_norm": 0.6732048439878882,
"learning_rate": 7.744473751976012e-05,
"loss": 0.4964,
"step": 124
},
{
"epoch": 0.6218905472636815,
"grad_norm": 0.5045209563187125,
"learning_rate": 7.7362570164401e-05,
"loss": 0.5183,
"step": 125
},
{
"epoch": 0.6268656716417911,
"grad_norm": 0.41851717230868335,
"learning_rate": 7.727914754034147e-05,
"loss": 0.5332,
"step": 126
},
{
"epoch": 0.6318407960199005,
"grad_norm": 0.47281567082753584,
"learning_rate": 7.719447245032788e-05,
"loss": 0.5531,
"step": 127
},
{
"epoch": 0.6368159203980099,
"grad_norm": 0.46985505736621747,
"learning_rate": 7.710854773918572e-05,
"loss": 0.523,
"step": 128
},
{
"epoch": 0.6417910447761194,
"grad_norm": 0.449788739083477,
"learning_rate": 7.702137629372388e-05,
"loss": 0.5323,
"step": 129
},
{
"epoch": 0.6467661691542289,
"grad_norm": 0.4644321215634913,
"learning_rate": 7.693296104263777e-05,
"loss": 0.5294,
"step": 130
},
{
"epoch": 0.6517412935323383,
"grad_norm": 0.44641643289164196,
"learning_rate": 7.684330495641084e-05,
"loss": 0.5301,
"step": 131
},
{
"epoch": 0.6567164179104478,
"grad_norm": 0.47484679630977794,
"learning_rate": 7.675241104721487e-05,
"loss": 0.5203,
"step": 132
},
{
"epoch": 0.6616915422885572,
"grad_norm": 0.4325510766840271,
"learning_rate": 7.66602823688087e-05,
"loss": 0.5337,
"step": 133
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.6149477632116208,
"learning_rate": 7.656692201643569e-05,
"loss": 0.5014,
"step": 134
},
{
"epoch": 0.6716417910447762,
"grad_norm": 0.42195591915373487,
"learning_rate": 7.647233312671966e-05,
"loss": 0.5124,
"step": 135
},
{
"epoch": 0.6766169154228856,
"grad_norm": 0.39318505190957936,
"learning_rate": 7.637651887755955e-05,
"loss": 0.5356,
"step": 136
},
{
"epoch": 0.681592039800995,
"grad_norm": 0.42944630349072993,
"learning_rate": 7.627948248802269e-05,
"loss": 0.5249,
"step": 137
},
{
"epoch": 0.6865671641791045,
"grad_norm": 0.43007219501882216,
"learning_rate": 7.618122721823656e-05,
"loss": 0.5722,
"step": 138
},
{
"epoch": 0.6915422885572139,
"grad_norm": 0.3612830585663661,
"learning_rate": 7.608175636927936e-05,
"loss": 0.5423,
"step": 139
},
{
"epoch": 0.6965174129353234,
"grad_norm": 0.4504348479973107,
"learning_rate": 7.598107328306902e-05,
"loss": 0.5418,
"step": 140
},
{
"epoch": 0.7014925373134329,
"grad_norm": 0.5246833441288096,
"learning_rate": 7.587918134225092e-05,
"loss": 0.5147,
"step": 141
},
{
"epoch": 0.7064676616915423,
"grad_norm": 0.35037174084972605,
"learning_rate": 7.577608397008436e-05,
"loss": 0.4877,
"step": 142
},
{
"epoch": 0.7114427860696517,
"grad_norm": 0.3908914010701244,
"learning_rate": 7.56717846303274e-05,
"loss": 0.5118,
"step": 143
},
{
"epoch": 0.7164179104477612,
"grad_norm": 0.3581238736307404,
"learning_rate": 7.55662868271206e-05,
"loss": 0.491,
"step": 144
},
{
"epoch": 0.7213930348258707,
"grad_norm": 0.3894480558267158,
"learning_rate": 7.545959410486918e-05,
"loss": 0.5682,
"step": 145
},
{
"epoch": 0.7263681592039801,
"grad_norm": 0.44359588797737537,
"learning_rate": 7.535171004812409e-05,
"loss": 0.5289,
"step": 146
},
{
"epoch": 0.7313432835820896,
"grad_norm": 0.407030891972332,
"learning_rate": 7.524263828146144e-05,
"loss": 0.5101,
"step": 147
},
{
"epoch": 0.736318407960199,
"grad_norm": 0.4063628333515133,
"learning_rate": 7.513238246936077e-05,
"loss": 0.5482,
"step": 148
},
{
"epoch": 0.7412935323383084,
"grad_norm": 0.4274106533585936,
"learning_rate": 7.502094631608201e-05,
"loss": 0.5191,
"step": 149
},
{
"epoch": 0.746268656716418,
"grad_norm": 0.44937783734239284,
"learning_rate": 7.490833356554088e-05,
"loss": 0.5359,
"step": 150
},
{
"epoch": 0.7512437810945274,
"grad_norm": 0.4007251382417533,
"learning_rate": 7.479454800118327e-05,
"loss": 0.5363,
"step": 151
},
{
"epoch": 0.7562189054726368,
"grad_norm": 0.4209673843323663,
"learning_rate": 7.467959344585796e-05,
"loss": 0.4941,
"step": 152
},
{
"epoch": 0.7611940298507462,
"grad_norm": 0.4019802174266669,
"learning_rate": 7.456347376168837e-05,
"loss": 0.5182,
"step": 153
},
{
"epoch": 0.7661691542288557,
"grad_norm": 0.3857327836554883,
"learning_rate": 7.44461928499426e-05,
"loss": 0.4866,
"step": 154
},
{
"epoch": 0.7711442786069652,
"grad_norm": 0.4127925959560194,
"learning_rate": 7.432775465090254e-05,
"loss": 0.5214,
"step": 155
},
{
"epoch": 0.7761194029850746,
"grad_norm": 0.3886111472729733,
"learning_rate": 7.420816314373139e-05,
"loss": 0.4861,
"step": 156
},
{
"epoch": 0.7810945273631841,
"grad_norm": 0.44211955181415347,
"learning_rate": 7.408742234633999e-05,
"loss": 0.516,
"step": 157
},
{
"epoch": 0.7860696517412935,
"grad_norm": 0.4478134256976994,
"learning_rate": 7.396553631525184e-05,
"loss": 0.561,
"step": 158
},
{
"epoch": 0.7910447761194029,
"grad_norm": 0.3705902473377113,
"learning_rate": 7.38425091454668e-05,
"loss": 0.5007,
"step": 159
},
{
"epoch": 0.7960199004975125,
"grad_norm": 0.41437615203351147,
"learning_rate": 7.371834497032353e-05,
"loss": 0.5338,
"step": 160
},
{
"epoch": 0.8009950248756219,
"grad_norm": 0.4054936427228974,
"learning_rate": 7.35930479613606e-05,
"loss": 0.5153,
"step": 161
},
{
"epoch": 0.8059701492537313,
"grad_norm": 0.41271930343767926,
"learning_rate": 7.346662232817638e-05,
"loss": 0.5595,
"step": 162
},
{
"epoch": 0.8109452736318408,
"grad_norm": 0.3944935394388094,
"learning_rate": 7.333907231828755e-05,
"loss": 0.5238,
"step": 163
},
{
"epoch": 0.8159203980099502,
"grad_norm": 0.3744177603013012,
"learning_rate": 7.32104022169864e-05,
"loss": 0.5171,
"step": 164
},
{
"epoch": 0.8208955223880597,
"grad_norm": 0.38249406328112323,
"learning_rate": 7.308061634719695e-05,
"loss": 0.5476,
"step": 165
},
{
"epoch": 0.8258706467661692,
"grad_norm": 0.44714495553689276,
"learning_rate": 7.294971906932963e-05,
"loss": 0.5646,
"step": 166
},
{
"epoch": 0.8308457711442786,
"grad_norm": 0.3799263053283722,
"learning_rate": 7.281771478113474e-05,
"loss": 0.5116,
"step": 167
},
{
"epoch": 0.835820895522388,
"grad_norm": 0.3602781959028979,
"learning_rate": 7.268460791755486e-05,
"loss": 0.5363,
"step": 168
},
{
"epoch": 0.8407960199004975,
"grad_norm": 0.3182978229069629,
"learning_rate": 7.255040295057566e-05,
"loss": 0.4906,
"step": 169
},
{
"epoch": 0.845771144278607,
"grad_norm": 0.3895900491070743,
"learning_rate": 7.241510438907577e-05,
"loss": 0.5105,
"step": 170
},
{
"epoch": 0.8507462686567164,
"grad_norm": 0.40093841905363664,
"learning_rate": 7.227871677867531e-05,
"loss": 0.5228,
"step": 171
},
{
"epoch": 0.8557213930348259,
"grad_norm": 0.39214721861667695,
"learning_rate": 7.214124470158308e-05,
"loss": 0.5165,
"step": 172
},
{
"epoch": 0.8606965174129353,
"grad_norm": 0.3864483741926972,
"learning_rate": 7.200269277644268e-05,
"loss": 0.5512,
"step": 173
},
{
"epoch": 0.8656716417910447,
"grad_norm": 0.35893618552568096,
"learning_rate": 7.186306565817731e-05,
"loss": 0.5306,
"step": 174
},
{
"epoch": 0.8706467661691543,
"grad_norm": 0.3821162188738986,
"learning_rate": 7.172236803783342e-05,
"loss": 0.5095,
"step": 175
},
{
"epoch": 0.8756218905472637,
"grad_norm": 0.385653159510127,
"learning_rate": 7.158060464242303e-05,
"loss": 0.5397,
"step": 176
},
{
"epoch": 0.8805970149253731,
"grad_norm": 0.36865249587374227,
"learning_rate": 7.1437780234765e-05,
"loss": 0.5223,
"step": 177
},
{
"epoch": 0.8855721393034826,
"grad_norm": 0.3695872138907406,
"learning_rate": 7.129389961332492e-05,
"loss": 0.4981,
"step": 178
},
{
"epoch": 0.8905472636815921,
"grad_norm": 0.3943495182262802,
"learning_rate": 7.114896761205404e-05,
"loss": 0.5482,
"step": 179
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.3852233271771578,
"learning_rate": 7.100298910022669e-05,
"loss": 0.5451,
"step": 180
},
{
"epoch": 0.900497512437811,
"grad_norm": 0.37235191480919677,
"learning_rate": 7.085596898227677e-05,
"loss": 0.511,
"step": 181
},
{
"epoch": 0.9054726368159204,
"grad_norm": 0.4064372568102956,
"learning_rate": 7.070791219763305e-05,
"loss": 0.5293,
"step": 182
},
{
"epoch": 0.9104477611940298,
"grad_norm": 0.6718930483865837,
"learning_rate": 7.055882372055308e-05,
"loss": 0.544,
"step": 183
},
{
"epoch": 0.9154228855721394,
"grad_norm": 0.38336725569872504,
"learning_rate": 7.040870855995619e-05,
"loss": 0.5215,
"step": 184
},
{
"epoch": 0.9203980099502488,
"grad_norm": 0.47174074201885974,
"learning_rate": 7.025757175925508e-05,
"loss": 0.5207,
"step": 185
},
{
"epoch": 0.9253731343283582,
"grad_norm": 0.4130352554049625,
"learning_rate": 7.010541839618655e-05,
"loss": 0.5486,
"step": 186
},
{
"epoch": 0.9303482587064676,
"grad_norm": 0.3523971543268066,
"learning_rate": 6.995225358264071e-05,
"loss": 0.5121,
"step": 187
},
{
"epoch": 0.9353233830845771,
"grad_norm": 0.5715491757640165,
"learning_rate": 6.979808246448938e-05,
"loss": 0.5071,
"step": 188
},
{
"epoch": 0.9402985074626866,
"grad_norm": 0.3470649290950888,
"learning_rate": 6.964291022141313e-05,
"loss": 0.5044,
"step": 189
},
{
"epoch": 0.945273631840796,
"grad_norm": 0.3795430837969946,
"learning_rate": 6.94867420667273e-05,
"loss": 0.5169,
"step": 190
},
{
"epoch": 0.9502487562189055,
"grad_norm": 0.3904017659815798,
"learning_rate": 6.932958324720682e-05,
"loss": 0.5762,
"step": 191
},
{
"epoch": 0.9552238805970149,
"grad_norm": 0.34232421212733316,
"learning_rate": 6.917143904290997e-05,
"loss": 0.4977,
"step": 192
},
{
"epoch": 0.9601990049751243,
"grad_norm": 0.3569370459819242,
"learning_rate": 6.901231476700091e-05,
"loss": 0.5629,
"step": 193
},
{
"epoch": 0.9651741293532339,
"grad_norm": 0.36974334588123947,
"learning_rate": 6.885221576557127e-05,
"loss": 0.5278,
"step": 194
},
{
"epoch": 0.9701492537313433,
"grad_norm": 0.3407897696413556,
"learning_rate": 6.869114741746046e-05,
"loss": 0.5157,
"step": 195
},
{
"epoch": 0.9751243781094527,
"grad_norm": 0.39761535489754035,
"learning_rate": 6.852911513407502e-05,
"loss": 0.5617,
"step": 196
},
{
"epoch": 0.9800995024875622,
"grad_norm": 0.3234218432091812,
"learning_rate": 6.836612435920677e-05,
"loss": 0.5177,
"step": 197
},
{
"epoch": 0.9850746268656716,
"grad_norm": 0.4334707528741285,
"learning_rate": 6.820218056884993e-05,
"loss": 0.5335,
"step": 198
},
{
"epoch": 0.9900497512437811,
"grad_norm": 0.42261946036901893,
"learning_rate": 6.803728927101712e-05,
"loss": 0.483,
"step": 199
},
{
"epoch": 0.9950248756218906,
"grad_norm": 0.4107426334903007,
"learning_rate": 6.787145600555436e-05,
"loss": 0.5131,
"step": 200
},
{
"epoch": 1.0,
"grad_norm": 0.4088302032853929,
"learning_rate": 6.770468634395491e-05,
"loss": 0.5371,
"step": 201
},
{
"epoch": 1.0049751243781095,
"grad_norm": 0.7163194260472774,
"learning_rate": 6.753698588917207e-05,
"loss": 0.4837,
"step": 202
},
{
"epoch": 1.0099502487562189,
"grad_norm": 0.3931710998290164,
"learning_rate": 6.736836027543097e-05,
"loss": 0.4053,
"step": 203
},
{
"epoch": 1.0149253731343284,
"grad_norm": 0.3992517026461951,
"learning_rate": 6.719881516803931e-05,
"loss": 0.388,
"step": 204
},
{
"epoch": 1.0199004975124377,
"grad_norm": 0.5006496476968266,
"learning_rate": 6.70283562631969e-05,
"loss": 0.4471,
"step": 205
},
{
"epoch": 1.0248756218905473,
"grad_norm": 0.457581634312986,
"learning_rate": 6.685698928780442e-05,
"loss": 0.4101,
"step": 206
},
{
"epoch": 1.0298507462686568,
"grad_norm": 0.46154762831732166,
"learning_rate": 6.668471999927097e-05,
"loss": 0.4053,
"step": 207
},
{
"epoch": 1.0348258706467661,
"grad_norm": 0.3541443217870462,
"learning_rate": 6.651155418532055e-05,
"loss": 0.4054,
"step": 208
},
{
"epoch": 1.0398009950248757,
"grad_norm": 0.4195870714919415,
"learning_rate": 6.633749766379778e-05,
"loss": 0.4036,
"step": 209
},
{
"epoch": 1.044776119402985,
"grad_norm": 0.373429921123025,
"learning_rate": 6.616255628247228e-05,
"loss": 0.4015,
"step": 210
},
{
"epoch": 1.0497512437810945,
"grad_norm": 0.4020032320060145,
"learning_rate": 6.59867359188423e-05,
"loss": 0.3879,
"step": 211
},
{
"epoch": 1.054726368159204,
"grad_norm": 0.4512310371194394,
"learning_rate": 6.58100424799372e-05,
"loss": 0.4316,
"step": 212
},
{
"epoch": 1.0597014925373134,
"grad_norm": 0.3962034658409848,
"learning_rate": 6.563248190211905e-05,
"loss": 0.3748,
"step": 213
},
{
"epoch": 1.064676616915423,
"grad_norm": 0.41060872654476754,
"learning_rate": 6.54540601508831e-05,
"loss": 0.3892,
"step": 214
},
{
"epoch": 1.0696517412935322,
"grad_norm": 0.4645504019709147,
"learning_rate": 6.527478322065744e-05,
"loss": 0.4413,
"step": 215
},
{
"epoch": 1.0746268656716418,
"grad_norm": 0.3836482825148122,
"learning_rate": 6.509465713460157e-05,
"loss": 0.4289,
"step": 216
},
{
"epoch": 1.0796019900497513,
"grad_norm": 0.43540736206118597,
"learning_rate": 6.491368794440402e-05,
"loss": 0.4056,
"step": 217
},
{
"epoch": 1.0845771144278606,
"grad_norm": 0.36663141189648585,
"learning_rate": 6.473188173007909e-05,
"loss": 0.4075,
"step": 218
},
{
"epoch": 1.0895522388059702,
"grad_norm": 0.40611590999810565,
"learning_rate": 6.454924459976253e-05,
"loss": 0.4391,
"step": 219
},
{
"epoch": 1.0945273631840795,
"grad_norm": 0.38113220211722076,
"learning_rate": 6.436578268950632e-05,
"loss": 0.3821,
"step": 220
},
{
"epoch": 1.099502487562189,
"grad_norm": 0.3780147945257588,
"learning_rate": 6.418150216307255e-05,
"loss": 0.4245,
"step": 221
},
{
"epoch": 1.1044776119402986,
"grad_norm": 0.3968220883846169,
"learning_rate": 6.399640921172634e-05,
"loss": 0.402,
"step": 222
},
{
"epoch": 1.109452736318408,
"grad_norm": 0.3946496719853137,
"learning_rate": 6.38105100540278e-05,
"loss": 0.4084,
"step": 223
},
{
"epoch": 1.1144278606965174,
"grad_norm": 0.4067068429286891,
"learning_rate": 6.36238109356231e-05,
"loss": 0.4137,
"step": 224
},
{
"epoch": 1.1194029850746268,
"grad_norm": 0.38859406046358386,
"learning_rate": 6.343631812903472e-05,
"loss": 0.3765,
"step": 225
},
{
"epoch": 1.1243781094527363,
"grad_norm": 0.41119260232877347,
"learning_rate": 6.324803793345057e-05,
"loss": 0.3784,
"step": 226
},
{
"epoch": 1.1293532338308458,
"grad_norm": 0.35980531417493244,
"learning_rate": 6.305897667451248e-05,
"loss": 0.4196,
"step": 227
},
{
"epoch": 1.1343283582089552,
"grad_norm": 0.3974649762417786,
"learning_rate": 6.286914070410365e-05,
"loss": 0.4085,
"step": 228
},
{
"epoch": 1.1393034825870647,
"grad_norm": 0.4151287885867595,
"learning_rate": 6.267853640013519e-05,
"loss": 0.4123,
"step": 229
},
{
"epoch": 1.144278606965174,
"grad_norm": 0.5273211889278953,
"learning_rate": 6.248717016633187e-05,
"loss": 0.4351,
"step": 230
},
{
"epoch": 1.1492537313432836,
"grad_norm": 0.40383736067681525,
"learning_rate": 6.229504843201705e-05,
"loss": 0.4259,
"step": 231
},
{
"epoch": 1.154228855721393,
"grad_norm": 0.4312211448910651,
"learning_rate": 6.210217765189653e-05,
"loss": 0.4396,
"step": 232
},
{
"epoch": 1.1592039800995024,
"grad_norm": 0.35782791847054873,
"learning_rate": 6.190856430584185e-05,
"loss": 0.3913,
"step": 233
},
{
"epoch": 1.164179104477612,
"grad_norm": 0.3492891250835248,
"learning_rate": 6.171421489867241e-05,
"loss": 0.3891,
"step": 234
},
{
"epoch": 1.1691542288557213,
"grad_norm": 0.35501231620300494,
"learning_rate": 6.151913595993711e-05,
"loss": 0.394,
"step": 235
},
{
"epoch": 1.1741293532338308,
"grad_norm": 0.45101422446088074,
"learning_rate": 6.132333404369488e-05,
"loss": 0.3787,
"step": 236
},
{
"epoch": 1.1791044776119404,
"grad_norm": 0.3926264666543916,
"learning_rate": 6.112681572829445e-05,
"loss": 0.3693,
"step": 237
},
{
"epoch": 1.1840796019900497,
"grad_norm": 0.4267424524859782,
"learning_rate": 6.092958761615341e-05,
"loss": 0.434,
"step": 238
},
{
"epoch": 1.1890547263681592,
"grad_norm": 0.39553559165797253,
"learning_rate": 6.073165633353636e-05,
"loss": 0.4034,
"step": 239
},
{
"epoch": 1.1940298507462686,
"grad_norm": 0.46046958354454653,
"learning_rate": 6.0533028530332297e-05,
"loss": 0.3684,
"step": 240
},
{
"epoch": 1.199004975124378,
"grad_norm": 0.4665878755951322,
"learning_rate": 6.033371087983117e-05,
"loss": 0.347,
"step": 241
},
{
"epoch": 1.2039800995024876,
"grad_norm": 0.4354664905472878,
"learning_rate": 6.013371007849972e-05,
"loss": 0.4517,
"step": 242
},
{
"epoch": 1.208955223880597,
"grad_norm": 0.3737884656720528,
"learning_rate": 5.993303284575647e-05,
"loss": 0.392,
"step": 243
},
{
"epoch": 1.2139303482587065,
"grad_norm": 0.36266398536729494,
"learning_rate": 5.9731685923745965e-05,
"loss": 0.3795,
"step": 244
},
{
"epoch": 1.2189054726368158,
"grad_norm": 0.4034905088087893,
"learning_rate": 5.95296760771123e-05,
"loss": 0.4424,
"step": 245
},
{
"epoch": 1.2238805970149254,
"grad_norm": 0.3865962830370829,
"learning_rate": 5.9327010092771796e-05,
"loss": 0.4118,
"step": 246
},
{
"epoch": 1.228855721393035,
"grad_norm": 0.3535923688066071,
"learning_rate": 5.912369477968503e-05,
"loss": 0.4322,
"step": 247
},
{
"epoch": 1.2338308457711442,
"grad_norm": 0.3763362566595206,
"learning_rate": 5.891973696862802e-05,
"loss": 0.3917,
"step": 248
},
{
"epoch": 1.2388059701492538,
"grad_norm": 0.40384314508526564,
"learning_rate": 5.8715143511962794e-05,
"loss": 0.4185,
"step": 249
},
{
"epoch": 1.243781094527363,
"grad_norm": 0.3722033433705015,
"learning_rate": 5.85099212834071e-05,
"loss": 0.422,
"step": 250
},
{
"epoch": 1.2487562189054726,
"grad_norm": 0.381393814251326,
"learning_rate": 5.830407717780356e-05,
"loss": 0.3946,
"step": 251
},
{
"epoch": 1.2537313432835822,
"grad_norm": 0.37263652080327836,
"learning_rate": 5.809761811088791e-05,
"loss": 0.4354,
"step": 252
},
{
"epoch": 1.2587064676616915,
"grad_norm": 0.3718606135863343,
"learning_rate": 5.789055101905678e-05,
"loss": 0.4354,
"step": 253
},
{
"epoch": 1.263681592039801,
"grad_norm": 0.3691803219584332,
"learning_rate": 5.768288285913454e-05,
"loss": 0.417,
"step": 254
},
{
"epoch": 1.2686567164179103,
"grad_norm": 0.36900905475761886,
"learning_rate": 5.7474620608139625e-05,
"loss": 0.428,
"step": 255
},
{
"epoch": 1.2736318407960199,
"grad_norm": 0.43160773872025243,
"learning_rate": 5.726577126305017e-05,
"loss": 0.3848,
"step": 256
},
{
"epoch": 1.2786069651741294,
"grad_norm": 0.3828488458628553,
"learning_rate": 5.705634184056881e-05,
"loss": 0.4016,
"step": 257
},
{
"epoch": 1.2835820895522387,
"grad_norm": 0.3704358136407763,
"learning_rate": 5.6846339376887084e-05,
"loss": 0.4076,
"step": 258
},
{
"epoch": 1.2885572139303483,
"grad_norm": 0.37618666448453975,
"learning_rate": 5.6635770927448916e-05,
"loss": 0.4147,
"step": 259
},
{
"epoch": 1.2935323383084576,
"grad_norm": 0.3673375645143347,
"learning_rate": 5.642464356671369e-05,
"loss": 0.3824,
"step": 260
},
{
"epoch": 1.2985074626865671,
"grad_norm": 0.35963122460702884,
"learning_rate": 5.6212964387918444e-05,
"loss": 0.3805,
"step": 261
},
{
"epoch": 1.3034825870646767,
"grad_norm": 0.4027201236656122,
"learning_rate": 5.6000740502839676e-05,
"loss": 0.4666,
"step": 262
},
{
"epoch": 1.308457711442786,
"grad_norm": 0.34921070865720966,
"learning_rate": 5.5787979041554336e-05,
"loss": 0.4029,
"step": 263
},
{
"epoch": 1.3134328358208955,
"grad_norm": 0.4247139167073171,
"learning_rate": 5.5574687152200294e-05,
"loss": 0.43,
"step": 264
},
{
"epoch": 1.3184079601990049,
"grad_norm": 0.3915409782396568,
"learning_rate": 5.536087200073621e-05,
"loss": 0.4001,
"step": 265
},
{
"epoch": 1.3233830845771144,
"grad_norm": 0.3389416749484375,
"learning_rate": 5.514654077070074e-05,
"loss": 0.3931,
"step": 266
},
{
"epoch": 1.328358208955224,
"grad_norm": 0.3744985906903865,
"learning_rate": 5.493170066297122e-05,
"loss": 0.3997,
"step": 267
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.3590802234283344,
"learning_rate": 5.471635889552171e-05,
"loss": 0.3835,
"step": 268
},
{
"epoch": 1.3383084577114428,
"grad_norm": 0.3698426511706334,
"learning_rate": 5.450052270318054e-05,
"loss": 0.4216,
"step": 269
},
{
"epoch": 1.3432835820895521,
"grad_norm": 0.4784044302472385,
"learning_rate": 5.42841993373872e-05,
"loss": 0.3705,
"step": 270
},
{
"epoch": 1.3482587064676617,
"grad_norm": 0.3628223132460973,
"learning_rate": 5.406739606594872e-05,
"loss": 0.4141,
"step": 271
},
{
"epoch": 1.3532338308457712,
"grad_norm": 0.3320557661495897,
"learning_rate": 5.3850120172795496e-05,
"loss": 0.4146,
"step": 272
},
{
"epoch": 1.3582089552238805,
"grad_norm": 0.3626608623601277,
"learning_rate": 5.36323789577366e-05,
"loss": 0.4141,
"step": 273
},
{
"epoch": 1.36318407960199,
"grad_norm": 0.32391201189440577,
"learning_rate": 5.341417973621447e-05,
"loss": 0.3953,
"step": 274
},
{
"epoch": 1.3681592039800994,
"grad_norm": 0.3530711948464647,
"learning_rate": 5.31955298390592e-05,
"loss": 0.4413,
"step": 275
},
{
"epoch": 1.373134328358209,
"grad_norm": 0.3388143208428122,
"learning_rate": 5.29764366122422e-05,
"loss": 0.4158,
"step": 276
},
{
"epoch": 1.3781094527363185,
"grad_norm": 0.37514946524437237,
"learning_rate": 5.275690741662939e-05,
"loss": 0.4158,
"step": 277
},
{
"epoch": 1.3830845771144278,
"grad_norm": 0.33697256732820746,
"learning_rate": 5.253694962773397e-05,
"loss": 0.4047,
"step": 278
},
{
"epoch": 1.3880597014925373,
"grad_norm": 0.31922818792904584,
"learning_rate": 5.2316570635468496e-05,
"loss": 0.3873,
"step": 279
},
{
"epoch": 1.3930348258706466,
"grad_norm": 0.41734702709710425,
"learning_rate": 5.209577784389673e-05,
"loss": 0.4288,
"step": 280
},
{
"epoch": 1.3980099502487562,
"grad_norm": 0.333151239021271,
"learning_rate": 5.1874578670984826e-05,
"loss": 0.3905,
"step": 281
},
{
"epoch": 1.4029850746268657,
"grad_norm": 0.36077988632299796,
"learning_rate": 5.1652980548352095e-05,
"loss": 0.4152,
"step": 282
},
{
"epoch": 1.407960199004975,
"grad_norm": 0.3596032551031165,
"learning_rate": 5.143099092102136e-05,
"loss": 0.4259,
"step": 283
},
{
"epoch": 1.4129353233830846,
"grad_norm": 0.3547815295533313,
"learning_rate": 5.1208617247168784e-05,
"loss": 0.3919,
"step": 284
},
{
"epoch": 1.417910447761194,
"grad_norm": 0.32341852061369397,
"learning_rate": 5.098586699787339e-05,
"loss": 0.3798,
"step": 285
},
{
"epoch": 1.4228855721393034,
"grad_norm": 0.3554342430636339,
"learning_rate": 5.07627476568659e-05,
"loss": 0.4085,
"step": 286
},
{
"epoch": 1.427860696517413,
"grad_norm": 0.41108995658095454,
"learning_rate": 5.053926672027748e-05,
"loss": 0.4191,
"step": 287
},
{
"epoch": 1.4328358208955223,
"grad_norm": 0.3551564879703378,
"learning_rate": 5.031543169638774e-05,
"loss": 0.4069,
"step": 288
},
{
"epoch": 1.4378109452736318,
"grad_norm": 0.34834914531770184,
"learning_rate": 5.0091250105372595e-05,
"loss": 0.3882,
"step": 289
},
{
"epoch": 1.4427860696517412,
"grad_norm": 0.3598765779758618,
"learning_rate": 4.986672947905153e-05,
"loss": 0.3804,
"step": 290
},
{
"epoch": 1.4477611940298507,
"grad_norm": 0.3651869029175629,
"learning_rate": 4.964187736063462e-05,
"loss": 0.4418,
"step": 291
},
{
"epoch": 1.4527363184079602,
"grad_norm": 0.3445888312659597,
"learning_rate": 4.941670130446901e-05,
"loss": 0.4157,
"step": 292
},
{
"epoch": 1.4577114427860698,
"grad_norm": 0.35813427415119026,
"learning_rate": 4.919120887578522e-05,
"loss": 0.3826,
"step": 293
},
{
"epoch": 1.462686567164179,
"grad_norm": 0.360403051462385,
"learning_rate": 4.8965407650442905e-05,
"loss": 0.4299,
"step": 294
},
{
"epoch": 1.4676616915422884,
"grad_norm": 0.3234770381678072,
"learning_rate": 4.8739305214676336e-05,
"loss": 0.3831,
"step": 295
},
{
"epoch": 1.472636815920398,
"grad_norm": 0.3607259148342821,
"learning_rate": 4.851290916483956e-05,
"loss": 0.4261,
"step": 296
},
{
"epoch": 1.4776119402985075,
"grad_norm": 0.3258301573358321,
"learning_rate": 4.828622710715115e-05,
"loss": 0.378,
"step": 297
},
{
"epoch": 1.482587064676617,
"grad_norm": 0.3288851224568668,
"learning_rate": 4.8059266657438686e-05,
"loss": 0.376,
"step": 298
},
{
"epoch": 1.4875621890547264,
"grad_norm": 0.3562719858206355,
"learning_rate": 4.7832035440882846e-05,
"loss": 0.4383,
"step": 299
},
{
"epoch": 1.4925373134328357,
"grad_norm": 0.3426198261946137,
"learning_rate": 4.760454109176128e-05,
"loss": 0.3914,
"step": 300
},
{
"epoch": 1.4975124378109452,
"grad_norm": 0.36962310476561455,
"learning_rate": 4.737679125319207e-05,
"loss": 0.3953,
"step": 301
},
{
"epoch": 1.5024875621890548,
"grad_norm": 0.340170063618827,
"learning_rate": 4.7148793576877e-05,
"loss": 0.4037,
"step": 302
},
{
"epoch": 1.5074626865671643,
"grad_norm": 0.31985106818418335,
"learning_rate": 4.692055572284441e-05,
"loss": 0.3729,
"step": 303
},
{
"epoch": 1.5124378109452736,
"grad_norm": 0.35693636989536703,
"learning_rate": 4.669208535919187e-05,
"loss": 0.3998,
"step": 304
},
{
"epoch": 1.517412935323383,
"grad_norm": 0.3565216917414643,
"learning_rate": 4.6463390161828625e-05,
"loss": 0.4654,
"step": 305
},
{
"epoch": 1.5223880597014925,
"grad_norm": 0.3276618680872065,
"learning_rate": 4.62344778142176e-05,
"loss": 0.3864,
"step": 306
},
{
"epoch": 1.527363184079602,
"grad_norm": 0.3452128607731188,
"learning_rate": 4.600535600711733e-05,
"loss": 0.4184,
"step": 307
},
{
"epoch": 1.5323383084577116,
"grad_norm": 0.3170488222819703,
"learning_rate": 4.5776032438323536e-05,
"loss": 0.4033,
"step": 308
},
{
"epoch": 1.537313432835821,
"grad_norm": 0.34350065215149495,
"learning_rate": 4.5546514812410537e-05,
"loss": 0.4185,
"step": 309
},
{
"epoch": 1.5422885572139302,
"grad_norm": 0.33678222386570505,
"learning_rate": 4.531681084047235e-05,
"loss": 0.3838,
"step": 310
},
{
"epoch": 1.5472636815920398,
"grad_norm": 0.34759111720203656,
"learning_rate": 4.50869282398637e-05,
"loss": 0.3936,
"step": 311
},
{
"epoch": 1.5522388059701493,
"grad_norm": 0.3211207303611902,
"learning_rate": 4.4856874733940635e-05,
"loss": 0.3826,
"step": 312
},
{
"epoch": 1.5572139303482588,
"grad_norm": 0.3612851847355618,
"learning_rate": 4.462665805180115e-05,
"loss": 0.404,
"step": 313
},
{
"epoch": 1.5621890547263682,
"grad_norm": 0.3511587780817123,
"learning_rate": 4.4396285928025444e-05,
"loss": 0.4102,
"step": 314
},
{
"epoch": 1.5671641791044775,
"grad_norm": 0.3584310431284297,
"learning_rate": 4.416576610241606e-05,
"loss": 0.418,
"step": 315
},
{
"epoch": 1.572139303482587,
"grad_norm": 0.3669767063579727,
"learning_rate": 4.393510631973793e-05,
"loss": 0.4267,
"step": 316
},
{
"epoch": 1.5771144278606966,
"grad_norm": 0.3268164381563611,
"learning_rate": 4.370431432945806e-05,
"loss": 0.3941,
"step": 317
},
{
"epoch": 1.582089552238806,
"grad_norm": 0.3508619416534592,
"learning_rate": 4.347339788548526e-05,
"loss": 0.3995,
"step": 318
},
{
"epoch": 1.5870646766169154,
"grad_norm": 0.3196058947918705,
"learning_rate": 4.3242364745909607e-05,
"loss": 0.3958,
"step": 319
},
{
"epoch": 1.5920398009950247,
"grad_norm": 0.34944455034926064,
"learning_rate": 4.301122267274177e-05,
"loss": 0.3747,
"step": 320
},
{
"epoch": 1.5970149253731343,
"grad_norm": 0.3414470311001462,
"learning_rate": 4.277997943165228e-05,
"loss": 0.3828,
"step": 321
},
{
"epoch": 1.6019900497512438,
"grad_norm": 0.34300557110432434,
"learning_rate": 4.2548642791710606e-05,
"loss": 0.4065,
"step": 322
},
{
"epoch": 1.6069651741293534,
"grad_norm": 0.3315879966748181,
"learning_rate": 4.23172205251241e-05,
"loss": 0.3697,
"step": 323
},
{
"epoch": 1.6119402985074627,
"grad_norm": 0.3330694565273453,
"learning_rate": 4.208572040697695e-05,
"loss": 0.3949,
"step": 324
},
{
"epoch": 1.616915422885572,
"grad_norm": 0.3354561985198847,
"learning_rate": 4.18541502149689e-05,
"loss": 0.3752,
"step": 325
},
{
"epoch": 1.6218905472636815,
"grad_norm": 0.3320378742535769,
"learning_rate": 4.162251772915396e-05,
"loss": 0.4328,
"step": 326
},
{
"epoch": 1.626865671641791,
"grad_norm": 0.37984693931888575,
"learning_rate": 4.139083073167902e-05,
"loss": 0.4452,
"step": 327
},
{
"epoch": 1.6318407960199006,
"grad_norm": 0.3477518807148073,
"learning_rate": 4.1159097006522407e-05,
"loss": 0.3884,
"step": 328
},
{
"epoch": 1.63681592039801,
"grad_norm": 0.38331379309387703,
"learning_rate": 4.092732433923236e-05,
"loss": 0.4358,
"step": 329
},
{
"epoch": 1.6417910447761193,
"grad_norm": 0.3231078961450345,
"learning_rate": 4.069552051666543e-05,
"loss": 0.4228,
"step": 330
},
{
"epoch": 1.6467661691542288,
"grad_norm": 0.34345629662186494,
"learning_rate": 4.0463693326724925e-05,
"loss": 0.4084,
"step": 331
},
{
"epoch": 1.6517412935323383,
"grad_norm": 0.32498201471285726,
"learning_rate": 4.0231850558099194e-05,
"loss": 0.4195,
"step": 332
},
{
"epoch": 1.6567164179104479,
"grad_norm": 0.33886498208601185,
"learning_rate": 4e-05,
"loss": 0.3906,
"step": 333
},
{
"epoch": 1.6616915422885572,
"grad_norm": 0.3448618436856829,
"learning_rate": 3.976814944190082e-05,
"loss": 0.3951,
"step": 334
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.422647256268607,
"learning_rate": 3.9536306673275095e-05,
"loss": 0.3553,
"step": 335
},
{
"epoch": 1.671641791044776,
"grad_norm": 0.373540380768812,
"learning_rate": 3.9304479483334576e-05,
"loss": 0.4274,
"step": 336
},
{
"epoch": 1.6766169154228856,
"grad_norm": 0.3479089624609906,
"learning_rate": 3.907267566076765e-05,
"loss": 0.378,
"step": 337
},
{
"epoch": 1.6815920398009951,
"grad_norm": 0.35053695758886855,
"learning_rate": 3.884090299347761e-05,
"loss": 0.424,
"step": 338
},
{
"epoch": 1.6865671641791045,
"grad_norm": 0.45197758226814305,
"learning_rate": 3.8609169268321e-05,
"loss": 0.4028,
"step": 339
},
{
"epoch": 1.6915422885572138,
"grad_norm": 0.3425199121501955,
"learning_rate": 3.837748227084605e-05,
"loss": 0.4012,
"step": 340
},
{
"epoch": 1.6965174129353233,
"grad_norm": 0.3184778916510887,
"learning_rate": 3.814584978503111e-05,
"loss": 0.3941,
"step": 341
},
{
"epoch": 1.7014925373134329,
"grad_norm": 0.3218368519985259,
"learning_rate": 3.791427959302306e-05,
"loss": 0.4073,
"step": 342
},
{
"epoch": 1.7064676616915424,
"grad_norm": 0.35780167550588043,
"learning_rate": 3.768277947487591e-05,
"loss": 0.4112,
"step": 343
},
{
"epoch": 1.7114427860696517,
"grad_norm": 0.33984570498228783,
"learning_rate": 3.7451357208289414e-05,
"loss": 0.3568,
"step": 344
},
{
"epoch": 1.716417910447761,
"grad_norm": 0.3098884240789945,
"learning_rate": 3.722002056834773e-05,
"loss": 0.3805,
"step": 345
},
{
"epoch": 1.7213930348258706,
"grad_norm": 0.32032507227277146,
"learning_rate": 3.6988777327258245e-05,
"loss": 0.4034,
"step": 346
},
{
"epoch": 1.7263681592039801,
"grad_norm": 0.3201489816718851,
"learning_rate": 3.675763525409041e-05,
"loss": 0.4088,
"step": 347
},
{
"epoch": 1.7313432835820897,
"grad_norm": 0.31920979171836683,
"learning_rate": 3.652660211451475e-05,
"loss": 0.3953,
"step": 348
},
{
"epoch": 1.736318407960199,
"grad_norm": 0.32780892777338394,
"learning_rate": 3.629568567054194e-05,
"loss": 0.4089,
"step": 349
},
{
"epoch": 1.7412935323383083,
"grad_norm": 0.3390908050496512,
"learning_rate": 3.6064893680262075e-05,
"loss": 0.3932,
"step": 350
},
{
"epoch": 1.7462686567164178,
"grad_norm": 0.31731531792600687,
"learning_rate": 3.583423389758395e-05,
"loss": 0.3847,
"step": 351
},
{
"epoch": 1.7512437810945274,
"grad_norm": 0.33714467493195,
"learning_rate": 3.5603714071974576e-05,
"loss": 0.3962,
"step": 352
},
{
"epoch": 1.756218905472637,
"grad_norm": 0.32050032403606926,
"learning_rate": 3.537334194819885e-05,
"loss": 0.4219,
"step": 353
},
{
"epoch": 1.7611940298507462,
"grad_norm": 0.34539825894953485,
"learning_rate": 3.5143125266059365e-05,
"loss": 0.4075,
"step": 354
},
{
"epoch": 1.7661691542288556,
"grad_norm": 0.3397347985270042,
"learning_rate": 3.4913071760136315e-05,
"loss": 0.4161,
"step": 355
},
{
"epoch": 1.771144278606965,
"grad_norm": 0.362703206594731,
"learning_rate": 3.468318915952766e-05,
"loss": 0.3847,
"step": 356
},
{
"epoch": 1.7761194029850746,
"grad_norm": 0.37947814193150187,
"learning_rate": 3.4453485187589484e-05,
"loss": 0.3733,
"step": 357
},
{
"epoch": 1.7810945273631842,
"grad_norm": 0.33573825765539345,
"learning_rate": 3.4223967561676464e-05,
"loss": 0.3959,
"step": 358
},
{
"epoch": 1.7860696517412935,
"grad_norm": 0.36863302025947026,
"learning_rate": 3.3994643992882675e-05,
"loss": 0.4115,
"step": 359
},
{
"epoch": 1.7910447761194028,
"grad_norm": 0.3614827656213472,
"learning_rate": 3.3765522185782414e-05,
"loss": 0.4063,
"step": 360
},
{
"epoch": 1.7960199004975124,
"grad_norm": 0.3539141617231257,
"learning_rate": 3.3536609838171395e-05,
"loss": 0.4119,
"step": 361
},
{
"epoch": 1.800995024875622,
"grad_norm": 0.36831769429972055,
"learning_rate": 3.330791464080814e-05,
"loss": 0.4128,
"step": 362
},
{
"epoch": 1.8059701492537314,
"grad_norm": 0.3215591845142348,
"learning_rate": 3.307944427715561e-05,
"loss": 0.3821,
"step": 363
},
{
"epoch": 1.8109452736318408,
"grad_norm": 0.35813676413956724,
"learning_rate": 3.2851206423123015e-05,
"loss": 0.3906,
"step": 364
},
{
"epoch": 1.81592039800995,
"grad_norm": 0.33955308336311896,
"learning_rate": 3.2623208746807935e-05,
"loss": 0.4149,
"step": 365
},
{
"epoch": 1.8208955223880596,
"grad_norm": 0.36772820484067015,
"learning_rate": 3.239545890823874e-05,
"loss": 0.411,
"step": 366
},
{
"epoch": 1.8258706467661692,
"grad_norm": 0.3669275359828165,
"learning_rate": 3.216796455911716e-05,
"loss": 0.3908,
"step": 367
},
{
"epoch": 1.8308457711442787,
"grad_norm": 0.3360940867007729,
"learning_rate": 3.194073334256133e-05,
"loss": 0.3939,
"step": 368
},
{
"epoch": 1.835820895522388,
"grad_norm": 0.3282348755853059,
"learning_rate": 3.171377289284886e-05,
"loss": 0.3987,
"step": 369
},
{
"epoch": 1.8407960199004973,
"grad_norm": 0.3257708000929639,
"learning_rate": 3.148709083516046e-05,
"loss": 0.3971,
"step": 370
},
{
"epoch": 1.845771144278607,
"grad_norm": 0.3079283959346746,
"learning_rate": 3.126069478532368e-05,
"loss": 0.3683,
"step": 371
},
{
"epoch": 1.8507462686567164,
"grad_norm": 0.3413053035869667,
"learning_rate": 3.103459234955711e-05,
"loss": 0.3877,
"step": 372
},
{
"epoch": 1.855721393034826,
"grad_norm": 0.31958058844137144,
"learning_rate": 3.0808791124214784e-05,
"loss": 0.3889,
"step": 373
},
{
"epoch": 1.8606965174129353,
"grad_norm": 0.3177636690558965,
"learning_rate": 3.0583298695531e-05,
"loss": 0.3778,
"step": 374
},
{
"epoch": 1.8656716417910446,
"grad_norm": 0.2970294606462075,
"learning_rate": 3.0358122639365395e-05,
"loss": 0.3879,
"step": 375
},
{
"epoch": 1.8706467661691542,
"grad_norm": 0.4030502322200175,
"learning_rate": 3.0133270520948467e-05,
"loss": 0.3966,
"step": 376
},
{
"epoch": 1.8756218905472637,
"grad_norm": 0.3082201392101219,
"learning_rate": 2.990874989462741e-05,
"loss": 0.3715,
"step": 377
},
{
"epoch": 1.8805970149253732,
"grad_norm": 0.35764449982425817,
"learning_rate": 2.9684568303612268e-05,
"loss": 0.3742,
"step": 378
},
{
"epoch": 1.8855721393034826,
"grad_norm": 0.3159937456034307,
"learning_rate": 2.9460733279722542e-05,
"loss": 0.3824,
"step": 379
},
{
"epoch": 1.890547263681592,
"grad_norm": 0.32331043003587234,
"learning_rate": 2.9237252343134098e-05,
"loss": 0.3891,
"step": 380
},
{
"epoch": 1.8955223880597014,
"grad_norm": 0.37425531357358577,
"learning_rate": 2.9014133002126623e-05,
"loss": 0.4506,
"step": 381
},
{
"epoch": 1.900497512437811,
"grad_norm": 0.33815725777177497,
"learning_rate": 2.879138275283122e-05,
"loss": 0.3743,
"step": 382
},
{
"epoch": 1.9054726368159205,
"grad_norm": 0.31236457607876705,
"learning_rate": 2.856900907897866e-05,
"loss": 0.3901,
"step": 383
},
{
"epoch": 1.9104477611940298,
"grad_norm": 0.2890304424344374,
"learning_rate": 2.834701945164793e-05,
"loss": 0.3644,
"step": 384
},
{
"epoch": 1.9154228855721394,
"grad_norm": 0.3708398866489987,
"learning_rate": 2.812542132901518e-05,
"loss": 0.4168,
"step": 385
},
{
"epoch": 1.9203980099502487,
"grad_norm": 0.380856014045257,
"learning_rate": 2.7904222156103276e-05,
"loss": 0.3956,
"step": 386
},
{
"epoch": 1.9253731343283582,
"grad_norm": 0.3404547166372201,
"learning_rate": 2.768342936453152e-05,
"loss": 0.3701,
"step": 387
},
{
"epoch": 1.9303482587064678,
"grad_norm": 0.3281747118440842,
"learning_rate": 2.7463050372266055e-05,
"loss": 0.3932,
"step": 388
},
{
"epoch": 1.935323383084577,
"grad_norm": 0.33072472038014866,
"learning_rate": 2.7243092583370613e-05,
"loss": 0.4122,
"step": 389
},
{
"epoch": 1.9402985074626866,
"grad_norm": 0.30399878417994697,
"learning_rate": 2.7023563387757814e-05,
"loss": 0.3623,
"step": 390
},
{
"epoch": 1.945273631840796,
"grad_norm": 0.2978112335333305,
"learning_rate": 2.6804470160940816e-05,
"loss": 0.3543,
"step": 391
},
{
"epoch": 1.9502487562189055,
"grad_norm": 0.3160634155222761,
"learning_rate": 2.6585820263785545e-05,
"loss": 0.3783,
"step": 392
},
{
"epoch": 1.955223880597015,
"grad_norm": 0.33157208172517716,
"learning_rate": 2.6367621042263406e-05,
"loss": 0.4121,
"step": 393
},
{
"epoch": 1.9601990049751243,
"grad_norm": 0.4402080931333875,
"learning_rate": 2.6149879827204513e-05,
"loss": 0.4042,
"step": 394
},
{
"epoch": 1.9651741293532339,
"grad_norm": 0.3002330447756732,
"learning_rate": 2.5932603934051296e-05,
"loss": 0.3928,
"step": 395
},
{
"epoch": 1.9701492537313432,
"grad_norm": 0.2943502159806271,
"learning_rate": 2.5715800662612816e-05,
"loss": 0.4044,
"step": 396
},
{
"epoch": 1.9751243781094527,
"grad_norm": 0.33490942595123435,
"learning_rate": 2.5499477296819473e-05,
"loss": 0.3932,
"step": 397
},
{
"epoch": 1.9800995024875623,
"grad_norm": 0.33880593496795247,
"learning_rate": 2.5283641104478304e-05,
"loss": 0.352,
"step": 398
},
{
"epoch": 1.9850746268656716,
"grad_norm": 0.33544144575729795,
"learning_rate": 2.5068299337028795e-05,
"loss": 0.3702,
"step": 399
},
{
"epoch": 1.9900497512437811,
"grad_norm": 0.29325661298374756,
"learning_rate": 2.485345922929927e-05,
"loss": 0.3901,
"step": 400
},
{
"epoch": 1.9950248756218905,
"grad_norm": 0.3214857721910235,
"learning_rate": 2.4639127999263802e-05,
"loss": 0.3943,
"step": 401
},
{
"epoch": 2.0,
"grad_norm": 0.33576678494810924,
"learning_rate": 2.4425312847799713e-05,
"loss": 0.3413,
"step": 402
},
{
"epoch": 2.0049751243781095,
"grad_norm": 0.5918142779151857,
"learning_rate": 2.4212020958445674e-05,
"loss": 0.2562,
"step": 403
},
{
"epoch": 2.009950248756219,
"grad_norm": 0.41209456448663645,
"learning_rate": 2.3999259497160337e-05,
"loss": 0.2559,
"step": 404
},
{
"epoch": 2.014925373134328,
"grad_norm": 0.6187901280392113,
"learning_rate": 2.3787035612081573e-05,
"loss": 0.2831,
"step": 405
},
{
"epoch": 2.0199004975124377,
"grad_norm": 0.48881789876251663,
"learning_rate": 2.3575356433286336e-05,
"loss": 0.2618,
"step": 406
},
{
"epoch": 2.0248756218905473,
"grad_norm": 0.4081982645902415,
"learning_rate": 2.3364229072551084e-05,
"loss": 0.2762,
"step": 407
},
{
"epoch": 2.029850746268657,
"grad_norm": 0.5200105687768247,
"learning_rate": 2.3153660623112922e-05,
"loss": 0.2592,
"step": 408
},
{
"epoch": 2.0348258706467663,
"grad_norm": 0.442494043835259,
"learning_rate": 2.2943658159431195e-05,
"loss": 0.2754,
"step": 409
},
{
"epoch": 2.0398009950248754,
"grad_norm": 0.41218650620030506,
"learning_rate": 2.273422873694984e-05,
"loss": 0.2182,
"step": 410
},
{
"epoch": 2.044776119402985,
"grad_norm": 0.45136550018816846,
"learning_rate": 2.2525379391860378e-05,
"loss": 0.2721,
"step": 411
},
{
"epoch": 2.0497512437810945,
"grad_norm": 0.5992719898523112,
"learning_rate": 2.2317117140865475e-05,
"loss": 0.2294,
"step": 412
},
{
"epoch": 2.054726368159204,
"grad_norm": 0.3645065043740578,
"learning_rate": 2.2109448980943222e-05,
"loss": 0.2439,
"step": 413
},
{
"epoch": 2.0597014925373136,
"grad_norm": 0.3817582428926202,
"learning_rate": 2.1902381889112094e-05,
"loss": 0.255,
"step": 414
},
{
"epoch": 2.0646766169154227,
"grad_norm": 0.3584286645671438,
"learning_rate": 2.1695922822196454e-05,
"loss": 0.2364,
"step": 415
},
{
"epoch": 2.0696517412935322,
"grad_norm": 0.4669254285695078,
"learning_rate": 2.149007871659291e-05,
"loss": 0.2812,
"step": 416
},
{
"epoch": 2.074626865671642,
"grad_norm": 0.7251931898787578,
"learning_rate": 2.1284856488037223e-05,
"loss": 0.1954,
"step": 417
},
{
"epoch": 2.0796019900497513,
"grad_norm": 0.4011018327789735,
"learning_rate": 2.1080263031372e-05,
"loss": 0.2611,
"step": 418
},
{
"epoch": 2.084577114427861,
"grad_norm": 0.3817812517359356,
"learning_rate": 2.0876305220315e-05,
"loss": 0.2573,
"step": 419
},
{
"epoch": 2.08955223880597,
"grad_norm": 0.37743477467908,
"learning_rate": 2.0672989907228214e-05,
"loss": 0.2558,
"step": 420
},
{
"epoch": 2.0945273631840795,
"grad_norm": 0.38153865146108384,
"learning_rate": 2.047032392288772e-05,
"loss": 0.2383,
"step": 421
},
{
"epoch": 2.099502487562189,
"grad_norm": 0.38606128931772565,
"learning_rate": 2.0268314076254055e-05,
"loss": 0.2283,
"step": 422
},
{
"epoch": 2.1044776119402986,
"grad_norm": 0.3446456674752202,
"learning_rate": 2.0066967154243557e-05,
"loss": 0.2512,
"step": 423
},
{
"epoch": 2.109452736318408,
"grad_norm": 0.4013318980024715,
"learning_rate": 1.9866289921500303e-05,
"loss": 0.2622,
"step": 424
},
{
"epoch": 2.1144278606965172,
"grad_norm": 0.3737939416328244,
"learning_rate": 1.966628912016884e-05,
"loss": 0.2497,
"step": 425
},
{
"epoch": 2.1194029850746268,
"grad_norm": 0.4009685350663624,
"learning_rate": 1.946697146966772e-05,
"loss": 0.2609,
"step": 426
},
{
"epoch": 2.1243781094527363,
"grad_norm": 0.3642686072097498,
"learning_rate": 1.9268343666463657e-05,
"loss": 0.2662,
"step": 427
},
{
"epoch": 2.129353233830846,
"grad_norm": 0.3799997362643185,
"learning_rate": 1.907041238384661e-05,
"loss": 0.2543,
"step": 428
},
{
"epoch": 2.1343283582089554,
"grad_norm": 0.3719896858231849,
"learning_rate": 1.887318427170556e-05,
"loss": 0.2665,
"step": 429
},
{
"epoch": 2.1393034825870645,
"grad_norm": 0.3683517131634379,
"learning_rate": 1.8676665956305132e-05,
"loss": 0.2643,
"step": 430
},
{
"epoch": 2.144278606965174,
"grad_norm": 0.3653894139897945,
"learning_rate": 1.84808640400629e-05,
"loss": 0.2441,
"step": 431
},
{
"epoch": 2.1492537313432836,
"grad_norm": 0.355426607510642,
"learning_rate": 1.8285785101327613e-05,
"loss": 0.2506,
"step": 432
},
{
"epoch": 2.154228855721393,
"grad_norm": 0.36584437796341124,
"learning_rate": 1.8091435694158174e-05,
"loss": 0.2539,
"step": 433
},
{
"epoch": 2.1592039800995027,
"grad_norm": 0.37592267739456214,
"learning_rate": 1.789782234810348e-05,
"loss": 0.2493,
"step": 434
},
{
"epoch": 2.1641791044776117,
"grad_norm": 0.35589449226988645,
"learning_rate": 1.7704951567982967e-05,
"loss": 0.2498,
"step": 435
},
{
"epoch": 2.1691542288557213,
"grad_norm": 0.3763201555098831,
"learning_rate": 1.751282983366814e-05,
"loss": 0.2802,
"step": 436
},
{
"epoch": 2.174129353233831,
"grad_norm": 0.3397712514365849,
"learning_rate": 1.7321463599864836e-05,
"loss": 0.2503,
"step": 437
},
{
"epoch": 2.1791044776119404,
"grad_norm": 0.3719290616698573,
"learning_rate": 1.713085929589635e-05,
"loss": 0.2404,
"step": 438
},
{
"epoch": 2.18407960199005,
"grad_norm": 0.3579620248101577,
"learning_rate": 1.6941023325487516e-05,
"loss": 0.2529,
"step": 439
},
{
"epoch": 2.189054726368159,
"grad_norm": 0.3378961286180641,
"learning_rate": 1.6751962066549445e-05,
"loss": 0.2299,
"step": 440
},
{
"epoch": 2.1940298507462686,
"grad_norm": 0.3409511987760758,
"learning_rate": 1.65636818709653e-05,
"loss": 0.2592,
"step": 441
},
{
"epoch": 2.199004975124378,
"grad_norm": 0.34209632040341525,
"learning_rate": 1.63761890643769e-05,
"loss": 0.2501,
"step": 442
},
{
"epoch": 2.2039800995024876,
"grad_norm": 0.3570923790333368,
"learning_rate": 1.6189489945972218e-05,
"loss": 0.2384,
"step": 443
},
{
"epoch": 2.208955223880597,
"grad_norm": 0.36512054797968035,
"learning_rate": 1.6003590788273672e-05,
"loss": 0.2561,
"step": 444
},
{
"epoch": 2.2139303482587063,
"grad_norm": 0.35591678638553415,
"learning_rate": 1.5818497836927464e-05,
"loss": 0.2302,
"step": 445
},
{
"epoch": 2.218905472636816,
"grad_norm": 0.3753879431710149,
"learning_rate": 1.56342173104937e-05,
"loss": 0.2632,
"step": 446
},
{
"epoch": 2.2238805970149254,
"grad_norm": 0.333664977680914,
"learning_rate": 1.545075540023748e-05,
"loss": 0.2477,
"step": 447
},
{
"epoch": 2.228855721393035,
"grad_norm": 0.3403946010339331,
"learning_rate": 1.5268118269920913e-05,
"loss": 0.2723,
"step": 448
},
{
"epoch": 2.2338308457711444,
"grad_norm": 0.4548760797070764,
"learning_rate": 1.5086312055595986e-05,
"loss": 0.2747,
"step": 449
},
{
"epoch": 2.2388059701492535,
"grad_norm": 0.35301398479196083,
"learning_rate": 1.4905342865398447e-05,
"loss": 0.234,
"step": 450
},
{
"epoch": 2.243781094527363,
"grad_norm": 0.34038959592803697,
"learning_rate": 1.4725216779342563e-05,
"loss": 0.2026,
"step": 451
},
{
"epoch": 2.2487562189054726,
"grad_norm": 0.34067462785209285,
"learning_rate": 1.4545939849116905e-05,
"loss": 0.2596,
"step": 452
},
{
"epoch": 2.253731343283582,
"grad_norm": 0.3466734614903421,
"learning_rate": 1.4367518097880959e-05,
"loss": 0.2529,
"step": 453
},
{
"epoch": 2.2587064676616917,
"grad_norm": 0.40268957982538645,
"learning_rate": 1.4189957520062802e-05,
"loss": 0.2562,
"step": 454
},
{
"epoch": 2.2636815920398012,
"grad_norm": 0.7602394175299795,
"learning_rate": 1.4013264081157716e-05,
"loss": 0.2388,
"step": 455
},
{
"epoch": 2.2686567164179103,
"grad_norm": 0.33285450157056085,
"learning_rate": 1.3837443717527723e-05,
"loss": 0.2416,
"step": 456
},
{
"epoch": 2.27363184079602,
"grad_norm": 0.34514962123726534,
"learning_rate": 1.3662502336202227e-05,
"loss": 0.2509,
"step": 457
},
{
"epoch": 2.2786069651741294,
"grad_norm": 0.40102532759600523,
"learning_rate": 1.3488445814679456e-05,
"loss": 0.2497,
"step": 458
},
{
"epoch": 2.283582089552239,
"grad_norm": 0.36204012456455475,
"learning_rate": 1.331528000072905e-05,
"loss": 0.2477,
"step": 459
},
{
"epoch": 2.288557213930348,
"grad_norm": 0.3738732602116117,
"learning_rate": 1.314301071219557e-05,
"loss": 0.2479,
"step": 460
},
{
"epoch": 2.2935323383084576,
"grad_norm": 0.40091238289977654,
"learning_rate": 1.2971643736803099e-05,
"loss": 0.2878,
"step": 461
},
{
"epoch": 2.298507462686567,
"grad_norm": 0.35336164095784656,
"learning_rate": 1.2801184831960697e-05,
"loss": 0.2499,
"step": 462
},
{
"epoch": 2.3034825870646767,
"grad_norm": 0.35065141906546554,
"learning_rate": 1.2631639724569027e-05,
"loss": 0.2625,
"step": 463
},
{
"epoch": 2.308457711442786,
"grad_norm": 0.37049415729179436,
"learning_rate": 1.2463014110827945e-05,
"loss": 0.2576,
"step": 464
},
{
"epoch": 2.3134328358208958,
"grad_norm": 0.40666661148958544,
"learning_rate": 1.2295313656045096e-05,
"loss": 0.2806,
"step": 465
},
{
"epoch": 2.318407960199005,
"grad_norm": 0.3600659090131196,
"learning_rate": 1.2128543994445639e-05,
"loss": 0.2494,
"step": 466
},
{
"epoch": 2.3233830845771144,
"grad_norm": 0.3415014141826654,
"learning_rate": 1.1962710728982882e-05,
"loss": 0.2302,
"step": 467
},
{
"epoch": 2.328358208955224,
"grad_norm": 0.3511085800170491,
"learning_rate": 1.1797819431150078e-05,
"loss": 0.2418,
"step": 468
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.36088577119554327,
"learning_rate": 1.163387564079323e-05,
"loss": 0.2311,
"step": 469
},
{
"epoch": 2.3383084577114426,
"grad_norm": 0.3475970107617478,
"learning_rate": 1.1470884865924986e-05,
"loss": 0.2542,
"step": 470
},
{
"epoch": 2.343283582089552,
"grad_norm": 0.3371040148205862,
"learning_rate": 1.1308852582539549e-05,
"loss": 0.2398,
"step": 471
},
{
"epoch": 2.3482587064676617,
"grad_norm": 0.3559672069242172,
"learning_rate": 1.1147784234428748e-05,
"loss": 0.2471,
"step": 472
},
{
"epoch": 2.353233830845771,
"grad_norm": 0.34758764543619075,
"learning_rate": 1.0987685232999094e-05,
"loss": 0.2567,
"step": 473
},
{
"epoch": 2.3582089552238807,
"grad_norm": 0.3586333567415116,
"learning_rate": 1.082856095709004e-05,
"loss": 0.2271,
"step": 474
},
{
"epoch": 2.3631840796019903,
"grad_norm": 0.3500617888267376,
"learning_rate": 1.0670416752793184e-05,
"loss": 0.2193,
"step": 475
},
{
"epoch": 2.3681592039800994,
"grad_norm": 0.32469351418788767,
"learning_rate": 1.0513257933272713e-05,
"loss": 0.2426,
"step": 476
},
{
"epoch": 2.373134328358209,
"grad_norm": 0.33387986576032047,
"learning_rate": 1.0357089778586892e-05,
"loss": 0.253,
"step": 477
},
{
"epoch": 2.3781094527363185,
"grad_norm": 0.35603804698804176,
"learning_rate": 1.0201917535510634e-05,
"loss": 0.2481,
"step": 478
},
{
"epoch": 2.383084577114428,
"grad_norm": 0.3540990244834932,
"learning_rate": 1.0047746417359306e-05,
"loss": 0.2683,
"step": 479
},
{
"epoch": 2.388059701492537,
"grad_norm": 0.32820898731863224,
"learning_rate": 9.894581603813464e-06,
"loss": 0.2353,
"step": 480
},
{
"epoch": 2.3930348258706466,
"grad_norm": 0.3815756199076165,
"learning_rate": 9.74242824074493e-06,
"loss": 0.2407,
"step": 481
},
{
"epoch": 2.398009950248756,
"grad_norm": 0.3560064670515033,
"learning_rate": 9.591291440043826e-06,
"loss": 0.2551,
"step": 482
},
{
"epoch": 2.4029850746268657,
"grad_norm": 0.36110329445178707,
"learning_rate": 9.441176279446931e-06,
"loss": 0.2602,
"step": 483
},
{
"epoch": 2.4079601990049753,
"grad_norm": 0.3750715483874539,
"learning_rate": 9.292087802366972e-06,
"loss": 0.2626,
"step": 484
},
{
"epoch": 2.412935323383085,
"grad_norm": 0.33138719559693736,
"learning_rate": 9.144031017723249e-06,
"loss": 0.2442,
"step": 485
},
{
"epoch": 2.417910447761194,
"grad_norm": 0.33379541759464765,
"learning_rate": 8.997010899773345e-06,
"loss": 0.2269,
"step": 486
},
{
"epoch": 2.4228855721393034,
"grad_norm": 0.3369139501102586,
"learning_rate": 8.85103238794597e-06,
"loss": 0.2491,
"step": 487
},
{
"epoch": 2.427860696517413,
"grad_norm": 0.35526144153013156,
"learning_rate": 8.706100386675077e-06,
"loss": 0.2565,
"step": 488
},
{
"epoch": 2.4328358208955225,
"grad_norm": 0.3598570097724235,
"learning_rate": 8.562219765235017e-06,
"loss": 0.2354,
"step": 489
},
{
"epoch": 2.4378109452736316,
"grad_norm": 0.33228280679960287,
"learning_rate": 8.419395357576982e-06,
"loss": 0.2396,
"step": 490
},
{
"epoch": 2.442786069651741,
"grad_norm": 0.4418521047066077,
"learning_rate": 8.27763196216659e-06,
"loss": 0.2544,
"step": 491
},
{
"epoch": 2.4477611940298507,
"grad_norm": 0.358004162374617,
"learning_rate": 8.136934341822695e-06,
"loss": 0.2583,
"step": 492
},
{
"epoch": 2.4527363184079602,
"grad_norm": 0.44898354057719714,
"learning_rate": 7.997307223557338e-06,
"loss": 0.26,
"step": 493
},
{
"epoch": 2.45771144278607,
"grad_norm": 0.41898718849713473,
"learning_rate": 7.858755298416936e-06,
"loss": 0.2522,
"step": 494
},
{
"epoch": 2.4626865671641793,
"grad_norm": 0.3683253603248198,
"learning_rate": 7.721283221324705e-06,
"loss": 0.2501,
"step": 495
},
{
"epoch": 2.4676616915422884,
"grad_norm": 0.33790174069212986,
"learning_rate": 7.584895610924232e-06,
"loss": 0.2538,
"step": 496
},
{
"epoch": 2.472636815920398,
"grad_norm": 0.3392175147909702,
"learning_rate": 7.449597049424357e-06,
"loss": 0.2553,
"step": 497
},
{
"epoch": 2.4776119402985075,
"grad_norm": 0.33260011750236457,
"learning_rate": 7.3153920824451516e-06,
"loss": 0.2258,
"step": 498
},
{
"epoch": 2.482587064676617,
"grad_norm": 0.4565479048485011,
"learning_rate": 7.182285218865264e-06,
"loss": 0.2721,
"step": 499
},
{
"epoch": 2.487562189054726,
"grad_norm": 0.3321494451833214,
"learning_rate": 7.050280930670381e-06,
"loss": 0.255,
"step": 500
},
{
"epoch": 2.4925373134328357,
"grad_norm": 0.34763473535410505,
"learning_rate": 6.919383652803051e-06,
"loss": 0.2746,
"step": 501
},
{
"epoch": 2.4975124378109452,
"grad_norm": 0.3135535144069993,
"learning_rate": 6.78959778301361e-06,
"loss": 0.2314,
"step": 502
},
{
"epoch": 2.5024875621890548,
"grad_norm": 0.3516491625832736,
"learning_rate": 6.660927681712475e-06,
"loss": 0.2766,
"step": 503
},
{
"epoch": 2.5074626865671643,
"grad_norm": 0.33588371388636734,
"learning_rate": 6.533377671823631e-06,
"loss": 0.2194,
"step": 504
},
{
"epoch": 2.512437810945274,
"grad_norm": 0.31803554462798533,
"learning_rate": 6.406952038639396e-06,
"loss": 0.2298,
"step": 505
},
{
"epoch": 2.517412935323383,
"grad_norm": 0.3452688943981057,
"learning_rate": 6.281655029676481e-06,
"loss": 0.2482,
"step": 506
},
{
"epoch": 2.5223880597014925,
"grad_norm": 0.3443466901108766,
"learning_rate": 6.157490854533215e-06,
"loss": 0.2385,
"step": 507
},
{
"epoch": 2.527363184079602,
"grad_norm": 0.3318444309920713,
"learning_rate": 6.034463684748178e-06,
"loss": 0.2542,
"step": 508
},
{
"epoch": 2.5323383084577116,
"grad_norm": 0.3546523722653983,
"learning_rate": 5.912577653660019e-06,
"loss": 0.2425,
"step": 509
},
{
"epoch": 2.5373134328358207,
"grad_norm": 0.3359439170821228,
"learning_rate": 5.79183685626862e-06,
"loss": 0.2325,
"step": 510
},
{
"epoch": 2.54228855721393,
"grad_norm": 0.34774707626690227,
"learning_rate": 5.672245349097471e-06,
"loss": 0.2503,
"step": 511
},
{
"epoch": 2.5472636815920398,
"grad_norm": 0.3277173676536131,
"learning_rate": 5.553807150057418e-06,
"loss": 0.2342,
"step": 512
},
{
"epoch": 2.5522388059701493,
"grad_norm": 0.38238411821050516,
"learning_rate": 5.436526238311644e-06,
"loss": 0.23,
"step": 513
},
{
"epoch": 2.557213930348259,
"grad_norm": 0.336557949765213,
"learning_rate": 5.320406554142037e-06,
"loss": 0.242,
"step": 514
},
{
"epoch": 2.5621890547263684,
"grad_norm": 0.3759208097643772,
"learning_rate": 5.2054519988167415e-06,
"loss": 0.2507,
"step": 515
},
{
"epoch": 2.5671641791044775,
"grad_norm": 0.36209335998781095,
"learning_rate": 5.091666434459121e-06,
"loss": 0.2699,
"step": 516
},
{
"epoch": 2.572139303482587,
"grad_norm": 0.35026794234042397,
"learning_rate": 4.979053683918e-06,
"loss": 0.2405,
"step": 517
},
{
"epoch": 2.5771144278606966,
"grad_norm": 0.3680263693984634,
"learning_rate": 4.867617530639224e-06,
"loss": 0.2472,
"step": 518
},
{
"epoch": 2.582089552238806,
"grad_norm": 0.3519030497333725,
"learning_rate": 4.757361718538569e-06,
"loss": 0.2567,
"step": 519
},
{
"epoch": 2.587064676616915,
"grad_norm": 0.3364511323706219,
"learning_rate": 4.648289951875917e-06,
"loss": 0.2349,
"step": 520
},
{
"epoch": 2.5920398009950247,
"grad_norm": 0.3262056797220248,
"learning_rate": 4.540405895130824e-06,
"loss": 0.2561,
"step": 521
},
{
"epoch": 2.5970149253731343,
"grad_norm": 0.3580197035735468,
"learning_rate": 4.433713172879417e-06,
"loss": 0.2366,
"step": 522
},
{
"epoch": 2.601990049751244,
"grad_norm": 0.3304871040650097,
"learning_rate": 4.328215369672606e-06,
"loss": 0.2621,
"step": 523
},
{
"epoch": 2.6069651741293534,
"grad_norm": 0.3299851878213626,
"learning_rate": 4.2239160299156536e-06,
"loss": 0.2361,
"step": 524
},
{
"epoch": 2.611940298507463,
"grad_norm": 0.3537662063365746,
"learning_rate": 4.1208186577490836e-06,
"loss": 0.2704,
"step": 525
},
{
"epoch": 2.616915422885572,
"grad_norm": 0.3618498169436797,
"learning_rate": 4.018926716931e-06,
"loss": 0.2645,
"step": 526
},
{
"epoch": 2.6218905472636815,
"grad_norm": 0.49476078567714166,
"learning_rate": 3.918243630720651e-06,
"loss": 0.2774,
"step": 527
},
{
"epoch": 2.626865671641791,
"grad_norm": 0.3436946389926571,
"learning_rate": 3.818772781763449e-06,
"loss": 0.2311,
"step": 528
},
{
"epoch": 2.6318407960199006,
"grad_norm": 0.31457657442983117,
"learning_rate": 3.7205175119773285e-06,
"loss": 0.231,
"step": 529
},
{
"epoch": 2.6368159203980097,
"grad_norm": 0.33774906407685856,
"learning_rate": 3.6234811224404686e-06,
"loss": 0.2341,
"step": 530
},
{
"epoch": 2.6417910447761193,
"grad_norm": 0.33915196241962076,
"learning_rate": 3.527666873280362e-06,
"loss": 0.2747,
"step": 531
},
{
"epoch": 2.646766169154229,
"grad_norm": 0.7253067903486308,
"learning_rate": 3.4330779835643235e-06,
"loss": 0.2434,
"step": 532
},
{
"epoch": 2.6517412935323383,
"grad_norm": 0.3533314185286548,
"learning_rate": 3.339717631191306e-06,
"loss": 0.2463,
"step": 533
},
{
"epoch": 2.656716417910448,
"grad_norm": 0.3488214812675212,
"learning_rate": 3.2475889527851413e-06,
"loss": 0.2564,
"step": 534
},
{
"epoch": 2.6616915422885574,
"grad_norm": 0.37464147411382354,
"learning_rate": 3.156695043589171e-06,
"loss": 0.2527,
"step": 535
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.31641458687486096,
"learning_rate": 3.0670389573622406e-06,
"loss": 0.2433,
"step": 536
},
{
"epoch": 2.671641791044776,
"grad_norm": 0.33063487766863686,
"learning_rate": 2.9786237062761247e-06,
"loss": 0.2596,
"step": 537
},
{
"epoch": 2.6766169154228856,
"grad_norm": 0.3198962608716895,
"learning_rate": 2.891452260814287e-06,
"loss": 0.2398,
"step": 538
},
{
"epoch": 2.681592039800995,
"grad_norm": 0.33045280401704574,
"learning_rate": 2.805527549672129e-06,
"loss": 0.2354,
"step": 539
},
{
"epoch": 2.6865671641791042,
"grad_norm": 0.32990177471806453,
"learning_rate": 2.7208524596585496e-06,
"loss": 0.255,
"step": 540
},
{
"epoch": 2.691542288557214,
"grad_norm": 0.3064290269911759,
"learning_rate": 2.637429835599008e-06,
"loss": 0.2322,
"step": 541
},
{
"epoch": 2.6965174129353233,
"grad_norm": 0.329629777652285,
"learning_rate": 2.5552624802398905e-06,
"loss": 0.2308,
"step": 542
},
{
"epoch": 2.701492537313433,
"grad_norm": 0.3203930929813302,
"learning_rate": 2.4743531541543807e-06,
"loss": 0.255,
"step": 543
},
{
"epoch": 2.7064676616915424,
"grad_norm": 0.3246337153430093,
"learning_rate": 2.3947045756497157e-06,
"loss": 0.2334,
"step": 544
},
{
"epoch": 2.711442786069652,
"grad_norm": 0.34574042588208914,
"learning_rate": 2.3163194206758365e-06,
"loss": 0.2315,
"step": 545
},
{
"epoch": 2.716417910447761,
"grad_norm": 0.33095949362492455,
"learning_rate": 2.2392003227355064e-06,
"loss": 0.2533,
"step": 546
},
{
"epoch": 2.7213930348258706,
"grad_norm": 0.327348254225279,
"learning_rate": 2.163349872795819e-06,
"loss": 0.228,
"step": 547
},
{
"epoch": 2.72636815920398,
"grad_norm": 0.3310532056353117,
"learning_rate": 2.0887706192011505e-06,
"loss": 0.2532,
"step": 548
},
{
"epoch": 2.7313432835820897,
"grad_norm": 0.3659658179822786,
"learning_rate": 2.015465067587554e-06,
"loss": 0.2526,
"step": 549
},
{
"epoch": 2.7363184079601988,
"grad_norm": 0.31321771077874316,
"learning_rate": 1.943435680798573e-06,
"loss": 0.2317,
"step": 550
},
{
"epoch": 2.7412935323383083,
"grad_norm": 0.35622804831329247,
"learning_rate": 1.872684878802482e-06,
"loss": 0.2778,
"step": 551
},
{
"epoch": 2.746268656716418,
"grad_norm": 0.33560958300477206,
"learning_rate": 1.8032150386110103e-06,
"loss": 0.2313,
"step": 552
},
{
"epoch": 2.7512437810945274,
"grad_norm": 0.32839333475527616,
"learning_rate": 1.735028494199451e-06,
"loss": 0.2675,
"step": 553
},
{
"epoch": 2.756218905472637,
"grad_norm": 0.32930584068523533,
"learning_rate": 1.6681275364282835e-06,
"loss": 0.2356,
"step": 554
},
{
"epoch": 2.7611940298507465,
"grad_norm": 0.343780413896504,
"learning_rate": 1.6025144129661763e-06,
"loss": 0.2342,
"step": 555
},
{
"epoch": 2.7661691542288556,
"grad_norm": 0.3676149014995963,
"learning_rate": 1.5381913282144711e-06,
"loss": 0.257,
"step": 556
},
{
"epoch": 2.771144278606965,
"grad_norm": 0.32741721241637517,
"learning_rate": 1.4751604432331567e-06,
"loss": 0.2393,
"step": 557
},
{
"epoch": 2.7761194029850746,
"grad_norm": 0.5133228461228249,
"learning_rate": 1.4134238756682162e-06,
"loss": 0.2468,
"step": 558
},
{
"epoch": 2.781094527363184,
"grad_norm": 0.6717564368152784,
"learning_rate": 1.3529836996805235e-06,
"loss": 0.2432,
"step": 559
},
{
"epoch": 2.7860696517412933,
"grad_norm": 0.3650345457576272,
"learning_rate": 1.2938419458761398e-06,
"loss": 0.2034,
"step": 560
},
{
"epoch": 2.791044776119403,
"grad_norm": 0.37879362944079575,
"learning_rate": 1.23600060123807e-06,
"loss": 0.2471,
"step": 561
},
{
"epoch": 2.7960199004975124,
"grad_norm": 0.3497471145381787,
"learning_rate": 1.1794616090595422e-06,
"loss": 0.249,
"step": 562
},
{
"epoch": 2.800995024875622,
"grad_norm": 0.3258716514081455,
"learning_rate": 1.124226868878715e-06,
"loss": 0.2184,
"step": 563
},
{
"epoch": 2.8059701492537314,
"grad_norm": 0.335029096301279,
"learning_rate": 1.0702982364148195e-06,
"loss": 0.2233,
"step": 564
},
{
"epoch": 2.810945273631841,
"grad_norm": 0.33089653722712126,
"learning_rate": 1.0176775235058645e-06,
"loss": 0.2127,
"step": 565
},
{
"epoch": 2.81592039800995,
"grad_norm": 0.36553404527889055,
"learning_rate": 9.66366498047724e-07,
"loss": 0.2472,
"step": 566
},
{
"epoch": 2.8208955223880596,
"grad_norm": 0.33513935646205256,
"learning_rate": 9.163668839347672e-07,
"loss": 0.2599,
"step": 567
},
{
"epoch": 2.825870646766169,
"grad_norm": 0.3327266292283009,
"learning_rate": 8.676803610019368e-07,
"loss": 0.243,
"step": 568
},
{
"epoch": 2.8308457711442787,
"grad_norm": 0.32072020619433717,
"learning_rate": 8.203085649682863e-07,
"loss": 0.2248,
"step": 569
},
{
"epoch": 2.835820895522388,
"grad_norm": 0.34180911322997665,
"learning_rate": 7.742530873820686e-07,
"loss": 0.2687,
"step": 570
},
{
"epoch": 2.8407960199004973,
"grad_norm": 0.32625172704997785,
"learning_rate": 7.295154755672196e-07,
"loss": 0.2164,
"step": 571
},
{
"epoch": 2.845771144278607,
"grad_norm": 0.31494320482153554,
"learning_rate": 6.860972325714121e-07,
"loss": 0.2431,
"step": 572
},
{
"epoch": 2.8507462686567164,
"grad_norm": 0.33375409453354005,
"learning_rate": 6.439998171155326e-07,
"loss": 0.2272,
"step": 573
},
{
"epoch": 2.855721393034826,
"grad_norm": 0.3661672121057468,
"learning_rate": 6.032246435446754e-07,
"loss": 0.2661,
"step": 574
},
{
"epoch": 2.8606965174129355,
"grad_norm": 0.32274521306276094,
"learning_rate": 5.637730817806341e-07,
"loss": 0.2219,
"step": 575
},
{
"epoch": 2.8656716417910446,
"grad_norm": 0.31596637325997035,
"learning_rate": 5.256464572758723e-07,
"loss": 0.2135,
"step": 576
},
{
"epoch": 2.870646766169154,
"grad_norm": 0.3083576046383486,
"learning_rate": 4.888460509689941e-07,
"loss": 0.2188,
"step": 577
},
{
"epoch": 2.8756218905472637,
"grad_norm": 0.33292190176115743,
"learning_rate": 4.533730992417029e-07,
"loss": 0.2557,
"step": 578
},
{
"epoch": 2.8805970149253732,
"grad_norm": 0.46548707820421953,
"learning_rate": 4.19228793877271e-07,
"loss": 0.2561,
"step": 579
},
{
"epoch": 2.8855721393034823,
"grad_norm": 0.3202236939710589,
"learning_rate": 3.8641428202048634e-07,
"loss": 0.2295,
"step": 580
},
{
"epoch": 2.890547263681592,
"grad_norm": 0.3250224661241107,
"learning_rate": 3.549306661391283e-07,
"loss": 0.2606,
"step": 581
},
{
"epoch": 2.8955223880597014,
"grad_norm": 0.31518434096820275,
"learning_rate": 3.247790039869214e-07,
"loss": 0.2512,
"step": 582
},
{
"epoch": 2.900497512437811,
"grad_norm": 0.3819340666754849,
"learning_rate": 2.959603085679863e-07,
"loss": 0.2636,
"step": 583
},
{
"epoch": 2.9054726368159205,
"grad_norm": 0.3185432391852622,
"learning_rate": 2.6847554810282226e-07,
"loss": 0.2396,
"step": 584
},
{
"epoch": 2.91044776119403,
"grad_norm": 0.31119028135213145,
"learning_rate": 2.4232564599577347e-07,
"loss": 0.222,
"step": 585
},
{
"epoch": 2.9154228855721396,
"grad_norm": 0.3897279825932753,
"learning_rate": 2.1751148080400464e-07,
"loss": 0.2547,
"step": 586
},
{
"epoch": 2.9203980099502487,
"grad_norm": 0.3318250867618136,
"learning_rate": 1.9403388620798268e-07,
"loss": 0.2316,
"step": 587
},
{
"epoch": 2.925373134328358,
"grad_norm": 0.33435033379588464,
"learning_rate": 1.71893650983459e-07,
"loss": 0.264,
"step": 588
},
{
"epoch": 2.9303482587064678,
"grad_norm": 0.3169245972712117,
"learning_rate": 1.510915189749973e-07,
"loss": 0.2258,
"step": 589
},
{
"epoch": 2.935323383084577,
"grad_norm": 0.31046188325115304,
"learning_rate": 1.3162818907094477e-07,
"loss": 0.2436,
"step": 590
},
{
"epoch": 2.9402985074626864,
"grad_norm": 0.33083406600042586,
"learning_rate": 1.1350431517998416e-07,
"loss": 0.2352,
"step": 591
},
{
"epoch": 2.945273631840796,
"grad_norm": 0.5036448220257013,
"learning_rate": 9.672050620913809e-08,
"loss": 0.2463,
"step": 592
},
{
"epoch": 2.9502487562189055,
"grad_norm": 0.34997368115151717,
"learning_rate": 8.127732604334082e-08,
"loss": 0.2652,
"step": 593
},
{
"epoch": 2.955223880597015,
"grad_norm": 0.3245631331451197,
"learning_rate": 6.717529352645802e-08,
"loss": 0.2604,
"step": 594
},
{
"epoch": 2.9601990049751246,
"grad_norm": 0.35279551675091364,
"learning_rate": 5.44148824438917e-08,
"loss": 0.2469,
"step": 595
},
{
"epoch": 2.965174129353234,
"grad_norm": 0.3236463161348794,
"learning_rate": 4.2996521506637465e-08,
"loss": 0.2162,
"step": 596
},
{
"epoch": 2.970149253731343,
"grad_norm": 0.31533592113778164,
"learning_rate": 3.292059433687822e-08,
"loss": 0.2387,
"step": 597
},
{
"epoch": 2.9751243781094527,
"grad_norm": 0.3171216482921585,
"learning_rate": 2.4187439455127804e-08,
"loss": 0.2287,
"step": 598
},
{
"epoch": 2.9800995024875623,
"grad_norm": 0.3268105010367728,
"learning_rate": 1.679735026881346e-08,
"loss": 0.2278,
"step": 599
},
{
"epoch": 2.9850746268656714,
"grad_norm": 0.3248084583080667,
"learning_rate": 1.0750575062461465e-08,
"loss": 0.229,
"step": 600
},
{
"epoch": 2.990049751243781,
"grad_norm": 0.345052065217444,
"learning_rate": 6.047316989317153e-09,
"loss": 0.1951,
"step": 601
},
{
"epoch": 2.9950248756218905,
"grad_norm": 0.337057099299072,
"learning_rate": 2.6877340645459217e-09,
"loss": 0.2749,
"step": 602
},
{
"epoch": 3.0,
"grad_norm": 0.3925108426294995,
"learning_rate": 6.719391599130376e-10,
"loss": 0.1737,
"step": 603
},
{
"epoch": 3.0,
"step": 603,
"total_flos": 146165521711104.0,
"train_loss": 0.3971822352996513,
"train_runtime": 6153.7064,
"train_samples_per_second": 0.781,
"train_steps_per_second": 0.098
}
],
"logging_steps": 1,
"max_steps": 603,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 146165521711104.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}