yang29's picture
Upload folder using huggingface_hub
0530d61 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 939,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010649627263045794,
"grad_norm": 49.5,
"learning_rate": 3e-05,
"loss": 2.4246,
"step": 1
},
{
"epoch": 0.002129925452609159,
"grad_norm": 37.75,
"learning_rate": 2.9999916048314652e-05,
"loss": 2.2391,
"step": 2
},
{
"epoch": 0.003194888178913738,
"grad_norm": 26.75,
"learning_rate": 2.9999664194198307e-05,
"loss": 1.8044,
"step": 3
},
{
"epoch": 0.004259850905218318,
"grad_norm": 22.5,
"learning_rate": 2.9999244440470125e-05,
"loss": 1.5917,
"step": 4
},
{
"epoch": 0.005324813631522897,
"grad_norm": 15.5,
"learning_rate": 2.999865679182864e-05,
"loss": 1.74,
"step": 5
},
{
"epoch": 0.006389776357827476,
"grad_norm": 14.8125,
"learning_rate": 2.999790125485172e-05,
"loss": 1.6569,
"step": 6
},
{
"epoch": 0.007454739084132056,
"grad_norm": 20.375,
"learning_rate": 2.9996977837996533e-05,
"loss": 2.0075,
"step": 7
},
{
"epoch": 0.008519701810436636,
"grad_norm": 14.9375,
"learning_rate": 2.9995886551599382e-05,
"loss": 1.8082,
"step": 8
},
{
"epoch": 0.009584664536741214,
"grad_norm": 14.875,
"learning_rate": 2.9994627407875647e-05,
"loss": 1.4614,
"step": 9
},
{
"epoch": 0.010649627263045794,
"grad_norm": 19.125,
"learning_rate": 2.999320042091963e-05,
"loss": 1.5376,
"step": 10
},
{
"epoch": 0.011714589989350373,
"grad_norm": 20.25,
"learning_rate": 2.999160560670439e-05,
"loss": 1.6331,
"step": 11
},
{
"epoch": 0.012779552715654952,
"grad_norm": 23.5,
"learning_rate": 2.9989842983081574e-05,
"loss": 1.3088,
"step": 12
},
{
"epoch": 0.013844515441959531,
"grad_norm": 25.5,
"learning_rate": 2.9987912569781212e-05,
"loss": 1.368,
"step": 13
},
{
"epoch": 0.014909478168264111,
"grad_norm": 12.5625,
"learning_rate": 2.99858143884115e-05,
"loss": 1.424,
"step": 14
},
{
"epoch": 0.01597444089456869,
"grad_norm": 18.625,
"learning_rate": 2.9983548462458546e-05,
"loss": 1.9811,
"step": 15
},
{
"epoch": 0.01703940362087327,
"grad_norm": 18.375,
"learning_rate": 2.9981114817286128e-05,
"loss": 1.6946,
"step": 16
},
{
"epoch": 0.01810436634717785,
"grad_norm": 17.875,
"learning_rate": 2.9978513480135398e-05,
"loss": 1.5809,
"step": 17
},
{
"epoch": 0.019169329073482427,
"grad_norm": 11.3125,
"learning_rate": 2.9975744480124565e-05,
"loss": 1.607,
"step": 18
},
{
"epoch": 0.02023429179978701,
"grad_norm": 12.5625,
"learning_rate": 2.99728078482486e-05,
"loss": 1.4597,
"step": 19
},
{
"epoch": 0.021299254526091587,
"grad_norm": 11.6875,
"learning_rate": 2.996970361737886e-05,
"loss": 1.4767,
"step": 20
},
{
"epoch": 0.022364217252396165,
"grad_norm": 11.875,
"learning_rate": 2.9966431822262732e-05,
"loss": 1.6979,
"step": 21
},
{
"epoch": 0.023429179978700747,
"grad_norm": 12.375,
"learning_rate": 2.9962992499523246e-05,
"loss": 1.5985,
"step": 22
},
{
"epoch": 0.024494142705005325,
"grad_norm": 15.0625,
"learning_rate": 2.9959385687658655e-05,
"loss": 1.3997,
"step": 23
},
{
"epoch": 0.025559105431309903,
"grad_norm": 14.75,
"learning_rate": 2.9955611427042026e-05,
"loss": 1.5398,
"step": 24
},
{
"epoch": 0.026624068157614485,
"grad_norm": 20.125,
"learning_rate": 2.9951669759920757e-05,
"loss": 1.5154,
"step": 25
},
{
"epoch": 0.027689030883919063,
"grad_norm": 11.25,
"learning_rate": 2.9947560730416133e-05,
"loss": 1.7137,
"step": 26
},
{
"epoch": 0.02875399361022364,
"grad_norm": 17.5,
"learning_rate": 2.9943284384522815e-05,
"loss": 1.5518,
"step": 27
},
{
"epoch": 0.029818956336528223,
"grad_norm": 15.625,
"learning_rate": 2.9938840770108324e-05,
"loss": 0.6355,
"step": 28
},
{
"epoch": 0.0308839190628328,
"grad_norm": 11.0,
"learning_rate": 2.9934229936912516e-05,
"loss": 1.5532,
"step": 29
},
{
"epoch": 0.03194888178913738,
"grad_norm": 12.5,
"learning_rate": 2.992945193654702e-05,
"loss": 1.5355,
"step": 30
},
{
"epoch": 0.03301384451544196,
"grad_norm": 12.25,
"learning_rate": 2.9924506822494668e-05,
"loss": 1.5092,
"step": 31
},
{
"epoch": 0.03407880724174654,
"grad_norm": 15.9375,
"learning_rate": 2.9919394650108877e-05,
"loss": 1.7472,
"step": 32
},
{
"epoch": 0.03514376996805112,
"grad_norm": 12.375,
"learning_rate": 2.9914115476613035e-05,
"loss": 1.5176,
"step": 33
},
{
"epoch": 0.0362087326943557,
"grad_norm": 11.5,
"learning_rate": 2.9908669361099895e-05,
"loss": 1.4667,
"step": 34
},
{
"epoch": 0.03727369542066028,
"grad_norm": 14.4375,
"learning_rate": 2.9903056364530856e-05,
"loss": 1.6553,
"step": 35
},
{
"epoch": 0.038338658146964855,
"grad_norm": 12.875,
"learning_rate": 2.989727654973532e-05,
"loss": 1.7434,
"step": 36
},
{
"epoch": 0.039403620873269436,
"grad_norm": 13.75,
"learning_rate": 2.9891329981409983e-05,
"loss": 1.8034,
"step": 37
},
{
"epoch": 0.04046858359957402,
"grad_norm": 15.875,
"learning_rate": 2.9885216726118107e-05,
"loss": 1.1681,
"step": 38
},
{
"epoch": 0.04153354632587859,
"grad_norm": 13.8125,
"learning_rate": 2.987893685228876e-05,
"loss": 1.6805,
"step": 39
},
{
"epoch": 0.042598509052183174,
"grad_norm": 12.6875,
"learning_rate": 2.987249043021608e-05,
"loss": 1.6899,
"step": 40
},
{
"epoch": 0.043663471778487756,
"grad_norm": 10.1875,
"learning_rate": 2.986587753205847e-05,
"loss": 1.773,
"step": 41
},
{
"epoch": 0.04472843450479233,
"grad_norm": 18.125,
"learning_rate": 2.985909823183778e-05,
"loss": 1.7827,
"step": 42
},
{
"epoch": 0.04579339723109691,
"grad_norm": 11.3125,
"learning_rate": 2.985215260543851e-05,
"loss": 1.4104,
"step": 43
},
{
"epoch": 0.046858359957401494,
"grad_norm": 13.3125,
"learning_rate": 2.9845040730606926e-05,
"loss": 1.6319,
"step": 44
},
{
"epoch": 0.04792332268370607,
"grad_norm": 11.0,
"learning_rate": 2.9837762686950216e-05,
"loss": 1.597,
"step": 45
},
{
"epoch": 0.04898828541001065,
"grad_norm": 16.25,
"learning_rate": 2.9830318555935578e-05,
"loss": 1.4282,
"step": 46
},
{
"epoch": 0.05005324813631523,
"grad_norm": 11.375,
"learning_rate": 2.982270842088933e-05,
"loss": 1.5651,
"step": 47
},
{
"epoch": 0.051118210862619806,
"grad_norm": 10.8125,
"learning_rate": 2.9814932366995963e-05,
"loss": 1.8754,
"step": 48
},
{
"epoch": 0.05218317358892439,
"grad_norm": 11.625,
"learning_rate": 2.980699048129718e-05,
"loss": 1.8102,
"step": 49
},
{
"epoch": 0.05324813631522897,
"grad_norm": 10.8125,
"learning_rate": 2.9798882852690942e-05,
"loss": 1.7686,
"step": 50
},
{
"epoch": 0.054313099041533544,
"grad_norm": 12.3125,
"learning_rate": 2.979060957193047e-05,
"loss": 1.6104,
"step": 51
},
{
"epoch": 0.055378061767838126,
"grad_norm": 15.0,
"learning_rate": 2.9782170731623196e-05,
"loss": 1.6611,
"step": 52
},
{
"epoch": 0.05644302449414271,
"grad_norm": 13.6875,
"learning_rate": 2.977356642622978e-05,
"loss": 1.5407,
"step": 53
},
{
"epoch": 0.05750798722044728,
"grad_norm": 12.0,
"learning_rate": 2.9764796752063013e-05,
"loss": 1.4936,
"step": 54
},
{
"epoch": 0.058572949946751864,
"grad_norm": 11.875,
"learning_rate": 2.9755861807286744e-05,
"loss": 1.7986,
"step": 55
},
{
"epoch": 0.059637912673056445,
"grad_norm": 11.625,
"learning_rate": 2.9746761691914805e-05,
"loss": 1.2507,
"step": 56
},
{
"epoch": 0.06070287539936102,
"grad_norm": 12.8125,
"learning_rate": 2.9737496507809862e-05,
"loss": 1.7383,
"step": 57
},
{
"epoch": 0.0617678381256656,
"grad_norm": 14.0625,
"learning_rate": 2.9728066358682293e-05,
"loss": 1.8381,
"step": 58
},
{
"epoch": 0.06283280085197018,
"grad_norm": 11.125,
"learning_rate": 2.9718471350089018e-05,
"loss": 1.4074,
"step": 59
},
{
"epoch": 0.06389776357827476,
"grad_norm": 11.6875,
"learning_rate": 2.970871158943232e-05,
"loss": 1.4428,
"step": 60
},
{
"epoch": 0.06496272630457935,
"grad_norm": 12.8125,
"learning_rate": 2.9698787185958652e-05,
"loss": 1.6872,
"step": 61
},
{
"epoch": 0.06602768903088392,
"grad_norm": 10.3125,
"learning_rate": 2.9688698250757396e-05,
"loss": 1.5911,
"step": 62
},
{
"epoch": 0.0670926517571885,
"grad_norm": 10.875,
"learning_rate": 2.9678444896759637e-05,
"loss": 1.8574,
"step": 63
},
{
"epoch": 0.06815761448349308,
"grad_norm": 11.5625,
"learning_rate": 2.9668027238736885e-05,
"loss": 1.6105,
"step": 64
},
{
"epoch": 0.06922257720979766,
"grad_norm": 14.1875,
"learning_rate": 2.9657445393299805e-05,
"loss": 1.4482,
"step": 65
},
{
"epoch": 0.07028753993610223,
"grad_norm": 13.0625,
"learning_rate": 2.964669947889689e-05,
"loss": 1.6329,
"step": 66
},
{
"epoch": 0.07135250266240682,
"grad_norm": 11.0,
"learning_rate": 2.963578961581316e-05,
"loss": 1.6117,
"step": 67
},
{
"epoch": 0.0724174653887114,
"grad_norm": 11.0,
"learning_rate": 2.962471592616881e-05,
"loss": 1.6114,
"step": 68
},
{
"epoch": 0.07348242811501597,
"grad_norm": 11.4375,
"learning_rate": 2.9613478533917813e-05,
"loss": 1.352,
"step": 69
},
{
"epoch": 0.07454739084132056,
"grad_norm": 12.6875,
"learning_rate": 2.9602077564846577e-05,
"loss": 1.6438,
"step": 70
},
{
"epoch": 0.07561235356762513,
"grad_norm": 10.375,
"learning_rate": 2.9590513146572512e-05,
"loss": 1.7608,
"step": 71
},
{
"epoch": 0.07667731629392971,
"grad_norm": 10.75,
"learning_rate": 2.957878540854261e-05,
"loss": 1.3708,
"step": 72
},
{
"epoch": 0.0777422790202343,
"grad_norm": 9.25,
"learning_rate": 2.9566894482031983e-05,
"loss": 1.5508,
"step": 73
},
{
"epoch": 0.07880724174653887,
"grad_norm": 10.625,
"learning_rate": 2.95548405001424e-05,
"loss": 1.9396,
"step": 74
},
{
"epoch": 0.07987220447284345,
"grad_norm": 15.4375,
"learning_rate": 2.954262359780082e-05,
"loss": 1.6615,
"step": 75
},
{
"epoch": 0.08093716719914804,
"grad_norm": 16.75,
"learning_rate": 2.9530243911757843e-05,
"loss": 1.7127,
"step": 76
},
{
"epoch": 0.08200212992545261,
"grad_norm": 11.375,
"learning_rate": 2.95177015805862e-05,
"loss": 1.6021,
"step": 77
},
{
"epoch": 0.08306709265175719,
"grad_norm": 12.875,
"learning_rate": 2.950499674467921e-05,
"loss": 1.6266,
"step": 78
},
{
"epoch": 0.08413205537806177,
"grad_norm": 13.875,
"learning_rate": 2.949212954624918e-05,
"loss": 1.3126,
"step": 79
},
{
"epoch": 0.08519701810436635,
"grad_norm": 13.8125,
"learning_rate": 2.9479100129325855e-05,
"loss": 1.3167,
"step": 80
},
{
"epoch": 0.08626198083067092,
"grad_norm": 9.875,
"learning_rate": 2.9465908639754763e-05,
"loss": 1.8433,
"step": 81
},
{
"epoch": 0.08732694355697551,
"grad_norm": 11.625,
"learning_rate": 2.9452555225195608e-05,
"loss": 1.467,
"step": 82
},
{
"epoch": 0.08839190628328009,
"grad_norm": 9.75,
"learning_rate": 2.9439040035120615e-05,
"loss": 1.8683,
"step": 83
},
{
"epoch": 0.08945686900958466,
"grad_norm": 17.5,
"learning_rate": 2.9425363220812843e-05,
"loss": 1.515,
"step": 84
},
{
"epoch": 0.09052183173588925,
"grad_norm": 12.6875,
"learning_rate": 2.941152493536451e-05,
"loss": 1.7103,
"step": 85
},
{
"epoch": 0.09158679446219382,
"grad_norm": 14.125,
"learning_rate": 2.939752533367527e-05,
"loss": 1.7114,
"step": 86
},
{
"epoch": 0.0926517571884984,
"grad_norm": 11.625,
"learning_rate": 2.9383364572450472e-05,
"loss": 1.3458,
"step": 87
},
{
"epoch": 0.09371671991480299,
"grad_norm": 10.875,
"learning_rate": 2.9369042810199422e-05,
"loss": 1.7802,
"step": 88
},
{
"epoch": 0.09478168264110756,
"grad_norm": 10.5,
"learning_rate": 2.9354560207233596e-05,
"loss": 1.4666,
"step": 89
},
{
"epoch": 0.09584664536741214,
"grad_norm": 14.875,
"learning_rate": 2.9339916925664856e-05,
"loss": 1.4141,
"step": 90
},
{
"epoch": 0.09691160809371673,
"grad_norm": 12.1875,
"learning_rate": 2.9325113129403612e-05,
"loss": 1.3912,
"step": 91
},
{
"epoch": 0.0979765708200213,
"grad_norm": 16.0,
"learning_rate": 2.9310148984157028e-05,
"loss": 1.7521,
"step": 92
},
{
"epoch": 0.09904153354632587,
"grad_norm": 9.375,
"learning_rate": 2.9295024657427128e-05,
"loss": 1.0351,
"step": 93
},
{
"epoch": 0.10010649627263046,
"grad_norm": 12.6875,
"learning_rate": 2.927974031850894e-05,
"loss": 1.545,
"step": 94
},
{
"epoch": 0.10117145899893504,
"grad_norm": 11.875,
"learning_rate": 2.9264296138488606e-05,
"loss": 1.8704,
"step": 95
},
{
"epoch": 0.10223642172523961,
"grad_norm": 16.75,
"learning_rate": 2.9248692290241445e-05,
"loss": 1.3282,
"step": 96
},
{
"epoch": 0.1033013844515442,
"grad_norm": 13.625,
"learning_rate": 2.9232928948430037e-05,
"loss": 1.6012,
"step": 97
},
{
"epoch": 0.10436634717784878,
"grad_norm": 10.125,
"learning_rate": 2.9217006289502266e-05,
"loss": 1.783,
"step": 98
},
{
"epoch": 0.10543130990415335,
"grad_norm": 10.8125,
"learning_rate": 2.920092449168934e-05,
"loss": 1.7447,
"step": 99
},
{
"epoch": 0.10649627263045794,
"grad_norm": 11.25,
"learning_rate": 2.91846837350038e-05,
"loss": 1.908,
"step": 100
},
{
"epoch": 0.10756123535676251,
"grad_norm": 10.125,
"learning_rate": 2.9168284201237487e-05,
"loss": 1.5631,
"step": 101
},
{
"epoch": 0.10862619808306709,
"grad_norm": 10.5,
"learning_rate": 2.9151726073959544e-05,
"loss": 1.9487,
"step": 102
},
{
"epoch": 0.10969116080937168,
"grad_norm": 13.3125,
"learning_rate": 2.9135009538514325e-05,
"loss": 1.1878,
"step": 103
},
{
"epoch": 0.11075612353567625,
"grad_norm": 11.0625,
"learning_rate": 2.9118134782019345e-05,
"loss": 1.7323,
"step": 104
},
{
"epoch": 0.11182108626198083,
"grad_norm": 11.1875,
"learning_rate": 2.9101101993363162e-05,
"loss": 1.4423,
"step": 105
},
{
"epoch": 0.11288604898828541,
"grad_norm": 10.9375,
"learning_rate": 2.9083911363203294e-05,
"loss": 1.429,
"step": 106
},
{
"epoch": 0.11395101171458999,
"grad_norm": 10.75,
"learning_rate": 2.9066563083964054e-05,
"loss": 1.3625,
"step": 107
},
{
"epoch": 0.11501597444089456,
"grad_norm": 11.25,
"learning_rate": 2.904905734983441e-05,
"loss": 1.8362,
"step": 108
},
{
"epoch": 0.11608093716719915,
"grad_norm": 11.0625,
"learning_rate": 2.9031394356765817e-05,
"loss": 1.5439,
"step": 109
},
{
"epoch": 0.11714589989350373,
"grad_norm": 9.375,
"learning_rate": 2.901357430247001e-05,
"loss": 1.8833,
"step": 110
},
{
"epoch": 0.1182108626198083,
"grad_norm": 10.5,
"learning_rate": 2.89955973864168e-05,
"loss": 1.6258,
"step": 111
},
{
"epoch": 0.11927582534611289,
"grad_norm": 10.8125,
"learning_rate": 2.8977463809831847e-05,
"loss": 1.8468,
"step": 112
},
{
"epoch": 0.12034078807241747,
"grad_norm": 14.4375,
"learning_rate": 2.8959173775694387e-05,
"loss": 1.501,
"step": 113
},
{
"epoch": 0.12140575079872204,
"grad_norm": 16.125,
"learning_rate": 2.894072748873498e-05,
"loss": 1.4962,
"step": 114
},
{
"epoch": 0.12247071352502663,
"grad_norm": 11.4375,
"learning_rate": 2.892212515543321e-05,
"loss": 1.9837,
"step": 115
},
{
"epoch": 0.1235356762513312,
"grad_norm": 10.8125,
"learning_rate": 2.890336698401538e-05,
"loss": 1.7728,
"step": 116
},
{
"epoch": 0.12460063897763578,
"grad_norm": 14.6875,
"learning_rate": 2.888445318445216e-05,
"loss": 2.0979,
"step": 117
},
{
"epoch": 0.12566560170394037,
"grad_norm": 15.625,
"learning_rate": 2.8865383968456272e-05,
"loss": 1.5003,
"step": 118
},
{
"epoch": 0.12673056443024494,
"grad_norm": 10.625,
"learning_rate": 2.8846159549480088e-05,
"loss": 2.2097,
"step": 119
},
{
"epoch": 0.12779552715654952,
"grad_norm": 10.9375,
"learning_rate": 2.882678014271326e-05,
"loss": 1.8599,
"step": 120
},
{
"epoch": 0.1288604898828541,
"grad_norm": 10.5,
"learning_rate": 2.88072459650803e-05,
"loss": 1.6879,
"step": 121
},
{
"epoch": 0.1299254526091587,
"grad_norm": 9.8125,
"learning_rate": 2.8787557235238167e-05,
"loss": 1.6777,
"step": 122
},
{
"epoch": 0.13099041533546327,
"grad_norm": 9.0,
"learning_rate": 2.876771417357379e-05,
"loss": 1.8459,
"step": 123
},
{
"epoch": 0.13205537806176784,
"grad_norm": 9.0625,
"learning_rate": 2.8747717002201638e-05,
"loss": 1.7704,
"step": 124
},
{
"epoch": 0.13312034078807242,
"grad_norm": 10.125,
"learning_rate": 2.87275659449612e-05,
"loss": 1.8606,
"step": 125
},
{
"epoch": 0.134185303514377,
"grad_norm": 15.9375,
"learning_rate": 2.870726122741452e-05,
"loss": 1.8471,
"step": 126
},
{
"epoch": 0.13525026624068157,
"grad_norm": 10.5625,
"learning_rate": 2.868680307684363e-05,
"loss": 1.7965,
"step": 127
},
{
"epoch": 0.13631522896698617,
"grad_norm": 9.125,
"learning_rate": 2.866619172224802e-05,
"loss": 1.3363,
"step": 128
},
{
"epoch": 0.13738019169329074,
"grad_norm": 9.0625,
"learning_rate": 2.864542739434208e-05,
"loss": 1.3873,
"step": 129
},
{
"epoch": 0.13844515441959532,
"grad_norm": 10.4375,
"learning_rate": 2.862451032555253e-05,
"loss": 1.3849,
"step": 130
},
{
"epoch": 0.1395101171458999,
"grad_norm": 9.75,
"learning_rate": 2.8603440750015786e-05,
"loss": 1.6095,
"step": 131
},
{
"epoch": 0.14057507987220447,
"grad_norm": 10.75,
"learning_rate": 2.858221890357537e-05,
"loss": 1.4683,
"step": 132
},
{
"epoch": 0.14164004259850904,
"grad_norm": 11.5625,
"learning_rate": 2.856084502377925e-05,
"loss": 1.7605,
"step": 133
},
{
"epoch": 0.14270500532481364,
"grad_norm": 12.0,
"learning_rate": 2.853931934987719e-05,
"loss": 1.8109,
"step": 134
},
{
"epoch": 0.14376996805111822,
"grad_norm": 12.3125,
"learning_rate": 2.8517642122818067e-05,
"loss": 1.7257,
"step": 135
},
{
"epoch": 0.1448349307774228,
"grad_norm": 10.5,
"learning_rate": 2.849581358524719e-05,
"loss": 1.649,
"step": 136
},
{
"epoch": 0.14589989350372737,
"grad_norm": 11.0,
"learning_rate": 2.8473833981503553e-05,
"loss": 1.365,
"step": 137
},
{
"epoch": 0.14696485623003194,
"grad_norm": 10.9375,
"learning_rate": 2.8451703557617126e-05,
"loss": 1.4037,
"step": 138
},
{
"epoch": 0.14802981895633652,
"grad_norm": 8.3125,
"learning_rate": 2.84294225613061e-05,
"loss": 1.9803,
"step": 139
},
{
"epoch": 0.14909478168264112,
"grad_norm": 12.5625,
"learning_rate": 2.840699124197409e-05,
"loss": 1.6368,
"step": 140
},
{
"epoch": 0.1501597444089457,
"grad_norm": 10.8125,
"learning_rate": 2.8384409850707383e-05,
"loss": 1.8567,
"step": 141
},
{
"epoch": 0.15122470713525027,
"grad_norm": 18.5,
"learning_rate": 2.8361678640272086e-05,
"loss": 1.8818,
"step": 142
},
{
"epoch": 0.15228966986155484,
"grad_norm": 11.0,
"learning_rate": 2.8338797865111323e-05,
"loss": 1.7529,
"step": 143
},
{
"epoch": 0.15335463258785942,
"grad_norm": 11.1875,
"learning_rate": 2.831576778134238e-05,
"loss": 1.5764,
"step": 144
},
{
"epoch": 0.154419595314164,
"grad_norm": 9.375,
"learning_rate": 2.8292588646753838e-05,
"loss": 1.7057,
"step": 145
},
{
"epoch": 0.1554845580404686,
"grad_norm": 9.0625,
"learning_rate": 2.826926072080268e-05,
"loss": 1.2397,
"step": 146
},
{
"epoch": 0.15654952076677317,
"grad_norm": 12.0625,
"learning_rate": 2.8245784264611408e-05,
"loss": 1.5406,
"step": 147
},
{
"epoch": 0.15761448349307774,
"grad_norm": 12.375,
"learning_rate": 2.822215954096509e-05,
"loss": 1.4422,
"step": 148
},
{
"epoch": 0.15867944621938232,
"grad_norm": 10.25,
"learning_rate": 2.8198386814308442e-05,
"loss": 1.8692,
"step": 149
},
{
"epoch": 0.1597444089456869,
"grad_norm": 13.5,
"learning_rate": 2.8174466350742865e-05,
"loss": 1.7064,
"step": 150
},
{
"epoch": 0.16080937167199147,
"grad_norm": 11.125,
"learning_rate": 2.8150398418023447e-05,
"loss": 1.7838,
"step": 151
},
{
"epoch": 0.16187433439829607,
"grad_norm": 11.875,
"learning_rate": 2.8126183285556e-05,
"loss": 1.9361,
"step": 152
},
{
"epoch": 0.16293929712460065,
"grad_norm": 10.5625,
"learning_rate": 2.810182122439401e-05,
"loss": 1.8139,
"step": 153
},
{
"epoch": 0.16400425985090522,
"grad_norm": 9.875,
"learning_rate": 2.807731250723562e-05,
"loss": 1.5778,
"step": 154
},
{
"epoch": 0.1650692225772098,
"grad_norm": 16.75,
"learning_rate": 2.8052657408420587e-05,
"loss": 1.5036,
"step": 155
},
{
"epoch": 0.16613418530351437,
"grad_norm": 11.75,
"learning_rate": 2.8027856203927183e-05,
"loss": 1.2629,
"step": 156
},
{
"epoch": 0.16719914802981894,
"grad_norm": 12.4375,
"learning_rate": 2.800290917136913e-05,
"loss": 1.7838,
"step": 157
},
{
"epoch": 0.16826411075612355,
"grad_norm": 11.0,
"learning_rate": 2.7977816589992494e-05,
"loss": 1.8775,
"step": 158
},
{
"epoch": 0.16932907348242812,
"grad_norm": 8.9375,
"learning_rate": 2.795257874067253e-05,
"loss": 1.4417,
"step": 159
},
{
"epoch": 0.1703940362087327,
"grad_norm": 11.8125,
"learning_rate": 2.7927195905910576e-05,
"loss": 2.1449,
"step": 160
},
{
"epoch": 0.17145899893503727,
"grad_norm": 9.625,
"learning_rate": 2.790166836983086e-05,
"loss": 1.8285,
"step": 161
},
{
"epoch": 0.17252396166134185,
"grad_norm": 11.6875,
"learning_rate": 2.7875996418177348e-05,
"loss": 1.4369,
"step": 162
},
{
"epoch": 0.17358892438764642,
"grad_norm": 12.625,
"learning_rate": 2.7850180338310517e-05,
"loss": 1.9668,
"step": 163
},
{
"epoch": 0.17465388711395102,
"grad_norm": 9.375,
"learning_rate": 2.782422041920415e-05,
"loss": 1.7926,
"step": 164
},
{
"epoch": 0.1757188498402556,
"grad_norm": 11.9375,
"learning_rate": 2.779811695144212e-05,
"loss": 1.945,
"step": 165
},
{
"epoch": 0.17678381256656017,
"grad_norm": 10.0,
"learning_rate": 2.7771870227215096e-05,
"loss": 1.9434,
"step": 166
},
{
"epoch": 0.17784877529286475,
"grad_norm": 15.3125,
"learning_rate": 2.7745480540317315e-05,
"loss": 1.6811,
"step": 167
},
{
"epoch": 0.17891373801916932,
"grad_norm": 10.75,
"learning_rate": 2.771894818614327e-05,
"loss": 1.8628,
"step": 168
},
{
"epoch": 0.1799787007454739,
"grad_norm": 14.125,
"learning_rate": 2.7692273461684407e-05,
"loss": 1.8834,
"step": 169
},
{
"epoch": 0.1810436634717785,
"grad_norm": 9.375,
"learning_rate": 2.7665456665525805e-05,
"loss": 1.8599,
"step": 170
},
{
"epoch": 0.18210862619808307,
"grad_norm": 10.75,
"learning_rate": 2.7638498097842823e-05,
"loss": 1.5488,
"step": 171
},
{
"epoch": 0.18317358892438765,
"grad_norm": 11.875,
"learning_rate": 2.7611398060397755e-05,
"loss": 1.6002,
"step": 172
},
{
"epoch": 0.18423855165069222,
"grad_norm": 14.3125,
"learning_rate": 2.7584156856536446e-05,
"loss": 1.6349,
"step": 173
},
{
"epoch": 0.1853035143769968,
"grad_norm": 11.4375,
"learning_rate": 2.7556774791184893e-05,
"loss": 1.7368,
"step": 174
},
{
"epoch": 0.18636847710330137,
"grad_norm": 14.4375,
"learning_rate": 2.752925217084583e-05,
"loss": 1.9945,
"step": 175
},
{
"epoch": 0.18743343982960597,
"grad_norm": 9.5,
"learning_rate": 2.7501589303595305e-05,
"loss": 1.1876,
"step": 176
},
{
"epoch": 0.18849840255591055,
"grad_norm": 11.4375,
"learning_rate": 2.7473786499079232e-05,
"loss": 1.8281,
"step": 177
},
{
"epoch": 0.18956336528221512,
"grad_norm": 10.9375,
"learning_rate": 2.744584406850992e-05,
"loss": 1.4349,
"step": 178
},
{
"epoch": 0.1906283280085197,
"grad_norm": 9.8125,
"learning_rate": 2.741776232466258e-05,
"loss": 1.8583,
"step": 179
},
{
"epoch": 0.19169329073482427,
"grad_norm": 9.1875,
"learning_rate": 2.7389541581871843e-05,
"loss": 2.0097,
"step": 180
},
{
"epoch": 0.19275825346112885,
"grad_norm": 12.1875,
"learning_rate": 2.736118215602823e-05,
"loss": 1.8045,
"step": 181
},
{
"epoch": 0.19382321618743345,
"grad_norm": 11.6875,
"learning_rate": 2.7332684364574632e-05,
"loss": 2.2613,
"step": 182
},
{
"epoch": 0.19488817891373802,
"grad_norm": 9.5,
"learning_rate": 2.7304048526502723e-05,
"loss": 1.443,
"step": 183
},
{
"epoch": 0.1959531416400426,
"grad_norm": 9.9375,
"learning_rate": 2.7275274962349417e-05,
"loss": 1.5373,
"step": 184
},
{
"epoch": 0.19701810436634717,
"grad_norm": 10.4375,
"learning_rate": 2.7246363994193276e-05,
"loss": 1.8491,
"step": 185
},
{
"epoch": 0.19808306709265175,
"grad_norm": 8.375,
"learning_rate": 2.721731594565091e-05,
"loss": 1.5917,
"step": 186
},
{
"epoch": 0.19914802981895632,
"grad_norm": 10.0,
"learning_rate": 2.718813114187332e-05,
"loss": 1.7208,
"step": 187
},
{
"epoch": 0.20021299254526093,
"grad_norm": 13.875,
"learning_rate": 2.7158809909542308e-05,
"loss": 1.5774,
"step": 188
},
{
"epoch": 0.2012779552715655,
"grad_norm": 10.4375,
"learning_rate": 2.712935257686679e-05,
"loss": 1.4742,
"step": 189
},
{
"epoch": 0.20234291799787008,
"grad_norm": 7.9375,
"learning_rate": 2.709975947357914e-05,
"loss": 1.7704,
"step": 190
},
{
"epoch": 0.20340788072417465,
"grad_norm": 10.25,
"learning_rate": 2.707003093093146e-05,
"loss": 1.3238,
"step": 191
},
{
"epoch": 0.20447284345047922,
"grad_norm": 9.5625,
"learning_rate": 2.704016728169193e-05,
"loss": 1.8382,
"step": 192
},
{
"epoch": 0.2055378061767838,
"grad_norm": 9.25,
"learning_rate": 2.7010168860141033e-05,
"loss": 1.6611,
"step": 193
},
{
"epoch": 0.2066027689030884,
"grad_norm": 12.0625,
"learning_rate": 2.6980036002067846e-05,
"loss": 1.4489,
"step": 194
},
{
"epoch": 0.20766773162939298,
"grad_norm": 11.0625,
"learning_rate": 2.6949769044766266e-05,
"loss": 1.6431,
"step": 195
},
{
"epoch": 0.20873269435569755,
"grad_norm": 10.6875,
"learning_rate": 2.6919368327031236e-05,
"loss": 1.6825,
"step": 196
},
{
"epoch": 0.20979765708200213,
"grad_norm": 9.625,
"learning_rate": 2.6888834189154955e-05,
"loss": 2.1797,
"step": 197
},
{
"epoch": 0.2108626198083067,
"grad_norm": 11.1875,
"learning_rate": 2.6858166972923063e-05,
"loss": 2.0067,
"step": 198
},
{
"epoch": 0.21192758253461128,
"grad_norm": 11.6875,
"learning_rate": 2.6827367021610832e-05,
"loss": 1.4995,
"step": 199
},
{
"epoch": 0.21299254526091588,
"grad_norm": 18.0,
"learning_rate": 2.67964346799793e-05,
"loss": 1.4391,
"step": 200
},
{
"epoch": 0.21405750798722045,
"grad_norm": 8.5,
"learning_rate": 2.676537029427143e-05,
"loss": 2.0381,
"step": 201
},
{
"epoch": 0.21512247071352503,
"grad_norm": 12.75,
"learning_rate": 2.6734174212208226e-05,
"loss": 1.7726,
"step": 202
},
{
"epoch": 0.2161874334398296,
"grad_norm": 12.5,
"learning_rate": 2.6702846782984846e-05,
"loss": 1.5714,
"step": 203
},
{
"epoch": 0.21725239616613418,
"grad_norm": 11.0625,
"learning_rate": 2.6671388357266687e-05,
"loss": 2.0508,
"step": 204
},
{
"epoch": 0.21831735889243875,
"grad_norm": 8.6875,
"learning_rate": 2.6639799287185456e-05,
"loss": 1.7225,
"step": 205
},
{
"epoch": 0.21938232161874335,
"grad_norm": 10.0625,
"learning_rate": 2.660807992633525e-05,
"loss": 1.433,
"step": 206
},
{
"epoch": 0.22044728434504793,
"grad_norm": 8.9375,
"learning_rate": 2.657623062976858e-05,
"loss": 1.9277,
"step": 207
},
{
"epoch": 0.2215122470713525,
"grad_norm": 9.0,
"learning_rate": 2.6544251753992387e-05,
"loss": 1.5421,
"step": 208
},
{
"epoch": 0.22257720979765708,
"grad_norm": 9.8125,
"learning_rate": 2.6512143656964077e-05,
"loss": 1.8624,
"step": 209
},
{
"epoch": 0.22364217252396165,
"grad_norm": 9.1875,
"learning_rate": 2.6479906698087496e-05,
"loss": 1.7282,
"step": 210
},
{
"epoch": 0.22470713525026625,
"grad_norm": 14.6875,
"learning_rate": 2.6447541238208917e-05,
"loss": 1.993,
"step": 211
},
{
"epoch": 0.22577209797657083,
"grad_norm": 13.875,
"learning_rate": 2.6415047639612992e-05,
"loss": 2.1598,
"step": 212
},
{
"epoch": 0.2268370607028754,
"grad_norm": 12.9375,
"learning_rate": 2.6382426266018704e-05,
"loss": 1.4964,
"step": 213
},
{
"epoch": 0.22790202342917998,
"grad_norm": 9.6875,
"learning_rate": 2.6349677482575297e-05,
"loss": 1.46,
"step": 214
},
{
"epoch": 0.22896698615548455,
"grad_norm": 9.125,
"learning_rate": 2.6316801655858165e-05,
"loss": 1.1583,
"step": 215
},
{
"epoch": 0.23003194888178913,
"grad_norm": 12.1875,
"learning_rate": 2.6283799153864797e-05,
"loss": 1.3867,
"step": 216
},
{
"epoch": 0.23109691160809373,
"grad_norm": 13.375,
"learning_rate": 2.6250670346010608e-05,
"loss": 1.3233,
"step": 217
},
{
"epoch": 0.2321618743343983,
"grad_norm": 12.9375,
"learning_rate": 2.6217415603124835e-05,
"loss": 1.414,
"step": 218
},
{
"epoch": 0.23322683706070288,
"grad_norm": 9.6875,
"learning_rate": 2.618403529744637e-05,
"loss": 1.8825,
"step": 219
},
{
"epoch": 0.23429179978700745,
"grad_norm": 13.6875,
"learning_rate": 2.6150529802619604e-05,
"loss": 1.6701,
"step": 220
},
{
"epoch": 0.23535676251331203,
"grad_norm": 11.875,
"learning_rate": 2.6116899493690237e-05,
"loss": 1.7026,
"step": 221
},
{
"epoch": 0.2364217252396166,
"grad_norm": 10.5,
"learning_rate": 2.6083144747101086e-05,
"loss": 2.3935,
"step": 222
},
{
"epoch": 0.2374866879659212,
"grad_norm": 9.0,
"learning_rate": 2.6049265940687868e-05,
"loss": 1.7679,
"step": 223
},
{
"epoch": 0.23855165069222578,
"grad_norm": 14.0,
"learning_rate": 2.601526345367496e-05,
"loss": 1.5758,
"step": 224
},
{
"epoch": 0.23961661341853036,
"grad_norm": 10.5625,
"learning_rate": 2.5981137666671178e-05,
"loss": 2.0644,
"step": 225
},
{
"epoch": 0.24068157614483493,
"grad_norm": 12.125,
"learning_rate": 2.5946888961665512e-05,
"loss": 1.6724,
"step": 226
},
{
"epoch": 0.2417465388711395,
"grad_norm": 9.375,
"learning_rate": 2.5912517722022817e-05,
"loss": 1.3376,
"step": 227
},
{
"epoch": 0.24281150159744408,
"grad_norm": 11.25,
"learning_rate": 2.587802433247956e-05,
"loss": 1.5454,
"step": 228
},
{
"epoch": 0.24387646432374868,
"grad_norm": 8.6875,
"learning_rate": 2.5843409179139498e-05,
"loss": 1.6634,
"step": 229
},
{
"epoch": 0.24494142705005326,
"grad_norm": 10.4375,
"learning_rate": 2.580867264946936e-05,
"loss": 1.9844,
"step": 230
},
{
"epoch": 0.24600638977635783,
"grad_norm": 9.1875,
"learning_rate": 2.5773815132294517e-05,
"loss": 1.8239,
"step": 231
},
{
"epoch": 0.2470713525026624,
"grad_norm": 9.1875,
"learning_rate": 2.57388370177946e-05,
"loss": 2.0781,
"step": 232
},
{
"epoch": 0.24813631522896698,
"grad_norm": 9.375,
"learning_rate": 2.5703738697499167e-05,
"loss": 2.0183,
"step": 233
},
{
"epoch": 0.24920127795527156,
"grad_norm": 21.5,
"learning_rate": 2.5668520564283305e-05,
"loss": 1.8053,
"step": 234
},
{
"epoch": 0.25026624068157616,
"grad_norm": 8.25,
"learning_rate": 2.5633183012363226e-05,
"loss": 1.4977,
"step": 235
},
{
"epoch": 0.25133120340788073,
"grad_norm": 10.5625,
"learning_rate": 2.559772643729188e-05,
"loss": 1.9078,
"step": 236
},
{
"epoch": 0.2523961661341853,
"grad_norm": 10.375,
"learning_rate": 2.556215123595449e-05,
"loss": 1.8836,
"step": 237
},
{
"epoch": 0.2534611288604899,
"grad_norm": 10.3125,
"learning_rate": 2.5526457806564138e-05,
"loss": 1.4115,
"step": 238
},
{
"epoch": 0.25452609158679446,
"grad_norm": 19.875,
"learning_rate": 2.5490646548657296e-05,
"loss": 1.8534,
"step": 239
},
{
"epoch": 0.25559105431309903,
"grad_norm": 10.125,
"learning_rate": 2.5454717863089367e-05,
"loss": 1.6013,
"step": 240
},
{
"epoch": 0.2566560170394036,
"grad_norm": 9.6875,
"learning_rate": 2.5418672152030174e-05,
"loss": 1.4595,
"step": 241
},
{
"epoch": 0.2577209797657082,
"grad_norm": 9.4375,
"learning_rate": 2.5382509818959468e-05,
"loss": 1.3814,
"step": 242
},
{
"epoch": 0.25878594249201275,
"grad_norm": 12.4375,
"learning_rate": 2.5346231268662435e-05,
"loss": 1.6796,
"step": 243
},
{
"epoch": 0.2598509052183174,
"grad_norm": 9.0,
"learning_rate": 2.5309836907225126e-05,
"loss": 1.5827,
"step": 244
},
{
"epoch": 0.26091586794462196,
"grad_norm": 10.125,
"learning_rate": 2.527332714202994e-05,
"loss": 1.7017,
"step": 245
},
{
"epoch": 0.26198083067092653,
"grad_norm": 14.875,
"learning_rate": 2.523670238175106e-05,
"loss": 1.5484,
"step": 246
},
{
"epoch": 0.2630457933972311,
"grad_norm": 10.375,
"learning_rate": 2.519996303634985e-05,
"loss": 1.7837,
"step": 247
},
{
"epoch": 0.2641107561235357,
"grad_norm": 13.4375,
"learning_rate": 2.5163109517070322e-05,
"loss": 1.6422,
"step": 248
},
{
"epoch": 0.26517571884984026,
"grad_norm": 9.0625,
"learning_rate": 2.512614223643448e-05,
"loss": 2.0394,
"step": 249
},
{
"epoch": 0.26624068157614483,
"grad_norm": 14.375,
"learning_rate": 2.5089061608237717e-05,
"loss": 1.5178,
"step": 250
},
{
"epoch": 0.2673056443024494,
"grad_norm": 11.5625,
"learning_rate": 2.5051868047544206e-05,
"loss": 1.6257,
"step": 251
},
{
"epoch": 0.268370607028754,
"grad_norm": 11.0,
"learning_rate": 2.501456197068222e-05,
"loss": 1.6878,
"step": 252
},
{
"epoch": 0.26943556975505856,
"grad_norm": 11.625,
"learning_rate": 2.4977143795239504e-05,
"loss": 1.7076,
"step": 253
},
{
"epoch": 0.27050053248136313,
"grad_norm": 18.0,
"learning_rate": 2.493961394005857e-05,
"loss": 1.3901,
"step": 254
},
{
"epoch": 0.2715654952076677,
"grad_norm": 10.875,
"learning_rate": 2.4901972825232033e-05,
"loss": 1.901,
"step": 255
},
{
"epoch": 0.27263045793397234,
"grad_norm": 10.5625,
"learning_rate": 2.48642208720979e-05,
"loss": 2.0679,
"step": 256
},
{
"epoch": 0.2736954206602769,
"grad_norm": 14.25,
"learning_rate": 2.482635850323484e-05,
"loss": 1.5161,
"step": 257
},
{
"epoch": 0.2747603833865815,
"grad_norm": 9.25,
"learning_rate": 2.478838614245749e-05,
"loss": 1.6164,
"step": 258
},
{
"epoch": 0.27582534611288606,
"grad_norm": 12.25,
"learning_rate": 2.475030421481167e-05,
"loss": 1.1729,
"step": 259
},
{
"epoch": 0.27689030883919064,
"grad_norm": 9.125,
"learning_rate": 2.4712113146569638e-05,
"loss": 1.6588,
"step": 260
},
{
"epoch": 0.2779552715654952,
"grad_norm": 12.4375,
"learning_rate": 2.4673813365225346e-05,
"loss": 1.404,
"step": 261
},
{
"epoch": 0.2790202342917998,
"grad_norm": 12.0625,
"learning_rate": 2.463540529948961e-05,
"loss": 1.7023,
"step": 262
},
{
"epoch": 0.28008519701810436,
"grad_norm": 9.125,
"learning_rate": 2.4596889379285353e-05,
"loss": 1.7676,
"step": 263
},
{
"epoch": 0.28115015974440893,
"grad_norm": 14.3125,
"learning_rate": 2.455826603574276e-05,
"loss": 1.6981,
"step": 264
},
{
"epoch": 0.2822151224707135,
"grad_norm": 10.9375,
"learning_rate": 2.451953570119446e-05,
"loss": 1.1314,
"step": 265
},
{
"epoch": 0.2832800851970181,
"grad_norm": 9.5625,
"learning_rate": 2.4480698809170716e-05,
"loss": 1.5507,
"step": 266
},
{
"epoch": 0.28434504792332266,
"grad_norm": 9.3125,
"learning_rate": 2.4441755794394522e-05,
"loss": 1.9222,
"step": 267
},
{
"epoch": 0.2854100106496273,
"grad_norm": 9.0625,
"learning_rate": 2.4402707092776778e-05,
"loss": 0.9189,
"step": 268
},
{
"epoch": 0.28647497337593186,
"grad_norm": 8.125,
"learning_rate": 2.436355314141139e-05,
"loss": 1.8083,
"step": 269
},
{
"epoch": 0.28753993610223644,
"grad_norm": 9.5,
"learning_rate": 2.4324294378570385e-05,
"loss": 1.4605,
"step": 270
},
{
"epoch": 0.288604898828541,
"grad_norm": 11.1875,
"learning_rate": 2.428493124369902e-05,
"loss": 1.653,
"step": 271
},
{
"epoch": 0.2896698615548456,
"grad_norm": 11.375,
"learning_rate": 2.4245464177410802e-05,
"loss": 1.3704,
"step": 272
},
{
"epoch": 0.29073482428115016,
"grad_norm": 11.0625,
"learning_rate": 2.4205893621482648e-05,
"loss": 1.3454,
"step": 273
},
{
"epoch": 0.29179978700745474,
"grad_norm": 11.75,
"learning_rate": 2.416622001884987e-05,
"loss": 1.9638,
"step": 274
},
{
"epoch": 0.2928647497337593,
"grad_norm": 15.125,
"learning_rate": 2.4126443813601235e-05,
"loss": 1.3706,
"step": 275
},
{
"epoch": 0.2939297124600639,
"grad_norm": 8.3125,
"learning_rate": 2.408656545097401e-05,
"loss": 1.2448,
"step": 276
},
{
"epoch": 0.29499467518636846,
"grad_norm": 20.25,
"learning_rate": 2.4046585377348963e-05,
"loss": 1.3458,
"step": 277
},
{
"epoch": 0.29605963791267303,
"grad_norm": 11.0625,
"learning_rate": 2.400650404024537e-05,
"loss": 1.7146,
"step": 278
},
{
"epoch": 0.2971246006389776,
"grad_norm": 11.6875,
"learning_rate": 2.3966321888316e-05,
"loss": 1.442,
"step": 279
},
{
"epoch": 0.29818956336528224,
"grad_norm": 14.0625,
"learning_rate": 2.3926039371342105e-05,
"loss": 1.6687,
"step": 280
},
{
"epoch": 0.2992545260915868,
"grad_norm": 11.375,
"learning_rate": 2.3885656940228378e-05,
"loss": 1.8262,
"step": 281
},
{
"epoch": 0.3003194888178914,
"grad_norm": 10.5625,
"learning_rate": 2.3845175046997903e-05,
"loss": 0.883,
"step": 282
},
{
"epoch": 0.30138445154419596,
"grad_norm": 9.9375,
"learning_rate": 2.3804594144787105e-05,
"loss": 1.842,
"step": 283
},
{
"epoch": 0.30244941427050054,
"grad_norm": 10.625,
"learning_rate": 2.3763914687840663e-05,
"loss": 1.7852,
"step": 284
},
{
"epoch": 0.3035143769968051,
"grad_norm": 9.1875,
"learning_rate": 2.3723137131506454e-05,
"loss": 1.6978,
"step": 285
},
{
"epoch": 0.3045793397231097,
"grad_norm": 10.0625,
"learning_rate": 2.3682261932230403e-05,
"loss": 1.9347,
"step": 286
},
{
"epoch": 0.30564430244941426,
"grad_norm": 10.6875,
"learning_rate": 2.364128954755144e-05,
"loss": 2.0528,
"step": 287
},
{
"epoch": 0.30670926517571884,
"grad_norm": 11.375,
"learning_rate": 2.360022043609632e-05,
"loss": 1.6198,
"step": 288
},
{
"epoch": 0.3077742279020234,
"grad_norm": 8.0625,
"learning_rate": 2.3559055057574533e-05,
"loss": 1.8185,
"step": 289
},
{
"epoch": 0.308839190628328,
"grad_norm": 9.5625,
"learning_rate": 2.3517793872773135e-05,
"loss": 1.2761,
"step": 290
},
{
"epoch": 0.30990415335463256,
"grad_norm": 21.5,
"learning_rate": 2.3476437343551585e-05,
"loss": 1.4168,
"step": 291
},
{
"epoch": 0.3109691160809372,
"grad_norm": 8.8125,
"learning_rate": 2.3434985932836603e-05,
"loss": 1.5111,
"step": 292
},
{
"epoch": 0.31203407880724177,
"grad_norm": 10.875,
"learning_rate": 2.3393440104616953e-05,
"loss": 1.2663,
"step": 293
},
{
"epoch": 0.31309904153354634,
"grad_norm": 10.1875,
"learning_rate": 2.335180032393828e-05,
"loss": 1.6099,
"step": 294
},
{
"epoch": 0.3141640042598509,
"grad_norm": 10.25,
"learning_rate": 2.331006705689788e-05,
"loss": 2.0177,
"step": 295
},
{
"epoch": 0.3152289669861555,
"grad_norm": 10.875,
"learning_rate": 2.3268240770639508e-05,
"loss": 1.3157,
"step": 296
},
{
"epoch": 0.31629392971246006,
"grad_norm": 11.75,
"learning_rate": 2.322632193334812e-05,
"loss": 1.5085,
"step": 297
},
{
"epoch": 0.31735889243876464,
"grad_norm": 12.625,
"learning_rate": 2.3184311014244663e-05,
"loss": 1.7265,
"step": 298
},
{
"epoch": 0.3184238551650692,
"grad_norm": 10.3125,
"learning_rate": 2.314220848358079e-05,
"loss": 1.5724,
"step": 299
},
{
"epoch": 0.3194888178913738,
"grad_norm": 9.1875,
"learning_rate": 2.310001481263363e-05,
"loss": 1.6024,
"step": 300
},
{
"epoch": 0.32055378061767836,
"grad_norm": 10.625,
"learning_rate": 2.3057730473700472e-05,
"loss": 1.6951,
"step": 301
},
{
"epoch": 0.32161874334398294,
"grad_norm": 10.875,
"learning_rate": 2.3015355940093544e-05,
"loss": 1.5714,
"step": 302
},
{
"epoch": 0.3226837060702875,
"grad_norm": 10.625,
"learning_rate": 2.2972891686134624e-05,
"loss": 1.6869,
"step": 303
},
{
"epoch": 0.32374866879659214,
"grad_norm": 11.9375,
"learning_rate": 2.2930338187149816e-05,
"loss": 1.9157,
"step": 304
},
{
"epoch": 0.3248136315228967,
"grad_norm": 11.375,
"learning_rate": 2.2887695919464172e-05,
"loss": 1.7153,
"step": 305
},
{
"epoch": 0.3258785942492013,
"grad_norm": 8.5625,
"learning_rate": 2.2844965360396405e-05,
"loss": 1.66,
"step": 306
},
{
"epoch": 0.32694355697550587,
"grad_norm": 8.75,
"learning_rate": 2.2802146988253494e-05,
"loss": 1.3719,
"step": 307
},
{
"epoch": 0.32800851970181044,
"grad_norm": 10.375,
"learning_rate": 2.2759241282325384e-05,
"loss": 1.8534,
"step": 308
},
{
"epoch": 0.329073482428115,
"grad_norm": 11.0,
"learning_rate": 2.2716248722879577e-05,
"loss": 1.6755,
"step": 309
},
{
"epoch": 0.3301384451544196,
"grad_norm": 8.9375,
"learning_rate": 2.2673169791155787e-05,
"loss": 1.713,
"step": 310
},
{
"epoch": 0.33120340788072417,
"grad_norm": 11.5,
"learning_rate": 2.2630004969360534e-05,
"loss": 1.6701,
"step": 311
},
{
"epoch": 0.33226837060702874,
"grad_norm": 16.25,
"learning_rate": 2.2586754740661756e-05,
"loss": 1.6506,
"step": 312
},
{
"epoch": 0.3333333333333333,
"grad_norm": 8.75,
"learning_rate": 2.25434195891834e-05,
"loss": 1.5584,
"step": 313
},
{
"epoch": 0.3343982960596379,
"grad_norm": 11.9375,
"learning_rate": 2.25e-05,
"loss": 1.5541,
"step": 314
},
{
"epoch": 0.3354632587859425,
"grad_norm": 9.0,
"learning_rate": 2.245649645913125e-05,
"loss": 1.6149,
"step": 315
},
{
"epoch": 0.3365282215122471,
"grad_norm": 8.375,
"learning_rate": 2.2412909453536553e-05,
"loss": 1.6719,
"step": 316
},
{
"epoch": 0.33759318423855167,
"grad_norm": 9.9375,
"learning_rate": 2.2369239471109594e-05,
"loss": 1.8576,
"step": 317
},
{
"epoch": 0.33865814696485624,
"grad_norm": 10.75,
"learning_rate": 2.2325487000672855e-05,
"loss": 1.4974,
"step": 318
},
{
"epoch": 0.3397231096911608,
"grad_norm": 9.875,
"learning_rate": 2.2281652531972147e-05,
"loss": 1.7744,
"step": 319
},
{
"epoch": 0.3407880724174654,
"grad_norm": 9.3125,
"learning_rate": 2.223773655567115e-05,
"loss": 0.9289,
"step": 320
},
{
"epoch": 0.34185303514376997,
"grad_norm": 8.875,
"learning_rate": 2.2193739563345886e-05,
"loss": 2.0212,
"step": 321
},
{
"epoch": 0.34291799787007454,
"grad_norm": 11.5,
"learning_rate": 2.214966204747924e-05,
"loss": 1.9651,
"step": 322
},
{
"epoch": 0.3439829605963791,
"grad_norm": 10.3125,
"learning_rate": 2.2105504501455456e-05,
"loss": 1.8089,
"step": 323
},
{
"epoch": 0.3450479233226837,
"grad_norm": 9.875,
"learning_rate": 2.2061267419554577e-05,
"loss": 2.0376,
"step": 324
},
{
"epoch": 0.34611288604898827,
"grad_norm": 10.125,
"learning_rate": 2.2016951296946955e-05,
"loss": 2.1548,
"step": 325
},
{
"epoch": 0.34717784877529284,
"grad_norm": 8.25,
"learning_rate": 2.1972556629687674e-05,
"loss": 1.478,
"step": 326
},
{
"epoch": 0.34824281150159747,
"grad_norm": 10.0625,
"learning_rate": 2.1928083914711023e-05,
"loss": 1.6681,
"step": 327
},
{
"epoch": 0.34930777422790205,
"grad_norm": 8.3125,
"learning_rate": 2.1883533649824922e-05,
"loss": 1.8026,
"step": 328
},
{
"epoch": 0.3503727369542066,
"grad_norm": 9.8125,
"learning_rate": 2.1838906333705338e-05,
"loss": 1.6024,
"step": 329
},
{
"epoch": 0.3514376996805112,
"grad_norm": 9.25,
"learning_rate": 2.1794202465890734e-05,
"loss": 1.3394,
"step": 330
},
{
"epoch": 0.35250266240681577,
"grad_norm": 8.0625,
"learning_rate": 2.1749422546776446e-05,
"loss": 1.7381,
"step": 331
},
{
"epoch": 0.35356762513312034,
"grad_norm": 7.78125,
"learning_rate": 2.170456707760909e-05,
"loss": 2.0204,
"step": 332
},
{
"epoch": 0.3546325878594249,
"grad_norm": 11.6875,
"learning_rate": 2.165963656048098e-05,
"loss": 1.9509,
"step": 333
},
{
"epoch": 0.3556975505857295,
"grad_norm": 10.5,
"learning_rate": 2.1614631498324455e-05,
"loss": 1.915,
"step": 334
},
{
"epoch": 0.35676251331203407,
"grad_norm": 11.375,
"learning_rate": 2.1569552394906292e-05,
"loss": 1.4493,
"step": 335
},
{
"epoch": 0.35782747603833864,
"grad_norm": 11.0,
"learning_rate": 2.152439975482205e-05,
"loss": 1.5415,
"step": 336
},
{
"epoch": 0.3588924387646432,
"grad_norm": 10.6875,
"learning_rate": 2.1479174083490443e-05,
"loss": 1.5783,
"step": 337
},
{
"epoch": 0.3599574014909478,
"grad_norm": 12.375,
"learning_rate": 2.1433875887147628e-05,
"loss": 1.5324,
"step": 338
},
{
"epoch": 0.3610223642172524,
"grad_norm": 7.90625,
"learning_rate": 2.13885056728416e-05,
"loss": 1.5981,
"step": 339
},
{
"epoch": 0.362087326943557,
"grad_norm": 7.90625,
"learning_rate": 2.1343063948426495e-05,
"loss": 1.6717,
"step": 340
},
{
"epoch": 0.36315228966986157,
"grad_norm": 15.6875,
"learning_rate": 2.1297551222556887e-05,
"loss": 1.8055,
"step": 341
},
{
"epoch": 0.36421725239616615,
"grad_norm": 11.3125,
"learning_rate": 2.1251968004682112e-05,
"loss": 1.9067,
"step": 342
},
{
"epoch": 0.3652822151224707,
"grad_norm": 9.5625,
"learning_rate": 2.1206314805040573e-05,
"loss": 1.4649,
"step": 343
},
{
"epoch": 0.3663471778487753,
"grad_norm": 10.75,
"learning_rate": 2.1160592134654e-05,
"loss": 1.5144,
"step": 344
},
{
"epoch": 0.36741214057507987,
"grad_norm": 9.6875,
"learning_rate": 2.111480050532177e-05,
"loss": 1.6677,
"step": 345
},
{
"epoch": 0.36847710330138445,
"grad_norm": 9.4375,
"learning_rate": 2.1068940429615138e-05,
"loss": 1.9054,
"step": 346
},
{
"epoch": 0.369542066027689,
"grad_norm": 12.75,
"learning_rate": 2.102301242087152e-05,
"loss": 1.3985,
"step": 347
},
{
"epoch": 0.3706070287539936,
"grad_norm": 9.9375,
"learning_rate": 2.097701699318875e-05,
"loss": 1.8731,
"step": 348
},
{
"epoch": 0.37167199148029817,
"grad_norm": 13.6875,
"learning_rate": 2.0930954661419325e-05,
"loss": 1.9324,
"step": 349
},
{
"epoch": 0.37273695420660274,
"grad_norm": 10.9375,
"learning_rate": 2.088482594116462e-05,
"loss": 1.3877,
"step": 350
},
{
"epoch": 0.3738019169329074,
"grad_norm": 11.25,
"learning_rate": 2.0838631348769142e-05,
"loss": 1.6139,
"step": 351
},
{
"epoch": 0.37486687965921195,
"grad_norm": 8.6875,
"learning_rate": 2.079237140131475e-05,
"loss": 1.5881,
"step": 352
},
{
"epoch": 0.3759318423855165,
"grad_norm": 10.1875,
"learning_rate": 2.0746046616614846e-05,
"loss": 2.0595,
"step": 353
},
{
"epoch": 0.3769968051118211,
"grad_norm": 8.6875,
"learning_rate": 2.0699657513208603e-05,
"loss": 1.4865,
"step": 354
},
{
"epoch": 0.3780617678381257,
"grad_norm": 10.6875,
"learning_rate": 2.065320461035513e-05,
"loss": 1.3383,
"step": 355
},
{
"epoch": 0.37912673056443025,
"grad_norm": 9.6875,
"learning_rate": 2.0606688428027708e-05,
"loss": 1.683,
"step": 356
},
{
"epoch": 0.3801916932907348,
"grad_norm": 12.0625,
"learning_rate": 2.0560109486907912e-05,
"loss": 1.2894,
"step": 357
},
{
"epoch": 0.3812566560170394,
"grad_norm": 14.375,
"learning_rate": 2.0513468308379826e-05,
"loss": 1.491,
"step": 358
},
{
"epoch": 0.38232161874334397,
"grad_norm": 10.5625,
"learning_rate": 2.046676541452419e-05,
"loss": 1.4899,
"step": 359
},
{
"epoch": 0.38338658146964855,
"grad_norm": 9.0625,
"learning_rate": 2.0420001328112558e-05,
"loss": 1.5506,
"step": 360
},
{
"epoch": 0.3844515441959531,
"grad_norm": 11.6875,
"learning_rate": 2.0373176572601443e-05,
"loss": 1.9362,
"step": 361
},
{
"epoch": 0.3855165069222577,
"grad_norm": 11.0,
"learning_rate": 2.032629167212647e-05,
"loss": 1.6449,
"step": 362
},
{
"epoch": 0.3865814696485623,
"grad_norm": 12.0,
"learning_rate": 2.0279347151496484e-05,
"loss": 1.5654,
"step": 363
},
{
"epoch": 0.3876464323748669,
"grad_norm": 9.375,
"learning_rate": 2.023234353618771e-05,
"loss": 2.1523,
"step": 364
},
{
"epoch": 0.3887113951011715,
"grad_norm": 9.4375,
"learning_rate": 2.0185281352337845e-05,
"loss": 1.4507,
"step": 365
},
{
"epoch": 0.38977635782747605,
"grad_norm": 12.4375,
"learning_rate": 2.0138161126740167e-05,
"loss": 1.7295,
"step": 366
},
{
"epoch": 0.3908413205537806,
"grad_norm": 11.8125,
"learning_rate": 2.0090983386837668e-05,
"loss": 1.4096,
"step": 367
},
{
"epoch": 0.3919062832800852,
"grad_norm": 11.125,
"learning_rate": 2.0043748660717107e-05,
"loss": 1.7948,
"step": 368
},
{
"epoch": 0.3929712460063898,
"grad_norm": 12.1875,
"learning_rate": 1.999645747710314e-05,
"loss": 1.6431,
"step": 369
},
{
"epoch": 0.39403620873269435,
"grad_norm": 9.0,
"learning_rate": 1.9949110365352377e-05,
"loss": 1.8226,
"step": 370
},
{
"epoch": 0.3951011714589989,
"grad_norm": 14.875,
"learning_rate": 1.990170785544745e-05,
"loss": 0.9145,
"step": 371
},
{
"epoch": 0.3961661341853035,
"grad_norm": 8.6875,
"learning_rate": 1.985425047799112e-05,
"loss": 1.7916,
"step": 372
},
{
"epoch": 0.3972310969116081,
"grad_norm": 11.5,
"learning_rate": 1.9806738764200293e-05,
"loss": 1.5905,
"step": 373
},
{
"epoch": 0.39829605963791265,
"grad_norm": 9.5625,
"learning_rate": 1.975917324590009e-05,
"loss": 1.5794,
"step": 374
},
{
"epoch": 0.3993610223642173,
"grad_norm": 8.75,
"learning_rate": 1.97115544555179e-05,
"loss": 1.4684,
"step": 375
},
{
"epoch": 0.40042598509052185,
"grad_norm": 11.375,
"learning_rate": 1.966388292607742e-05,
"loss": 1.4792,
"step": 376
},
{
"epoch": 0.4014909478168264,
"grad_norm": 8.9375,
"learning_rate": 1.961615919119268e-05,
"loss": 2.0911,
"step": 377
},
{
"epoch": 0.402555910543131,
"grad_norm": 10.0625,
"learning_rate": 1.9568383785062086e-05,
"loss": 2.1202,
"step": 378
},
{
"epoch": 0.4036208732694356,
"grad_norm": 11.5,
"learning_rate": 1.9520557242462412e-05,
"loss": 1.7334,
"step": 379
},
{
"epoch": 0.40468583599574015,
"grad_norm": 8.625,
"learning_rate": 1.9472680098742838e-05,
"loss": 1.7019,
"step": 380
},
{
"epoch": 0.4057507987220447,
"grad_norm": 10.125,
"learning_rate": 1.9424752889818956e-05,
"loss": 1.5539,
"step": 381
},
{
"epoch": 0.4068157614483493,
"grad_norm": 10.875,
"learning_rate": 1.9376776152166757e-05,
"loss": 1.2953,
"step": 382
},
{
"epoch": 0.4078807241746539,
"grad_norm": 14.4375,
"learning_rate": 1.932875042281664e-05,
"loss": 1.4313,
"step": 383
},
{
"epoch": 0.40894568690095845,
"grad_norm": 10.25,
"learning_rate": 1.9280676239347392e-05,
"loss": 1.6241,
"step": 384
},
{
"epoch": 0.410010649627263,
"grad_norm": 9.0625,
"learning_rate": 1.923255413988018e-05,
"loss": 1.2076,
"step": 385
},
{
"epoch": 0.4110756123535676,
"grad_norm": 10.6875,
"learning_rate": 1.9184384663072514e-05,
"loss": 1.6646,
"step": 386
},
{
"epoch": 0.41214057507987223,
"grad_norm": 8.75,
"learning_rate": 1.9136168348112236e-05,
"loss": 1.8323,
"step": 387
},
{
"epoch": 0.4132055378061768,
"grad_norm": 9.6875,
"learning_rate": 1.9087905734711457e-05,
"loss": 1.985,
"step": 388
},
{
"epoch": 0.4142705005324814,
"grad_norm": 10.1875,
"learning_rate": 1.9039597363100542e-05,
"loss": 1.2271,
"step": 389
},
{
"epoch": 0.41533546325878595,
"grad_norm": 11.4375,
"learning_rate": 1.8991243774022065e-05,
"loss": 1.7998,
"step": 390
},
{
"epoch": 0.4164004259850905,
"grad_norm": 11.1875,
"learning_rate": 1.894284550872472e-05,
"loss": 1.7686,
"step": 391
},
{
"epoch": 0.4174653887113951,
"grad_norm": 12.0,
"learning_rate": 1.8894403108957305e-05,
"loss": 1.4314,
"step": 392
},
{
"epoch": 0.4185303514376997,
"grad_norm": 10.25,
"learning_rate": 1.884591711696263e-05,
"loss": 1.6033,
"step": 393
},
{
"epoch": 0.41959531416400425,
"grad_norm": 9.375,
"learning_rate": 1.879738807547146e-05,
"loss": 1.613,
"step": 394
},
{
"epoch": 0.4206602768903088,
"grad_norm": 11.8125,
"learning_rate": 1.8748816527696443e-05,
"loss": 1.1745,
"step": 395
},
{
"epoch": 0.4217252396166134,
"grad_norm": 10.375,
"learning_rate": 1.8700203017326017e-05,
"loss": 1.3751,
"step": 396
},
{
"epoch": 0.422790202342918,
"grad_norm": 12.8125,
"learning_rate": 1.8651548088518328e-05,
"loss": 1.3624,
"step": 397
},
{
"epoch": 0.42385516506922255,
"grad_norm": 9.9375,
"learning_rate": 1.8602852285895148e-05,
"loss": 1.8956,
"step": 398
},
{
"epoch": 0.4249201277955272,
"grad_norm": 10.5625,
"learning_rate": 1.8554116154535774e-05,
"loss": 1.5782,
"step": 399
},
{
"epoch": 0.42598509052183176,
"grad_norm": 10.5,
"learning_rate": 1.850534023997092e-05,
"loss": 0.929,
"step": 400
},
{
"epoch": 0.42705005324813633,
"grad_norm": 12.3125,
"learning_rate": 1.8456525088176608e-05,
"loss": 1.9057,
"step": 401
},
{
"epoch": 0.4281150159744409,
"grad_norm": 13.1875,
"learning_rate": 1.8407671245568086e-05,
"loss": 1.5854,
"step": 402
},
{
"epoch": 0.4291799787007455,
"grad_norm": 12.5625,
"learning_rate": 1.8358779258993673e-05,
"loss": 1.5715,
"step": 403
},
{
"epoch": 0.43024494142705005,
"grad_norm": 11.875,
"learning_rate": 1.8309849675728654e-05,
"loss": 1.5288,
"step": 404
},
{
"epoch": 0.43130990415335463,
"grad_norm": 10.0,
"learning_rate": 1.8260883043469165e-05,
"loss": 1.9417,
"step": 405
},
{
"epoch": 0.4323748668796592,
"grad_norm": 10.5,
"learning_rate": 1.8211879910326044e-05,
"loss": 1.8962,
"step": 406
},
{
"epoch": 0.4334398296059638,
"grad_norm": 11.0,
"learning_rate": 1.8162840824818706e-05,
"loss": 1.5027,
"step": 407
},
{
"epoch": 0.43450479233226835,
"grad_norm": 9.625,
"learning_rate": 1.8113766335869004e-05,
"loss": 1.9446,
"step": 408
},
{
"epoch": 0.4355697550585729,
"grad_norm": 12.1875,
"learning_rate": 1.8064656992795076e-05,
"loss": 1.857,
"step": 409
},
{
"epoch": 0.4366347177848775,
"grad_norm": 14.125,
"learning_rate": 1.8015513345305205e-05,
"loss": 1.7554,
"step": 410
},
{
"epoch": 0.43769968051118213,
"grad_norm": 9.25,
"learning_rate": 1.7966335943491664e-05,
"loss": 1.769,
"step": 411
},
{
"epoch": 0.4387646432374867,
"grad_norm": 10.125,
"learning_rate": 1.7917125337824546e-05,
"loss": 1.4604,
"step": 412
},
{
"epoch": 0.4398296059637913,
"grad_norm": 9.875,
"learning_rate": 1.786788207914563e-05,
"loss": 1.7425,
"step": 413
},
{
"epoch": 0.44089456869009586,
"grad_norm": 8.375,
"learning_rate": 1.7818606718662193e-05,
"loss": 1.5833,
"step": 414
},
{
"epoch": 0.44195953141640043,
"grad_norm": 8.5625,
"learning_rate": 1.7769299807940835e-05,
"loss": 1.7705,
"step": 415
},
{
"epoch": 0.443024494142705,
"grad_norm": 10.0625,
"learning_rate": 1.771996189890133e-05,
"loss": 1.8412,
"step": 416
},
{
"epoch": 0.4440894568690096,
"grad_norm": 10.125,
"learning_rate": 1.7670593543810427e-05,
"loss": 1.1687,
"step": 417
},
{
"epoch": 0.44515441959531415,
"grad_norm": 11.75,
"learning_rate": 1.7621195295275668e-05,
"loss": 1.7284,
"step": 418
},
{
"epoch": 0.44621938232161873,
"grad_norm": 10.875,
"learning_rate": 1.757176770623922e-05,
"loss": 1.5151,
"step": 419
},
{
"epoch": 0.4472843450479233,
"grad_norm": 9.4375,
"learning_rate": 1.752231132997167e-05,
"loss": 1.9066,
"step": 420
},
{
"epoch": 0.4483493077742279,
"grad_norm": 11.4375,
"learning_rate": 1.7472826720065833e-05,
"loss": 1.4873,
"step": 421
},
{
"epoch": 0.4494142705005325,
"grad_norm": 11.0,
"learning_rate": 1.7423314430430564e-05,
"loss": 1.6894,
"step": 422
},
{
"epoch": 0.4504792332268371,
"grad_norm": 9.875,
"learning_rate": 1.737377501528455e-05,
"loss": 1.9422,
"step": 423
},
{
"epoch": 0.45154419595314166,
"grad_norm": 10.4375,
"learning_rate": 1.7324209029150118e-05,
"loss": 1.7035,
"step": 424
},
{
"epoch": 0.45260915867944623,
"grad_norm": 11.625,
"learning_rate": 1.7274617026847e-05,
"loss": 1.6268,
"step": 425
},
{
"epoch": 0.4536741214057508,
"grad_norm": 10.0,
"learning_rate": 1.7224999563486163e-05,
"loss": 1.6705,
"step": 426
},
{
"epoch": 0.4547390841320554,
"grad_norm": 10.6875,
"learning_rate": 1.7175357194463556e-05,
"loss": 1.8731,
"step": 427
},
{
"epoch": 0.45580404685835996,
"grad_norm": 9.3125,
"learning_rate": 1.7125690475453915e-05,
"loss": 1.6503,
"step": 428
},
{
"epoch": 0.45686900958466453,
"grad_norm": 8.3125,
"learning_rate": 1.7075999962404548e-05,
"loss": 2.3099,
"step": 429
},
{
"epoch": 0.4579339723109691,
"grad_norm": 8.3125,
"learning_rate": 1.70262862115291e-05,
"loss": 1.5789,
"step": 430
},
{
"epoch": 0.4589989350372737,
"grad_norm": 10.0,
"learning_rate": 1.697654977930132e-05,
"loss": 1.6735,
"step": 431
},
{
"epoch": 0.46006389776357826,
"grad_norm": 11.375,
"learning_rate": 1.6926791222448854e-05,
"loss": 1.5729,
"step": 432
},
{
"epoch": 0.46112886048988283,
"grad_norm": 10.125,
"learning_rate": 1.6877011097946995e-05,
"loss": 1.8057,
"step": 433
},
{
"epoch": 0.46219382321618746,
"grad_norm": 15.0,
"learning_rate": 1.6827209963012454e-05,
"loss": 1.3789,
"step": 434
},
{
"epoch": 0.46325878594249204,
"grad_norm": 10.8125,
"learning_rate": 1.6777388375097133e-05,
"loss": 1.6362,
"step": 435
},
{
"epoch": 0.4643237486687966,
"grad_norm": 12.625,
"learning_rate": 1.6727546891881862e-05,
"loss": 1.7457,
"step": 436
},
{
"epoch": 0.4653887113951012,
"grad_norm": 8.3125,
"learning_rate": 1.6677686071270175e-05,
"loss": 1.6304,
"step": 437
},
{
"epoch": 0.46645367412140576,
"grad_norm": 8.8125,
"learning_rate": 1.6627806471382065e-05,
"loss": 1.6158,
"step": 438
},
{
"epoch": 0.46751863684771033,
"grad_norm": 13.5625,
"learning_rate": 1.6577908650547732e-05,
"loss": 1.5116,
"step": 439
},
{
"epoch": 0.4685835995740149,
"grad_norm": 7.875,
"learning_rate": 1.6527993167301322e-05,
"loss": 1.8969,
"step": 440
},
{
"epoch": 0.4696485623003195,
"grad_norm": 10.5,
"learning_rate": 1.64780605803747e-05,
"loss": 1.4656,
"step": 441
},
{
"epoch": 0.47071352502662406,
"grad_norm": 8.5625,
"learning_rate": 1.6428111448691177e-05,
"loss": 2.104,
"step": 442
},
{
"epoch": 0.47177848775292863,
"grad_norm": 8.875,
"learning_rate": 1.6378146331359252e-05,
"loss": 1.7346,
"step": 443
},
{
"epoch": 0.4728434504792332,
"grad_norm": 12.0,
"learning_rate": 1.6328165787666368e-05,
"loss": 1.3809,
"step": 444
},
{
"epoch": 0.4739084132055378,
"grad_norm": 12.375,
"learning_rate": 1.627817037707265e-05,
"loss": 1.9848,
"step": 445
},
{
"epoch": 0.4749733759318424,
"grad_norm": 9.8125,
"learning_rate": 1.6228160659204623e-05,
"loss": 1.6315,
"step": 446
},
{
"epoch": 0.476038338658147,
"grad_norm": 11.125,
"learning_rate": 1.6178137193848956e-05,
"loss": 1.552,
"step": 447
},
{
"epoch": 0.47710330138445156,
"grad_norm": 13.0625,
"learning_rate": 1.6128100540946227e-05,
"loss": 1.4892,
"step": 448
},
{
"epoch": 0.47816826411075614,
"grad_norm": 9.75,
"learning_rate": 1.607805126058461e-05,
"loss": 0.9454,
"step": 449
},
{
"epoch": 0.4792332268370607,
"grad_norm": 12.8125,
"learning_rate": 1.6027989912993635e-05,
"loss": 1.8403,
"step": 450
},
{
"epoch": 0.4802981895633653,
"grad_norm": 9.125,
"learning_rate": 1.5977917058537893e-05,
"loss": 1.659,
"step": 451
},
{
"epoch": 0.48136315228966986,
"grad_norm": 9.9375,
"learning_rate": 1.592783325771079e-05,
"loss": 1.1204,
"step": 452
},
{
"epoch": 0.48242811501597443,
"grad_norm": 10.75,
"learning_rate": 1.5877739071128266e-05,
"loss": 1.4801,
"step": 453
},
{
"epoch": 0.483493077742279,
"grad_norm": 11.625,
"learning_rate": 1.5827635059522496e-05,
"loss": 1.6075,
"step": 454
},
{
"epoch": 0.4845580404685836,
"grad_norm": 12.125,
"learning_rate": 1.577752178373564e-05,
"loss": 1.7549,
"step": 455
},
{
"epoch": 0.48562300319488816,
"grad_norm": 11.3125,
"learning_rate": 1.572739980471357e-05,
"loss": 1.3698,
"step": 456
},
{
"epoch": 0.48668796592119273,
"grad_norm": 10.375,
"learning_rate": 1.567726968349956e-05,
"loss": 2.1374,
"step": 457
},
{
"epoch": 0.48775292864749736,
"grad_norm": 11.875,
"learning_rate": 1.5627131981228035e-05,
"loss": 1.8397,
"step": 458
},
{
"epoch": 0.48881789137380194,
"grad_norm": 11.125,
"learning_rate": 1.557698725911827e-05,
"loss": 1.791,
"step": 459
},
{
"epoch": 0.4898828541001065,
"grad_norm": 12.375,
"learning_rate": 1.5526836078468133e-05,
"loss": 1.6973,
"step": 460
},
{
"epoch": 0.4909478168264111,
"grad_norm": 10.0625,
"learning_rate": 1.5476679000647777e-05,
"loss": 1.6784,
"step": 461
},
{
"epoch": 0.49201277955271566,
"grad_norm": 12.5,
"learning_rate": 1.5426516587093348e-05,
"loss": 1.6424,
"step": 462
},
{
"epoch": 0.49307774227902024,
"grad_norm": 15.8125,
"learning_rate": 1.5376349399300748e-05,
"loss": 1.3521,
"step": 463
},
{
"epoch": 0.4941427050053248,
"grad_norm": 9.25,
"learning_rate": 1.53261779988193e-05,
"loss": 1.5164,
"step": 464
},
{
"epoch": 0.4952076677316294,
"grad_norm": 15.625,
"learning_rate": 1.5276002947245486e-05,
"loss": 1.602,
"step": 465
},
{
"epoch": 0.49627263045793396,
"grad_norm": 8.25,
"learning_rate": 1.5225824806216662e-05,
"loss": 1.6166,
"step": 466
},
{
"epoch": 0.49733759318423854,
"grad_norm": 10.25,
"learning_rate": 1.5175644137404763e-05,
"loss": 1.9271,
"step": 467
},
{
"epoch": 0.4984025559105431,
"grad_norm": 8.5625,
"learning_rate": 1.5125461502510014e-05,
"loss": 1.8779,
"step": 468
},
{
"epoch": 0.4994675186368477,
"grad_norm": 15.875,
"learning_rate": 1.5075277463254655e-05,
"loss": 1.2732,
"step": 469
},
{
"epoch": 0.5005324813631523,
"grad_norm": 13.0,
"learning_rate": 1.5025092581376643e-05,
"loss": 1.4483,
"step": 470
},
{
"epoch": 0.5015974440894568,
"grad_norm": 11.9375,
"learning_rate": 1.4974907418623361e-05,
"loss": 1.5915,
"step": 471
},
{
"epoch": 0.5026624068157615,
"grad_norm": 13.4375,
"learning_rate": 1.4924722536745351e-05,
"loss": 1.5104,
"step": 472
},
{
"epoch": 0.503727369542066,
"grad_norm": 11.375,
"learning_rate": 1.4874538497489989e-05,
"loss": 1.7744,
"step": 473
},
{
"epoch": 0.5047923322683706,
"grad_norm": 16.625,
"learning_rate": 1.4824355862595245e-05,
"loss": 1.3057,
"step": 474
},
{
"epoch": 0.5058572949946751,
"grad_norm": 9.4375,
"learning_rate": 1.477417519378334e-05,
"loss": 1.3718,
"step": 475
},
{
"epoch": 0.5069222577209798,
"grad_norm": 8.9375,
"learning_rate": 1.472399705275452e-05,
"loss": 2.0112,
"step": 476
},
{
"epoch": 0.5079872204472844,
"grad_norm": 9.75,
"learning_rate": 1.4673822001180703e-05,
"loss": 1.5658,
"step": 477
},
{
"epoch": 0.5090521831735889,
"grad_norm": 10.6875,
"learning_rate": 1.4623650600699254e-05,
"loss": 1.7109,
"step": 478
},
{
"epoch": 0.5101171458998935,
"grad_norm": 8.0625,
"learning_rate": 1.4573483412906653e-05,
"loss": 1.5009,
"step": 479
},
{
"epoch": 0.5111821086261981,
"grad_norm": 9.375,
"learning_rate": 1.4523320999352228e-05,
"loss": 1.5631,
"step": 480
},
{
"epoch": 0.5122470713525027,
"grad_norm": 9.9375,
"learning_rate": 1.4473163921531868e-05,
"loss": 1.5273,
"step": 481
},
{
"epoch": 0.5133120340788072,
"grad_norm": 12.8125,
"learning_rate": 1.4423012740881726e-05,
"loss": 1.4918,
"step": 482
},
{
"epoch": 0.5143769968051118,
"grad_norm": 13.375,
"learning_rate": 1.4372868018771971e-05,
"loss": 1.6232,
"step": 483
},
{
"epoch": 0.5154419595314164,
"grad_norm": 8.125,
"learning_rate": 1.4322730316500444e-05,
"loss": 1.8868,
"step": 484
},
{
"epoch": 0.516506922257721,
"grad_norm": 10.125,
"learning_rate": 1.4272600195286437e-05,
"loss": 0.9475,
"step": 485
},
{
"epoch": 0.5175718849840255,
"grad_norm": 8.875,
"learning_rate": 1.422247821626436e-05,
"loss": 1.9696,
"step": 486
},
{
"epoch": 0.5186368477103301,
"grad_norm": 11.625,
"learning_rate": 1.4172364940477512e-05,
"loss": 1.8617,
"step": 487
},
{
"epoch": 0.5197018104366348,
"grad_norm": 11.6875,
"learning_rate": 1.4122260928871737e-05,
"loss": 1.7146,
"step": 488
},
{
"epoch": 0.5207667731629393,
"grad_norm": 10.1875,
"learning_rate": 1.4072166742289206e-05,
"loss": 1.7215,
"step": 489
},
{
"epoch": 0.5218317358892439,
"grad_norm": 10.1875,
"learning_rate": 1.402208294146211e-05,
"loss": 1.9426,
"step": 490
},
{
"epoch": 0.5228966986155484,
"grad_norm": 13.375,
"learning_rate": 1.3972010087006364e-05,
"loss": 1.5129,
"step": 491
},
{
"epoch": 0.5239616613418531,
"grad_norm": 8.125,
"learning_rate": 1.392194873941539e-05,
"loss": 1.8564,
"step": 492
},
{
"epoch": 0.5250266240681576,
"grad_norm": 9.5,
"learning_rate": 1.3871899459053769e-05,
"loss": 1.674,
"step": 493
},
{
"epoch": 0.5260915867944622,
"grad_norm": 10.125,
"learning_rate": 1.3821862806151046e-05,
"loss": 1.5183,
"step": 494
},
{
"epoch": 0.5271565495207667,
"grad_norm": 9.625,
"learning_rate": 1.3771839340795383e-05,
"loss": 1.9528,
"step": 495
},
{
"epoch": 0.5282215122470714,
"grad_norm": 11.125,
"learning_rate": 1.3721829622927354e-05,
"loss": 1.8296,
"step": 496
},
{
"epoch": 0.5292864749733759,
"grad_norm": 11.375,
"learning_rate": 1.3671834212333633e-05,
"loss": 1.5489,
"step": 497
},
{
"epoch": 0.5303514376996805,
"grad_norm": 9.25,
"learning_rate": 1.362185366864075e-05,
"loss": 1.6517,
"step": 498
},
{
"epoch": 0.531416400425985,
"grad_norm": 17.25,
"learning_rate": 1.3571888551308827e-05,
"loss": 1.4663,
"step": 499
},
{
"epoch": 0.5324813631522897,
"grad_norm": 9.4375,
"learning_rate": 1.3521939419625304e-05,
"loss": 1.661,
"step": 500
},
{
"epoch": 0.5335463258785943,
"grad_norm": 9.0,
"learning_rate": 1.347200683269868e-05,
"loss": 1.3956,
"step": 501
},
{
"epoch": 0.5346112886048988,
"grad_norm": 10.25,
"learning_rate": 1.3422091349452269e-05,
"loss": 1.6099,
"step": 502
},
{
"epoch": 0.5356762513312034,
"grad_norm": 8.375,
"learning_rate": 1.3372193528617936e-05,
"loss": 1.6831,
"step": 503
},
{
"epoch": 0.536741214057508,
"grad_norm": 9.625,
"learning_rate": 1.3322313928729824e-05,
"loss": 1.2729,
"step": 504
},
{
"epoch": 0.5378061767838126,
"grad_norm": 9.0,
"learning_rate": 1.3272453108118142e-05,
"loss": 1.8599,
"step": 505
},
{
"epoch": 0.5388711395101171,
"grad_norm": 15.4375,
"learning_rate": 1.322261162490287e-05,
"loss": 1.7929,
"step": 506
},
{
"epoch": 0.5399361022364217,
"grad_norm": 8.125,
"learning_rate": 1.3172790036987545e-05,
"loss": 1.5644,
"step": 507
},
{
"epoch": 0.5410010649627263,
"grad_norm": 11.1875,
"learning_rate": 1.3122988902053007e-05,
"loss": 1.4376,
"step": 508
},
{
"epoch": 0.5420660276890309,
"grad_norm": 10.5,
"learning_rate": 1.3073208777551152e-05,
"loss": 1.4278,
"step": 509
},
{
"epoch": 0.5431309904153354,
"grad_norm": 9.0,
"learning_rate": 1.3023450220698683e-05,
"loss": 1.926,
"step": 510
},
{
"epoch": 0.54419595314164,
"grad_norm": 9.4375,
"learning_rate": 1.2973713788470907e-05,
"loss": 1.7574,
"step": 511
},
{
"epoch": 0.5452609158679447,
"grad_norm": 9.25,
"learning_rate": 1.2924000037595453e-05,
"loss": 2.1237,
"step": 512
},
{
"epoch": 0.5463258785942492,
"grad_norm": 14.5,
"learning_rate": 1.2874309524546085e-05,
"loss": 1.6285,
"step": 513
},
{
"epoch": 0.5473908413205538,
"grad_norm": 11.75,
"learning_rate": 1.282464280553645e-05,
"loss": 1.606,
"step": 514
},
{
"epoch": 0.5484558040468583,
"grad_norm": 9.4375,
"learning_rate": 1.277500043651384e-05,
"loss": 1.4522,
"step": 515
},
{
"epoch": 0.549520766773163,
"grad_norm": 12.75,
"learning_rate": 1.2725382973153003e-05,
"loss": 1.6253,
"step": 516
},
{
"epoch": 0.5505857294994675,
"grad_norm": 9.875,
"learning_rate": 1.2675790970849885e-05,
"loss": 1.8144,
"step": 517
},
{
"epoch": 0.5516506922257721,
"grad_norm": 11.875,
"learning_rate": 1.2626224984715451e-05,
"loss": 1.6275,
"step": 518
},
{
"epoch": 0.5527156549520766,
"grad_norm": 9.8125,
"learning_rate": 1.2576685569569438e-05,
"loss": 1.7244,
"step": 519
},
{
"epoch": 0.5537806176783813,
"grad_norm": 10.5,
"learning_rate": 1.2527173279934173e-05,
"loss": 1.7188,
"step": 520
},
{
"epoch": 0.5548455804046858,
"grad_norm": 9.5625,
"learning_rate": 1.2477688670028331e-05,
"loss": 1.5563,
"step": 521
},
{
"epoch": 0.5559105431309904,
"grad_norm": 10.6875,
"learning_rate": 1.2428232293760784e-05,
"loss": 1.9121,
"step": 522
},
{
"epoch": 0.556975505857295,
"grad_norm": 8.375,
"learning_rate": 1.2378804704724331e-05,
"loss": 1.4518,
"step": 523
},
{
"epoch": 0.5580404685835996,
"grad_norm": 10.0625,
"learning_rate": 1.2329406456189574e-05,
"loss": 1.3654,
"step": 524
},
{
"epoch": 0.5591054313099042,
"grad_norm": 8.625,
"learning_rate": 1.2280038101098671e-05,
"loss": 1.6945,
"step": 525
},
{
"epoch": 0.5601703940362087,
"grad_norm": 9.8125,
"learning_rate": 1.2230700192059162e-05,
"loss": 1.3019,
"step": 526
},
{
"epoch": 0.5612353567625133,
"grad_norm": 9.9375,
"learning_rate": 1.218139328133781e-05,
"loss": 2.0614,
"step": 527
},
{
"epoch": 0.5623003194888179,
"grad_norm": 12.125,
"learning_rate": 1.213211792085437e-05,
"loss": 1.4044,
"step": 528
},
{
"epoch": 0.5633652822151225,
"grad_norm": 8.6875,
"learning_rate": 1.208287466217546e-05,
"loss": 1.6953,
"step": 529
},
{
"epoch": 0.564430244941427,
"grad_norm": 7.9375,
"learning_rate": 1.203366405650834e-05,
"loss": 1.8574,
"step": 530
},
{
"epoch": 0.5654952076677316,
"grad_norm": 9.125,
"learning_rate": 1.19844866546948e-05,
"loss": 1.4788,
"step": 531
},
{
"epoch": 0.5665601703940362,
"grad_norm": 10.75,
"learning_rate": 1.1935343007204925e-05,
"loss": 1.8245,
"step": 532
},
{
"epoch": 0.5676251331203408,
"grad_norm": 8.625,
"learning_rate": 1.1886233664130999e-05,
"loss": 1.5421,
"step": 533
},
{
"epoch": 0.5686900958466453,
"grad_norm": 11.6875,
"learning_rate": 1.1837159175181296e-05,
"loss": 1.1918,
"step": 534
},
{
"epoch": 0.56975505857295,
"grad_norm": 13.75,
"learning_rate": 1.1788120089673963e-05,
"loss": 1.3899,
"step": 535
},
{
"epoch": 0.5708200212992546,
"grad_norm": 12.0,
"learning_rate": 1.1739116956530839e-05,
"loss": 1.389,
"step": 536
},
{
"epoch": 0.5718849840255591,
"grad_norm": 12.4375,
"learning_rate": 1.1690150324271345e-05,
"loss": 1.7149,
"step": 537
},
{
"epoch": 0.5729499467518637,
"grad_norm": 12.875,
"learning_rate": 1.1641220741006331e-05,
"loss": 1.951,
"step": 538
},
{
"epoch": 0.5740149094781682,
"grad_norm": 11.875,
"learning_rate": 1.1592328754431911e-05,
"loss": 1.2455,
"step": 539
},
{
"epoch": 0.5750798722044729,
"grad_norm": 8.4375,
"learning_rate": 1.1543474911823391e-05,
"loss": 1.3843,
"step": 540
},
{
"epoch": 0.5761448349307774,
"grad_norm": 9.9375,
"learning_rate": 1.1494659760029085e-05,
"loss": 1.5953,
"step": 541
},
{
"epoch": 0.577209797657082,
"grad_norm": 9.4375,
"learning_rate": 1.1445883845464229e-05,
"loss": 1.5503,
"step": 542
},
{
"epoch": 0.5782747603833865,
"grad_norm": 9.3125,
"learning_rate": 1.1397147714104853e-05,
"loss": 2.1053,
"step": 543
},
{
"epoch": 0.5793397231096912,
"grad_norm": 9.375,
"learning_rate": 1.134845191148168e-05,
"loss": 1.7168,
"step": 544
},
{
"epoch": 0.5804046858359957,
"grad_norm": 9.1875,
"learning_rate": 1.1299796982673988e-05,
"loss": 1.7581,
"step": 545
},
{
"epoch": 0.5814696485623003,
"grad_norm": 8.875,
"learning_rate": 1.1251183472303562e-05,
"loss": 1.5051,
"step": 546
},
{
"epoch": 0.582534611288605,
"grad_norm": 9.0,
"learning_rate": 1.120261192452854e-05,
"loss": 1.747,
"step": 547
},
{
"epoch": 0.5835995740149095,
"grad_norm": 8.25,
"learning_rate": 1.1154082883037371e-05,
"loss": 1.3058,
"step": 548
},
{
"epoch": 0.5846645367412141,
"grad_norm": 8.6875,
"learning_rate": 1.1105596891042699e-05,
"loss": 1.606,
"step": 549
},
{
"epoch": 0.5857294994675186,
"grad_norm": 10.3125,
"learning_rate": 1.1057154491275281e-05,
"loss": 1.5261,
"step": 550
},
{
"epoch": 0.5867944621938233,
"grad_norm": 9.6875,
"learning_rate": 1.1008756225977936e-05,
"loss": 1.6968,
"step": 551
},
{
"epoch": 0.5878594249201278,
"grad_norm": 9.5,
"learning_rate": 1.0960402636899457e-05,
"loss": 1.1682,
"step": 552
},
{
"epoch": 0.5889243876464324,
"grad_norm": 10.125,
"learning_rate": 1.091209426528855e-05,
"loss": 1.745,
"step": 553
},
{
"epoch": 0.5899893503727369,
"grad_norm": 13.875,
"learning_rate": 1.0863831651887768e-05,
"loss": 1.3198,
"step": 554
},
{
"epoch": 0.5910543130990416,
"grad_norm": 12.5625,
"learning_rate": 1.081561533692749e-05,
"loss": 1.7697,
"step": 555
},
{
"epoch": 0.5921192758253461,
"grad_norm": 18.5,
"learning_rate": 1.0767445860119822e-05,
"loss": 1.9429,
"step": 556
},
{
"epoch": 0.5931842385516507,
"grad_norm": 10.125,
"learning_rate": 1.0719323760652612e-05,
"loss": 1.7703,
"step": 557
},
{
"epoch": 0.5942492012779552,
"grad_norm": 7.78125,
"learning_rate": 1.0671249577183364e-05,
"loss": 1.782,
"step": 558
},
{
"epoch": 0.5953141640042598,
"grad_norm": 10.3125,
"learning_rate": 1.062322384783325e-05,
"loss": 1.6762,
"step": 559
},
{
"epoch": 0.5963791267305645,
"grad_norm": 9.625,
"learning_rate": 1.0575247110181048e-05,
"loss": 1.4526,
"step": 560
},
{
"epoch": 0.597444089456869,
"grad_norm": 10.9375,
"learning_rate": 1.0527319901257161e-05,
"loss": 1.9204,
"step": 561
},
{
"epoch": 0.5985090521831736,
"grad_norm": 9.125,
"learning_rate": 1.047944275753759e-05,
"loss": 0.9891,
"step": 562
},
{
"epoch": 0.5995740149094781,
"grad_norm": 9.4375,
"learning_rate": 1.0431616214937911e-05,
"loss": 1.7877,
"step": 563
},
{
"epoch": 0.6006389776357828,
"grad_norm": 9.125,
"learning_rate": 1.038384080880732e-05,
"loss": 1.9058,
"step": 564
},
{
"epoch": 0.6017039403620873,
"grad_norm": 12.6875,
"learning_rate": 1.033611707392258e-05,
"loss": 1.8118,
"step": 565
},
{
"epoch": 0.6027689030883919,
"grad_norm": 10.25,
"learning_rate": 1.0288445544482105e-05,
"loss": 1.3733,
"step": 566
},
{
"epoch": 0.6038338658146964,
"grad_norm": 9.6875,
"learning_rate": 1.0240826754099914e-05,
"loss": 1.5815,
"step": 567
},
{
"epoch": 0.6048988285410011,
"grad_norm": 11.125,
"learning_rate": 1.0193261235799713e-05,
"loss": 1.7796,
"step": 568
},
{
"epoch": 0.6059637912673056,
"grad_norm": 18.375,
"learning_rate": 1.0145749522008881e-05,
"loss": 1.5525,
"step": 569
},
{
"epoch": 0.6070287539936102,
"grad_norm": 10.5625,
"learning_rate": 1.009829214455255e-05,
"loss": 1.6614,
"step": 570
},
{
"epoch": 0.6080937167199149,
"grad_norm": 8.3125,
"learning_rate": 1.0050889634647629e-05,
"loss": 1.5261,
"step": 571
},
{
"epoch": 0.6091586794462194,
"grad_norm": 11.875,
"learning_rate": 1.0003542522896859e-05,
"loss": 1.3385,
"step": 572
},
{
"epoch": 0.610223642172524,
"grad_norm": 10.0,
"learning_rate": 9.956251339282895e-06,
"loss": 1.8288,
"step": 573
},
{
"epoch": 0.6112886048988285,
"grad_norm": 13.125,
"learning_rate": 9.909016613162334e-06,
"loss": 2.1504,
"step": 574
},
{
"epoch": 0.6123535676251332,
"grad_norm": 10.4375,
"learning_rate": 9.861838873259835e-06,
"loss": 1.5278,
"step": 575
},
{
"epoch": 0.6134185303514377,
"grad_norm": 10.25,
"learning_rate": 9.814718647662158e-06,
"loss": 1.5204,
"step": 576
},
{
"epoch": 0.6144834930777423,
"grad_norm": 10.9375,
"learning_rate": 9.767656463812292e-06,
"loss": 1.3771,
"step": 577
},
{
"epoch": 0.6155484558040468,
"grad_norm": 9.625,
"learning_rate": 9.720652848503519e-06,
"loss": 1.5326,
"step": 578
},
{
"epoch": 0.6166134185303515,
"grad_norm": 9.3125,
"learning_rate": 9.673708327873535e-06,
"loss": 2.0136,
"step": 579
},
{
"epoch": 0.617678381256656,
"grad_norm": 9.875,
"learning_rate": 9.62682342739856e-06,
"loss": 1.7406,
"step": 580
},
{
"epoch": 0.6187433439829606,
"grad_norm": 9.4375,
"learning_rate": 9.57999867188745e-06,
"loss": 1.8908,
"step": 581
},
{
"epoch": 0.6198083067092651,
"grad_norm": 9.3125,
"learning_rate": 9.533234585475814e-06,
"loss": 1.565,
"step": 582
},
{
"epoch": 0.6208732694355698,
"grad_norm": 16.0,
"learning_rate": 9.486531691620182e-06,
"loss": 1.4218,
"step": 583
},
{
"epoch": 0.6219382321618744,
"grad_norm": 10.375,
"learning_rate": 9.439890513092092e-06,
"loss": 1.4015,
"step": 584
},
{
"epoch": 0.6230031948881789,
"grad_norm": 9.75,
"learning_rate": 9.393311571972293e-06,
"loss": 1.678,
"step": 585
},
{
"epoch": 0.6240681576144835,
"grad_norm": 10.25,
"learning_rate": 9.34679538964487e-06,
"loss": 1.8353,
"step": 586
},
{
"epoch": 0.625133120340788,
"grad_norm": 11.5,
"learning_rate": 9.300342486791401e-06,
"loss": 1.9182,
"step": 587
},
{
"epoch": 0.6261980830670927,
"grad_norm": 11.375,
"learning_rate": 9.253953383385158e-06,
"loss": 1.0814,
"step": 588
},
{
"epoch": 0.6272630457933972,
"grad_norm": 12.0625,
"learning_rate": 9.207628598685253e-06,
"loss": 1.3614,
"step": 589
},
{
"epoch": 0.6283280085197018,
"grad_norm": 11.9375,
"learning_rate": 9.161368651230862e-06,
"loss": 1.6114,
"step": 590
},
{
"epoch": 0.6293929712460063,
"grad_norm": 11.875,
"learning_rate": 9.115174058835386e-06,
"loss": 1.4484,
"step": 591
},
{
"epoch": 0.630457933972311,
"grad_norm": 9.9375,
"learning_rate": 9.069045338580684e-06,
"loss": 1.6065,
"step": 592
},
{
"epoch": 0.6315228966986155,
"grad_norm": 9.125,
"learning_rate": 9.02298300681125e-06,
"loss": 1.6245,
"step": 593
},
{
"epoch": 0.6325878594249201,
"grad_norm": 10.0625,
"learning_rate": 8.976987579128486e-06,
"loss": 1.2913,
"step": 594
},
{
"epoch": 0.6336528221512248,
"grad_norm": 9.625,
"learning_rate": 8.931059570384864e-06,
"loss": 1.6383,
"step": 595
},
{
"epoch": 0.6347177848775293,
"grad_norm": 8.1875,
"learning_rate": 8.88519949467823e-06,
"loss": 1.5182,
"step": 596
},
{
"epoch": 0.6357827476038339,
"grad_norm": 9.9375,
"learning_rate": 8.839407865345999e-06,
"loss": 1.2956,
"step": 597
},
{
"epoch": 0.6368477103301384,
"grad_norm": 11.0625,
"learning_rate": 8.79368519495943e-06,
"loss": 2.0363,
"step": 598
},
{
"epoch": 0.6379126730564431,
"grad_norm": 8.5,
"learning_rate": 8.748031995317887e-06,
"loss": 1.6898,
"step": 599
},
{
"epoch": 0.6389776357827476,
"grad_norm": 10.5625,
"learning_rate": 8.702448777443115e-06,
"loss": 1.6978,
"step": 600
},
{
"epoch": 0.6400425985090522,
"grad_norm": 10.125,
"learning_rate": 8.656936051573505e-06,
"loss": 1.6635,
"step": 601
},
{
"epoch": 0.6411075612353567,
"grad_norm": 11.625,
"learning_rate": 8.611494327158398e-06,
"loss": 1.2385,
"step": 602
},
{
"epoch": 0.6421725239616614,
"grad_norm": 7.5625,
"learning_rate": 8.56612411285238e-06,
"loss": 1.6086,
"step": 603
},
{
"epoch": 0.6432374866879659,
"grad_norm": 9.9375,
"learning_rate": 8.520825916509557e-06,
"loss": 1.5494,
"step": 604
},
{
"epoch": 0.6443024494142705,
"grad_norm": 9.0625,
"learning_rate": 8.475600245177951e-06,
"loss": 1.6019,
"step": 605
},
{
"epoch": 0.645367412140575,
"grad_norm": 9.0,
"learning_rate": 8.430447605093707e-06,
"loss": 1.9869,
"step": 606
},
{
"epoch": 0.6464323748668797,
"grad_norm": 8.0625,
"learning_rate": 8.385368501675551e-06,
"loss": 2.1738,
"step": 607
},
{
"epoch": 0.6474973375931843,
"grad_norm": 8.875,
"learning_rate": 8.340363439519021e-06,
"loss": 1.7293,
"step": 608
},
{
"epoch": 0.6485623003194888,
"grad_norm": 9.125,
"learning_rate": 8.295432922390905e-06,
"loss": 1.8153,
"step": 609
},
{
"epoch": 0.6496272630457934,
"grad_norm": 11.0,
"learning_rate": 8.250577453223561e-06,
"loss": 1.2735,
"step": 610
},
{
"epoch": 0.650692225772098,
"grad_norm": 9.3125,
"learning_rate": 8.205797534109265e-06,
"loss": 1.5776,
"step": 611
},
{
"epoch": 0.6517571884984026,
"grad_norm": 12.6875,
"learning_rate": 8.161093666294664e-06,
"loss": 1.2921,
"step": 612
},
{
"epoch": 0.6528221512247071,
"grad_norm": 8.875,
"learning_rate": 8.116466350175079e-06,
"loss": 1.8095,
"step": 613
},
{
"epoch": 0.6538871139510117,
"grad_norm": 8.6875,
"learning_rate": 8.071916085288981e-06,
"loss": 1.9529,
"step": 614
},
{
"epoch": 0.6549520766773163,
"grad_norm": 9.6875,
"learning_rate": 8.027443370312326e-06,
"loss": 1.6288,
"step": 615
},
{
"epoch": 0.6560170394036209,
"grad_norm": 11.375,
"learning_rate": 7.983048703053055e-06,
"loss": 1.9875,
"step": 616
},
{
"epoch": 0.6570820021299254,
"grad_norm": 10.875,
"learning_rate": 7.938732580445422e-06,
"loss": 1.6334,
"step": 617
},
{
"epoch": 0.65814696485623,
"grad_norm": 10.1875,
"learning_rate": 7.894495498544551e-06,
"loss": 1.6786,
"step": 618
},
{
"epoch": 0.6592119275825347,
"grad_norm": 9.625,
"learning_rate": 7.850337952520763e-06,
"loss": 1.6683,
"step": 619
},
{
"epoch": 0.6602768903088392,
"grad_norm": 12.0625,
"learning_rate": 7.806260436654116e-06,
"loss": 2.3544,
"step": 620
},
{
"epoch": 0.6613418530351438,
"grad_norm": 10.5625,
"learning_rate": 7.762263444328856e-06,
"loss": 1.3748,
"step": 621
},
{
"epoch": 0.6624068157614483,
"grad_norm": 7.59375,
"learning_rate": 7.71834746802785e-06,
"loss": 1.307,
"step": 622
},
{
"epoch": 0.663471778487753,
"grad_norm": 9.1875,
"learning_rate": 7.674512999327149e-06,
"loss": 1.7112,
"step": 623
},
{
"epoch": 0.6645367412140575,
"grad_norm": 9.25,
"learning_rate": 7.630760528890403e-06,
"loss": 1.7411,
"step": 624
},
{
"epoch": 0.6656017039403621,
"grad_norm": 8.625,
"learning_rate": 7.587090546463447e-06,
"loss": 1.5324,
"step": 625
},
{
"epoch": 0.6666666666666666,
"grad_norm": 8.8125,
"learning_rate": 7.5435035408687504e-06,
"loss": 1.5691,
"step": 626
},
{
"epoch": 0.6677316293929713,
"grad_norm": 9.75,
"learning_rate": 7.500000000000004e-06,
"loss": 1.6352,
"step": 627
},
{
"epoch": 0.6687965921192758,
"grad_norm": 9.125,
"learning_rate": 7.456580410816604e-06,
"loss": 1.5628,
"step": 628
},
{
"epoch": 0.6698615548455804,
"grad_norm": 10.125,
"learning_rate": 7.41324525933825e-06,
"loss": 1.5523,
"step": 629
},
{
"epoch": 0.670926517571885,
"grad_norm": 11.0,
"learning_rate": 7.3699950306394715e-06,
"loss": 1.6238,
"step": 630
},
{
"epoch": 0.6719914802981896,
"grad_norm": 11.0,
"learning_rate": 7.3268302088442125e-06,
"loss": 1.9907,
"step": 631
},
{
"epoch": 0.6730564430244942,
"grad_norm": 8.8125,
"learning_rate": 7.283751277120427e-06,
"loss": 1.7915,
"step": 632
},
{
"epoch": 0.6741214057507987,
"grad_norm": 10.0,
"learning_rate": 7.2407587176746146e-06,
"loss": 1.6269,
"step": 633
},
{
"epoch": 0.6751863684771033,
"grad_norm": 10.75,
"learning_rate": 7.197853011746506e-06,
"loss": 1.2853,
"step": 634
},
{
"epoch": 0.6762513312034079,
"grad_norm": 12.5,
"learning_rate": 7.1550346396035975e-06,
"loss": 1.6906,
"step": 635
},
{
"epoch": 0.6773162939297125,
"grad_norm": 9.625,
"learning_rate": 7.112304080535827e-06,
"loss": 1.7357,
"step": 636
},
{
"epoch": 0.678381256656017,
"grad_norm": 10.8125,
"learning_rate": 7.069661812850188e-06,
"loss": 1.548,
"step": 637
},
{
"epoch": 0.6794462193823216,
"grad_norm": 12.375,
"learning_rate": 7.027108313865379e-06,
"loss": 1.9071,
"step": 638
},
{
"epoch": 0.6805111821086262,
"grad_norm": 8.3125,
"learning_rate": 6.984644059906461e-06,
"loss": 1.6673,
"step": 639
},
{
"epoch": 0.6815761448349308,
"grad_norm": 10.6875,
"learning_rate": 6.942269526299527e-06,
"loss": 1.4935,
"step": 640
},
{
"epoch": 0.6826411075612353,
"grad_norm": 12.375,
"learning_rate": 6.899985187366376e-06,
"loss": 2.1448,
"step": 641
},
{
"epoch": 0.6837060702875399,
"grad_norm": 11.3125,
"learning_rate": 6.857791516419212e-06,
"loss": 1.418,
"step": 642
},
{
"epoch": 0.6847710330138446,
"grad_norm": 10.0625,
"learning_rate": 6.815688985755341e-06,
"loss": 1.5875,
"step": 643
},
{
"epoch": 0.6858359957401491,
"grad_norm": 10.1875,
"learning_rate": 6.773678066651881e-06,
"loss": 1.8208,
"step": 644
},
{
"epoch": 0.6869009584664537,
"grad_norm": 12.1875,
"learning_rate": 6.731759229360494e-06,
"loss": 1.4882,
"step": 645
},
{
"epoch": 0.6879659211927582,
"grad_norm": 11.9375,
"learning_rate": 6.6899329431021215e-06,
"loss": 1.7406,
"step": 646
},
{
"epoch": 0.6890308839190629,
"grad_norm": 9.0625,
"learning_rate": 6.648199676061724e-06,
"loss": 1.717,
"step": 647
},
{
"epoch": 0.6900958466453674,
"grad_norm": 8.8125,
"learning_rate": 6.606559895383051e-06,
"loss": 1.633,
"step": 648
},
{
"epoch": 0.691160809371672,
"grad_norm": 10.5,
"learning_rate": 6.5650140671634e-06,
"loss": 1.0872,
"step": 649
},
{
"epoch": 0.6922257720979765,
"grad_norm": 14.0625,
"learning_rate": 6.523562656448417e-06,
"loss": 1.5847,
"step": 650
},
{
"epoch": 0.6932907348242812,
"grad_norm": 9.75,
"learning_rate": 6.4822061272268696e-06,
"loss": 0.9841,
"step": 651
},
{
"epoch": 0.6943556975505857,
"grad_norm": 8.125,
"learning_rate": 6.440944942425469e-06,
"loss": 1.7512,
"step": 652
},
{
"epoch": 0.6954206602768903,
"grad_norm": 12.125,
"learning_rate": 6.399779563903683e-06,
"loss": 1.3146,
"step": 653
},
{
"epoch": 0.6964856230031949,
"grad_norm": 9.375,
"learning_rate": 6.358710452448566e-06,
"loss": 1.6926,
"step": 654
},
{
"epoch": 0.6975505857294995,
"grad_norm": 8.375,
"learning_rate": 6.317738067769599e-06,
"loss": 1.4995,
"step": 655
},
{
"epoch": 0.6986155484558041,
"grad_norm": 10.0625,
"learning_rate": 6.2768628684935496e-06,
"loss": 1.5856,
"step": 656
},
{
"epoch": 0.6996805111821086,
"grad_norm": 8.1875,
"learning_rate": 6.236085312159335e-06,
"loss": 1.4653,
"step": 657
},
{
"epoch": 0.7007454739084132,
"grad_norm": 11.125,
"learning_rate": 6.195405855212896e-06,
"loss": 1.8539,
"step": 658
},
{
"epoch": 0.7018104366347178,
"grad_norm": 10.4375,
"learning_rate": 6.154824953002098e-06,
"loss": 1.4134,
"step": 659
},
{
"epoch": 0.7028753993610224,
"grad_norm": 8.625,
"learning_rate": 6.114343059771625e-06,
"loss": 1.6557,
"step": 660
},
{
"epoch": 0.7039403620873269,
"grad_norm": 10.125,
"learning_rate": 6.073960628657896e-06,
"loss": 1.4981,
"step": 661
},
{
"epoch": 0.7050053248136315,
"grad_norm": 10.1875,
"learning_rate": 6.033678111684001e-06,
"loss": 1.9017,
"step": 662
},
{
"epoch": 0.7060702875399361,
"grad_norm": 10.0,
"learning_rate": 5.9934959597546315e-06,
"loss": 1.7662,
"step": 663
},
{
"epoch": 0.7071352502662407,
"grad_norm": 8.875,
"learning_rate": 5.953414622651037e-06,
"loss": 1.5215,
"step": 664
},
{
"epoch": 0.7082002129925452,
"grad_norm": 11.0,
"learning_rate": 5.913434549025989e-06,
"loss": 1.6729,
"step": 665
},
{
"epoch": 0.7092651757188498,
"grad_norm": 9.3125,
"learning_rate": 5.873556186398771e-06,
"loss": 1.7875,
"step": 666
},
{
"epoch": 0.7103301384451545,
"grad_norm": 9.8125,
"learning_rate": 5.833779981150133e-06,
"loss": 1.8491,
"step": 667
},
{
"epoch": 0.711395101171459,
"grad_norm": 10.375,
"learning_rate": 5.7941063785173535e-06,
"loss": 1.6454,
"step": 668
},
{
"epoch": 0.7124600638977636,
"grad_norm": 12.0,
"learning_rate": 5.754535822589197e-06,
"loss": 1.5219,
"step": 669
},
{
"epoch": 0.7135250266240681,
"grad_norm": 8.1875,
"learning_rate": 5.715068756300985e-06,
"loss": 1.9882,
"step": 670
},
{
"epoch": 0.7145899893503728,
"grad_norm": 11.0,
"learning_rate": 5.675705621429611e-06,
"loss": 1.5543,
"step": 671
},
{
"epoch": 0.7156549520766773,
"grad_norm": 10.1875,
"learning_rate": 5.636446858588611e-06,
"loss": 1.4343,
"step": 672
},
{
"epoch": 0.7167199148029819,
"grad_norm": 8.875,
"learning_rate": 5.597292907223229e-06,
"loss": 1.7156,
"step": 673
},
{
"epoch": 0.7177848775292864,
"grad_norm": 10.375,
"learning_rate": 5.55824420560548e-06,
"loss": 1.8069,
"step": 674
},
{
"epoch": 0.7188498402555911,
"grad_norm": 10.9375,
"learning_rate": 5.51930119082929e-06,
"loss": 2.1811,
"step": 675
},
{
"epoch": 0.7199148029818956,
"grad_norm": 9.8125,
"learning_rate": 5.480464298805539e-06,
"loss": 1.5952,
"step": 676
},
{
"epoch": 0.7209797657082002,
"grad_norm": 11.75,
"learning_rate": 5.441733964257246e-06,
"loss": 1.7754,
"step": 677
},
{
"epoch": 0.7220447284345048,
"grad_norm": 8.625,
"learning_rate": 5.403110620714647e-06,
"loss": 1.4991,
"step": 678
},
{
"epoch": 0.7231096911608094,
"grad_norm": 10.0625,
"learning_rate": 5.3645947005103874e-06,
"loss": 1.5927,
"step": 679
},
{
"epoch": 0.724174653887114,
"grad_norm": 8.1875,
"learning_rate": 5.326186634774654e-06,
"loss": 1.3676,
"step": 680
},
{
"epoch": 0.7252396166134185,
"grad_norm": 13.4375,
"learning_rate": 5.287886853430362e-06,
"loss": 1.5596,
"step": 681
},
{
"epoch": 0.7263045793397231,
"grad_norm": 9.25,
"learning_rate": 5.249695785188338e-06,
"loss": 1.8652,
"step": 682
},
{
"epoch": 0.7273695420660277,
"grad_norm": 9.8125,
"learning_rate": 5.21161385754251e-06,
"loss": 1.7813,
"step": 683
},
{
"epoch": 0.7284345047923323,
"grad_norm": 11.3125,
"learning_rate": 5.173641496765163e-06,
"loss": 1.2195,
"step": 684
},
{
"epoch": 0.7294994675186368,
"grad_norm": 7.4375,
"learning_rate": 5.135779127902103e-06,
"loss": 1.5703,
"step": 685
},
{
"epoch": 0.7305644302449414,
"grad_norm": 10.25,
"learning_rate": 5.098027174767972e-06,
"loss": 1.623,
"step": 686
},
{
"epoch": 0.731629392971246,
"grad_norm": 9.3125,
"learning_rate": 5.0603860599414324e-06,
"loss": 1.9181,
"step": 687
},
{
"epoch": 0.7326943556975506,
"grad_norm": 7.46875,
"learning_rate": 5.022856204760504e-06,
"loss": 1.7919,
"step": 688
},
{
"epoch": 0.7337593184238551,
"grad_norm": 9.375,
"learning_rate": 4.98543802931778e-06,
"loss": 1.8822,
"step": 689
},
{
"epoch": 0.7348242811501597,
"grad_norm": 9.1875,
"learning_rate": 4.948131952455802e-06,
"loss": 1.8503,
"step": 690
},
{
"epoch": 0.7358892438764644,
"grad_norm": 10.25,
"learning_rate": 4.910938391762287e-06,
"loss": 1.9597,
"step": 691
},
{
"epoch": 0.7369542066027689,
"grad_norm": 10.1875,
"learning_rate": 4.873857763565523e-06,
"loss": 1.5532,
"step": 692
},
{
"epoch": 0.7380191693290735,
"grad_norm": 10.8125,
"learning_rate": 4.8368904829296816e-06,
"loss": 1.6786,
"step": 693
},
{
"epoch": 0.739084132055378,
"grad_norm": 12.0,
"learning_rate": 4.800036963650147e-06,
"loss": 1.7628,
"step": 694
},
{
"epoch": 0.7401490947816827,
"grad_norm": 13.0625,
"learning_rate": 4.7632976182489475e-06,
"loss": 1.1223,
"step": 695
},
{
"epoch": 0.7412140575079872,
"grad_norm": 12.1875,
"learning_rate": 4.726672857970059e-06,
"loss": 1.4561,
"step": 696
},
{
"epoch": 0.7422790202342918,
"grad_norm": 12.0625,
"learning_rate": 4.690163092774878e-06,
"loss": 1.4758,
"step": 697
},
{
"epoch": 0.7433439829605963,
"grad_norm": 8.8125,
"learning_rate": 4.65376873133757e-06,
"loss": 1.442,
"step": 698
},
{
"epoch": 0.744408945686901,
"grad_norm": 10.1875,
"learning_rate": 4.617490181040536e-06,
"loss": 1.5644,
"step": 699
},
{
"epoch": 0.7454739084132055,
"grad_norm": 12.4375,
"learning_rate": 4.581327847969832e-06,
"loss": 1.5819,
"step": 700
},
{
"epoch": 0.7465388711395101,
"grad_norm": 9.5625,
"learning_rate": 4.545282136910635e-06,
"loss": 1.7544,
"step": 701
},
{
"epoch": 0.7476038338658147,
"grad_norm": 11.0625,
"learning_rate": 4.509353451342704e-06,
"loss": 1.518,
"step": 702
},
{
"epoch": 0.7486687965921193,
"grad_norm": 11.625,
"learning_rate": 4.4735421934358625e-06,
"loss": 1.843,
"step": 703
},
{
"epoch": 0.7497337593184239,
"grad_norm": 10.5,
"learning_rate": 4.437848764045515e-06,
"loss": 2.0874,
"step": 704
},
{
"epoch": 0.7507987220447284,
"grad_norm": 9.1875,
"learning_rate": 4.402273562708119e-06,
"loss": 1.7729,
"step": 705
},
{
"epoch": 0.751863684771033,
"grad_norm": 10.875,
"learning_rate": 4.366816987636777e-06,
"loss": 2.045,
"step": 706
},
{
"epoch": 0.7529286474973376,
"grad_norm": 10.25,
"learning_rate": 4.3314794357167e-06,
"loss": 1.1957,
"step": 707
},
{
"epoch": 0.7539936102236422,
"grad_norm": 11.75,
"learning_rate": 4.2962613025008365e-06,
"loss": 1.3515,
"step": 708
},
{
"epoch": 0.7550585729499467,
"grad_norm": 9.25,
"learning_rate": 4.2611629822054035e-06,
"loss": 1.5657,
"step": 709
},
{
"epoch": 0.7561235356762513,
"grad_norm": 8.125,
"learning_rate": 4.226184867705487e-06,
"loss": 1.4376,
"step": 710
},
{
"epoch": 0.7571884984025559,
"grad_norm": 8.875,
"learning_rate": 4.1913273505306385e-06,
"loss": 1.6607,
"step": 711
},
{
"epoch": 0.7582534611288605,
"grad_norm": 10.6875,
"learning_rate": 4.156590820860506e-06,
"loss": 1.5126,
"step": 712
},
{
"epoch": 0.759318423855165,
"grad_norm": 11.5,
"learning_rate": 4.121975667520446e-06,
"loss": 1.7676,
"step": 713
},
{
"epoch": 0.7603833865814696,
"grad_norm": 9.8125,
"learning_rate": 4.087482277977188e-06,
"loss": 1.7903,
"step": 714
},
{
"epoch": 0.7614483493077743,
"grad_norm": 11.625,
"learning_rate": 4.0531110383344906e-06,
"loss": 1.6722,
"step": 715
},
{
"epoch": 0.7625133120340788,
"grad_norm": 10.125,
"learning_rate": 4.018862333328819e-06,
"loss": 1.6057,
"step": 716
},
{
"epoch": 0.7635782747603834,
"grad_norm": 10.0625,
"learning_rate": 3.984736546325043e-06,
"loss": 1.7429,
"step": 717
},
{
"epoch": 0.7646432374866879,
"grad_norm": 12.875,
"learning_rate": 3.9507340593121385e-06,
"loss": 1.6328,
"step": 718
},
{
"epoch": 0.7657082002129926,
"grad_norm": 9.0625,
"learning_rate": 3.916855252898917e-06,
"loss": 1.5328,
"step": 719
},
{
"epoch": 0.7667731629392971,
"grad_norm": 8.5625,
"learning_rate": 3.883100506309763e-06,
"loss": 1.5779,
"step": 720
},
{
"epoch": 0.7678381256656017,
"grad_norm": 10.875,
"learning_rate": 3.849470197380397e-06,
"loss": 1.3155,
"step": 721
},
{
"epoch": 0.7689030883919062,
"grad_norm": 8.875,
"learning_rate": 3.815964702553632e-06,
"loss": 1.9236,
"step": 722
},
{
"epoch": 0.7699680511182109,
"grad_norm": 9.75,
"learning_rate": 3.7825843968751665e-06,
"loss": 1.295,
"step": 723
},
{
"epoch": 0.7710330138445154,
"grad_norm": 9.125,
"learning_rate": 3.749329653989393e-06,
"loss": 1.4935,
"step": 724
},
{
"epoch": 0.77209797657082,
"grad_norm": 12.0,
"learning_rate": 3.7162008461352055e-06,
"loss": 1.1767,
"step": 725
},
{
"epoch": 0.7731629392971247,
"grad_norm": 8.6875,
"learning_rate": 3.6831983441418366e-06,
"loss": 2.067,
"step": 726
},
{
"epoch": 0.7742279020234292,
"grad_norm": 10.5625,
"learning_rate": 3.650322517424708e-06,
"loss": 1.853,
"step": 727
},
{
"epoch": 0.7752928647497338,
"grad_norm": 8.6875,
"learning_rate": 3.6175737339812968e-06,
"loss": 1.5211,
"step": 728
},
{
"epoch": 0.7763578274760383,
"grad_norm": 7.0625,
"learning_rate": 3.584952360387009e-06,
"loss": 1.5024,
"step": 729
},
{
"epoch": 0.777422790202343,
"grad_norm": 10.25,
"learning_rate": 3.5524587617910844e-06,
"loss": 1.3889,
"step": 730
},
{
"epoch": 0.7784877529286475,
"grad_norm": 10.9375,
"learning_rate": 3.520093301912505e-06,
"loss": 1.665,
"step": 731
},
{
"epoch": 0.7795527156549521,
"grad_norm": 9.125,
"learning_rate": 3.4878563430359246e-06,
"loss": 1.5885,
"step": 732
},
{
"epoch": 0.7806176783812566,
"grad_norm": 10.0625,
"learning_rate": 3.4557482460076145e-06,
"loss": 1.747,
"step": 733
},
{
"epoch": 0.7816826411075612,
"grad_norm": 10.8125,
"learning_rate": 3.4237693702314215e-06,
"loss": 1.6977,
"step": 734
},
{
"epoch": 0.7827476038338658,
"grad_norm": 9.5625,
"learning_rate": 3.3919200736647476e-06,
"loss": 2.0445,
"step": 735
},
{
"epoch": 0.7838125665601704,
"grad_norm": 8.5,
"learning_rate": 3.3602007128145485e-06,
"loss": 1.1119,
"step": 736
},
{
"epoch": 0.784877529286475,
"grad_norm": 7.90625,
"learning_rate": 3.328611642733316e-06,
"loss": 1.547,
"step": 737
},
{
"epoch": 0.7859424920127795,
"grad_norm": 10.0625,
"learning_rate": 3.297153217015155e-06,
"loss": 1.6143,
"step": 738
},
{
"epoch": 0.7870074547390842,
"grad_norm": 14.0625,
"learning_rate": 3.265825787791774e-06,
"loss": 1.0104,
"step": 739
},
{
"epoch": 0.7880724174653887,
"grad_norm": 11.5625,
"learning_rate": 3.234629705728571e-06,
"loss": 1.8247,
"step": 740
},
{
"epoch": 0.7891373801916933,
"grad_norm": 8.5,
"learning_rate": 3.203565320020701e-06,
"loss": 1.6653,
"step": 741
},
{
"epoch": 0.7902023429179978,
"grad_norm": 8.625,
"learning_rate": 3.1726329783891688e-06,
"loss": 1.3578,
"step": 742
},
{
"epoch": 0.7912673056443025,
"grad_norm": 9.3125,
"learning_rate": 3.1418330270769375e-06,
"loss": 1.3851,
"step": 743
},
{
"epoch": 0.792332268370607,
"grad_norm": 11.75,
"learning_rate": 3.1111658108450465e-06,
"loss": 1.4159,
"step": 744
},
{
"epoch": 0.7933972310969116,
"grad_norm": 8.4375,
"learning_rate": 3.080631672968769e-06,
"loss": 1.7524,
"step": 745
},
{
"epoch": 0.7944621938232161,
"grad_norm": 12.8125,
"learning_rate": 3.050230955233733e-06,
"loss": 1.5065,
"step": 746
},
{
"epoch": 0.7955271565495208,
"grad_norm": 8.3125,
"learning_rate": 3.019963997932157e-06,
"loss": 1.6124,
"step": 747
},
{
"epoch": 0.7965921192758253,
"grad_norm": 9.5625,
"learning_rate": 2.9898311398589674e-06,
"loss": 1.5844,
"step": 748
},
{
"epoch": 0.7976570820021299,
"grad_norm": 8.1875,
"learning_rate": 2.959832718308077e-06,
"loss": 1.0975,
"step": 749
},
{
"epoch": 0.7987220447284346,
"grad_norm": 11.8125,
"learning_rate": 2.929969069068539e-06,
"loss": 1.8824,
"step": 750
},
{
"epoch": 0.7997870074547391,
"grad_norm": 11.375,
"learning_rate": 2.900240526420861e-06,
"loss": 1.56,
"step": 751
},
{
"epoch": 0.8008519701810437,
"grad_norm": 10.9375,
"learning_rate": 2.8706474231332064e-06,
"loss": 1.9439,
"step": 752
},
{
"epoch": 0.8019169329073482,
"grad_norm": 7.5625,
"learning_rate": 2.8411900904576916e-06,
"loss": 1.373,
"step": 753
},
{
"epoch": 0.8029818956336529,
"grad_norm": 12.6875,
"learning_rate": 2.811868858126686e-06,
"loss": 1.4297,
"step": 754
},
{
"epoch": 0.8040468583599574,
"grad_norm": 11.875,
"learning_rate": 2.7826840543490932e-06,
"loss": 1.7362,
"step": 755
},
{
"epoch": 0.805111821086262,
"grad_norm": 9.25,
"learning_rate": 2.753636005806725e-06,
"loss": 1.8195,
"step": 756
},
{
"epoch": 0.8061767838125665,
"grad_norm": 12.5,
"learning_rate": 2.7247250376505823e-06,
"loss": 1.6419,
"step": 757
},
{
"epoch": 0.8072417465388712,
"grad_norm": 15.625,
"learning_rate": 2.69595147349728e-06,
"loss": 1.4818,
"step": 758
},
{
"epoch": 0.8083067092651757,
"grad_norm": 12.8125,
"learning_rate": 2.667315635425366e-06,
"loss": 1.4912,
"step": 759
},
{
"epoch": 0.8093716719914803,
"grad_norm": 8.3125,
"learning_rate": 2.6388178439717696e-06,
"loss": 1.8378,
"step": 760
},
{
"epoch": 0.8104366347177849,
"grad_norm": 11.0,
"learning_rate": 2.610458418128158e-06,
"loss": 0.8448,
"step": 761
},
{
"epoch": 0.8115015974440895,
"grad_norm": 9.9375,
"learning_rate": 2.5822376753374215e-06,
"loss": 1.8725,
"step": 762
},
{
"epoch": 0.8125665601703941,
"grad_norm": 9.5625,
"learning_rate": 2.554155931490085e-06,
"loss": 1.6764,
"step": 763
},
{
"epoch": 0.8136315228966986,
"grad_norm": 17.0,
"learning_rate": 2.526213500920766e-06,
"loss": 1.4876,
"step": 764
},
{
"epoch": 0.8146964856230032,
"grad_norm": 7.96875,
"learning_rate": 2.498410696404698e-06,
"loss": 1.7144,
"step": 765
},
{
"epoch": 0.8157614483493077,
"grad_norm": 8.5625,
"learning_rate": 2.47074782915417e-06,
"loss": 1.6642,
"step": 766
},
{
"epoch": 0.8168264110756124,
"grad_norm": 8.875,
"learning_rate": 2.443225208815111e-06,
"loss": 1.9021,
"step": 767
},
{
"epoch": 0.8178913738019169,
"grad_norm": 8.8125,
"learning_rate": 2.4158431434635525e-06,
"loss": 1.4165,
"step": 768
},
{
"epoch": 0.8189563365282215,
"grad_norm": 12.375,
"learning_rate": 2.388601939602246e-06,
"loss": 0.9623,
"step": 769
},
{
"epoch": 0.820021299254526,
"grad_norm": 10.625,
"learning_rate": 2.3615019021571798e-06,
"loss": 1.8855,
"step": 770
},
{
"epoch": 0.8210862619808307,
"grad_norm": 11.3125,
"learning_rate": 2.3345433344741984e-06,
"loss": 1.9843,
"step": 771
},
{
"epoch": 0.8221512247071352,
"grad_norm": 11.125,
"learning_rate": 2.3077265383155937e-06,
"loss": 1.8292,
"step": 772
},
{
"epoch": 0.8232161874334398,
"grad_norm": 9.3125,
"learning_rate": 2.281051813856732e-06,
"loss": 1.7709,
"step": 773
},
{
"epoch": 0.8242811501597445,
"grad_norm": 11.4375,
"learning_rate": 2.2545194596826867e-06,
"loss": 1.6551,
"step": 774
},
{
"epoch": 0.825346112886049,
"grad_norm": 14.3125,
"learning_rate": 2.2281297727849042e-06,
"loss": 2.0816,
"step": 775
},
{
"epoch": 0.8264110756123536,
"grad_norm": 10.375,
"learning_rate": 2.201883048557885e-06,
"loss": 1.7194,
"step": 776
},
{
"epoch": 0.8274760383386581,
"grad_norm": 11.375,
"learning_rate": 2.175779580795848e-06,
"loss": 1.5737,
"step": 777
},
{
"epoch": 0.8285410010649628,
"grad_norm": 11.875,
"learning_rate": 2.1498196616894867e-06,
"loss": 1.7461,
"step": 778
},
{
"epoch": 0.8296059637912673,
"grad_norm": 8.5,
"learning_rate": 2.1240035818226546e-06,
"loss": 0.9217,
"step": 779
},
{
"epoch": 0.8306709265175719,
"grad_norm": 12.6875,
"learning_rate": 2.09833163016914e-06,
"loss": 1.483,
"step": 780
},
{
"epoch": 0.8317358892438764,
"grad_norm": 14.8125,
"learning_rate": 2.0728040940894277e-06,
"loss": 1.5439,
"step": 781
},
{
"epoch": 0.832800851970181,
"grad_norm": 9.375,
"learning_rate": 2.047421259327472e-06,
"loss": 1.7316,
"step": 782
},
{
"epoch": 0.8338658146964856,
"grad_norm": 12.3125,
"learning_rate": 2.0221834100075086e-06,
"loss": 0.6799,
"step": 783
},
{
"epoch": 0.8349307774227902,
"grad_norm": 8.8125,
"learning_rate": 1.99709082863087e-06,
"loss": 1.2157,
"step": 784
},
{
"epoch": 0.8359957401490948,
"grad_norm": 8.3125,
"learning_rate": 1.9721437960728183e-06,
"loss": 1.783,
"step": 785
},
{
"epoch": 0.8370607028753994,
"grad_norm": 10.9375,
"learning_rate": 1.9473425915794108e-06,
"loss": 2.1057,
"step": 786
},
{
"epoch": 0.838125665601704,
"grad_norm": 7.9375,
"learning_rate": 1.922687492764379e-06,
"loss": 1.8039,
"step": 787
},
{
"epoch": 0.8391906283280085,
"grad_norm": 7.90625,
"learning_rate": 1.8981787756059933e-06,
"loss": 1.6254,
"step": 788
},
{
"epoch": 0.8402555910543131,
"grad_norm": 12.3125,
"learning_rate": 1.8738167144440026e-06,
"loss": 1.3783,
"step": 789
},
{
"epoch": 0.8413205537806177,
"grad_norm": 8.0625,
"learning_rate": 1.8496015819765548e-06,
"loss": 1.6897,
"step": 790
},
{
"epoch": 0.8423855165069223,
"grad_norm": 11.0,
"learning_rate": 1.8255336492571394e-06,
"loss": 1.3614,
"step": 791
},
{
"epoch": 0.8434504792332268,
"grad_norm": 7.78125,
"learning_rate": 1.8016131856915608e-06,
"loss": 1.8157,
"step": 792
},
{
"epoch": 0.8445154419595314,
"grad_norm": 14.75,
"learning_rate": 1.7778404590349135e-06,
"loss": 1.3761,
"step": 793
},
{
"epoch": 0.845580404685836,
"grad_norm": 10.8125,
"learning_rate": 1.754215735388595e-06,
"loss": 1.9235,
"step": 794
},
{
"epoch": 0.8466453674121406,
"grad_norm": 20.875,
"learning_rate": 1.7307392791973204e-06,
"loss": 1.0706,
"step": 795
},
{
"epoch": 0.8477103301384451,
"grad_norm": 14.9375,
"learning_rate": 1.7074113532461644e-06,
"loss": 1.6008,
"step": 796
},
{
"epoch": 0.8487752928647497,
"grad_norm": 8.4375,
"learning_rate": 1.6842322186576208e-06,
"loss": 1.3219,
"step": 797
},
{
"epoch": 0.8498402555910544,
"grad_norm": 12.125,
"learning_rate": 1.6612021348886775e-06,
"loss": 1.831,
"step": 798
},
{
"epoch": 0.8509052183173589,
"grad_norm": 9.3125,
"learning_rate": 1.6383213597279146e-06,
"loss": 1.5059,
"step": 799
},
{
"epoch": 0.8519701810436635,
"grad_norm": 9.625,
"learning_rate": 1.615590149292618e-06,
"loss": 1.371,
"step": 800
},
{
"epoch": 0.853035143769968,
"grad_norm": 10.25,
"learning_rate": 1.5930087580259089e-06,
"loss": 1.5541,
"step": 801
},
{
"epoch": 0.8541001064962727,
"grad_norm": 9.875,
"learning_rate": 1.5705774386939027e-06,
"loss": 1.5982,
"step": 802
},
{
"epoch": 0.8551650692225772,
"grad_norm": 13.1875,
"learning_rate": 1.5482964423828738e-06,
"loss": 1.4818,
"step": 803
},
{
"epoch": 0.8562300319488818,
"grad_norm": 12.1875,
"learning_rate": 1.5261660184964488e-06,
"loss": 1.4705,
"step": 804
},
{
"epoch": 0.8572949946751863,
"grad_norm": 13.5625,
"learning_rate": 1.50418641475281e-06,
"loss": 1.1884,
"step": 805
},
{
"epoch": 0.858359957401491,
"grad_norm": 9.9375,
"learning_rate": 1.4823578771819308e-06,
"loss": 1.7592,
"step": 806
},
{
"epoch": 0.8594249201277955,
"grad_norm": 9.5625,
"learning_rate": 1.4606806501228098e-06,
"loss": 1.8618,
"step": 807
},
{
"epoch": 0.8604898828541001,
"grad_norm": 11.5625,
"learning_rate": 1.439154976220753e-06,
"loss": 1.872,
"step": 808
},
{
"epoch": 0.8615548455804047,
"grad_norm": 11.125,
"learning_rate": 1.417781096424629e-06,
"loss": 1.7073,
"step": 809
},
{
"epoch": 0.8626198083067093,
"grad_norm": 15.625,
"learning_rate": 1.3965592499842133e-06,
"loss": 1.5956,
"step": 810
},
{
"epoch": 0.8636847710330139,
"grad_norm": 8.0625,
"learning_rate": 1.3754896744474704e-06,
"loss": 1.6528,
"step": 811
},
{
"epoch": 0.8647497337593184,
"grad_norm": 9.0,
"learning_rate": 1.3545726056579199e-06,
"loss": 1.6145,
"step": 812
},
{
"epoch": 0.865814696485623,
"grad_norm": 8.4375,
"learning_rate": 1.3338082777519822e-06,
"loss": 2.4227,
"step": 813
},
{
"epoch": 0.8668796592119276,
"grad_norm": 14.4375,
"learning_rate": 1.31319692315637e-06,
"loss": 1.1167,
"step": 814
},
{
"epoch": 0.8679446219382322,
"grad_norm": 8.3125,
"learning_rate": 1.2927387725854761e-06,
"loss": 1.7869,
"step": 815
},
{
"epoch": 0.8690095846645367,
"grad_norm": 9.5,
"learning_rate": 1.2724340550387963e-06,
"loss": 1.538,
"step": 816
},
{
"epoch": 0.8700745473908413,
"grad_norm": 9.0,
"learning_rate": 1.2522829977983691e-06,
"loss": 1.8242,
"step": 817
},
{
"epoch": 0.8711395101171459,
"grad_norm": 8.4375,
"learning_rate": 1.2322858264262133e-06,
"loss": 1.3644,
"step": 818
},
{
"epoch": 0.8722044728434505,
"grad_norm": 10.125,
"learning_rate": 1.2124427647618392e-06,
"loss": 1.755,
"step": 819
},
{
"epoch": 0.873269435569755,
"grad_norm": 10.5625,
"learning_rate": 1.1927540349196986e-06,
"loss": 1.7762,
"step": 820
},
{
"epoch": 0.8743343982960596,
"grad_norm": 12.0,
"learning_rate": 1.173219857286742e-06,
"loss": 1.2137,
"step": 821
},
{
"epoch": 0.8753993610223643,
"grad_norm": 11.3125,
"learning_rate": 1.1538404505199102e-06,
"loss": 1.9446,
"step": 822
},
{
"epoch": 0.8764643237486688,
"grad_norm": 9.0,
"learning_rate": 1.1346160315437282e-06,
"loss": 1.9131,
"step": 823
},
{
"epoch": 0.8775292864749734,
"grad_norm": 8.75,
"learning_rate": 1.1155468155478387e-06,
"loss": 1.6933,
"step": 824
},
{
"epoch": 0.8785942492012779,
"grad_norm": 8.5625,
"learning_rate": 1.096633015984621e-06,
"loss": 1.6843,
"step": 825
},
{
"epoch": 0.8796592119275826,
"grad_norm": 8.5625,
"learning_rate": 1.0778748445667907e-06,
"loss": 1.0095,
"step": 826
},
{
"epoch": 0.8807241746538871,
"grad_norm": 9.625,
"learning_rate": 1.0592725112650204e-06,
"loss": 1.5605,
"step": 827
},
{
"epoch": 0.8817891373801917,
"grad_norm": 9.9375,
"learning_rate": 1.040826224305616e-06,
"loss": 1.4877,
"step": 828
},
{
"epoch": 0.8828541001064962,
"grad_norm": 9.3125,
"learning_rate": 1.022536190168153e-06,
"loss": 1.7958,
"step": 829
},
{
"epoch": 0.8839190628328009,
"grad_norm": 9.875,
"learning_rate": 1.0044026135832018e-06,
"loss": 1.2038,
"step": 830
},
{
"epoch": 0.8849840255591054,
"grad_norm": 13.0,
"learning_rate": 9.864256975299912e-07,
"loss": 1.5369,
"step": 831
},
{
"epoch": 0.88604898828541,
"grad_norm": 10.5,
"learning_rate": 9.686056432341872e-07,
"loss": 1.8342,
"step": 832
},
{
"epoch": 0.8871139510117146,
"grad_norm": 10.8125,
"learning_rate": 9.509426501655921e-07,
"loss": 1.683,
"step": 833
},
{
"epoch": 0.8881789137380192,
"grad_norm": 9.4375,
"learning_rate": 9.334369160359463e-07,
"loss": 1.6419,
"step": 834
},
{
"epoch": 0.8892438764643238,
"grad_norm": 8.875,
"learning_rate": 9.16088636796708e-07,
"loss": 1.6393,
"step": 835
},
{
"epoch": 0.8903088391906283,
"grad_norm": 15.0,
"learning_rate": 8.988980066368357e-07,
"loss": 1.4805,
"step": 836
},
{
"epoch": 0.8913738019169329,
"grad_norm": 10.6875,
"learning_rate": 8.818652179806591e-07,
"loss": 1.5142,
"step": 837
},
{
"epoch": 0.8924387646432375,
"grad_norm": 9.4375,
"learning_rate": 8.649904614856746e-07,
"loss": 1.1254,
"step": 838
},
{
"epoch": 0.8935037273695421,
"grad_norm": 9.8125,
"learning_rate": 8.482739260404604e-07,
"loss": 1.5179,
"step": 839
},
{
"epoch": 0.8945686900958466,
"grad_norm": 8.8125,
"learning_rate": 8.317157987625146e-07,
"loss": 1.254,
"step": 840
},
{
"epoch": 0.8956336528221512,
"grad_norm": 12.125,
"learning_rate": 8.153162649962054e-07,
"loss": 1.655,
"step": 841
},
{
"epoch": 0.8966986155484558,
"grad_norm": 11.1875,
"learning_rate": 7.99075508310661e-07,
"loss": 1.5181,
"step": 842
},
{
"epoch": 0.8977635782747604,
"grad_norm": 10.4375,
"learning_rate": 7.829937104977347e-07,
"loss": 1.6125,
"step": 843
},
{
"epoch": 0.898828541001065,
"grad_norm": 9.0625,
"learning_rate": 7.670710515699647e-07,
"loss": 1.4513,
"step": 844
},
{
"epoch": 0.8998935037273695,
"grad_norm": 12.9375,
"learning_rate": 7.513077097585558e-07,
"loss": 1.2881,
"step": 845
},
{
"epoch": 0.9009584664536742,
"grad_norm": 10.75,
"learning_rate": 7.357038615113959e-07,
"loss": 1.9436,
"step": 846
},
{
"epoch": 0.9020234291799787,
"grad_norm": 10.6875,
"learning_rate": 7.202596814910561e-07,
"loss": 1.6863,
"step": 847
},
{
"epoch": 0.9030883919062833,
"grad_norm": 11.0,
"learning_rate": 7.049753425728723e-07,
"loss": 1.4833,
"step": 848
},
{
"epoch": 0.9041533546325878,
"grad_norm": 10.3125,
"learning_rate": 6.89851015842971e-07,
"loss": 1.5175,
"step": 849
},
{
"epoch": 0.9052183173588925,
"grad_norm": 8.1875,
"learning_rate": 6.74886870596389e-07,
"loss": 1.6937,
"step": 850
},
{
"epoch": 0.906283280085197,
"grad_norm": 8.5,
"learning_rate": 6.600830743351482e-07,
"loss": 1.3861,
"step": 851
},
{
"epoch": 0.9073482428115016,
"grad_norm": 9.75,
"learning_rate": 6.454397927664035e-07,
"loss": 1.7092,
"step": 852
},
{
"epoch": 0.9084132055378061,
"grad_norm": 12.25,
"learning_rate": 6.309571898005784e-07,
"loss": 1.4632,
"step": 853
},
{
"epoch": 0.9094781682641108,
"grad_norm": 11.0,
"learning_rate": 6.166354275495284e-07,
"loss": 1.6058,
"step": 854
},
{
"epoch": 0.9105431309904153,
"grad_norm": 9.6875,
"learning_rate": 6.02474666324731e-07,
"loss": 1.6195,
"step": 855
},
{
"epoch": 0.9116080937167199,
"grad_norm": 9.8125,
"learning_rate": 5.884750646354903e-07,
"loss": 1.9317,
"step": 856
},
{
"epoch": 0.9126730564430245,
"grad_norm": 9.75,
"learning_rate": 5.746367791871582e-07,
"loss": 1.5125,
"step": 857
},
{
"epoch": 0.9137380191693291,
"grad_norm": 11.0,
"learning_rate": 5.609599648793878e-07,
"loss": 1.8878,
"step": 858
},
{
"epoch": 0.9148029818956337,
"grad_norm": 8.875,
"learning_rate": 5.474447748043931e-07,
"loss": 1.4361,
"step": 859
},
{
"epoch": 0.9158679446219382,
"grad_norm": 9.0625,
"learning_rate": 5.340913602452385e-07,
"loss": 1.5905,
"step": 860
},
{
"epoch": 0.9169329073482428,
"grad_norm": 9.3125,
"learning_rate": 5.208998706741469e-07,
"loss": 1.8177,
"step": 861
},
{
"epoch": 0.9179978700745474,
"grad_norm": 8.9375,
"learning_rate": 5.078704537508194e-07,
"loss": 1.1538,
"step": 862
},
{
"epoch": 0.919062832800852,
"grad_norm": 9.625,
"learning_rate": 4.950032553207934e-07,
"loss": 0.8619,
"step": 863
},
{
"epoch": 0.9201277955271565,
"grad_norm": 10.5,
"learning_rate": 4.822984194138003e-07,
"loss": 1.3963,
"step": 864
},
{
"epoch": 0.9211927582534611,
"grad_norm": 9.3125,
"learning_rate": 4.6975608824215866e-07,
"loss": 2.0253,
"step": 865
},
{
"epoch": 0.9222577209797657,
"grad_norm": 11.125,
"learning_rate": 4.5737640219917885e-07,
"loss": 1.6444,
"step": 866
},
{
"epoch": 0.9233226837060703,
"grad_norm": 15.4375,
"learning_rate": 4.451594998575975e-07,
"loss": 1.637,
"step": 867
},
{
"epoch": 0.9243876464323749,
"grad_norm": 11.25,
"learning_rate": 4.331055179680188e-07,
"loss": 1.6452,
"step": 868
},
{
"epoch": 0.9254526091586794,
"grad_norm": 11.0625,
"learning_rate": 4.212145914573906e-07,
"loss": 1.1187,
"step": 869
},
{
"epoch": 0.9265175718849841,
"grad_norm": 9.5,
"learning_rate": 4.0948685342748595e-07,
"loss": 1.5924,
"step": 870
},
{
"epoch": 0.9275825346112886,
"grad_norm": 9.125,
"learning_rate": 3.9792243515342387e-07,
"loss": 1.5951,
"step": 871
},
{
"epoch": 0.9286474973375932,
"grad_norm": 9.0625,
"learning_rate": 3.865214660821892e-07,
"loss": 1.6178,
"step": 872
},
{
"epoch": 0.9297124600638977,
"grad_norm": 12.25,
"learning_rate": 3.7528407383119355e-07,
"loss": 1.6325,
"step": 873
},
{
"epoch": 0.9307774227902024,
"grad_norm": 12.625,
"learning_rate": 3.642103841868383e-07,
"loss": 1.4013,
"step": 874
},
{
"epoch": 0.9318423855165069,
"grad_norm": 9.4375,
"learning_rate": 3.533005211031104e-07,
"loss": 1.7131,
"step": 875
},
{
"epoch": 0.9329073482428115,
"grad_norm": 8.9375,
"learning_rate": 3.4255460670019723e-07,
"loss": 1.6834,
"step": 876
},
{
"epoch": 0.933972310969116,
"grad_norm": 10.0625,
"learning_rate": 3.3197276126311404e-07,
"loss": 1.8832,
"step": 877
},
{
"epoch": 0.9350372736954207,
"grad_norm": 8.4375,
"learning_rate": 3.2155510324036354e-07,
"loss": 1.5813,
"step": 878
},
{
"epoch": 0.9361022364217252,
"grad_norm": 8.0625,
"learning_rate": 3.1130174924260345e-07,
"loss": 1.7615,
"step": 879
},
{
"epoch": 0.9371671991480298,
"grad_norm": 9.75,
"learning_rate": 3.012128140413495e-07,
"loss": 1.9889,
"step": 880
},
{
"epoch": 0.9382321618743344,
"grad_norm": 8.8125,
"learning_rate": 2.9128841056767943e-07,
"loss": 1.8032,
"step": 881
},
{
"epoch": 0.939297124600639,
"grad_norm": 9.25,
"learning_rate": 2.815286499109826e-07,
"loss": 1.287,
"step": 882
},
{
"epoch": 0.9403620873269436,
"grad_norm": 11.875,
"learning_rate": 2.719336413177076e-07,
"loss": 1.4744,
"step": 883
},
{
"epoch": 0.9414270500532481,
"grad_norm": 9.3125,
"learning_rate": 2.6250349219013813e-07,
"loss": 1.9576,
"step": 884
},
{
"epoch": 0.9424920127795527,
"grad_norm": 9.8125,
"learning_rate": 2.5323830808519575e-07,
"loss": 1.7786,
"step": 885
},
{
"epoch": 0.9435569755058573,
"grad_norm": 10.3125,
"learning_rate": 2.4413819271325576e-07,
"loss": 1.5379,
"step": 886
},
{
"epoch": 0.9446219382321619,
"grad_norm": 10.8125,
"learning_rate": 2.3520324793698977e-07,
"loss": 1.546,
"step": 887
},
{
"epoch": 0.9456869009584664,
"grad_norm": 9.9375,
"learning_rate": 2.2643357377022166e-07,
"loss": 1.8081,
"step": 888
},
{
"epoch": 0.946751863684771,
"grad_norm": 11.1875,
"learning_rate": 2.1782926837680518e-07,
"loss": 1.6002,
"step": 889
},
{
"epoch": 0.9478168264110756,
"grad_norm": 10.5,
"learning_rate": 2.09390428069533e-07,
"loss": 1.2529,
"step": 890
},
{
"epoch": 0.9488817891373802,
"grad_norm": 10.0,
"learning_rate": 2.0111714730905783e-07,
"loss": 1.8403,
"step": 891
},
{
"epoch": 0.9499467518636848,
"grad_norm": 12.1875,
"learning_rate": 1.9300951870282136e-07,
"loss": 2.0224,
"step": 892
},
{
"epoch": 0.9510117145899893,
"grad_norm": 12.8125,
"learning_rate": 1.850676330040385e-07,
"loss": 2.1045,
"step": 893
},
{
"epoch": 0.952076677316294,
"grad_norm": 8.875,
"learning_rate": 1.7729157911066994e-07,
"loss": 1.7303,
"step": 894
},
{
"epoch": 0.9531416400425985,
"grad_norm": 10.3125,
"learning_rate": 1.6968144406442288e-07,
"loss": 1.1697,
"step": 895
},
{
"epoch": 0.9542066027689031,
"grad_norm": 8.1875,
"learning_rate": 1.6223731304978838e-07,
"loss": 1.6273,
"step": 896
},
{
"epoch": 0.9552715654952076,
"grad_norm": 13.3125,
"learning_rate": 1.549592693930757e-07,
"loss": 1.2158,
"step": 897
},
{
"epoch": 0.9563365282215123,
"grad_norm": 9.4375,
"learning_rate": 1.4784739456149442e-07,
"loss": 1.3284,
"step": 898
},
{
"epoch": 0.9574014909478168,
"grad_norm": 14.8125,
"learning_rate": 1.4090176816222211e-07,
"loss": 1.3294,
"step": 899
},
{
"epoch": 0.9584664536741214,
"grad_norm": 9.875,
"learning_rate": 1.3412246794153481e-07,
"loss": 1.8441,
"step": 900
},
{
"epoch": 0.9595314164004259,
"grad_norm": 12.25,
"learning_rate": 1.2750956978392124e-07,
"loss": 1.4858,
"step": 901
},
{
"epoch": 0.9605963791267306,
"grad_norm": 10.125,
"learning_rate": 1.2106314771124171e-07,
"loss": 1.1116,
"step": 902
},
{
"epoch": 0.9616613418530351,
"grad_norm": 12.25,
"learning_rate": 1.1478327388189547e-07,
"loss": 1.6537,
"step": 903
},
{
"epoch": 0.9627263045793397,
"grad_norm": 9.5,
"learning_rate": 1.0867001859001801e-07,
"loss": 2.0677,
"step": 904
},
{
"epoch": 0.9637912673056444,
"grad_norm": 8.0625,
"learning_rate": 1.0272345026468177e-07,
"loss": 1.1597,
"step": 905
},
{
"epoch": 0.9648562300319489,
"grad_norm": 11.0625,
"learning_rate": 9.694363546914664e-08,
"loss": 1.4244,
"step": 906
},
{
"epoch": 0.9659211927582535,
"grad_norm": 10.6875,
"learning_rate": 9.133063890010729e-08,
"loss": 1.0121,
"step": 907
},
{
"epoch": 0.966986155484558,
"grad_norm": 8.125,
"learning_rate": 8.588452338696206e-08,
"loss": 1.5185,
"step": 908
},
{
"epoch": 0.9680511182108626,
"grad_norm": 9.75,
"learning_rate": 8.060534989112688e-08,
"loss": 1.5174,
"step": 909
},
{
"epoch": 0.9691160809371672,
"grad_norm": 11.3125,
"learning_rate": 7.549317750533246e-08,
"loss": 1.413,
"step": 910
},
{
"epoch": 0.9701810436634718,
"grad_norm": 8.875,
"learning_rate": 7.054806345297815e-08,
"loss": 1.8943,
"step": 911
},
{
"epoch": 0.9712460063897763,
"grad_norm": 9.625,
"learning_rate": 6.577006308748579e-08,
"loss": 1.5478,
"step": 912
},
{
"epoch": 0.972310969116081,
"grad_norm": 9.0625,
"learning_rate": 6.115922989167855e-08,
"loss": 1.7652,
"step": 913
},
{
"epoch": 0.9733759318423855,
"grad_norm": 9.625,
"learning_rate": 5.6715615477188064e-08,
"loss": 1.5531,
"step": 914
},
{
"epoch": 0.9744408945686901,
"grad_norm": 9.4375,
"learning_rate": 5.243926958386658e-08,
"loss": 1.4704,
"step": 915
},
{
"epoch": 0.9755058572949947,
"grad_norm": 11.1875,
"learning_rate": 4.833024007924236e-08,
"loss": 1.9338,
"step": 916
},
{
"epoch": 0.9765708200212992,
"grad_norm": 9.5625,
"learning_rate": 4.438857295797516e-08,
"loss": 1.3931,
"step": 917
},
{
"epoch": 0.9776357827476039,
"grad_norm": 8.9375,
"learning_rate": 4.0614312341346604e-08,
"loss": 1.7876,
"step": 918
},
{
"epoch": 0.9787007454739084,
"grad_norm": 9.4375,
"learning_rate": 3.7007500476757274e-08,
"loss": 1.7527,
"step": 919
},
{
"epoch": 0.979765708200213,
"grad_norm": 10.125,
"learning_rate": 3.356817773727039e-08,
"loss": 1.3077,
"step": 920
},
{
"epoch": 0.9808306709265175,
"grad_norm": 8.4375,
"learning_rate": 3.02963826211422e-08,
"loss": 1.8316,
"step": 921
},
{
"epoch": 0.9818956336528222,
"grad_norm": 14.375,
"learning_rate": 2.7192151751400662e-08,
"loss": 1.0442,
"step": 922
},
{
"epoch": 0.9829605963791267,
"grad_norm": 13.375,
"learning_rate": 2.4255519875434062e-08,
"loss": 1.6917,
"step": 923
},
{
"epoch": 0.9840255591054313,
"grad_norm": 8.9375,
"learning_rate": 2.1486519864604703e-08,
"loss": 1.544,
"step": 924
},
{
"epoch": 0.9850905218317358,
"grad_norm": 9.1875,
"learning_rate": 1.8885182713870853e-08,
"loss": 1.7003,
"step": 925
},
{
"epoch": 0.9861554845580405,
"grad_norm": 13.125,
"learning_rate": 1.6451537541453677e-08,
"loss": 1.3597,
"step": 926
},
{
"epoch": 0.987220447284345,
"grad_norm": 10.25,
"learning_rate": 1.4185611588500847e-08,
"loss": 1.5915,
"step": 927
},
{
"epoch": 0.9882854100106496,
"grad_norm": 9.3125,
"learning_rate": 1.2087430218786776e-08,
"loss": 1.6792,
"step": 928
},
{
"epoch": 0.9893503727369543,
"grad_norm": 10.0,
"learning_rate": 1.0157016918426188e-08,
"loss": 1.3929,
"step": 929
},
{
"epoch": 0.9904153354632588,
"grad_norm": 8.5,
"learning_rate": 8.39439329561098e-09,
"loss": 1.5281,
"step": 930
},
{
"epoch": 0.9914802981895634,
"grad_norm": 8.0625,
"learning_rate": 6.799579080372098e-09,
"loss": 1.5907,
"step": 931
},
{
"epoch": 0.9925452609158679,
"grad_norm": 9.0,
"learning_rate": 5.372592124354703e-09,
"loss": 1.7476,
"step": 932
},
{
"epoch": 0.9936102236421726,
"grad_norm": 10.375,
"learning_rate": 4.113448400621667e-09,
"loss": 1.5377,
"step": 933
},
{
"epoch": 0.9946751863684771,
"grad_norm": 9.5625,
"learning_rate": 3.0221620034687203e-09,
"loss": 1.26,
"step": 934
},
{
"epoch": 0.9957401490947817,
"grad_norm": 9.625,
"learning_rate": 2.0987451482762376e-09,
"loss": 1.2999,
"step": 935
},
{
"epoch": 0.9968051118210862,
"grad_norm": 9.9375,
"learning_rate": 1.3432081713626865e-09,
"loss": 1.7814,
"step": 936
},
{
"epoch": 0.9978700745473909,
"grad_norm": 11.125,
"learning_rate": 7.555595298747165e-10,
"loss": 1.4365,
"step": 937
},
{
"epoch": 0.9989350372736954,
"grad_norm": 13.875,
"learning_rate": 3.3580580169223494e-10,
"loss": 1.4393,
"step": 938
},
{
"epoch": 1.0,
"grad_norm": 37.5,
"learning_rate": 8.395168535180187e-11,
"loss": 1.3917,
"step": 939
},
{
"epoch": 1.0,
"step": 939,
"total_flos": 1.9193025942808166e+17,
"train_loss": 1.628197498976613,
"train_runtime": 2231.2795,
"train_samples_per_second": 6.728,
"train_steps_per_second": 0.421
}
],
"logging_steps": 1.0,
"max_steps": 939,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.9193025942808166e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}