sapphire-12b-subseq-perseq / trainer_state.json
kalomaze's picture
Upload folder using huggingface_hub
def9004 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 37,
"global_step": 294,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003401360544217687,
"grad_norm": 106.7094005171616,
"learning_rate": 0.0,
"loss": 2.9268,
"step": 1
},
{
"epoch": 0.003401360544217687,
"eval_loss": 2.5302913188934326,
"eval_runtime": 3.7953,
"eval_samples_per_second": 14.492,
"eval_steps_per_second": 1.054,
"step": 1
},
{
"epoch": 0.006802721088435374,
"grad_norm": 57.97506009705182,
"learning_rate": 6.89655172413793e-08,
"loss": 2.0122,
"step": 2
},
{
"epoch": 0.01020408163265306,
"grad_norm": 116.23413141145363,
"learning_rate": 1.379310344827586e-07,
"loss": 2.6743,
"step": 3
},
{
"epoch": 0.013605442176870748,
"grad_norm": 21.262801374024775,
"learning_rate": 2.0689655172413793e-07,
"loss": 2.0743,
"step": 4
},
{
"epoch": 0.017006802721088437,
"grad_norm": 59.319984755304056,
"learning_rate": 2.758620689655172e-07,
"loss": 2.2775,
"step": 5
},
{
"epoch": 0.02040816326530612,
"grad_norm": 159.51320885432614,
"learning_rate": 3.4482758620689656e-07,
"loss": 2.1337,
"step": 6
},
{
"epoch": 0.023809523809523808,
"grad_norm": 87.93970940325055,
"learning_rate": 4.1379310344827586e-07,
"loss": 1.9061,
"step": 7
},
{
"epoch": 0.027210884353741496,
"grad_norm": 61.133777808660895,
"learning_rate": 4.827586206896552e-07,
"loss": 1.8118,
"step": 8
},
{
"epoch": 0.030612244897959183,
"grad_norm": 48.65887299035499,
"learning_rate": 5.517241379310344e-07,
"loss": 3.4095,
"step": 9
},
{
"epoch": 0.034013605442176874,
"grad_norm": 30.592687909719288,
"learning_rate": 6.206896551724138e-07,
"loss": 2.2398,
"step": 10
},
{
"epoch": 0.03741496598639456,
"grad_norm": 74.15295766799099,
"learning_rate": 6.896551724137931e-07,
"loss": 3.4425,
"step": 11
},
{
"epoch": 0.04081632653061224,
"grad_norm": 34.94892634385338,
"learning_rate": 7.586206896551724e-07,
"loss": 2.5405,
"step": 12
},
{
"epoch": 0.04421768707482993,
"grad_norm": 26.538521745061775,
"learning_rate": 8.275862068965517e-07,
"loss": 1.9614,
"step": 13
},
{
"epoch": 0.047619047619047616,
"grad_norm": 52.23979896259082,
"learning_rate": 8.96551724137931e-07,
"loss": 2.9785,
"step": 14
},
{
"epoch": 0.05102040816326531,
"grad_norm": 30.812143999051266,
"learning_rate": 9.655172413793103e-07,
"loss": 2.0185,
"step": 15
},
{
"epoch": 0.05442176870748299,
"grad_norm": 41.48478088374125,
"learning_rate": 1.0344827586206896e-06,
"loss": 2.1126,
"step": 16
},
{
"epoch": 0.05782312925170068,
"grad_norm": 29.347588210089675,
"learning_rate": 1.1034482758620688e-06,
"loss": 2.2078,
"step": 17
},
{
"epoch": 0.061224489795918366,
"grad_norm": 28.947554594850924,
"learning_rate": 1.172413793103448e-06,
"loss": 2.442,
"step": 18
},
{
"epoch": 0.06462585034013606,
"grad_norm": 32.28592513881342,
"learning_rate": 1.2413793103448275e-06,
"loss": 2.8683,
"step": 19
},
{
"epoch": 0.06802721088435375,
"grad_norm": 38.97631997775744,
"learning_rate": 1.3103448275862068e-06,
"loss": 2.4376,
"step": 20
},
{
"epoch": 0.07142857142857142,
"grad_norm": 43.775478156068516,
"learning_rate": 1.3793103448275862e-06,
"loss": 2.4167,
"step": 21
},
{
"epoch": 0.07482993197278912,
"grad_norm": 30.904260805899465,
"learning_rate": 1.4482758620689655e-06,
"loss": 2.6971,
"step": 22
},
{
"epoch": 0.0782312925170068,
"grad_norm": 48.202871069183985,
"learning_rate": 1.5172413793103447e-06,
"loss": 2.5093,
"step": 23
},
{
"epoch": 0.08163265306122448,
"grad_norm": 55.067186300198706,
"learning_rate": 1.5862068965517242e-06,
"loss": 2.0053,
"step": 24
},
{
"epoch": 0.08503401360544217,
"grad_norm": 38.486811757681096,
"learning_rate": 1.6551724137931035e-06,
"loss": 2.2475,
"step": 25
},
{
"epoch": 0.08843537414965986,
"grad_norm": 90.78568630900098,
"learning_rate": 1.7241379310344825e-06,
"loss": 3.8342,
"step": 26
},
{
"epoch": 0.09183673469387756,
"grad_norm": 23.32050516158788,
"learning_rate": 1.793103448275862e-06,
"loss": 2.2496,
"step": 27
},
{
"epoch": 0.09523809523809523,
"grad_norm": 25.01047005218693,
"learning_rate": 1.8620689655172412e-06,
"loss": 2.6991,
"step": 28
},
{
"epoch": 0.09863945578231292,
"grad_norm": 27.40209208002175,
"learning_rate": 1.9310344827586207e-06,
"loss": 2.7017,
"step": 29
},
{
"epoch": 0.10204081632653061,
"grad_norm": 16.372774250078056,
"learning_rate": 2e-06,
"loss": 2.1315,
"step": 30
},
{
"epoch": 0.1054421768707483,
"grad_norm": 34.32100924763162,
"learning_rate": 1.999984207714351e-06,
"loss": 2.4298,
"step": 31
},
{
"epoch": 0.10884353741496598,
"grad_norm": 49.15042168439896,
"learning_rate": 1.9999368313561964e-06,
"loss": 3.1687,
"step": 32
},
{
"epoch": 0.11224489795918367,
"grad_norm": 27.553221322487154,
"learning_rate": 1.9998578724218984e-06,
"loss": 2.307,
"step": 33
},
{
"epoch": 0.11564625850340136,
"grad_norm": 25.29898708562965,
"learning_rate": 1.999747333405341e-06,
"loss": 2.6711,
"step": 34
},
{
"epoch": 0.11904761904761904,
"grad_norm": 35.13639034121329,
"learning_rate": 1.9996052177978517e-06,
"loss": 2.2923,
"step": 35
},
{
"epoch": 0.12244897959183673,
"grad_norm": 61.904951168823246,
"learning_rate": 1.999431530088091e-06,
"loss": 3.0837,
"step": 36
},
{
"epoch": 0.12585034013605442,
"grad_norm": 43.72931173152359,
"learning_rate": 1.9992262757619108e-06,
"loss": 2.9055,
"step": 37
},
{
"epoch": 0.12585034013605442,
"eval_loss": 2.2881884574890137,
"eval_runtime": 3.7387,
"eval_samples_per_second": 14.711,
"eval_steps_per_second": 1.07,
"step": 37
},
{
"epoch": 0.1292517006802721,
"grad_norm": 75.128224809043,
"learning_rate": 1.9989894613021807e-06,
"loss": 3.9717,
"step": 38
},
{
"epoch": 0.1326530612244898,
"grad_norm": 6.423556290490496,
"learning_rate": 1.998721094188584e-06,
"loss": 1.6634,
"step": 39
},
{
"epoch": 0.1360544217687075,
"grad_norm": 8.952452652609857,
"learning_rate": 1.9984211828973816e-06,
"loss": 2.1183,
"step": 40
},
{
"epoch": 0.13945578231292516,
"grad_norm": 12.837161899787583,
"learning_rate": 1.998089736901142e-06,
"loss": 2.1306,
"step": 41
},
{
"epoch": 0.14285714285714285,
"grad_norm": 7.2779063942957825,
"learning_rate": 1.9977267666684456e-06,
"loss": 1.9831,
"step": 42
},
{
"epoch": 0.14625850340136054,
"grad_norm": 30.288569770228293,
"learning_rate": 1.9973322836635515e-06,
"loss": 2.1869,
"step": 43
},
{
"epoch": 0.14965986394557823,
"grad_norm": 11.672608976353168,
"learning_rate": 1.996906300346036e-06,
"loss": 1.9566,
"step": 44
},
{
"epoch": 0.15306122448979592,
"grad_norm": 14.837719065187358,
"learning_rate": 1.9964488301704e-06,
"loss": 2.2152,
"step": 45
},
{
"epoch": 0.1564625850340136,
"grad_norm": 18.558600033713702,
"learning_rate": 1.9959598875856427e-06,
"loss": 2.06,
"step": 46
},
{
"epoch": 0.1598639455782313,
"grad_norm": 17.161073648503006,
"learning_rate": 1.995439488034806e-06,
"loss": 2.0463,
"step": 47
},
{
"epoch": 0.16326530612244897,
"grad_norm": 10.944090642041195,
"learning_rate": 1.994887647954486e-06,
"loss": 1.9676,
"step": 48
},
{
"epoch": 0.16666666666666666,
"grad_norm": 30.260773919516463,
"learning_rate": 1.9943043847743164e-06,
"loss": 2.4235,
"step": 49
},
{
"epoch": 0.17006802721088435,
"grad_norm": 17.95874457178673,
"learning_rate": 1.9936897169164135e-06,
"loss": 2.4211,
"step": 50
},
{
"epoch": 0.17346938775510204,
"grad_norm": 29.32804844947439,
"learning_rate": 1.993043663794799e-06,
"loss": 2.2786,
"step": 51
},
{
"epoch": 0.17687074829931973,
"grad_norm": 31.224760731119037,
"learning_rate": 1.9923662458147826e-06,
"loss": 2.8374,
"step": 52
},
{
"epoch": 0.18027210884353742,
"grad_norm": 4.5045539325043205,
"learning_rate": 1.9916574843723217e-06,
"loss": 1.6301,
"step": 53
},
{
"epoch": 0.1836734693877551,
"grad_norm": 10.827050277516674,
"learning_rate": 1.9909174018533427e-06,
"loss": 2.0554,
"step": 54
},
{
"epoch": 0.1870748299319728,
"grad_norm": 17.063187262605883,
"learning_rate": 1.990146021633034e-06,
"loss": 2.4202,
"step": 55
},
{
"epoch": 0.19047619047619047,
"grad_norm": 3.946679947433292,
"learning_rate": 1.98934336807511e-06,
"loss": 1.7808,
"step": 56
},
{
"epoch": 0.19387755102040816,
"grad_norm": 8.431222224384186,
"learning_rate": 1.9885094665310388e-06,
"loss": 1.7766,
"step": 57
},
{
"epoch": 0.19727891156462585,
"grad_norm": 32.28667139462841,
"learning_rate": 1.9876443433392433e-06,
"loss": 2.2299,
"step": 58
},
{
"epoch": 0.20068027210884354,
"grad_norm": 11.950555724182584,
"learning_rate": 1.986748025824268e-06,
"loss": 1.928,
"step": 59
},
{
"epoch": 0.20408163265306123,
"grad_norm": 3.6059136679066977,
"learning_rate": 1.985820542295918e-06,
"loss": 1.7761,
"step": 60
},
{
"epoch": 0.20748299319727892,
"grad_norm": 41.40947345983446,
"learning_rate": 1.984861922048363e-06,
"loss": 2.6704,
"step": 61
},
{
"epoch": 0.2108843537414966,
"grad_norm": 30.634237938465816,
"learning_rate": 1.983872195359212e-06,
"loss": 2.7336,
"step": 62
},
{
"epoch": 0.21428571428571427,
"grad_norm": 3.760013022701194,
"learning_rate": 1.9828513934885587e-06,
"loss": 1.8831,
"step": 63
},
{
"epoch": 0.21768707482993196,
"grad_norm": 37.34059674722221,
"learning_rate": 1.981799548677993e-06,
"loss": 2.27,
"step": 64
},
{
"epoch": 0.22108843537414966,
"grad_norm": 11.009700618421736,
"learning_rate": 1.980716694149581e-06,
"loss": 1.9265,
"step": 65
},
{
"epoch": 0.22448979591836735,
"grad_norm": 17.609147027884987,
"learning_rate": 1.9796028641048194e-06,
"loss": 2.3411,
"step": 66
},
{
"epoch": 0.22789115646258504,
"grad_norm": 17.432142291951372,
"learning_rate": 1.978458093723553e-06,
"loss": 2.2213,
"step": 67
},
{
"epoch": 0.23129251700680273,
"grad_norm": 14.11664326231067,
"learning_rate": 1.9772824191628632e-06,
"loss": 2.0831,
"step": 68
},
{
"epoch": 0.23469387755102042,
"grad_norm": 37.456025944063875,
"learning_rate": 1.9760758775559273e-06,
"loss": 2.7494,
"step": 69
},
{
"epoch": 0.23809523809523808,
"grad_norm": 16.30994509129653,
"learning_rate": 1.974838507010844e-06,
"loss": 2.118,
"step": 70
},
{
"epoch": 0.24149659863945577,
"grad_norm": 25.92468917111241,
"learning_rate": 1.9735703466094324e-06,
"loss": 2.1656,
"step": 71
},
{
"epoch": 0.24489795918367346,
"grad_norm": 17.23253832018251,
"learning_rate": 1.972271436405994e-06,
"loss": 2.0787,
"step": 72
},
{
"epoch": 0.24829931972789115,
"grad_norm": 6.286286593272188,
"learning_rate": 1.970941817426052e-06,
"loss": 1.7458,
"step": 73
},
{
"epoch": 0.25170068027210885,
"grad_norm": 20.87004487229478,
"learning_rate": 1.969581531665051e-06,
"loss": 2.364,
"step": 74
},
{
"epoch": 0.25170068027210885,
"eval_loss": 2.240875482559204,
"eval_runtime": 3.7328,
"eval_samples_per_second": 14.734,
"eval_steps_per_second": 1.072,
"step": 74
},
{
"epoch": 0.25510204081632654,
"grad_norm": 22.83815781491435,
"learning_rate": 1.968190622087034e-06,
"loss": 2.2176,
"step": 75
},
{
"epoch": 0.2585034013605442,
"grad_norm": 39.2204163613504,
"learning_rate": 1.9667691326232835e-06,
"loss": 2.605,
"step": 76
},
{
"epoch": 0.2619047619047619,
"grad_norm": 9.599486970591897,
"learning_rate": 1.965317108170935e-06,
"loss": 2.1652,
"step": 77
},
{
"epoch": 0.2653061224489796,
"grad_norm": 3.7571781853463175,
"learning_rate": 1.9638345945915586e-06,
"loss": 1.6055,
"step": 78
},
{
"epoch": 0.2687074829931973,
"grad_norm": 7.064670527473922,
"learning_rate": 1.962321638709709e-06,
"loss": 1.9937,
"step": 79
},
{
"epoch": 0.272108843537415,
"grad_norm": 28.207901160479654,
"learning_rate": 1.9607782883114506e-06,
"loss": 2.2552,
"step": 80
},
{
"epoch": 0.2755102040816326,
"grad_norm": 15.991872570963396,
"learning_rate": 1.959204592142843e-06,
"loss": 2.1559,
"step": 81
},
{
"epoch": 0.2789115646258503,
"grad_norm": 13.401822104278665,
"learning_rate": 1.957600599908406e-06,
"loss": 2.1652,
"step": 82
},
{
"epoch": 0.282312925170068,
"grad_norm": 14.708704691038701,
"learning_rate": 1.9559663622695455e-06,
"loss": 1.9673,
"step": 83
},
{
"epoch": 0.2857142857142857,
"grad_norm": 3.3458550475032105,
"learning_rate": 1.954301930842958e-06,
"loss": 1.6917,
"step": 84
},
{
"epoch": 0.2891156462585034,
"grad_norm": 3.479853146114766,
"learning_rate": 1.9526073581989955e-06,
"loss": 1.624,
"step": 85
},
{
"epoch": 0.2925170068027211,
"grad_norm": 25.10854427551898,
"learning_rate": 1.950882697860009e-06,
"loss": 2.3626,
"step": 86
},
{
"epoch": 0.29591836734693877,
"grad_norm": 14.389114459997433,
"learning_rate": 1.9491280042986562e-06,
"loss": 2.0549,
"step": 87
},
{
"epoch": 0.29931972789115646,
"grad_norm": 17.72897272235088,
"learning_rate": 1.9473433329361802e-06,
"loss": 2.4525,
"step": 88
},
{
"epoch": 0.30272108843537415,
"grad_norm": 8.212788560084723,
"learning_rate": 1.945528740140662e-06,
"loss": 2.1368,
"step": 89
},
{
"epoch": 0.30612244897959184,
"grad_norm": 26.76274867022125,
"learning_rate": 1.943684283225236e-06,
"loss": 2.3735,
"step": 90
},
{
"epoch": 0.30952380952380953,
"grad_norm": 23.71630229663243,
"learning_rate": 1.941810020446284e-06,
"loss": 2.6005,
"step": 91
},
{
"epoch": 0.3129251700680272,
"grad_norm": 22.889738702248234,
"learning_rate": 1.9399060110015917e-06,
"loss": 2.6924,
"step": 92
},
{
"epoch": 0.3163265306122449,
"grad_norm": 32.54631787971477,
"learning_rate": 1.9379723150284814e-06,
"loss": 2.5301,
"step": 93
},
{
"epoch": 0.3197278911564626,
"grad_norm": 3.6877224549117344,
"learning_rate": 1.936008993601912e-06,
"loss": 1.6556,
"step": 94
},
{
"epoch": 0.3231292517006803,
"grad_norm": 33.682920637388364,
"learning_rate": 1.934016108732548e-06,
"loss": 2.3709,
"step": 95
},
{
"epoch": 0.32653061224489793,
"grad_norm": 19.342157148675135,
"learning_rate": 1.9319937233648045e-06,
"loss": 1.8713,
"step": 96
},
{
"epoch": 0.3299319727891156,
"grad_norm": 36.9446891807536,
"learning_rate": 1.929941901374856e-06,
"loss": 3.1666,
"step": 97
},
{
"epoch": 0.3333333333333333,
"grad_norm": 12.769242612326224,
"learning_rate": 1.9278607075686205e-06,
"loss": 2.2024,
"step": 98
},
{
"epoch": 0.336734693877551,
"grad_norm": 7.569149644914372,
"learning_rate": 1.9257502076797123e-06,
"loss": 1.8434,
"step": 99
},
{
"epoch": 0.3401360544217687,
"grad_norm": 18.672166864254265,
"learning_rate": 1.9236104683673653e-06,
"loss": 2.6262,
"step": 100
},
{
"epoch": 0.3435374149659864,
"grad_norm": 7.251393661314555,
"learning_rate": 1.9214415572143284e-06,
"loss": 1.8447,
"step": 101
},
{
"epoch": 0.3469387755102041,
"grad_norm": 25.8588617341962,
"learning_rate": 1.919243542724731e-06,
"loss": 2.3528,
"step": 102
},
{
"epoch": 0.35034013605442177,
"grad_norm": 21.00339285362203,
"learning_rate": 1.917016494321918e-06,
"loss": 2.462,
"step": 103
},
{
"epoch": 0.35374149659863946,
"grad_norm": 19.533037226832878,
"learning_rate": 1.9147604823462585e-06,
"loss": 2.3057,
"step": 104
},
{
"epoch": 0.35714285714285715,
"grad_norm": 3.1087327492999286,
"learning_rate": 1.9124755780529243e-06,
"loss": 1.6935,
"step": 105
},
{
"epoch": 0.36054421768707484,
"grad_norm": 35.707396347148176,
"learning_rate": 1.910161853609637e-06,
"loss": 2.3652,
"step": 106
},
{
"epoch": 0.36394557823129253,
"grad_norm": 16.694934440145225,
"learning_rate": 1.9078193820943916e-06,
"loss": 2.6014,
"step": 107
},
{
"epoch": 0.3673469387755102,
"grad_norm": 12.946146725042743,
"learning_rate": 1.9054482374931466e-06,
"loss": 1.9379,
"step": 108
},
{
"epoch": 0.3707482993197279,
"grad_norm": 8.740650008889842,
"learning_rate": 1.9030484946974878e-06,
"loss": 1.9414,
"step": 109
},
{
"epoch": 0.3741496598639456,
"grad_norm": 23.13581690576701,
"learning_rate": 1.9006202295022629e-06,
"loss": 2.4563,
"step": 110
},
{
"epoch": 0.37755102040816324,
"grad_norm": 10.00026809536462,
"learning_rate": 1.8981635186031869e-06,
"loss": 1.8384,
"step": 111
},
{
"epoch": 0.37755102040816324,
"eval_loss": 2.2185332775115967,
"eval_runtime": 3.7603,
"eval_samples_per_second": 14.626,
"eval_steps_per_second": 1.064,
"step": 111
},
{
"epoch": 0.38095238095238093,
"grad_norm": 26.376801704138895,
"learning_rate": 1.89567843959442e-06,
"loss": 3.095,
"step": 112
},
{
"epoch": 0.3843537414965986,
"grad_norm": 31.801160647661863,
"learning_rate": 1.8931650709661176e-06,
"loss": 2.4186,
"step": 113
},
{
"epoch": 0.3877551020408163,
"grad_norm": 3.7202396333724406,
"learning_rate": 1.8906234921019504e-06,
"loss": 1.8483,
"step": 114
},
{
"epoch": 0.391156462585034,
"grad_norm": 20.22060079238643,
"learning_rate": 1.8880537832765975e-06,
"loss": 2.1247,
"step": 115
},
{
"epoch": 0.3945578231292517,
"grad_norm": 29.233218070907714,
"learning_rate": 1.8854560256532098e-06,
"loss": 2.3962,
"step": 116
},
{
"epoch": 0.3979591836734694,
"grad_norm": 12.311196195760077,
"learning_rate": 1.882830301280849e-06,
"loss": 1.9291,
"step": 117
},
{
"epoch": 0.4013605442176871,
"grad_norm": 24.022251844658836,
"learning_rate": 1.880176693091893e-06,
"loss": 2.0967,
"step": 118
},
{
"epoch": 0.40476190476190477,
"grad_norm": 15.5145598820515,
"learning_rate": 1.8774952848994193e-06,
"loss": 2.0164,
"step": 119
},
{
"epoch": 0.40816326530612246,
"grad_norm": 18.669552144287866,
"learning_rate": 1.874786161394556e-06,
"loss": 1.9074,
"step": 120
},
{
"epoch": 0.41156462585034015,
"grad_norm": 20.221669243742017,
"learning_rate": 1.8720494081438077e-06,
"loss": 2.0693,
"step": 121
},
{
"epoch": 0.41496598639455784,
"grad_norm": 40.16853982486705,
"learning_rate": 1.8692851115863521e-06,
"loss": 2.7133,
"step": 122
},
{
"epoch": 0.41836734693877553,
"grad_norm": 28.130765299643805,
"learning_rate": 1.8664933590313116e-06,
"loss": 2.3678,
"step": 123
},
{
"epoch": 0.4217687074829932,
"grad_norm": 3.285521259165442,
"learning_rate": 1.8636742386549936e-06,
"loss": 1.643,
"step": 124
},
{
"epoch": 0.42517006802721086,
"grad_norm": 14.918765530830019,
"learning_rate": 1.8608278394981065e-06,
"loss": 2.2832,
"step": 125
},
{
"epoch": 0.42857142857142855,
"grad_norm": 3.221047286582191,
"learning_rate": 1.8579542514629471e-06,
"loss": 1.7598,
"step": 126
},
{
"epoch": 0.43197278911564624,
"grad_norm": 30.02563146393063,
"learning_rate": 1.8550535653105621e-06,
"loss": 2.2684,
"step": 127
},
{
"epoch": 0.43537414965986393,
"grad_norm": 14.894051195947721,
"learning_rate": 1.8521258726578802e-06,
"loss": 2.2898,
"step": 128
},
{
"epoch": 0.4387755102040816,
"grad_norm": 31.346174242632404,
"learning_rate": 1.849171265974818e-06,
"loss": 2.4443,
"step": 129
},
{
"epoch": 0.4421768707482993,
"grad_norm": 18.396976082720574,
"learning_rate": 1.846189838581362e-06,
"loss": 2.4081,
"step": 130
},
{
"epoch": 0.445578231292517,
"grad_norm": 11.300098238275778,
"learning_rate": 1.843181684644617e-06,
"loss": 1.9707,
"step": 131
},
{
"epoch": 0.4489795918367347,
"grad_norm": 9.311622064720812,
"learning_rate": 1.8401468991758364e-06,
"loss": 2.0055,
"step": 132
},
{
"epoch": 0.4523809523809524,
"grad_norm": 17.268118260619143,
"learning_rate": 1.837085578027418e-06,
"loss": 2.1029,
"step": 133
},
{
"epoch": 0.4557823129251701,
"grad_norm": 13.534018757700077,
"learning_rate": 1.833997817889878e-06,
"loss": 1.6714,
"step": 134
},
{
"epoch": 0.45918367346938777,
"grad_norm": 25.67291091851184,
"learning_rate": 1.8308837162887962e-06,
"loss": 2.0809,
"step": 135
},
{
"epoch": 0.46258503401360546,
"grad_norm": 16.78554391811326,
"learning_rate": 1.827743371581737e-06,
"loss": 2.095,
"step": 136
},
{
"epoch": 0.46598639455782315,
"grad_norm": 7.0895304724541175,
"learning_rate": 1.8245768829551415e-06,
"loss": 2.0924,
"step": 137
},
{
"epoch": 0.46938775510204084,
"grad_norm": 28.325113542255774,
"learning_rate": 1.8213843504211956e-06,
"loss": 2.2312,
"step": 138
},
{
"epoch": 0.47278911564625853,
"grad_norm": 19.627621449351967,
"learning_rate": 1.8181658748146709e-06,
"loss": 2.1092,
"step": 139
},
{
"epoch": 0.47619047619047616,
"grad_norm": 3.253642214201976,
"learning_rate": 1.8149215577897394e-06,
"loss": 1.8119,
"step": 140
},
{
"epoch": 0.47959183673469385,
"grad_norm": 22.194249754011054,
"learning_rate": 1.8116515018167635e-06,
"loss": 1.8086,
"step": 141
},
{
"epoch": 0.48299319727891155,
"grad_norm": 3.291628206622755,
"learning_rate": 1.8083558101790595e-06,
"loss": 1.6961,
"step": 142
},
{
"epoch": 0.48639455782312924,
"grad_norm": 30.333797331495706,
"learning_rate": 1.8050345869696346e-06,
"loss": 2.4649,
"step": 143
},
{
"epoch": 0.4897959183673469,
"grad_norm": 35.46381155966904,
"learning_rate": 1.8016879370879004e-06,
"loss": 2.375,
"step": 144
},
{
"epoch": 0.4931972789115646,
"grad_norm": 10.065027530577671,
"learning_rate": 1.798315966236358e-06,
"loss": 1.7088,
"step": 145
},
{
"epoch": 0.4965986394557823,
"grad_norm": 31.969238069641904,
"learning_rate": 1.794918780917262e-06,
"loss": 2.2722,
"step": 146
},
{
"epoch": 0.5,
"grad_norm": 3.1706943713916287,
"learning_rate": 1.791496488429254e-06,
"loss": 1.5129,
"step": 147
},
{
"epoch": 0.5034013605442177,
"grad_norm": 40.129409477941664,
"learning_rate": 1.7880491968639751e-06,
"loss": 2.8429,
"step": 148
},
{
"epoch": 0.5034013605442177,
"eval_loss": 2.2053215503692627,
"eval_runtime": 3.8702,
"eval_samples_per_second": 14.211,
"eval_steps_per_second": 1.034,
"step": 148
},
{
"epoch": 0.5068027210884354,
"grad_norm": 26.985890370710862,
"learning_rate": 1.7845770151026513e-06,
"loss": 2.3221,
"step": 149
},
{
"epoch": 0.5102040816326531,
"grad_norm": 34.746114296368646,
"learning_rate": 1.7810800528126553e-06,
"loss": 2.3499,
"step": 150
},
{
"epoch": 0.5136054421768708,
"grad_norm": 3.902076154967714,
"learning_rate": 1.7775584204440416e-06,
"loss": 1.7411,
"step": 151
},
{
"epoch": 0.5170068027210885,
"grad_norm": 27.80193827038684,
"learning_rate": 1.7740122292260594e-06,
"loss": 2.2895,
"step": 152
},
{
"epoch": 0.5204081632653061,
"grad_norm": 3.4114906810600685,
"learning_rate": 1.7704415911636375e-06,
"loss": 1.5119,
"step": 153
},
{
"epoch": 0.5238095238095238,
"grad_norm": 9.505522369554297,
"learning_rate": 1.7668466190338483e-06,
"loss": 1.844,
"step": 154
},
{
"epoch": 0.5272108843537415,
"grad_norm": 36.46998151934392,
"learning_rate": 1.7632274263823457e-06,
"loss": 2.4713,
"step": 155
},
{
"epoch": 0.5306122448979592,
"grad_norm": 17.765108257489125,
"learning_rate": 1.759584127519778e-06,
"loss": 2.2811,
"step": 156
},
{
"epoch": 0.5340136054421769,
"grad_norm": 14.148223114236801,
"learning_rate": 1.7559168375181775e-06,
"loss": 1.8442,
"step": 157
},
{
"epoch": 0.5374149659863946,
"grad_norm": 9.76402372234183,
"learning_rate": 1.7522256722073273e-06,
"loss": 1.8945,
"step": 158
},
{
"epoch": 0.5408163265306123,
"grad_norm": 16.450896799860217,
"learning_rate": 1.748510748171101e-06,
"loss": 1.9574,
"step": 159
},
{
"epoch": 0.54421768707483,
"grad_norm": 3.912613042056259,
"learning_rate": 1.7447721827437819e-06,
"loss": 1.6032,
"step": 160
},
{
"epoch": 0.5476190476190477,
"grad_norm": 33.305605159021646,
"learning_rate": 1.7410100940063558e-06,
"loss": 2.4057,
"step": 161
},
{
"epoch": 0.5510204081632653,
"grad_norm": 38.319973023280475,
"learning_rate": 1.7372246007827833e-06,
"loss": 2.5925,
"step": 162
},
{
"epoch": 0.5544217687074829,
"grad_norm": 17.216523524482163,
"learning_rate": 1.7334158226362446e-06,
"loss": 2.0324,
"step": 163
},
{
"epoch": 0.5578231292517006,
"grad_norm": 4.9862323362748535,
"learning_rate": 1.7295838798653649e-06,
"loss": 1.7436,
"step": 164
},
{
"epoch": 0.5612244897959183,
"grad_norm": 4.0759355613648625,
"learning_rate": 1.7257288935004132e-06,
"loss": 1.7034,
"step": 165
},
{
"epoch": 0.564625850340136,
"grad_norm": 16.519960341878562,
"learning_rate": 1.7218509852994822e-06,
"loss": 2.115,
"step": 166
},
{
"epoch": 0.5680272108843537,
"grad_norm": 17.37824200525593,
"learning_rate": 1.7179502777446392e-06,
"loss": 2.0609,
"step": 167
},
{
"epoch": 0.5714285714285714,
"grad_norm": 39.604264809847564,
"learning_rate": 1.7140268940380605e-06,
"loss": 2.3861,
"step": 168
},
{
"epoch": 0.5748299319727891,
"grad_norm": 17.489048911326037,
"learning_rate": 1.7100809580981384e-06,
"loss": 1.9979,
"step": 169
},
{
"epoch": 0.5782312925170068,
"grad_norm": 6.642641185839537,
"learning_rate": 1.7061125945555679e-06,
"loss": 1.7533,
"step": 170
},
{
"epoch": 0.5816326530612245,
"grad_norm": 41.437166409250736,
"learning_rate": 1.70212192874941e-06,
"loss": 2.8676,
"step": 171
},
{
"epoch": 0.5850340136054422,
"grad_norm": 12.285090452877482,
"learning_rate": 1.6981090867231336e-06,
"loss": 1.8715,
"step": 172
},
{
"epoch": 0.5884353741496599,
"grad_norm": 20.351266920257437,
"learning_rate": 1.694074195220634e-06,
"loss": 2.5238,
"step": 173
},
{
"epoch": 0.5918367346938775,
"grad_norm": 13.128678816386138,
"learning_rate": 1.6900173816822289e-06,
"loss": 1.7191,
"step": 174
},
{
"epoch": 0.5952380952380952,
"grad_norm": 3.1331026154409565,
"learning_rate": 1.6859387742406358e-06,
"loss": 1.7885,
"step": 175
},
{
"epoch": 0.5986394557823129,
"grad_norm": 12.273944679120639,
"learning_rate": 1.6818385017169212e-06,
"loss": 1.9361,
"step": 176
},
{
"epoch": 0.6020408163265306,
"grad_norm": 18.988287394873876,
"learning_rate": 1.6777166936164354e-06,
"loss": 2.118,
"step": 177
},
{
"epoch": 0.6054421768707483,
"grad_norm": 13.330413347581118,
"learning_rate": 1.6735734801247202e-06,
"loss": 1.9923,
"step": 178
},
{
"epoch": 0.608843537414966,
"grad_norm": 8.528660885149025,
"learning_rate": 1.6694089921033976e-06,
"loss": 1.6938,
"step": 179
},
{
"epoch": 0.6122448979591837,
"grad_norm": 28.049589150374253,
"learning_rate": 1.6652233610860364e-06,
"loss": 2.4092,
"step": 180
},
{
"epoch": 0.6156462585034014,
"grad_norm": 19.077236893577115,
"learning_rate": 1.6610167192739978e-06,
"loss": 2.3235,
"step": 181
},
{
"epoch": 0.6190476190476191,
"grad_norm": 23.109888095114325,
"learning_rate": 1.6567891995322603e-06,
"loss": 2.2678,
"step": 182
},
{
"epoch": 0.6224489795918368,
"grad_norm": 19.456776496200867,
"learning_rate": 1.6525409353852221e-06,
"loss": 2.2764,
"step": 183
},
{
"epoch": 0.6258503401360545,
"grad_norm": 9.82404206796416,
"learning_rate": 1.6482720610124856e-06,
"loss": 1.8034,
"step": 184
},
{
"epoch": 0.6292517006802721,
"grad_norm": 24.2061776724548,
"learning_rate": 1.6439827112446173e-06,
"loss": 2.161,
"step": 185
},
{
"epoch": 0.6292517006802721,
"eval_loss": 2.194326400756836,
"eval_runtime": 3.7428,
"eval_samples_per_second": 14.695,
"eval_steps_per_second": 1.069,
"step": 185
},
{
"epoch": 0.6326530612244898,
"grad_norm": 30.469163171671003,
"learning_rate": 1.6396730215588912e-06,
"loss": 2.2773,
"step": 186
},
{
"epoch": 0.6360544217687075,
"grad_norm": 3.646917584621385,
"learning_rate": 1.6353431280750082e-06,
"loss": 1.5989,
"step": 187
},
{
"epoch": 0.6394557823129252,
"grad_norm": 30.30266588230692,
"learning_rate": 1.6309931675507978e-06,
"loss": 2.6169,
"step": 188
},
{
"epoch": 0.6428571428571429,
"grad_norm": 14.371186117614542,
"learning_rate": 1.6266232773778983e-06,
"loss": 1.9241,
"step": 189
},
{
"epoch": 0.6462585034013606,
"grad_norm": 18.71258411403636,
"learning_rate": 1.6222335955774176e-06,
"loss": 2.1737,
"step": 190
},
{
"epoch": 0.6496598639455783,
"grad_norm": 3.2723339662931585,
"learning_rate": 1.617824260795573e-06,
"loss": 1.8075,
"step": 191
},
{
"epoch": 0.6530612244897959,
"grad_norm": 16.496061968286824,
"learning_rate": 1.6133954122993139e-06,
"loss": 2.0147,
"step": 192
},
{
"epoch": 0.6564625850340136,
"grad_norm": 3.2013079969624805,
"learning_rate": 1.608947189971921e-06,
"loss": 1.6798,
"step": 193
},
{
"epoch": 0.6598639455782312,
"grad_norm": 20.981814890242124,
"learning_rate": 1.6044797343085898e-06,
"loss": 2.0425,
"step": 194
},
{
"epoch": 0.6632653061224489,
"grad_norm": 50.879018823375965,
"learning_rate": 1.599993186411992e-06,
"loss": 3.8504,
"step": 195
},
{
"epoch": 0.6666666666666666,
"grad_norm": 3.283241794235971,
"learning_rate": 1.59548768798782e-06,
"loss": 1.4971,
"step": 196
},
{
"epoch": 0.6700680272108843,
"grad_norm": 12.706772022061763,
"learning_rate": 1.5909633813403092e-06,
"loss": 1.9318,
"step": 197
},
{
"epoch": 0.673469387755102,
"grad_norm": 7.747043673117189,
"learning_rate": 1.5864204093677463e-06,
"loss": 1.8641,
"step": 198
},
{
"epoch": 0.6768707482993197,
"grad_norm": 12.685665761738797,
"learning_rate": 1.5818589155579529e-06,
"loss": 2.0781,
"step": 199
},
{
"epoch": 0.6802721088435374,
"grad_norm": 8.183695796856302,
"learning_rate": 1.5772790439837555e-06,
"loss": 2.1112,
"step": 200
},
{
"epoch": 0.6836734693877551,
"grad_norm": 3.6436475976280605,
"learning_rate": 1.572680939298435e-06,
"loss": 1.504,
"step": 201
},
{
"epoch": 0.6870748299319728,
"grad_norm": 7.765753459491514,
"learning_rate": 1.5680647467311555e-06,
"loss": 1.6113,
"step": 202
},
{
"epoch": 0.6904761904761905,
"grad_norm": 27.059590789587673,
"learning_rate": 1.563430612082382e-06,
"loss": 2.3797,
"step": 203
},
{
"epoch": 0.6938775510204082,
"grad_norm": 17.865181616406808,
"learning_rate": 1.5587786817192687e-06,
"loss": 2.2287,
"step": 204
},
{
"epoch": 0.6972789115646258,
"grad_norm": 11.50437842198177,
"learning_rate": 1.5541091025710434e-06,
"loss": 2.2926,
"step": 205
},
{
"epoch": 0.7006802721088435,
"grad_norm": 18.03962056520961,
"learning_rate": 1.5494220221243607e-06,
"loss": 2.3374,
"step": 206
},
{
"epoch": 0.7040816326530612,
"grad_norm": 19.808732477248256,
"learning_rate": 1.5447175884186478e-06,
"loss": 2.3215,
"step": 207
},
{
"epoch": 0.7074829931972789,
"grad_norm": 21.35228597761302,
"learning_rate": 1.539995950041426e-06,
"loss": 2.2378,
"step": 208
},
{
"epoch": 0.7108843537414966,
"grad_norm": 14.090932946927257,
"learning_rate": 1.5352572561236197e-06,
"loss": 2.22,
"step": 209
},
{
"epoch": 0.7142857142857143,
"grad_norm": 22.22875395969964,
"learning_rate": 1.5305016563348443e-06,
"loss": 2.44,
"step": 210
},
{
"epoch": 0.717687074829932,
"grad_norm": 12.732771656478363,
"learning_rate": 1.5257293008786807e-06,
"loss": 2.0598,
"step": 211
},
{
"epoch": 0.7210884353741497,
"grad_norm": 3.3024595151809777,
"learning_rate": 1.5209403404879303e-06,
"loss": 1.8514,
"step": 212
},
{
"epoch": 0.7244897959183674,
"grad_norm": 31.041628605811148,
"learning_rate": 1.5161349264198535e-06,
"loss": 2.4225,
"step": 213
},
{
"epoch": 0.7278911564625851,
"grad_norm": 11.866017531018645,
"learning_rate": 1.511313210451394e-06,
"loss": 1.9747,
"step": 214
},
{
"epoch": 0.7312925170068028,
"grad_norm": 23.77867996796224,
"learning_rate": 1.5064753448743832e-06,
"loss": 2.0971,
"step": 215
},
{
"epoch": 0.7346938775510204,
"grad_norm": 28.640512428374876,
"learning_rate": 1.5016214824907314e-06,
"loss": 2.2247,
"step": 216
},
{
"epoch": 0.7380952380952381,
"grad_norm": 9.463317499162777,
"learning_rate": 1.4967517766076015e-06,
"loss": 1.9511,
"step": 217
},
{
"epoch": 0.7414965986394558,
"grad_norm": 3.6132074342008336,
"learning_rate": 1.4918663810325659e-06,
"loss": 1.5643,
"step": 218
},
{
"epoch": 0.7448979591836735,
"grad_norm": 12.274299577611806,
"learning_rate": 1.4869654500687492e-06,
"loss": 2.0865,
"step": 219
},
{
"epoch": 0.7482993197278912,
"grad_norm": 9.577269499797044,
"learning_rate": 1.4820491385099555e-06,
"loss": 2.1494,
"step": 220
},
{
"epoch": 0.7517006802721088,
"grad_norm": 13.665325186622818,
"learning_rate": 1.477117601635777e-06,
"loss": 2.0676,
"step": 221
},
{
"epoch": 0.7551020408163265,
"grad_norm": 12.044556166373619,
"learning_rate": 1.4721709952066923e-06,
"loss": 1.7408,
"step": 222
},
{
"epoch": 0.7551020408163265,
"eval_loss": 2.1867611408233643,
"eval_runtime": 3.7388,
"eval_samples_per_second": 14.711,
"eval_steps_per_second": 1.07,
"step": 222
},
{
"epoch": 0.7585034013605442,
"grad_norm": 32.0866216128451,
"learning_rate": 1.4672094754591449e-06,
"loss": 2.6444,
"step": 223
},
{
"epoch": 0.7619047619047619,
"grad_norm": 26.272890838528287,
"learning_rate": 1.4622331991006082e-06,
"loss": 2.0286,
"step": 224
},
{
"epoch": 0.7653061224489796,
"grad_norm": 10.948966043777636,
"learning_rate": 1.4572423233046385e-06,
"loss": 1.8924,
"step": 225
},
{
"epoch": 0.7687074829931972,
"grad_norm": 10.041220633719293,
"learning_rate": 1.4522370057059079e-06,
"loss": 1.8589,
"step": 226
},
{
"epoch": 0.7721088435374149,
"grad_norm": 19.90849856575333,
"learning_rate": 1.447217404395227e-06,
"loss": 2.4632,
"step": 227
},
{
"epoch": 0.7755102040816326,
"grad_norm": 3.3718807752757134,
"learning_rate": 1.4421836779145511e-06,
"loss": 1.7402,
"step": 228
},
{
"epoch": 0.7789115646258503,
"grad_norm": 33.99543346002537,
"learning_rate": 1.4371359852519734e-06,
"loss": 2.9081,
"step": 229
},
{
"epoch": 0.782312925170068,
"grad_norm": 12.446391408704297,
"learning_rate": 1.4320744858367024e-06,
"loss": 2.0828,
"step": 230
},
{
"epoch": 0.7857142857142857,
"grad_norm": 26.19952152880794,
"learning_rate": 1.4269993395340277e-06,
"loss": 2.2178,
"step": 231
},
{
"epoch": 0.7891156462585034,
"grad_norm": 36.07799078718175,
"learning_rate": 1.4219107066402692e-06,
"loss": 2.6926,
"step": 232
},
{
"epoch": 0.7925170068027211,
"grad_norm": 11.216785179837261,
"learning_rate": 1.4168087478777152e-06,
"loss": 2.0393,
"step": 233
},
{
"epoch": 0.7959183673469388,
"grad_norm": 17.659830496744974,
"learning_rate": 1.4116936243895466e-06,
"loss": 2.1082,
"step": 234
},
{
"epoch": 0.7993197278911565,
"grad_norm": 17.001892765923902,
"learning_rate": 1.406565497734745e-06,
"loss": 1.9051,
"step": 235
},
{
"epoch": 0.8027210884353742,
"grad_norm": 31.896056687773818,
"learning_rate": 1.4014245298829935e-06,
"loss": 2.702,
"step": 236
},
{
"epoch": 0.8061224489795918,
"grad_norm": 6.972810630357569,
"learning_rate": 1.3962708832095568e-06,
"loss": 1.9466,
"step": 237
},
{
"epoch": 0.8095238095238095,
"grad_norm": 17.689383441039308,
"learning_rate": 1.3911047204901558e-06,
"loss": 2.3425,
"step": 238
},
{
"epoch": 0.8129251700680272,
"grad_norm": 16.46834046227904,
"learning_rate": 1.385926204895826e-06,
"loss": 2.1545,
"step": 239
},
{
"epoch": 0.8163265306122449,
"grad_norm": 21.69161139742313,
"learning_rate": 1.3807354999877614e-06,
"loss": 2.3222,
"step": 240
},
{
"epoch": 0.8197278911564626,
"grad_norm": 3.411794366451801,
"learning_rate": 1.3755327697121522e-06,
"loss": 1.6492,
"step": 241
},
{
"epoch": 0.8231292517006803,
"grad_norm": 13.113564486849809,
"learning_rate": 1.3703181783950031e-06,
"loss": 2.0212,
"step": 242
},
{
"epoch": 0.826530612244898,
"grad_norm": 14.798483657902382,
"learning_rate": 1.3650918907369452e-06,
"loss": 2.1974,
"step": 243
},
{
"epoch": 0.8299319727891157,
"grad_norm": 10.19780084250851,
"learning_rate": 1.3598540718080345e-06,
"loss": 1.8543,
"step": 244
},
{
"epoch": 0.8333333333333334,
"grad_norm": 30.023251305313995,
"learning_rate": 1.3546048870425354e-06,
"loss": 2.2387,
"step": 245
},
{
"epoch": 0.8367346938775511,
"grad_norm": 22.321684071392564,
"learning_rate": 1.3493445022336994e-06,
"loss": 2.4305,
"step": 246
},
{
"epoch": 0.8401360544217688,
"grad_norm": 34.98925650288134,
"learning_rate": 1.3440730835285247e-06,
"loss": 2.4364,
"step": 247
},
{
"epoch": 0.8435374149659864,
"grad_norm": 3.161092974878791,
"learning_rate": 1.3387907974225116e-06,
"loss": 1.4885,
"step": 248
},
{
"epoch": 0.8469387755102041,
"grad_norm": 50.11899935337027,
"learning_rate": 1.3334978107544024e-06,
"loss": 2.3332,
"step": 249
},
{
"epoch": 0.8503401360544217,
"grad_norm": 15.05206270554561,
"learning_rate": 1.3281942907009112e-06,
"loss": 2.2131,
"step": 250
},
{
"epoch": 0.8537414965986394,
"grad_norm": 24.869549840961,
"learning_rate": 1.3228804047714462e-06,
"loss": 2.2264,
"step": 251
},
{
"epoch": 0.8571428571428571,
"grad_norm": 16.049594008906414,
"learning_rate": 1.317556320802816e-06,
"loss": 1.7228,
"step": 252
},
{
"epoch": 0.8605442176870748,
"grad_norm": 14.258214783846427,
"learning_rate": 1.31222220695393e-06,
"loss": 1.999,
"step": 253
},
{
"epoch": 0.8639455782312925,
"grad_norm": 3.3063413494205474,
"learning_rate": 1.3068782317004874e-06,
"loss": 1.4607,
"step": 254
},
{
"epoch": 0.8673469387755102,
"grad_norm": 8.831787955552995,
"learning_rate": 1.3015245638296563e-06,
"loss": 2.1192,
"step": 255
},
{
"epoch": 0.8707482993197279,
"grad_norm": 3.121872417027736,
"learning_rate": 1.296161372434741e-06,
"loss": 1.5467,
"step": 256
},
{
"epoch": 0.8741496598639455,
"grad_norm": 33.22351218100941,
"learning_rate": 1.2907888269098416e-06,
"loss": 2.3588,
"step": 257
},
{
"epoch": 0.8775510204081632,
"grad_norm": 3.188560179185641,
"learning_rate": 1.2854070969445064e-06,
"loss": 1.5405,
"step": 258
},
{
"epoch": 0.8809523809523809,
"grad_norm": 21.318069352021737,
"learning_rate": 1.2800163525183688e-06,
"loss": 2.2063,
"step": 259
},
{
"epoch": 0.8809523809523809,
"eval_loss": 2.1820290088653564,
"eval_runtime": 3.8534,
"eval_samples_per_second": 14.273,
"eval_steps_per_second": 1.038,
"step": 259
},
{
"epoch": 0.8843537414965986,
"grad_norm": 8.243323927611506,
"learning_rate": 1.2746167638957805e-06,
"loss": 1.8474,
"step": 260
},
{
"epoch": 0.8877551020408163,
"grad_norm": 28.909948439715215,
"learning_rate": 1.2692085016204333e-06,
"loss": 2.2626,
"step": 261
},
{
"epoch": 0.891156462585034,
"grad_norm": 3.0722449835450116,
"learning_rate": 1.2637917365099725e-06,
"loss": 1.6435,
"step": 262
},
{
"epoch": 0.8945578231292517,
"grad_norm": 29.871491992872432,
"learning_rate": 1.2583666396506023e-06,
"loss": 2.1498,
"step": 263
},
{
"epoch": 0.8979591836734694,
"grad_norm": 2.977539901133042,
"learning_rate": 1.2529333823916806e-06,
"loss": 1.7024,
"step": 264
},
{
"epoch": 0.9013605442176871,
"grad_norm": 16.47476152363902,
"learning_rate": 1.2474921363403094e-06,
"loss": 2.532,
"step": 265
},
{
"epoch": 0.9047619047619048,
"grad_norm": 13.022051400004793,
"learning_rate": 1.2420430733559124e-06,
"loss": 1.8884,
"step": 266
},
{
"epoch": 0.9081632653061225,
"grad_norm": 8.97804602434911,
"learning_rate": 1.2365863655448075e-06,
"loss": 1.7885,
"step": 267
},
{
"epoch": 0.9115646258503401,
"grad_norm": 16.047174726202446,
"learning_rate": 1.2311221852547721e-06,
"loss": 2.3363,
"step": 268
},
{
"epoch": 0.9149659863945578,
"grad_norm": 3.5763323384852765,
"learning_rate": 1.2256507050695977e-06,
"loss": 1.701,
"step": 269
},
{
"epoch": 0.9183673469387755,
"grad_norm": 26.929796973835796,
"learning_rate": 1.220172097803641e-06,
"loss": 2.3601,
"step": 270
},
{
"epoch": 0.9217687074829932,
"grad_norm": 22.50281840057178,
"learning_rate": 1.2146865364963633e-06,
"loss": 2.0693,
"step": 271
},
{
"epoch": 0.9251700680272109,
"grad_norm": 11.62602578923058,
"learning_rate": 1.2091941944068665e-06,
"loss": 1.9123,
"step": 272
},
{
"epoch": 0.9285714285714286,
"grad_norm": 16.841220035990798,
"learning_rate": 1.2036952450084214e-06,
"loss": 2.2163,
"step": 273
},
{
"epoch": 0.9319727891156463,
"grad_norm": 18.055133543008612,
"learning_rate": 1.1981898619829879e-06,
"loss": 2.2485,
"step": 274
},
{
"epoch": 0.935374149659864,
"grad_norm": 26.45820099458286,
"learning_rate": 1.1926782192157273e-06,
"loss": 2.1845,
"step": 275
},
{
"epoch": 0.9387755102040817,
"grad_norm": 3.334955291200548,
"learning_rate": 1.1871604907895148e-06,
"loss": 1.7059,
"step": 276
},
{
"epoch": 0.9421768707482994,
"grad_norm": 19.511242339983163,
"learning_rate": 1.1816368509794364e-06,
"loss": 2.3601,
"step": 277
},
{
"epoch": 0.9455782312925171,
"grad_norm": 21.146925953072365,
"learning_rate": 1.1761074742472882e-06,
"loss": 1.9957,
"step": 278
},
{
"epoch": 0.9489795918367347,
"grad_norm": 3.5535024021194452,
"learning_rate": 1.1705725352360633e-06,
"loss": 1.9249,
"step": 279
},
{
"epoch": 0.9523809523809523,
"grad_norm": 13.348912305071467,
"learning_rate": 1.165032208764438e-06,
"loss": 2.0641,
"step": 280
},
{
"epoch": 0.95578231292517,
"grad_norm": 12.61033318044152,
"learning_rate": 1.1594866698212483e-06,
"loss": 2.169,
"step": 281
},
{
"epoch": 0.9591836734693877,
"grad_norm": 28.256325358544956,
"learning_rate": 1.1539360935599644e-06,
"loss": 2.0952,
"step": 282
},
{
"epoch": 0.9625850340136054,
"grad_norm": 12.61302060729169,
"learning_rate": 1.1483806552931582e-06,
"loss": 1.9411,
"step": 283
},
{
"epoch": 0.9659863945578231,
"grad_norm": 8.711391665501074,
"learning_rate": 1.142820530486966e-06,
"loss": 1.7633,
"step": 284
},
{
"epoch": 0.9693877551020408,
"grad_norm": 35.95958496013491,
"learning_rate": 1.1372558947555455e-06,
"loss": 2.1904,
"step": 285
},
{
"epoch": 0.9727891156462585,
"grad_norm": 3.429092657849847,
"learning_rate": 1.131686923855531e-06,
"loss": 1.8276,
"step": 286
},
{
"epoch": 0.9761904761904762,
"grad_norm": 12.871658288368948,
"learning_rate": 1.1261137936804811e-06,
"loss": 2.0911,
"step": 287
},
{
"epoch": 0.9795918367346939,
"grad_norm": 13.217001333800638,
"learning_rate": 1.1205366802553228e-06,
"loss": 1.9614,
"step": 288
},
{
"epoch": 0.9829931972789115,
"grad_norm": 24.712172909538513,
"learning_rate": 1.1149557597307934e-06,
"loss": 2.0412,
"step": 289
},
{
"epoch": 0.9863945578231292,
"grad_norm": 10.412944718560512,
"learning_rate": 1.1093712083778746e-06,
"loss": 1.7787,
"step": 290
},
{
"epoch": 0.9897959183673469,
"grad_norm": 15.631851389191027,
"learning_rate": 1.1037832025822265e-06,
"loss": 2.3362,
"step": 291
},
{
"epoch": 0.9931972789115646,
"grad_norm": 12.135256117907334,
"learning_rate": 1.098191918838617e-06,
"loss": 2.0212,
"step": 292
},
{
"epoch": 0.9965986394557823,
"grad_norm": 13.057522322919077,
"learning_rate": 1.0925975337453462e-06,
"loss": 2.2842,
"step": 293
},
{
"epoch": 1.0,
"grad_norm": 17.565324685523922,
"learning_rate": 1.0870002239986686e-06,
"loss": 2.5002,
"step": 294
}
],
"logging_steps": 1,
"max_steps": 588,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 294,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 95887829237760.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}