PumeTu's picture
Add files using upload-large-folder tool
dbde12c verified
raw
history blame
98.4 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.873563218390805,
"eval_steps": 10,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005747126436781609,
"grad_norm": 0.10123365372419357,
"learning_rate": 0.0,
"loss": 0.9918,
"step": 1
},
{
"epoch": 0.011494252873563218,
"grad_norm": 0.09671098738908768,
"learning_rate": 3.7735849056603773e-06,
"loss": 0.9604,
"step": 2
},
{
"epoch": 0.017241379310344827,
"grad_norm": 0.0981190875172615,
"learning_rate": 7.547169811320755e-06,
"loss": 0.9868,
"step": 3
},
{
"epoch": 0.022988505747126436,
"grad_norm": 0.10396745055913925,
"learning_rate": 1.1320754716981132e-05,
"loss": 0.962,
"step": 4
},
{
"epoch": 0.028735632183908046,
"grad_norm": 0.0982985869050026,
"learning_rate": 1.509433962264151e-05,
"loss": 0.9684,
"step": 5
},
{
"epoch": 0.034482758620689655,
"grad_norm": 0.10332155227661133,
"learning_rate": 1.8867924528301888e-05,
"loss": 0.9442,
"step": 6
},
{
"epoch": 0.040229885057471264,
"grad_norm": 0.1124059334397316,
"learning_rate": 2.2641509433962265e-05,
"loss": 0.9382,
"step": 7
},
{
"epoch": 0.04597701149425287,
"grad_norm": 0.12120208889245987,
"learning_rate": 2.641509433962264e-05,
"loss": 0.9416,
"step": 8
},
{
"epoch": 0.05172413793103448,
"grad_norm": 0.12729395925998688,
"learning_rate": 3.018867924528302e-05,
"loss": 0.9356,
"step": 9
},
{
"epoch": 0.05747126436781609,
"grad_norm": 0.13560789823532104,
"learning_rate": 3.39622641509434e-05,
"loss": 0.9293,
"step": 10
},
{
"epoch": 0.05747126436781609,
"eval_loss": 1.0470749139785767,
"eval_runtime": 412.2553,
"eval_samples_per_second": 24.009,
"eval_steps_per_second": 0.376,
"step": 10
},
{
"epoch": 0.06321839080459771,
"grad_norm": 0.1474100798368454,
"learning_rate": 3.7735849056603776e-05,
"loss": 0.9533,
"step": 11
},
{
"epoch": 0.06896551724137931,
"grad_norm": 0.16510824859142303,
"learning_rate": 4.150943396226415e-05,
"loss": 0.9206,
"step": 12
},
{
"epoch": 0.07471264367816093,
"grad_norm": 0.17097796499729156,
"learning_rate": 4.528301886792453e-05,
"loss": 0.8921,
"step": 13
},
{
"epoch": 0.08045977011494253,
"grad_norm": 0.17923878133296967,
"learning_rate": 4.9056603773584906e-05,
"loss": 0.8861,
"step": 14
},
{
"epoch": 0.08620689655172414,
"grad_norm": 0.18173959851264954,
"learning_rate": 5.283018867924528e-05,
"loss": 0.8904,
"step": 15
},
{
"epoch": 0.09195402298850575,
"grad_norm": 0.17235629260540009,
"learning_rate": 5.660377358490566e-05,
"loss": 0.8424,
"step": 16
},
{
"epoch": 0.09770114942528736,
"grad_norm": 0.16792210936546326,
"learning_rate": 6.037735849056604e-05,
"loss": 0.8395,
"step": 17
},
{
"epoch": 0.10344827586206896,
"grad_norm": 0.14939646422863007,
"learning_rate": 6.415094339622641e-05,
"loss": 0.8203,
"step": 18
},
{
"epoch": 0.10919540229885058,
"grad_norm": 0.14632105827331543,
"learning_rate": 6.79245283018868e-05,
"loss": 0.8464,
"step": 19
},
{
"epoch": 0.11494252873563218,
"grad_norm": 0.14770475029945374,
"learning_rate": 7.169811320754717e-05,
"loss": 0.8085,
"step": 20
},
{
"epoch": 0.11494252873563218,
"eval_loss": 0.8244547247886658,
"eval_runtime": 404.4489,
"eval_samples_per_second": 24.473,
"eval_steps_per_second": 0.383,
"step": 20
},
{
"epoch": 0.1206896551724138,
"grad_norm": 0.1725720465183258,
"learning_rate": 7.547169811320755e-05,
"loss": 0.8219,
"step": 21
},
{
"epoch": 0.12643678160919541,
"grad_norm": 0.1685618907213211,
"learning_rate": 7.924528301886794e-05,
"loss": 0.8148,
"step": 22
},
{
"epoch": 0.13218390804597702,
"grad_norm": 0.1653290092945099,
"learning_rate": 8.30188679245283e-05,
"loss": 0.7846,
"step": 23
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.16122524440288544,
"learning_rate": 8.679245283018869e-05,
"loss": 0.7903,
"step": 24
},
{
"epoch": 0.14367816091954022,
"grad_norm": 0.12793505191802979,
"learning_rate": 9.056603773584906e-05,
"loss": 0.7741,
"step": 25
},
{
"epoch": 0.14942528735632185,
"grad_norm": 0.10620377957820892,
"learning_rate": 9.433962264150944e-05,
"loss": 0.7308,
"step": 26
},
{
"epoch": 0.15517241379310345,
"grad_norm": 0.10993366688489914,
"learning_rate": 9.811320754716981e-05,
"loss": 0.7559,
"step": 27
},
{
"epoch": 0.16091954022988506,
"grad_norm": 0.11916384100914001,
"learning_rate": 0.0001018867924528302,
"loss": 0.7622,
"step": 28
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.13500399887561798,
"learning_rate": 0.00010566037735849057,
"loss": 0.7436,
"step": 29
},
{
"epoch": 0.1724137931034483,
"grad_norm": 0.12777844071388245,
"learning_rate": 0.00010943396226415095,
"loss": 0.7547,
"step": 30
},
{
"epoch": 0.1724137931034483,
"eval_loss": 0.7580565214157104,
"eval_runtime": 404.708,
"eval_samples_per_second": 24.457,
"eval_steps_per_second": 0.383,
"step": 30
},
{
"epoch": 0.1781609195402299,
"grad_norm": 0.11721828579902649,
"learning_rate": 0.00011320754716981132,
"loss": 0.7337,
"step": 31
},
{
"epoch": 0.1839080459770115,
"grad_norm": 0.08667382597923279,
"learning_rate": 0.0001169811320754717,
"loss": 0.7538,
"step": 32
},
{
"epoch": 0.1896551724137931,
"grad_norm": 0.06665026396512985,
"learning_rate": 0.00012075471698113207,
"loss": 0.7186,
"step": 33
},
{
"epoch": 0.19540229885057472,
"grad_norm": 0.04627465456724167,
"learning_rate": 0.00012452830188679244,
"loss": 0.7719,
"step": 34
},
{
"epoch": 0.20114942528735633,
"grad_norm": 0.04290887340903282,
"learning_rate": 0.00012830188679245283,
"loss": 0.752,
"step": 35
},
{
"epoch": 0.20689655172413793,
"grad_norm": 0.056834809482097626,
"learning_rate": 0.0001320754716981132,
"loss": 0.7429,
"step": 36
},
{
"epoch": 0.21264367816091953,
"grad_norm": 0.062055498361587524,
"learning_rate": 0.0001358490566037736,
"loss": 0.7208,
"step": 37
},
{
"epoch": 0.21839080459770116,
"grad_norm": 0.070551298558712,
"learning_rate": 0.00013962264150943395,
"loss": 0.7651,
"step": 38
},
{
"epoch": 0.22413793103448276,
"grad_norm": 0.07514140754938126,
"learning_rate": 0.00014339622641509434,
"loss": 0.7456,
"step": 39
},
{
"epoch": 0.22988505747126436,
"grad_norm": 0.06458627432584763,
"learning_rate": 0.00014716981132075472,
"loss": 0.7289,
"step": 40
},
{
"epoch": 0.22988505747126436,
"eval_loss": 0.7386028170585632,
"eval_runtime": 407.409,
"eval_samples_per_second": 24.295,
"eval_steps_per_second": 0.38,
"step": 40
},
{
"epoch": 0.23563218390804597,
"grad_norm": 0.056490588933229446,
"learning_rate": 0.0001509433962264151,
"loss": 0.7503,
"step": 41
},
{
"epoch": 0.2413793103448276,
"grad_norm": 0.036972932517528534,
"learning_rate": 0.0001547169811320755,
"loss": 0.7392,
"step": 42
},
{
"epoch": 0.2471264367816092,
"grad_norm": 0.038239240646362305,
"learning_rate": 0.00015849056603773587,
"loss": 0.7206,
"step": 43
},
{
"epoch": 0.25287356321839083,
"grad_norm": 0.033113010227680206,
"learning_rate": 0.00016226415094339625,
"loss": 0.7198,
"step": 44
},
{
"epoch": 0.25862068965517243,
"grad_norm": 0.03197947517037392,
"learning_rate": 0.0001660377358490566,
"loss": 0.7393,
"step": 45
},
{
"epoch": 0.26436781609195403,
"grad_norm": 0.03696918115019798,
"learning_rate": 0.000169811320754717,
"loss": 0.7576,
"step": 46
},
{
"epoch": 0.27011494252873564,
"grad_norm": 0.04209383204579353,
"learning_rate": 0.00017358490566037738,
"loss": 0.7157,
"step": 47
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.035038772970438004,
"learning_rate": 0.00017735849056603776,
"loss": 0.7256,
"step": 48
},
{
"epoch": 0.28160919540229884,
"grad_norm": 0.03674735128879547,
"learning_rate": 0.00018113207547169812,
"loss": 0.7295,
"step": 49
},
{
"epoch": 0.28735632183908044,
"grad_norm": 0.046050041913986206,
"learning_rate": 0.0001849056603773585,
"loss": 0.6965,
"step": 50
},
{
"epoch": 0.28735632183908044,
"eval_loss": 0.724204421043396,
"eval_runtime": 405.0004,
"eval_samples_per_second": 24.439,
"eval_steps_per_second": 0.383,
"step": 50
},
{
"epoch": 0.29310344827586204,
"grad_norm": 0.036520447582006454,
"learning_rate": 0.00018867924528301889,
"loss": 0.7273,
"step": 51
},
{
"epoch": 0.2988505747126437,
"grad_norm": 0.03720232844352722,
"learning_rate": 0.00019245283018867927,
"loss": 0.7084,
"step": 52
},
{
"epoch": 0.3045977011494253,
"grad_norm": 0.03159736469388008,
"learning_rate": 0.00019622641509433963,
"loss": 0.7485,
"step": 53
},
{
"epoch": 0.3103448275862069,
"grad_norm": 0.03695262596011162,
"learning_rate": 0.0002,
"loss": 0.745,
"step": 54
},
{
"epoch": 0.3160919540229885,
"grad_norm": 0.041795678436756134,
"learning_rate": 0.00019999775651876987,
"loss": 0.7165,
"step": 55
},
{
"epoch": 0.3218390804597701,
"grad_norm": 0.03494727239012718,
"learning_rate": 0.00019999102617574365,
"loss": 0.7499,
"step": 56
},
{
"epoch": 0.3275862068965517,
"grad_norm": 0.033885981887578964,
"learning_rate": 0.00019997980927290927,
"loss": 0.7118,
"step": 57
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.03606434166431427,
"learning_rate": 0.00019996410631356498,
"loss": 0.6945,
"step": 58
},
{
"epoch": 0.3390804597701149,
"grad_norm": 0.04015219211578369,
"learning_rate": 0.00019994391800229666,
"loss": 0.6982,
"step": 59
},
{
"epoch": 0.3448275862068966,
"grad_norm": 0.0380714014172554,
"learning_rate": 0.00019991924524494627,
"loss": 0.6848,
"step": 60
},
{
"epoch": 0.3448275862068966,
"eval_loss": 0.7109408378601074,
"eval_runtime": 404.9798,
"eval_samples_per_second": 24.441,
"eval_steps_per_second": 0.383,
"step": 60
},
{
"epoch": 0.3505747126436782,
"grad_norm": 0.04110811650753021,
"learning_rate": 0.00019989008914857116,
"loss": 0.6899,
"step": 61
},
{
"epoch": 0.3563218390804598,
"grad_norm": 0.03853503614664078,
"learning_rate": 0.0001998564510213944,
"loss": 0.7094,
"step": 62
},
{
"epoch": 0.3620689655172414,
"grad_norm": 0.0391794852912426,
"learning_rate": 0.00019981833237274618,
"loss": 0.6975,
"step": 63
},
{
"epoch": 0.367816091954023,
"grad_norm": 0.03894927725195885,
"learning_rate": 0.00019977573491299598,
"loss": 0.714,
"step": 64
},
{
"epoch": 0.3735632183908046,
"grad_norm": 0.04239923506975174,
"learning_rate": 0.00019972866055347572,
"loss": 0.7339,
"step": 65
},
{
"epoch": 0.3793103448275862,
"grad_norm": 0.03982697054743767,
"learning_rate": 0.0001996771114063943,
"loss": 0.6821,
"step": 66
},
{
"epoch": 0.3850574712643678,
"grad_norm": 0.04431302100419998,
"learning_rate": 0.00019962108978474263,
"loss": 0.7273,
"step": 67
},
{
"epoch": 0.39080459770114945,
"grad_norm": 0.043787937611341476,
"learning_rate": 0.00019956059820218982,
"loss": 0.6984,
"step": 68
},
{
"epoch": 0.39655172413793105,
"grad_norm": 0.054389603435993195,
"learning_rate": 0.00019949563937297045,
"loss": 0.6778,
"step": 69
},
{
"epoch": 0.40229885057471265,
"grad_norm": 0.041256386786699295,
"learning_rate": 0.00019942621621176282,
"loss": 0.693,
"step": 70
},
{
"epoch": 0.40229885057471265,
"eval_loss": 0.7021871209144592,
"eval_runtime": 406.6755,
"eval_samples_per_second": 24.339,
"eval_steps_per_second": 0.381,
"step": 70
},
{
"epoch": 0.40804597701149425,
"grad_norm": 0.05022790655493736,
"learning_rate": 0.0001993523318335581,
"loss": 0.6967,
"step": 71
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.06086933612823486,
"learning_rate": 0.00019927398955352061,
"loss": 0.7279,
"step": 72
},
{
"epoch": 0.41954022988505746,
"grad_norm": 0.04689742252230644,
"learning_rate": 0.00019919119288683908,
"loss": 0.6792,
"step": 73
},
{
"epoch": 0.42528735632183906,
"grad_norm": 0.04852883517742157,
"learning_rate": 0.00019910394554856876,
"loss": 0.701,
"step": 74
},
{
"epoch": 0.43103448275862066,
"grad_norm": 0.06196567416191101,
"learning_rate": 0.0001990122514534651,
"loss": 0.6805,
"step": 75
},
{
"epoch": 0.4367816091954023,
"grad_norm": 0.047033004462718964,
"learning_rate": 0.00019891611471580764,
"loss": 0.7058,
"step": 76
},
{
"epoch": 0.4425287356321839,
"grad_norm": 0.047392234206199646,
"learning_rate": 0.00019881553964921572,
"loss": 0.6861,
"step": 77
},
{
"epoch": 0.4482758620689655,
"grad_norm": 0.054070815443992615,
"learning_rate": 0.00019871053076645488,
"loss": 0.6969,
"step": 78
},
{
"epoch": 0.4540229885057471,
"grad_norm": 0.055412329733371735,
"learning_rate": 0.00019860109277923418,
"loss": 0.7001,
"step": 79
},
{
"epoch": 0.45977011494252873,
"grad_norm": 0.05274376645684242,
"learning_rate": 0.00019848723059799506,
"loss": 0.7101,
"step": 80
},
{
"epoch": 0.45977011494252873,
"eval_loss": 0.694656252861023,
"eval_runtime": 410.9173,
"eval_samples_per_second": 24.088,
"eval_steps_per_second": 0.377,
"step": 80
},
{
"epoch": 0.46551724137931033,
"grad_norm": 0.05915577709674835,
"learning_rate": 0.00019836894933169088,
"loss": 0.6836,
"step": 81
},
{
"epoch": 0.47126436781609193,
"grad_norm": 0.051574286073446274,
"learning_rate": 0.0001982462542875576,
"loss": 0.7181,
"step": 82
},
{
"epoch": 0.47701149425287354,
"grad_norm": 0.050167519599199295,
"learning_rate": 0.00019811915097087587,
"loss": 0.6645,
"step": 83
},
{
"epoch": 0.4827586206896552,
"grad_norm": 0.06501943618059158,
"learning_rate": 0.00019798764508472373,
"loss": 0.6891,
"step": 84
},
{
"epoch": 0.4885057471264368,
"grad_norm": 0.05396122857928276,
"learning_rate": 0.00019785174252972092,
"loss": 0.6842,
"step": 85
},
{
"epoch": 0.4942528735632184,
"grad_norm": 0.051826637238264084,
"learning_rate": 0.0001977114494037641,
"loss": 0.7047,
"step": 86
},
{
"epoch": 0.5,
"grad_norm": 0.05442539602518082,
"learning_rate": 0.00019756677200175315,
"loss": 0.7261,
"step": 87
},
{
"epoch": 0.5057471264367817,
"grad_norm": 0.05559674650430679,
"learning_rate": 0.0001974177168153088,
"loss": 0.6699,
"step": 88
},
{
"epoch": 0.5114942528735632,
"grad_norm": 0.058047693222761154,
"learning_rate": 0.0001972642905324813,
"loss": 0.6831,
"step": 89
},
{
"epoch": 0.5172413793103449,
"grad_norm": 0.051893047988414764,
"learning_rate": 0.0001971065000374504,
"loss": 0.7293,
"step": 90
},
{
"epoch": 0.5172413793103449,
"eval_loss": 0.6888386607170105,
"eval_runtime": 405.4362,
"eval_samples_per_second": 24.413,
"eval_steps_per_second": 0.382,
"step": 90
},
{
"epoch": 0.5229885057471264,
"grad_norm": 0.051870737224817276,
"learning_rate": 0.0001969443524102163,
"loss": 0.6945,
"step": 91
},
{
"epoch": 0.5287356321839081,
"grad_norm": 0.04907568544149399,
"learning_rate": 0.0001967778549262822,
"loss": 0.6985,
"step": 92
},
{
"epoch": 0.5344827586206896,
"grad_norm": 0.05802120640873909,
"learning_rate": 0.00019660701505632772,
"loss": 0.6911,
"step": 93
},
{
"epoch": 0.5402298850574713,
"grad_norm": 0.06809733808040619,
"learning_rate": 0.0001964318404658737,
"loss": 0.6815,
"step": 94
},
{
"epoch": 0.5459770114942529,
"grad_norm": 0.05489501729607582,
"learning_rate": 0.00019625233901493822,
"loss": 0.6664,
"step": 95
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.0648936778306961,
"learning_rate": 0.000196068518757684,
"loss": 0.6689,
"step": 96
},
{
"epoch": 0.5574712643678161,
"grad_norm": 0.054548367857933044,
"learning_rate": 0.00019588038794205703,
"loss": 0.6695,
"step": 97
},
{
"epoch": 0.5632183908045977,
"grad_norm": 0.0626642182469368,
"learning_rate": 0.00019568795500941635,
"loss": 0.7062,
"step": 98
},
{
"epoch": 0.5689655172413793,
"grad_norm": 0.0539688840508461,
"learning_rate": 0.00019549122859415538,
"loss": 0.6891,
"step": 99
},
{
"epoch": 0.5747126436781609,
"grad_norm": 0.05761811137199402,
"learning_rate": 0.00019529021752331453,
"loss": 0.6852,
"step": 100
},
{
"epoch": 0.5747126436781609,
"eval_loss": 0.6821601986885071,
"eval_runtime": 404.287,
"eval_samples_per_second": 24.483,
"eval_steps_per_second": 0.383,
"step": 100
},
{
"epoch": 0.5804597701149425,
"grad_norm": 0.054896607995033264,
"learning_rate": 0.00019508493081618513,
"loss": 0.6785,
"step": 101
},
{
"epoch": 0.5862068965517241,
"grad_norm": 0.06048964709043503,
"learning_rate": 0.00019487537768390464,
"loss": 0.6724,
"step": 102
},
{
"epoch": 0.5919540229885057,
"grad_norm": 0.06828396022319794,
"learning_rate": 0.00019466156752904343,
"loss": 0.7117,
"step": 103
},
{
"epoch": 0.5977011494252874,
"grad_norm": 0.06610234081745148,
"learning_rate": 0.0001944435099451829,
"loss": 0.6982,
"step": 104
},
{
"epoch": 0.603448275862069,
"grad_norm": 0.06762486696243286,
"learning_rate": 0.00019422121471648497,
"loss": 0.6768,
"step": 105
},
{
"epoch": 0.6091954022988506,
"grad_norm": 0.05772867798805237,
"learning_rate": 0.0001939946918172531,
"loss": 0.7073,
"step": 106
},
{
"epoch": 0.6149425287356322,
"grad_norm": 0.11993183940649033,
"learning_rate": 0.00019376395141148476,
"loss": 0.6831,
"step": 107
},
{
"epoch": 0.6206896551724138,
"grad_norm": 0.08105713874101639,
"learning_rate": 0.00019352900385241536,
"loss": 0.6857,
"step": 108
},
{
"epoch": 0.6264367816091954,
"grad_norm": 0.06035466492176056,
"learning_rate": 0.0001932898596820536,
"loss": 0.672,
"step": 109
},
{
"epoch": 0.632183908045977,
"grad_norm": 0.09288731962442398,
"learning_rate": 0.0001930465296307087,
"loss": 0.7033,
"step": 110
},
{
"epoch": 0.632183908045977,
"eval_loss": 0.677044153213501,
"eval_runtime": 405.2323,
"eval_samples_per_second": 24.425,
"eval_steps_per_second": 0.382,
"step": 110
},
{
"epoch": 0.6379310344827587,
"grad_norm": 0.06630638986825943,
"learning_rate": 0.00019279902461650866,
"loss": 0.6831,
"step": 111
},
{
"epoch": 0.6436781609195402,
"grad_norm": 0.05605092644691467,
"learning_rate": 0.00019254735574491058,
"loss": 0.6654,
"step": 112
},
{
"epoch": 0.6494252873563219,
"grad_norm": 0.07270795851945877,
"learning_rate": 0.00019229153430820232,
"loss": 0.6744,
"step": 113
},
{
"epoch": 0.6551724137931034,
"grad_norm": 0.06772006303071976,
"learning_rate": 0.0001920315717849956,
"loss": 0.6833,
"step": 114
},
{
"epoch": 0.6609195402298851,
"grad_norm": 0.06296226382255554,
"learning_rate": 0.0001917674798397113,
"loss": 0.677,
"step": 115
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.06553810834884644,
"learning_rate": 0.00019149927032205587,
"loss": 0.6828,
"step": 116
},
{
"epoch": 0.6724137931034483,
"grad_norm": 0.057245928794145584,
"learning_rate": 0.00019122695526648968,
"loss": 0.6858,
"step": 117
},
{
"epoch": 0.6781609195402298,
"grad_norm": 0.06503669917583466,
"learning_rate": 0.00019095054689168705,
"loss": 0.6591,
"step": 118
},
{
"epoch": 0.6839080459770115,
"grad_norm": 0.05912588909268379,
"learning_rate": 0.00019067005759998797,
"loss": 0.6669,
"step": 119
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.06517963111400604,
"learning_rate": 0.0001903854999768417,
"loss": 0.6815,
"step": 120
},
{
"epoch": 0.6896551724137931,
"eval_loss": 0.6735538244247437,
"eval_runtime": 405.8319,
"eval_samples_per_second": 24.389,
"eval_steps_per_second": 0.382,
"step": 120
},
{
"epoch": 0.6954022988505747,
"grad_norm": 0.06089121848344803,
"learning_rate": 0.0001900968867902419,
"loss": 0.67,
"step": 121
},
{
"epoch": 0.7011494252873564,
"grad_norm": 0.05764375999569893,
"learning_rate": 0.00018980423099015402,
"loss": 0.6733,
"step": 122
},
{
"epoch": 0.7068965517241379,
"grad_norm": 0.06278955936431885,
"learning_rate": 0.00018950754570793384,
"loss": 0.6702,
"step": 123
},
{
"epoch": 0.7126436781609196,
"grad_norm": 0.06360521912574768,
"learning_rate": 0.00018920684425573865,
"loss": 0.6619,
"step": 124
},
{
"epoch": 0.7183908045977011,
"grad_norm": 0.0599365159869194,
"learning_rate": 0.00018890214012592975,
"loss": 0.6851,
"step": 125
},
{
"epoch": 0.7241379310344828,
"grad_norm": 0.061885766685009,
"learning_rate": 0.000188593446990467,
"loss": 0.6346,
"step": 126
},
{
"epoch": 0.7298850574712644,
"grad_norm": 0.061761509627103806,
"learning_rate": 0.00018828077870029552,
"loss": 0.6834,
"step": 127
},
{
"epoch": 0.735632183908046,
"grad_norm": 0.075982965528965,
"learning_rate": 0.00018796414928472417,
"loss": 0.6279,
"step": 128
},
{
"epoch": 0.7413793103448276,
"grad_norm": 0.05802853778004646,
"learning_rate": 0.0001876435729507959,
"loss": 0.6348,
"step": 129
},
{
"epoch": 0.7471264367816092,
"grad_norm": 0.06642711162567139,
"learning_rate": 0.0001873190640826505,
"loss": 0.679,
"step": 130
},
{
"epoch": 0.7471264367816092,
"eval_loss": 0.6707044243812561,
"eval_runtime": 407.4212,
"eval_samples_per_second": 24.294,
"eval_steps_per_second": 0.38,
"step": 130
},
{
"epoch": 0.7528735632183908,
"grad_norm": 0.06452522426843643,
"learning_rate": 0.00018699063724087904,
"loss": 0.6423,
"step": 131
},
{
"epoch": 0.7586206896551724,
"grad_norm": 0.05988775193691254,
"learning_rate": 0.00018665830716187065,
"loss": 0.6654,
"step": 132
},
{
"epoch": 0.764367816091954,
"grad_norm": 0.059349820017814636,
"learning_rate": 0.0001863220887571512,
"loss": 0.6866,
"step": 133
},
{
"epoch": 0.7701149425287356,
"grad_norm": 0.06473397463560104,
"learning_rate": 0.0001859819971127143,
"loss": 0.7014,
"step": 134
},
{
"epoch": 0.7758620689655172,
"grad_norm": 0.06945810467004776,
"learning_rate": 0.00018563804748834438,
"loss": 0.6769,
"step": 135
},
{
"epoch": 0.7816091954022989,
"grad_norm": 0.06217830255627632,
"learning_rate": 0.000185290255316932,
"loss": 0.6821,
"step": 136
},
{
"epoch": 0.7873563218390804,
"grad_norm": 0.07021711021661758,
"learning_rate": 0.00018493863620378122,
"loss": 0.6614,
"step": 137
},
{
"epoch": 0.7931034482758621,
"grad_norm": 0.0640297532081604,
"learning_rate": 0.00018458320592590975,
"loss": 0.6699,
"step": 138
},
{
"epoch": 0.7988505747126436,
"grad_norm": 0.0640842542052269,
"learning_rate": 0.00018422398043134067,
"loss": 0.6795,
"step": 139
},
{
"epoch": 0.8045977011494253,
"grad_norm": 0.07371507585048676,
"learning_rate": 0.00018386097583838714,
"loss": 0.6571,
"step": 140
},
{
"epoch": 0.8045977011494253,
"eval_loss": 0.6682229042053223,
"eval_runtime": 404.8694,
"eval_samples_per_second": 24.447,
"eval_steps_per_second": 0.383,
"step": 140
},
{
"epoch": 0.8103448275862069,
"grad_norm": 0.06185011938214302,
"learning_rate": 0.00018349420843492888,
"loss": 0.6524,
"step": 141
},
{
"epoch": 0.8160919540229885,
"grad_norm": 0.08427827060222626,
"learning_rate": 0.00018312369467768166,
"loss": 0.6685,
"step": 142
},
{
"epoch": 0.8218390804597702,
"grad_norm": 0.06529568880796432,
"learning_rate": 0.0001827494511914587,
"loss": 0.659,
"step": 143
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.07357680797576904,
"learning_rate": 0.0001823714947684247,
"loss": 0.6792,
"step": 144
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.09026575833559036,
"learning_rate": 0.00018198984236734246,
"loss": 0.6954,
"step": 145
},
{
"epoch": 0.8390804597701149,
"grad_norm": 0.06157710403203964,
"learning_rate": 0.000181604511112812,
"loss": 0.6527,
"step": 146
},
{
"epoch": 0.8448275862068966,
"grad_norm": 0.08122924715280533,
"learning_rate": 0.000181215518294502,
"loss": 0.6571,
"step": 147
},
{
"epoch": 0.8505747126436781,
"grad_norm": 0.05926045402884483,
"learning_rate": 0.00018082288136637422,
"loss": 0.6773,
"step": 148
},
{
"epoch": 0.8563218390804598,
"grad_norm": 0.07869191467761993,
"learning_rate": 0.00018042661794590023,
"loss": 0.7066,
"step": 149
},
{
"epoch": 0.8620689655172413,
"grad_norm": 0.07564139366149902,
"learning_rate": 0.00018002674581327094,
"loss": 0.6491,
"step": 150
},
{
"epoch": 0.8620689655172413,
"eval_loss": 0.6660047769546509,
"eval_runtime": 406.5581,
"eval_samples_per_second": 24.346,
"eval_steps_per_second": 0.381,
"step": 150
},
{
"epoch": 0.867816091954023,
"grad_norm": 0.05749671533703804,
"learning_rate": 0.00017962328291059888,
"loss": 0.7081,
"step": 151
},
{
"epoch": 0.8735632183908046,
"grad_norm": 0.08154609054327011,
"learning_rate": 0.00017921624734111292,
"loss": 0.6622,
"step": 152
},
{
"epoch": 0.8793103448275862,
"grad_norm": 0.08773736655712128,
"learning_rate": 0.0001788056573683464,
"loss": 0.6393,
"step": 153
},
{
"epoch": 0.8850574712643678,
"grad_norm": 0.06756340712308884,
"learning_rate": 0.00017839153141531718,
"loss": 0.6384,
"step": 154
},
{
"epoch": 0.8908045977011494,
"grad_norm": 0.08763930201530457,
"learning_rate": 0.00017797388806370132,
"loss": 0.6512,
"step": 155
},
{
"epoch": 0.896551724137931,
"grad_norm": 0.0647486001253128,
"learning_rate": 0.00017755274605299923,
"loss": 0.6502,
"step": 156
},
{
"epoch": 0.9022988505747126,
"grad_norm": 0.11679747700691223,
"learning_rate": 0.00017712812427969485,
"loss": 0.6666,
"step": 157
},
{
"epoch": 0.9080459770114943,
"grad_norm": 0.06472433358430862,
"learning_rate": 0.00017670004179640774,
"loss": 0.6495,
"step": 158
},
{
"epoch": 0.9137931034482759,
"grad_norm": 0.09902803599834442,
"learning_rate": 0.0001762685178110382,
"loss": 0.6747,
"step": 159
},
{
"epoch": 0.9195402298850575,
"grad_norm": 0.06362438946962357,
"learning_rate": 0.0001758335716859055,
"loss": 0.7015,
"step": 160
},
{
"epoch": 0.9195402298850575,
"eval_loss": 0.663636326789856,
"eval_runtime": 404.5915,
"eval_samples_per_second": 24.464,
"eval_steps_per_second": 0.383,
"step": 160
},
{
"epoch": 0.9252873563218391,
"grad_norm": 0.07304941862821579,
"learning_rate": 0.00017539522293687898,
"loss": 0.6825,
"step": 161
},
{
"epoch": 0.9310344827586207,
"grad_norm": 0.08923015743494034,
"learning_rate": 0.00017495349123250242,
"loss": 0.674,
"step": 162
},
{
"epoch": 0.9367816091954023,
"grad_norm": 0.062135376036167145,
"learning_rate": 0.00017450839639311162,
"loss": 0.6477,
"step": 163
},
{
"epoch": 0.9425287356321839,
"grad_norm": 0.1098598912358284,
"learning_rate": 0.00017405995838994494,
"loss": 0.6742,
"step": 164
},
{
"epoch": 0.9482758620689655,
"grad_norm": 0.06947540491819382,
"learning_rate": 0.00017360819734424715,
"loss": 0.6509,
"step": 165
},
{
"epoch": 0.9540229885057471,
"grad_norm": 0.11134368181228638,
"learning_rate": 0.0001731531335263669,
"loss": 0.6602,
"step": 166
},
{
"epoch": 0.9597701149425287,
"grad_norm": 0.06717904657125473,
"learning_rate": 0.00017269478735484683,
"loss": 0.6697,
"step": 167
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.06737629324197769,
"learning_rate": 0.00017223317939550753,
"loss": 0.6636,
"step": 168
},
{
"epoch": 0.9712643678160919,
"grad_norm": 0.08558724075555801,
"learning_rate": 0.00017176833036052495,
"loss": 0.6733,
"step": 169
},
{
"epoch": 0.9770114942528736,
"grad_norm": 0.07127804309129715,
"learning_rate": 0.0001713002611075007,
"loss": 0.6523,
"step": 170
},
{
"epoch": 0.9770114942528736,
"eval_loss": 0.6618800759315491,
"eval_runtime": 411.375,
"eval_samples_per_second": 24.061,
"eval_steps_per_second": 0.377,
"step": 170
},
{
"epoch": 0.9827586206896551,
"grad_norm": 0.08060283958911896,
"learning_rate": 0.0001708289926385265,
"loss": 0.658,
"step": 171
},
{
"epoch": 0.9885057471264368,
"grad_norm": 0.06496579200029373,
"learning_rate": 0.0001703545460992416,
"loss": 0.6697,
"step": 172
},
{
"epoch": 0.9942528735632183,
"grad_norm": 0.0646037757396698,
"learning_rate": 0.00016987694277788417,
"loss": 0.6231,
"step": 173
},
{
"epoch": 1.0,
"grad_norm": 0.08516079187393188,
"learning_rate": 0.0001693962041043359,
"loss": 0.6374,
"step": 174
},
{
"epoch": 1.0057471264367817,
"grad_norm": 0.06554190069437027,
"learning_rate": 0.00016891235164916065,
"loss": 0.6271,
"step": 175
},
{
"epoch": 1.0114942528735633,
"grad_norm": 0.06361629068851471,
"learning_rate": 0.00016842540712263637,
"loss": 0.649,
"step": 176
},
{
"epoch": 1.0172413793103448,
"grad_norm": 0.0814083069562912,
"learning_rate": 0.00016793539237378128,
"loss": 0.654,
"step": 177
},
{
"epoch": 1.0229885057471264,
"grad_norm": 0.06498701125383377,
"learning_rate": 0.00016744232938937308,
"loss": 0.6313,
"step": 178
},
{
"epoch": 1.028735632183908,
"grad_norm": 0.11292543262243271,
"learning_rate": 0.0001669462402929629,
"loss": 0.6803,
"step": 179
},
{
"epoch": 1.0344827586206897,
"grad_norm": 0.0661187544465065,
"learning_rate": 0.00016644714734388217,
"loss": 0.6672,
"step": 180
},
{
"epoch": 1.0344827586206897,
"eval_loss": 0.6602174043655396,
"eval_runtime": 410.2914,
"eval_samples_per_second": 24.124,
"eval_steps_per_second": 0.378,
"step": 180
},
{
"epoch": 1.0402298850574712,
"grad_norm": 0.08441785722970963,
"learning_rate": 0.00016594507293624425,
"loss": 0.6257,
"step": 181
},
{
"epoch": 1.0459770114942528,
"grad_norm": 0.09075969457626343,
"learning_rate": 0.00016544003959793925,
"loss": 0.641,
"step": 182
},
{
"epoch": 1.0517241379310345,
"grad_norm": 0.07677901536226273,
"learning_rate": 0.00016493206998962354,
"loss": 0.6351,
"step": 183
},
{
"epoch": 1.0574712643678161,
"grad_norm": 0.09646302461624146,
"learning_rate": 0.0001644211869037027,
"loss": 0.6635,
"step": 184
},
{
"epoch": 1.0632183908045978,
"grad_norm": 0.06928115338087082,
"learning_rate": 0.00016390741326330907,
"loss": 0.6458,
"step": 185
},
{
"epoch": 1.0689655172413792,
"grad_norm": 0.1076992079615593,
"learning_rate": 0.00016339077212127294,
"loss": 0.6209,
"step": 186
},
{
"epoch": 1.0747126436781609,
"grad_norm": 0.08489565551280975,
"learning_rate": 0.0001628712866590885,
"loss": 0.6336,
"step": 187
},
{
"epoch": 1.0804597701149425,
"grad_norm": 0.11920158565044403,
"learning_rate": 0.00016234898018587337,
"loss": 0.6496,
"step": 188
},
{
"epoch": 1.0862068965517242,
"grad_norm": 0.07987701892852783,
"learning_rate": 0.00016182387613732291,
"loss": 0.668,
"step": 189
},
{
"epoch": 1.0919540229885056,
"grad_norm": 0.1095438227057457,
"learning_rate": 0.00016129599807465875,
"loss": 0.6862,
"step": 190
},
{
"epoch": 1.0919540229885056,
"eval_loss": 0.6588147282600403,
"eval_runtime": 406.5115,
"eval_samples_per_second": 24.349,
"eval_steps_per_second": 0.381,
"step": 190
},
{
"epoch": 1.0977011494252873,
"grad_norm": 0.08076825737953186,
"learning_rate": 0.0001607653696835713,
"loss": 0.6367,
"step": 191
},
{
"epoch": 1.103448275862069,
"grad_norm": 0.09829648584127426,
"learning_rate": 0.00016023201477315731,
"loss": 0.6391,
"step": 192
},
{
"epoch": 1.1091954022988506,
"grad_norm": 0.09008080512285233,
"learning_rate": 0.0001596959572748514,
"loss": 0.6462,
"step": 193
},
{
"epoch": 1.1149425287356323,
"grad_norm": 0.07725552469491959,
"learning_rate": 0.00015915722124135227,
"loss": 0.6356,
"step": 194
},
{
"epoch": 1.1206896551724137,
"grad_norm": 0.08215273171663284,
"learning_rate": 0.00015861583084554349,
"loss": 0.6557,
"step": 195
},
{
"epoch": 1.1264367816091954,
"grad_norm": 0.07044622302055359,
"learning_rate": 0.0001580718103794089,
"loss": 0.6401,
"step": 196
},
{
"epoch": 1.132183908045977,
"grad_norm": 0.06852877885103226,
"learning_rate": 0.00015752518425294257,
"loss": 0.6641,
"step": 197
},
{
"epoch": 1.1379310344827587,
"grad_norm": 0.07775932550430298,
"learning_rate": 0.00015697597699305366,
"loss": 0.6689,
"step": 198
},
{
"epoch": 1.1436781609195403,
"grad_norm": 0.07384389638900757,
"learning_rate": 0.00015642421324246568,
"loss": 0.663,
"step": 199
},
{
"epoch": 1.1494252873563218,
"grad_norm": 0.074593685567379,
"learning_rate": 0.00015586991775861102,
"loss": 0.6755,
"step": 200
},
{
"epoch": 1.1494252873563218,
"eval_loss": 0.6577329635620117,
"eval_runtime": 406.5534,
"eval_samples_per_second": 24.346,
"eval_steps_per_second": 0.381,
"step": 200
},
{
"epoch": 1.1551724137931034,
"grad_norm": 0.07201389968395233,
"learning_rate": 0.00015531311541251995,
"loss": 0.62,
"step": 201
},
{
"epoch": 1.160919540229885,
"grad_norm": 0.07052464783191681,
"learning_rate": 0.00015475383118770472,
"loss": 0.6456,
"step": 202
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.07045558094978333,
"learning_rate": 0.00015419209017903852,
"loss": 0.6421,
"step": 203
},
{
"epoch": 1.1724137931034484,
"grad_norm": 0.0870729386806488,
"learning_rate": 0.0001536279175916296,
"loss": 0.6342,
"step": 204
},
{
"epoch": 1.1781609195402298,
"grad_norm": 0.0703926831483841,
"learning_rate": 0.0001530613387396901,
"loss": 0.6533,
"step": 205
},
{
"epoch": 1.1839080459770115,
"grad_norm": 0.07181324064731598,
"learning_rate": 0.0001524923790454004,
"loss": 0.6511,
"step": 206
},
{
"epoch": 1.1896551724137931,
"grad_norm": 0.07455940544605255,
"learning_rate": 0.00015192106403776848,
"loss": 0.6363,
"step": 207
},
{
"epoch": 1.1954022988505748,
"grad_norm": 0.08370154350996017,
"learning_rate": 0.0001513474193514842,
"loss": 0.6517,
"step": 208
},
{
"epoch": 1.2011494252873562,
"grad_norm": 0.08015818893909454,
"learning_rate": 0.00015077147072576933,
"loss": 0.6264,
"step": 209
},
{
"epoch": 1.206896551724138,
"grad_norm": 0.093206986784935,
"learning_rate": 0.00015019324400322243,
"loss": 0.6279,
"step": 210
},
{
"epoch": 1.206896551724138,
"eval_loss": 0.6562607884407043,
"eval_runtime": 407.9222,
"eval_samples_per_second": 24.264,
"eval_steps_per_second": 0.38,
"step": 210
},
{
"epoch": 1.2126436781609196,
"grad_norm": 0.07707002758979797,
"learning_rate": 0.00014961276512865954,
"loss": 0.6726,
"step": 211
},
{
"epoch": 1.2183908045977012,
"grad_norm": 0.08275868743658066,
"learning_rate": 0.00014903006014794983,
"loss": 0.6493,
"step": 212
},
{
"epoch": 1.2241379310344827,
"grad_norm": 0.11222587525844574,
"learning_rate": 0.00014844515520684703,
"loss": 0.6367,
"step": 213
},
{
"epoch": 1.2298850574712643,
"grad_norm": 0.09210342168807983,
"learning_rate": 0.00014785807654981627,
"loss": 0.6734,
"step": 214
},
{
"epoch": 1.235632183908046,
"grad_norm": 0.08821109682321548,
"learning_rate": 0.00014726885051885653,
"loss": 0.6354,
"step": 215
},
{
"epoch": 1.2413793103448276,
"grad_norm": 0.12253956496715546,
"learning_rate": 0.0001466775035523186,
"loss": 0.6412,
"step": 216
},
{
"epoch": 1.2471264367816093,
"grad_norm": 0.08476684242486954,
"learning_rate": 0.00014608406218371894,
"loss": 0.6635,
"step": 217
},
{
"epoch": 1.2528735632183907,
"grad_norm": 0.08554086089134216,
"learning_rate": 0.00014548855304054886,
"loss": 0.6403,
"step": 218
},
{
"epoch": 1.2586206896551724,
"grad_norm": 0.10986476391553879,
"learning_rate": 0.00014489100284308017,
"loss": 0.6253,
"step": 219
},
{
"epoch": 1.264367816091954,
"grad_norm": 0.09221742302179337,
"learning_rate": 0.00014429143840316585,
"loss": 0.6622,
"step": 220
},
{
"epoch": 1.264367816091954,
"eval_loss": 0.6551185250282288,
"eval_runtime": 408.2025,
"eval_samples_per_second": 24.248,
"eval_steps_per_second": 0.38,
"step": 220
},
{
"epoch": 1.2701149425287357,
"grad_norm": 0.08050013333559036,
"learning_rate": 0.00014368988662303732,
"loss": 0.6226,
"step": 221
},
{
"epoch": 1.2758620689655173,
"grad_norm": 0.16257594525814056,
"learning_rate": 0.00014308637449409706,
"loss": 0.6661,
"step": 222
},
{
"epoch": 1.2816091954022988,
"grad_norm": 0.07793809473514557,
"learning_rate": 0.00014248092909570774,
"loss": 0.6243,
"step": 223
},
{
"epoch": 1.2873563218390804,
"grad_norm": 0.0975632593035698,
"learning_rate": 0.00014187357759397714,
"loss": 0.6348,
"step": 224
},
{
"epoch": 1.293103448275862,
"grad_norm": 0.07041144371032715,
"learning_rate": 0.00014126434724053913,
"loss": 0.6386,
"step": 225
},
{
"epoch": 1.2988505747126438,
"grad_norm": 0.12080610543489456,
"learning_rate": 0.00014065326537133094,
"loss": 0.6276,
"step": 226
},
{
"epoch": 1.3045977011494254,
"grad_norm": 0.09340126812458038,
"learning_rate": 0.0001400403594053667,
"loss": 0.6431,
"step": 227
},
{
"epoch": 1.3103448275862069,
"grad_norm": 0.09178619831800461,
"learning_rate": 0.00013942565684350698,
"loss": 0.6457,
"step": 228
},
{
"epoch": 1.3160919540229885,
"grad_norm": 0.134804829955101,
"learning_rate": 0.00013880918526722497,
"loss": 0.6247,
"step": 229
},
{
"epoch": 1.3218390804597702,
"grad_norm": 0.07517404854297638,
"learning_rate": 0.00013819097233736888,
"loss": 0.6329,
"step": 230
},
{
"epoch": 1.3218390804597702,
"eval_loss": 0.6541800498962402,
"eval_runtime": 404.9523,
"eval_samples_per_second": 24.442,
"eval_steps_per_second": 0.383,
"step": 230
},
{
"epoch": 1.3275862068965516,
"grad_norm": 0.1385478675365448,
"learning_rate": 0.00013757104579292082,
"loss": 0.6697,
"step": 231
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.08156240731477737,
"learning_rate": 0.00013694943344975212,
"loss": 0.6279,
"step": 232
},
{
"epoch": 1.339080459770115,
"grad_norm": 0.10937108844518661,
"learning_rate": 0.00013632616319937522,
"loss": 0.6487,
"step": 233
},
{
"epoch": 1.3448275862068966,
"grad_norm": 0.12300366908311844,
"learning_rate": 0.00013570126300769232,
"loss": 0.6456,
"step": 234
},
{
"epoch": 1.3505747126436782,
"grad_norm": 0.07707128673791885,
"learning_rate": 0.0001350747609137404,
"loss": 0.6302,
"step": 235
},
{
"epoch": 1.3563218390804597,
"grad_norm": 0.0954674631357193,
"learning_rate": 0.0001344466850284333,
"loss": 0.6184,
"step": 236
},
{
"epoch": 1.3620689655172413,
"grad_norm": 0.10317125916481018,
"learning_rate": 0.00013381706353330014,
"loss": 0.6618,
"step": 237
},
{
"epoch": 1.367816091954023,
"grad_norm": 0.08765599131584167,
"learning_rate": 0.0001331859246792211,
"loss": 0.6191,
"step": 238
},
{
"epoch": 1.3735632183908046,
"grad_norm": 0.10305018723011017,
"learning_rate": 0.0001325532967851596,
"loss": 0.6397,
"step": 239
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.08769567310810089,
"learning_rate": 0.00013191920823689177,
"loss": 0.6559,
"step": 240
},
{
"epoch": 1.3793103448275863,
"eval_loss": 0.6528159379959106,
"eval_runtime": 407.607,
"eval_samples_per_second": 24.283,
"eval_steps_per_second": 0.38,
"step": 240
},
{
"epoch": 1.3850574712643677,
"grad_norm": 0.09783841669559479,
"learning_rate": 0.00013128368748573273,
"loss": 0.6736,
"step": 241
},
{
"epoch": 1.3908045977011494,
"grad_norm": 0.08165410906076431,
"learning_rate": 0.00013064676304726,
"loss": 0.6467,
"step": 242
},
{
"epoch": 1.396551724137931,
"grad_norm": 0.10928885638713837,
"learning_rate": 0.0001300084635000341,
"loss": 0.6956,
"step": 243
},
{
"epoch": 1.4022988505747127,
"grad_norm": 0.09388460218906403,
"learning_rate": 0.000129368817484316,
"loss": 0.6474,
"step": 244
},
{
"epoch": 1.4080459770114944,
"grad_norm": 0.08257792145013809,
"learning_rate": 0.0001287278537007824,
"loss": 0.6301,
"step": 245
},
{
"epoch": 1.4137931034482758,
"grad_norm": 0.07570406794548035,
"learning_rate": 0.00012808560090923758,
"loss": 0.6238,
"step": 246
},
{
"epoch": 1.4195402298850575,
"grad_norm": 0.097509004175663,
"learning_rate": 0.00012744208792732324,
"loss": 0.6383,
"step": 247
},
{
"epoch": 1.4252873563218391,
"grad_norm": 0.07778667658567429,
"learning_rate": 0.00012679734362922528,
"loss": 0.642,
"step": 248
},
{
"epoch": 1.4310344827586206,
"grad_norm": 0.08389262855052948,
"learning_rate": 0.00012615139694437835,
"loss": 0.6152,
"step": 249
},
{
"epoch": 1.4367816091954024,
"grad_norm": 0.08290071040391922,
"learning_rate": 0.00012550427685616765,
"loss": 0.6389,
"step": 250
},
{
"epoch": 1.4367816091954024,
"eval_loss": 0.6516815423965454,
"eval_runtime": 411.2719,
"eval_samples_per_second": 24.067,
"eval_steps_per_second": 0.377,
"step": 250
},
{
"epoch": 1.4425287356321839,
"grad_norm": 0.08134254068136215,
"learning_rate": 0.00012485601240062869,
"loss": 0.6365,
"step": 251
},
{
"epoch": 1.4482758620689655,
"grad_norm": 0.11836981773376465,
"learning_rate": 0.00012420663266514417,
"loss": 0.6345,
"step": 252
},
{
"epoch": 1.4540229885057472,
"grad_norm": 0.07629366219043732,
"learning_rate": 0.0001235561667871391,
"loss": 0.6365,
"step": 253
},
{
"epoch": 1.4597701149425286,
"grad_norm": 0.09142953902482986,
"learning_rate": 0.0001229046439527732,
"loss": 0.6316,
"step": 254
},
{
"epoch": 1.4655172413793103,
"grad_norm": 0.12063657492399216,
"learning_rate": 0.00012225209339563145,
"loss": 0.6221,
"step": 255
},
{
"epoch": 1.471264367816092,
"grad_norm": 0.07524894177913666,
"learning_rate": 0.00012159854439541245,
"loss": 0.6485,
"step": 256
},
{
"epoch": 1.4770114942528736,
"grad_norm": 0.08384133875370026,
"learning_rate": 0.00012094402627661447,
"loss": 0.6607,
"step": 257
},
{
"epoch": 1.4827586206896552,
"grad_norm": 0.08039575815200806,
"learning_rate": 0.00012028856840721974,
"loss": 0.6764,
"step": 258
},
{
"epoch": 1.4885057471264367,
"grad_norm": 0.09115740656852722,
"learning_rate": 0.00011963220019737691,
"loss": 0.6587,
"step": 259
},
{
"epoch": 1.4942528735632183,
"grad_norm": 0.08291927725076675,
"learning_rate": 0.00011897495109808107,
"loss": 0.6476,
"step": 260
},
{
"epoch": 1.4942528735632183,
"eval_loss": 0.6506026983261108,
"eval_runtime": 407.6949,
"eval_samples_per_second": 24.278,
"eval_steps_per_second": 0.38,
"step": 260
},
{
"epoch": 1.5,
"grad_norm": 0.09679999202489853,
"learning_rate": 0.00011831685059985262,
"loss": 0.6378,
"step": 261
},
{
"epoch": 1.5057471264367817,
"grad_norm": 0.07858405262231827,
"learning_rate": 0.00011765792823141384,
"loss": 0.6679,
"step": 262
},
{
"epoch": 1.5114942528735633,
"grad_norm": 0.07274090498685837,
"learning_rate": 0.00011699821355836409,
"loss": 0.6199,
"step": 263
},
{
"epoch": 1.5172413793103448,
"grad_norm": 0.11862179636955261,
"learning_rate": 0.00011633773618185302,
"loss": 0.6369,
"step": 264
},
{
"epoch": 1.5229885057471264,
"grad_norm": 0.08915189653635025,
"learning_rate": 0.00011567652573725262,
"loss": 0.6248,
"step": 265
},
{
"epoch": 1.528735632183908,
"grad_norm": 0.12184260040521622,
"learning_rate": 0.00011501461189282733,
"loss": 0.645,
"step": 266
},
{
"epoch": 1.5344827586206895,
"grad_norm": 0.09939936548471451,
"learning_rate": 0.00011435202434840287,
"loss": 0.6382,
"step": 267
},
{
"epoch": 1.5402298850574714,
"grad_norm": 0.07167995721101761,
"learning_rate": 0.0001136887928340336,
"loss": 0.6064,
"step": 268
},
{
"epoch": 1.5459770114942528,
"grad_norm": 0.09978017210960388,
"learning_rate": 0.00011302494710866857,
"loss": 0.6467,
"step": 269
},
{
"epoch": 1.5517241379310345,
"grad_norm": 0.09598653763532639,
"learning_rate": 0.00011236051695881633,
"loss": 0.6412,
"step": 270
},
{
"epoch": 1.5517241379310345,
"eval_loss": 0.6497076749801636,
"eval_runtime": 407.5672,
"eval_samples_per_second": 24.286,
"eval_steps_per_second": 0.38,
"step": 270
},
{
"epoch": 1.5574712643678161,
"grad_norm": 0.08118661493062973,
"learning_rate": 0.00011169553219720828,
"loss": 0.6659,
"step": 271
},
{
"epoch": 1.5632183908045976,
"grad_norm": 0.11158329248428345,
"learning_rate": 0.00011103002266146096,
"loss": 0.6578,
"step": 272
},
{
"epoch": 1.5689655172413794,
"grad_norm": 0.12230509519577026,
"learning_rate": 0.0001103640182127375,
"loss": 0.6187,
"step": 273
},
{
"epoch": 1.5747126436781609,
"grad_norm": 0.07973505556583405,
"learning_rate": 0.00010969754873440743,
"loss": 0.6507,
"step": 274
},
{
"epoch": 1.5804597701149425,
"grad_norm": 0.07436943054199219,
"learning_rate": 0.00010903064413070612,
"loss": 0.6381,
"step": 275
},
{
"epoch": 1.5862068965517242,
"grad_norm": 0.0804380401968956,
"learning_rate": 0.00010836333432539272,
"loss": 0.6302,
"step": 276
},
{
"epoch": 1.5919540229885056,
"grad_norm": 0.07640023529529572,
"learning_rate": 0.00010769564926040769,
"loss": 0.618,
"step": 277
},
{
"epoch": 1.5977011494252875,
"grad_norm": 0.0787947028875351,
"learning_rate": 0.0001070276188945293,
"loss": 0.6308,
"step": 278
},
{
"epoch": 1.603448275862069,
"grad_norm": 0.08764500916004181,
"learning_rate": 0.00010635927320202928,
"loss": 0.6316,
"step": 279
},
{
"epoch": 1.6091954022988506,
"grad_norm": 0.07885821908712387,
"learning_rate": 0.00010569064217132791,
"loss": 0.6232,
"step": 280
},
{
"epoch": 1.6091954022988506,
"eval_loss": 0.6484516859054565,
"eval_runtime": 406.5349,
"eval_samples_per_second": 24.347,
"eval_steps_per_second": 0.381,
"step": 280
},
{
"epoch": 1.6149425287356323,
"grad_norm": 0.08910427987575531,
"learning_rate": 0.00010502175580364857,
"loss": 0.6207,
"step": 281
},
{
"epoch": 1.6206896551724137,
"grad_norm": 0.08195802569389343,
"learning_rate": 0.00010435264411167148,
"loss": 0.6604,
"step": 282
},
{
"epoch": 1.6264367816091954,
"grad_norm": 0.09276524186134338,
"learning_rate": 0.0001036833371181871,
"loss": 0.6444,
"step": 283
},
{
"epoch": 1.632183908045977,
"grad_norm": 0.07577691972255707,
"learning_rate": 0.00010301386485474889,
"loss": 0.6439,
"step": 284
},
{
"epoch": 1.6379310344827587,
"grad_norm": 0.07871613651514053,
"learning_rate": 0.00010234425736032607,
"loss": 0.639,
"step": 285
},
{
"epoch": 1.6436781609195403,
"grad_norm": 0.07570876181125641,
"learning_rate": 0.00010167454467995549,
"loss": 0.6056,
"step": 286
},
{
"epoch": 1.6494252873563218,
"grad_norm": 0.09836837649345398,
"learning_rate": 0.00010100475686339379,
"loss": 0.6341,
"step": 287
},
{
"epoch": 1.6551724137931034,
"grad_norm": 0.08796896785497665,
"learning_rate": 0.00010033492396376878,
"loss": 0.6193,
"step": 288
},
{
"epoch": 1.660919540229885,
"grad_norm": 0.07815764099359512,
"learning_rate": 9.966507603623125e-05,
"loss": 0.6227,
"step": 289
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.13016292452812195,
"learning_rate": 9.899524313660624e-05,
"loss": 0.6243,
"step": 290
},
{
"epoch": 1.6666666666666665,
"eval_loss": 0.6477526426315308,
"eval_runtime": 405.0855,
"eval_samples_per_second": 24.434,
"eval_steps_per_second": 0.383,
"step": 290
},
{
"epoch": 1.6724137931034484,
"grad_norm": 0.09747885912656784,
"learning_rate": 9.832545532004454e-05,
"loss": 0.6328,
"step": 291
},
{
"epoch": 1.6781609195402298,
"grad_norm": 0.10131366550922394,
"learning_rate": 9.765574263967396e-05,
"loss": 0.6212,
"step": 292
},
{
"epoch": 1.6839080459770115,
"grad_norm": 0.1203976571559906,
"learning_rate": 9.698613514525116e-05,
"loss": 0.6563,
"step": 293
},
{
"epoch": 1.6896551724137931,
"grad_norm": 0.07119957357645035,
"learning_rate": 9.631666288181293e-05,
"loss": 0.6278,
"step": 294
},
{
"epoch": 1.6954022988505746,
"grad_norm": 0.11370845884084702,
"learning_rate": 9.564735588832856e-05,
"loss": 0.6376,
"step": 295
},
{
"epoch": 1.7011494252873565,
"grad_norm": 0.07851264625787735,
"learning_rate": 9.497824419635144e-05,
"loss": 0.6149,
"step": 296
},
{
"epoch": 1.706896551724138,
"grad_norm": 0.0818655788898468,
"learning_rate": 9.430935782867212e-05,
"loss": 0.6048,
"step": 297
},
{
"epoch": 1.7126436781609196,
"grad_norm": 0.07335007190704346,
"learning_rate": 9.364072679797073e-05,
"loss": 0.6292,
"step": 298
},
{
"epoch": 1.7183908045977012,
"grad_norm": 0.07759315520524979,
"learning_rate": 9.297238110547074e-05,
"loss": 0.6464,
"step": 299
},
{
"epoch": 1.7241379310344827,
"grad_norm": 0.0833640992641449,
"learning_rate": 9.230435073959232e-05,
"loss": 0.6467,
"step": 300
},
{
"epoch": 1.7241379310344827,
"eval_loss": 0.6469475030899048,
"eval_runtime": 408.9385,
"eval_samples_per_second": 24.204,
"eval_steps_per_second": 0.379,
"step": 300
},
{
"epoch": 1.7298850574712645,
"grad_norm": 0.08030898869037628,
"learning_rate": 9.163666567460733e-05,
"loss": 0.6268,
"step": 301
},
{
"epoch": 1.735632183908046,
"grad_norm": 0.08017026633024216,
"learning_rate": 9.096935586929392e-05,
"loss": 0.6367,
"step": 302
},
{
"epoch": 1.7413793103448276,
"grad_norm": 0.07945988327264786,
"learning_rate": 9.030245126559262e-05,
"loss": 0.6318,
"step": 303
},
{
"epoch": 1.7471264367816093,
"grad_norm": 0.09426795691251755,
"learning_rate": 8.963598178726254e-05,
"loss": 0.6399,
"step": 304
},
{
"epoch": 1.7528735632183907,
"grad_norm": 0.08182523399591446,
"learning_rate": 8.896997733853903e-05,
"loss": 0.6203,
"step": 305
},
{
"epoch": 1.7586206896551724,
"grad_norm": 0.07778620719909668,
"learning_rate": 8.830446780279176e-05,
"loss": 0.6816,
"step": 306
},
{
"epoch": 1.764367816091954,
"grad_norm": 0.11482707411050797,
"learning_rate": 8.763948304118368e-05,
"loss": 0.6442,
"step": 307
},
{
"epoch": 1.7701149425287355,
"grad_norm": 0.07546856999397278,
"learning_rate": 8.697505289133145e-05,
"loss": 0.6445,
"step": 308
},
{
"epoch": 1.7758620689655173,
"grad_norm": 0.11665278673171997,
"learning_rate": 8.631120716596641e-05,
"loss": 0.6374,
"step": 309
},
{
"epoch": 1.7816091954022988,
"grad_norm": 0.1181105300784111,
"learning_rate": 8.564797565159714e-05,
"loss": 0.6146,
"step": 310
},
{
"epoch": 1.7816091954022988,
"eval_loss": 0.6459708213806152,
"eval_runtime": 405.0602,
"eval_samples_per_second": 24.436,
"eval_steps_per_second": 0.383,
"step": 310
},
{
"epoch": 1.7873563218390804,
"grad_norm": 0.07805997133255005,
"learning_rate": 8.498538810717267e-05,
"loss": 0.6679,
"step": 311
},
{
"epoch": 1.793103448275862,
"grad_norm": 0.08421120047569275,
"learning_rate": 8.432347426274739e-05,
"loss": 0.642,
"step": 312
},
{
"epoch": 1.7988505747126435,
"grad_norm": 0.10425391793251038,
"learning_rate": 8.366226381814697e-05,
"loss": 0.6354,
"step": 313
},
{
"epoch": 1.8045977011494254,
"grad_norm": 0.08861584216356277,
"learning_rate": 8.300178644163594e-05,
"loss": 0.6397,
"step": 314
},
{
"epoch": 1.8103448275862069,
"grad_norm": 0.08726219832897186,
"learning_rate": 8.234207176858614e-05,
"loss": 0.6474,
"step": 315
},
{
"epoch": 1.8160919540229885,
"grad_norm": 0.12218604981899261,
"learning_rate": 8.16831494001474e-05,
"loss": 0.6459,
"step": 316
},
{
"epoch": 1.8218390804597702,
"grad_norm": 0.08113615214824677,
"learning_rate": 8.102504890191892e-05,
"loss": 0.6114,
"step": 317
},
{
"epoch": 1.8275862068965516,
"grad_norm": 0.08763635903596878,
"learning_rate": 8.036779980262311e-05,
"loss": 0.6602,
"step": 318
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.1053246557712555,
"learning_rate": 7.971143159278026e-05,
"loss": 0.6182,
"step": 319
},
{
"epoch": 1.839080459770115,
"grad_norm": 0.09522312134504318,
"learning_rate": 7.905597372338558e-05,
"loss": 0.6386,
"step": 320
},
{
"epoch": 1.839080459770115,
"eval_loss": 0.6449984908103943,
"eval_runtime": 405.9165,
"eval_samples_per_second": 24.384,
"eval_steps_per_second": 0.382,
"step": 320
},
{
"epoch": 1.8448275862068966,
"grad_norm": 0.09493348747491837,
"learning_rate": 7.840145560458756e-05,
"loss": 0.6522,
"step": 321
},
{
"epoch": 1.8505747126436782,
"grad_norm": 0.10554379224777222,
"learning_rate": 7.774790660436858e-05,
"loss": 0.6401,
"step": 322
},
{
"epoch": 1.8563218390804597,
"grad_norm": 0.09237196296453476,
"learning_rate": 7.709535604722684e-05,
"loss": 0.6315,
"step": 323
},
{
"epoch": 1.8620689655172413,
"grad_norm": 0.07175464183092117,
"learning_rate": 7.644383321286094e-05,
"loss": 0.6559,
"step": 324
},
{
"epoch": 1.867816091954023,
"grad_norm": 0.08578918129205704,
"learning_rate": 7.579336733485584e-05,
"loss": 0.6297,
"step": 325
},
{
"epoch": 1.8735632183908046,
"grad_norm": 0.14390091598033905,
"learning_rate": 7.514398759937135e-05,
"loss": 0.6155,
"step": 326
},
{
"epoch": 1.8793103448275863,
"grad_norm": 0.07774030417203903,
"learning_rate": 7.449572314383237e-05,
"loss": 0.6551,
"step": 327
},
{
"epoch": 1.8850574712643677,
"grad_norm": 0.07927459478378296,
"learning_rate": 7.384860305562172e-05,
"loss": 0.6312,
"step": 328
},
{
"epoch": 1.8908045977011494,
"grad_norm": 0.11287631094455719,
"learning_rate": 7.320265637077473e-05,
"loss": 0.66,
"step": 329
},
{
"epoch": 1.896551724137931,
"grad_norm": 0.09955232590436935,
"learning_rate": 7.255791207267679e-05,
"loss": 0.6456,
"step": 330
},
{
"epoch": 1.896551724137931,
"eval_loss": 0.6442980766296387,
"eval_runtime": 404.2901,
"eval_samples_per_second": 24.482,
"eval_steps_per_second": 0.383,
"step": 330
},
{
"epoch": 1.9022988505747125,
"grad_norm": 0.07881880551576614,
"learning_rate": 7.191439909076243e-05,
"loss": 0.6398,
"step": 331
},
{
"epoch": 1.9080459770114944,
"grad_norm": 0.15244217216968536,
"learning_rate": 7.127214629921765e-05,
"loss": 0.6614,
"step": 332
},
{
"epoch": 1.9137931034482758,
"grad_norm": 0.07337264716625214,
"learning_rate": 7.0631182515684e-05,
"loss": 0.6294,
"step": 333
},
{
"epoch": 1.9195402298850575,
"grad_norm": 0.07102935016155243,
"learning_rate": 6.999153649996595e-05,
"loss": 0.6237,
"step": 334
},
{
"epoch": 1.9252873563218391,
"grad_norm": 0.09349462389945984,
"learning_rate": 6.935323695274002e-05,
"loss": 0.6051,
"step": 335
},
{
"epoch": 1.9310344827586206,
"grad_norm": 0.0851803794503212,
"learning_rate": 6.871631251426728e-05,
"loss": 0.6548,
"step": 336
},
{
"epoch": 1.9367816091954024,
"grad_norm": 0.08571562170982361,
"learning_rate": 6.808079176310827e-05,
"loss": 0.6136,
"step": 337
},
{
"epoch": 1.9425287356321839,
"grad_norm": 0.0772768035531044,
"learning_rate": 6.744670321484043e-05,
"loss": 0.6668,
"step": 338
},
{
"epoch": 1.9482758620689655,
"grad_norm": 0.08812547475099564,
"learning_rate": 6.681407532077895e-05,
"loss": 0.6427,
"step": 339
},
{
"epoch": 1.9540229885057472,
"grad_norm": 0.09011583775281906,
"learning_rate": 6.618293646669986e-05,
"loss": 0.6402,
"step": 340
},
{
"epoch": 1.9540229885057472,
"eval_loss": 0.6436823606491089,
"eval_runtime": 413.0204,
"eval_samples_per_second": 23.965,
"eval_steps_per_second": 0.375,
"step": 340
},
{
"epoch": 1.9597701149425286,
"grad_norm": 0.08234158158302307,
"learning_rate": 6.555331497156672e-05,
"loss": 0.6362,
"step": 341
},
{
"epoch": 1.9655172413793105,
"grad_norm": 0.0780014768242836,
"learning_rate": 6.492523908625959e-05,
"loss": 0.6454,
"step": 342
},
{
"epoch": 1.971264367816092,
"grad_norm": 0.08458276093006134,
"learning_rate": 6.42987369923077e-05,
"loss": 0.6587,
"step": 343
},
{
"epoch": 1.9770114942528736,
"grad_norm": 0.11979149281978607,
"learning_rate": 6.367383680062478e-05,
"loss": 0.6369,
"step": 344
},
{
"epoch": 1.9827586206896552,
"grad_norm": 0.08782167732715607,
"learning_rate": 6.30505665502479e-05,
"loss": 0.6382,
"step": 345
},
{
"epoch": 1.9885057471264367,
"grad_norm": 0.07542918622493744,
"learning_rate": 6.242895420707917e-05,
"loss": 0.6238,
"step": 346
},
{
"epoch": 1.9942528735632183,
"grad_norm": 0.09390002489089966,
"learning_rate": 6.180902766263113e-05,
"loss": 0.632,
"step": 347
},
{
"epoch": 2.0,
"grad_norm": 0.10154885053634644,
"learning_rate": 6.119081473277501e-05,
"loss": 0.6078,
"step": 348
},
{
"epoch": 2.0057471264367814,
"grad_norm": 0.09035320580005646,
"learning_rate": 6.057434315649304e-05,
"loss": 0.6331,
"step": 349
},
{
"epoch": 2.0114942528735633,
"grad_norm": 0.1151895746588707,
"learning_rate": 5.99596405946333e-05,
"loss": 0.6455,
"step": 350
},
{
"epoch": 2.0114942528735633,
"eval_loss": 0.6433547139167786,
"eval_runtime": 409.0063,
"eval_samples_per_second": 24.2,
"eval_steps_per_second": 0.379,
"step": 350
},
{
"epoch": 2.0172413793103448,
"grad_norm": 0.10666079819202423,
"learning_rate": 5.9346734628669065e-05,
"loss": 0.6473,
"step": 351
},
{
"epoch": 2.0229885057471266,
"grad_norm": 0.09095422178506851,
"learning_rate": 5.873565275946088e-05,
"loss": 0.6335,
"step": 352
},
{
"epoch": 2.028735632183908,
"grad_norm": 0.09256957471370697,
"learning_rate": 5.8126422406022885e-05,
"loss": 0.5969,
"step": 353
},
{
"epoch": 2.0344827586206895,
"grad_norm": 0.1397576928138733,
"learning_rate": 5.7519070904292247e-05,
"loss": 0.5919,
"step": 354
},
{
"epoch": 2.0402298850574714,
"grad_norm": 0.0867573469877243,
"learning_rate": 5.691362550590297e-05,
"loss": 0.5909,
"step": 355
},
{
"epoch": 2.045977011494253,
"grad_norm": 0.07953327894210815,
"learning_rate": 5.631011337696271e-05,
"loss": 0.5959,
"step": 356
},
{
"epoch": 2.0517241379310347,
"grad_norm": 0.09324570745229721,
"learning_rate": 5.570856159683418e-05,
"loss": 0.6216,
"step": 357
},
{
"epoch": 2.057471264367816,
"grad_norm": 0.10510014742612839,
"learning_rate": 5.510899715691984e-05,
"loss": 0.6172,
"step": 358
},
{
"epoch": 2.0632183908045976,
"grad_norm": 0.08669542521238327,
"learning_rate": 5.451144695945116e-05,
"loss": 0.5931,
"step": 359
},
{
"epoch": 2.0689655172413794,
"grad_norm": 0.09054102748632431,
"learning_rate": 5.3915937816281095e-05,
"loss": 0.5888,
"step": 360
},
{
"epoch": 2.0689655172413794,
"eval_loss": 0.643742024898529,
"eval_runtime": 404.2471,
"eval_samples_per_second": 24.485,
"eval_steps_per_second": 0.383,
"step": 360
},
{
"epoch": 2.074712643678161,
"grad_norm": 0.11839323490858078,
"learning_rate": 5.3322496447681414e-05,
"loss": 0.6093,
"step": 361
},
{
"epoch": 2.0804597701149423,
"grad_norm": 0.1050933375954628,
"learning_rate": 5.273114948114346e-05,
"loss": 0.6247,
"step": 362
},
{
"epoch": 2.086206896551724,
"grad_norm": 0.09781333059072495,
"learning_rate": 5.214192345018374e-05,
"loss": 0.6274,
"step": 363
},
{
"epoch": 2.0919540229885056,
"grad_norm": 0.09329628199338913,
"learning_rate": 5.1554844793153e-05,
"loss": 0.6243,
"step": 364
},
{
"epoch": 2.0977011494252875,
"grad_norm": 0.08716364949941635,
"learning_rate": 5.096993985205023e-05,
"loss": 0.6149,
"step": 365
},
{
"epoch": 2.103448275862069,
"grad_norm": 0.09969545155763626,
"learning_rate": 5.0387234871340486e-05,
"loss": 0.635,
"step": 366
},
{
"epoch": 2.1091954022988504,
"grad_norm": 0.10841623693704605,
"learning_rate": 4.980675599677757e-05,
"loss": 0.6544,
"step": 367
},
{
"epoch": 2.1149425287356323,
"grad_norm": 0.07902085781097412,
"learning_rate": 4.9228529274230695e-05,
"loss": 0.6144,
"step": 368
},
{
"epoch": 2.1206896551724137,
"grad_norm": 0.11440268158912659,
"learning_rate": 4.865258064851579e-05,
"loss": 0.6217,
"step": 369
},
{
"epoch": 2.1264367816091956,
"grad_norm": 0.09594007581472397,
"learning_rate": 4.807893596223152e-05,
"loss": 0.6267,
"step": 370
},
{
"epoch": 2.1264367816091956,
"eval_loss": 0.6434890031814575,
"eval_runtime": 404.1508,
"eval_samples_per_second": 24.491,
"eval_steps_per_second": 0.384,
"step": 370
},
{
"epoch": 2.132183908045977,
"grad_norm": 0.09025128185749054,
"learning_rate": 4.75076209545996e-05,
"loss": 0.6122,
"step": 371
},
{
"epoch": 2.1379310344827585,
"grad_norm": 0.09677668660879135,
"learning_rate": 4.693866126030995e-05,
"loss": 0.6339,
"step": 372
},
{
"epoch": 2.1436781609195403,
"grad_norm": 0.08178266882896423,
"learning_rate": 4.637208240837042e-05,
"loss": 0.6392,
"step": 373
},
{
"epoch": 2.1494252873563218,
"grad_norm": 0.10616466403007507,
"learning_rate": 4.5807909820961494e-05,
"loss": 0.6207,
"step": 374
},
{
"epoch": 2.1551724137931036,
"grad_norm": 0.08333076536655426,
"learning_rate": 4.5246168812295286e-05,
"loss": 0.6148,
"step": 375
},
{
"epoch": 2.160919540229885,
"grad_norm": 0.1016552671790123,
"learning_rate": 4.468688458748006e-05,
"loss": 0.6306,
"step": 376
},
{
"epoch": 2.1666666666666665,
"grad_norm": 0.08546506613492966,
"learning_rate": 4.413008224138897e-05,
"loss": 0.606,
"step": 377
},
{
"epoch": 2.1724137931034484,
"grad_norm": 0.08369904011487961,
"learning_rate": 4.357578675753432e-05,
"loss": 0.6007,
"step": 378
},
{
"epoch": 2.17816091954023,
"grad_norm": 0.08523935824632645,
"learning_rate": 4.302402300694636e-05,
"loss": 0.5884,
"step": 379
},
{
"epoch": 2.1839080459770113,
"grad_norm": 0.0944519191980362,
"learning_rate": 4.247481574705744e-05,
"loss": 0.6292,
"step": 380
},
{
"epoch": 2.1839080459770113,
"eval_loss": 0.6433520913124084,
"eval_runtime": 404.2218,
"eval_samples_per_second": 24.487,
"eval_steps_per_second": 0.383,
"step": 380
},
{
"epoch": 2.189655172413793,
"grad_norm": 0.11311980336904526,
"learning_rate": 4.1928189620591116e-05,
"loss": 0.6103,
"step": 381
},
{
"epoch": 2.1954022988505746,
"grad_norm": 0.08662451803684235,
"learning_rate": 4.138416915445655e-05,
"loss": 0.5852,
"step": 382
},
{
"epoch": 2.2011494252873565,
"grad_norm": 0.09417479485273361,
"learning_rate": 4.084277875864776e-05,
"loss": 0.6467,
"step": 383
},
{
"epoch": 2.206896551724138,
"grad_norm": 0.09818896651268005,
"learning_rate": 4.030404272514864e-05,
"loss": 0.6112,
"step": 384
},
{
"epoch": 2.2126436781609193,
"grad_norm": 0.08806431293487549,
"learning_rate": 3.9767985226842696e-05,
"loss": 0.5822,
"step": 385
},
{
"epoch": 2.218390804597701,
"grad_norm": 0.0837361216545105,
"learning_rate": 3.923463031642872e-05,
"loss": 0.6137,
"step": 386
},
{
"epoch": 2.2241379310344827,
"grad_norm": 0.10712449252605438,
"learning_rate": 3.870400192534128e-05,
"loss": 0.602,
"step": 387
},
{
"epoch": 2.2298850574712645,
"grad_norm": 0.11590448766946793,
"learning_rate": 3.81761238626771e-05,
"loss": 0.6215,
"step": 388
},
{
"epoch": 2.235632183908046,
"grad_norm": 0.08264652639627457,
"learning_rate": 3.7651019814126654e-05,
"loss": 0.6002,
"step": 389
},
{
"epoch": 2.2413793103448274,
"grad_norm": 0.08986306935548782,
"learning_rate": 3.7128713340911535e-05,
"loss": 0.6058,
"step": 390
},
{
"epoch": 2.2413793103448274,
"eval_loss": 0.6431533098220825,
"eval_runtime": 419.2567,
"eval_samples_per_second": 23.608,
"eval_steps_per_second": 0.37,
"step": 390
},
{
"epoch": 2.2471264367816093,
"grad_norm": 0.3949902057647705,
"learning_rate": 3.660922787872706e-05,
"loss": 0.643,
"step": 391
},
{
"epoch": 2.2528735632183907,
"grad_norm": 0.09183293581008911,
"learning_rate": 3.609258673669097e-05,
"loss": 0.5931,
"step": 392
},
{
"epoch": 2.2586206896551726,
"grad_norm": 0.0786626785993576,
"learning_rate": 3.557881309629729e-05,
"loss": 0.5795,
"step": 393
},
{
"epoch": 2.264367816091954,
"grad_norm": 0.08318330347537994,
"learning_rate": 3.5067930010376484e-05,
"loss": 0.6173,
"step": 394
},
{
"epoch": 2.2701149425287355,
"grad_norm": 0.09149078279733658,
"learning_rate": 3.455996040206076e-05,
"loss": 0.6238,
"step": 395
},
{
"epoch": 2.2758620689655173,
"grad_norm": 0.09578599780797958,
"learning_rate": 3.4054927063755796e-05,
"loss": 0.6264,
"step": 396
},
{
"epoch": 2.281609195402299,
"grad_norm": 0.08735264092683792,
"learning_rate": 3.355285265611784e-05,
"loss": 0.6269,
"step": 397
},
{
"epoch": 2.2873563218390807,
"grad_norm": 0.0886816754937172,
"learning_rate": 3.305375970703711e-05,
"loss": 0.6043,
"step": 398
},
{
"epoch": 2.293103448275862,
"grad_norm": 0.07559609413146973,
"learning_rate": 3.2557670610626925e-05,
"loss": 0.6416,
"step": 399
},
{
"epoch": 2.2988505747126435,
"grad_norm": 0.11379113793373108,
"learning_rate": 3.206460762621873e-05,
"loss": 0.6221,
"step": 400
},
{
"epoch": 2.2988505747126435,
"eval_loss": 0.6427375078201294,
"eval_runtime": 405.8229,
"eval_samples_per_second": 24.39,
"eval_steps_per_second": 0.382,
"step": 400
},
{
"epoch": 2.3045977011494254,
"grad_norm": 0.08930199593305588,
"learning_rate": 3.157459287736362e-05,
"loss": 0.599,
"step": 401
},
{
"epoch": 2.310344827586207,
"grad_norm": 0.11189960688352585,
"learning_rate": 3.108764835083938e-05,
"loss": 0.6243,
"step": 402
},
{
"epoch": 2.3160919540229887,
"grad_norm": 0.0793476328253746,
"learning_rate": 3.0603795895664124e-05,
"loss": 0.615,
"step": 403
},
{
"epoch": 2.32183908045977,
"grad_norm": 0.0860418751835823,
"learning_rate": 3.0123057222115836e-05,
"loss": 0.5968,
"step": 404
},
{
"epoch": 2.3275862068965516,
"grad_norm": 0.08753317594528198,
"learning_rate": 2.964545390075841e-05,
"loss": 0.6192,
"step": 405
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.09598301351070404,
"learning_rate": 2.9171007361473514e-05,
"loss": 0.6237,
"step": 406
},
{
"epoch": 2.339080459770115,
"grad_norm": 0.10627751052379608,
"learning_rate": 2.8699738892499328e-05,
"loss": 0.6123,
"step": 407
},
{
"epoch": 2.344827586206897,
"grad_norm": 0.08839675039052963,
"learning_rate": 2.8231669639475067e-05,
"loss": 0.6123,
"step": 408
},
{
"epoch": 2.3505747126436782,
"grad_norm": 0.08533503860235214,
"learning_rate": 2.776682060449247e-05,
"loss": 0.6251,
"step": 409
},
{
"epoch": 2.3563218390804597,
"grad_norm": 0.10517686605453491,
"learning_rate": 2.7305212645153212e-05,
"loss": 0.6254,
"step": 410
},
{
"epoch": 2.3563218390804597,
"eval_loss": 0.6428195238113403,
"eval_runtime": 404.1758,
"eval_samples_per_second": 24.489,
"eval_steps_per_second": 0.383,
"step": 410
},
{
"epoch": 2.3620689655172415,
"grad_norm": 0.10578128695487976,
"learning_rate": 2.6846866473633125e-05,
"loss": 0.6216,
"step": 411
},
{
"epoch": 2.367816091954023,
"grad_norm": 0.10083532333374023,
"learning_rate": 2.6391802655752853e-05,
"loss": 0.6052,
"step": 412
},
{
"epoch": 2.3735632183908044,
"grad_norm": 0.08413968980312347,
"learning_rate": 2.594004161005511e-05,
"loss": 0.6007,
"step": 413
},
{
"epoch": 2.3793103448275863,
"grad_norm": 0.08840201050043106,
"learning_rate": 2.549160360688838e-05,
"loss": 0.5876,
"step": 414
},
{
"epoch": 2.3850574712643677,
"grad_norm": 0.09680577367544174,
"learning_rate": 2.50465087674976e-05,
"loss": 0.6183,
"step": 415
},
{
"epoch": 2.3908045977011496,
"grad_norm": 0.09196774661540985,
"learning_rate": 2.4604777063121033e-05,
"loss": 0.613,
"step": 416
},
{
"epoch": 2.396551724137931,
"grad_norm": 0.0849708616733551,
"learning_rate": 2.4166428314094514e-05,
"loss": 0.6443,
"step": 417
},
{
"epoch": 2.4022988505747125,
"grad_norm": 0.09316956251859665,
"learning_rate": 2.3731482188961818e-05,
"loss": 0.6062,
"step": 418
},
{
"epoch": 2.4080459770114944,
"grad_norm": 0.08482903987169266,
"learning_rate": 2.32999582035923e-05,
"loss": 0.6099,
"step": 419
},
{
"epoch": 2.413793103448276,
"grad_norm": 0.08352029323577881,
"learning_rate": 2.287187572030516e-05,
"loss": 0.6178,
"step": 420
},
{
"epoch": 2.413793103448276,
"eval_loss": 0.6422638297080994,
"eval_runtime": 404.4609,
"eval_samples_per_second": 24.472,
"eval_steps_per_second": 0.383,
"step": 420
},
{
"epoch": 2.4195402298850572,
"grad_norm": 0.09856913238763809,
"learning_rate": 2.244725394700079e-05,
"loss": 0.6166,
"step": 421
},
{
"epoch": 2.425287356321839,
"grad_norm": 0.10127527266740799,
"learning_rate": 2.202611193629869e-05,
"loss": 0.6195,
"step": 422
},
{
"epoch": 2.4310344827586206,
"grad_norm": 0.09415800124406815,
"learning_rate": 2.160846858468285e-05,
"loss": 0.6157,
"step": 423
},
{
"epoch": 2.4367816091954024,
"grad_norm": 0.08563528954982758,
"learning_rate": 2.1194342631653607e-05,
"loss": 0.6212,
"step": 424
},
{
"epoch": 2.442528735632184,
"grad_norm": 0.0861605629324913,
"learning_rate": 2.0783752658887066e-05,
"loss": 0.6095,
"step": 425
},
{
"epoch": 2.4482758620689653,
"grad_norm": 0.1125798374414444,
"learning_rate": 2.0376717089401164e-05,
"loss": 0.606,
"step": 426
},
{
"epoch": 2.454022988505747,
"grad_norm": 0.09633134305477142,
"learning_rate": 1.9973254186729086e-05,
"loss": 0.6109,
"step": 427
},
{
"epoch": 2.4597701149425286,
"grad_norm": 0.08123010396957397,
"learning_rate": 1.9573382054099786e-05,
"loss": 0.5896,
"step": 428
},
{
"epoch": 2.4655172413793105,
"grad_norm": 0.08620712906122208,
"learning_rate": 1.9177118633625814e-05,
"loss": 0.6022,
"step": 429
},
{
"epoch": 2.471264367816092,
"grad_norm": 0.08710537105798721,
"learning_rate": 1.8784481705498015e-05,
"loss": 0.6161,
"step": 430
},
{
"epoch": 2.471264367816092,
"eval_loss": 0.642048180103302,
"eval_runtime": 405.7821,
"eval_samples_per_second": 24.392,
"eval_steps_per_second": 0.382,
"step": 430
},
{
"epoch": 2.4770114942528734,
"grad_norm": 0.08711250126361847,
"learning_rate": 1.8395488887188005e-05,
"loss": 0.581,
"step": 431
},
{
"epoch": 2.4827586206896552,
"grad_norm": 0.08405685424804688,
"learning_rate": 1.8010157632657543e-05,
"loss": 0.6149,
"step": 432
},
{
"epoch": 2.4885057471264367,
"grad_norm": 0.08080325275659561,
"learning_rate": 1.762850523157532e-05,
"loss": 0.6264,
"step": 433
},
{
"epoch": 2.4942528735632186,
"grad_norm": 0.09836191684007645,
"learning_rate": 1.7250548808541322e-05,
"loss": 0.6055,
"step": 434
},
{
"epoch": 2.5,
"grad_norm": 0.10626177489757538,
"learning_rate": 1.687630532231833e-05,
"loss": 0.5907,
"step": 435
},
{
"epoch": 2.5057471264367814,
"grad_norm": 0.08308445662260056,
"learning_rate": 1.6505791565071138e-05,
"loss": 0.6189,
"step": 436
},
{
"epoch": 2.5114942528735633,
"grad_norm": 0.10249936580657959,
"learning_rate": 1.613902416161288e-05,
"loss": 0.6084,
"step": 437
},
{
"epoch": 2.5172413793103448,
"grad_norm": 0.08516431599855423,
"learning_rate": 1.5776019568659338e-05,
"loss": 0.624,
"step": 438
},
{
"epoch": 2.5229885057471266,
"grad_norm": 0.08852159231901169,
"learning_rate": 1.5416794074090258e-05,
"loss": 0.6374,
"step": 439
},
{
"epoch": 2.528735632183908,
"grad_norm": 0.09616044908761978,
"learning_rate": 1.5061363796218785e-05,
"loss": 0.634,
"step": 440
},
{
"epoch": 2.528735632183908,
"eval_loss": 0.6419377326965332,
"eval_runtime": 416.5131,
"eval_samples_per_second": 23.764,
"eval_steps_per_second": 0.372,
"step": 440
},
{
"epoch": 2.5344827586206895,
"grad_norm": 0.1012992411851883,
"learning_rate": 1.4709744683068039e-05,
"loss": 0.6443,
"step": 441
},
{
"epoch": 2.5402298850574714,
"grad_norm": 0.102021224796772,
"learning_rate": 1.4361952511655618e-05,
"loss": 0.6111,
"step": 442
},
{
"epoch": 2.545977011494253,
"grad_norm": 0.08464264124631882,
"learning_rate": 1.4018002887285687e-05,
"loss": 0.6007,
"step": 443
},
{
"epoch": 2.5517241379310347,
"grad_norm": 0.0829034224152565,
"learning_rate": 1.3677911242848806e-05,
"loss": 0.6083,
"step": 444
},
{
"epoch": 2.557471264367816,
"grad_norm": 0.08752921968698502,
"learning_rate": 1.334169283812936e-05,
"loss": 0.6227,
"step": 445
},
{
"epoch": 2.5632183908045976,
"grad_norm": 0.080236054956913,
"learning_rate": 1.300936275912098e-05,
"loss": 0.6212,
"step": 446
},
{
"epoch": 2.5689655172413794,
"grad_norm": 0.08524277061223984,
"learning_rate": 1.2680935917349523e-05,
"loss": 0.5915,
"step": 447
},
{
"epoch": 2.574712643678161,
"grad_norm": 0.09109287708997726,
"learning_rate": 1.2356427049204122e-05,
"loss": 0.5972,
"step": 448
},
{
"epoch": 2.5804597701149428,
"grad_norm": 0.11969230324029922,
"learning_rate": 1.2035850715275865e-05,
"loss": 0.6358,
"step": 449
},
{
"epoch": 2.586206896551724,
"grad_norm": 0.08512509614229202,
"learning_rate": 1.1719221299704497e-05,
"loss": 0.6241,
"step": 450
},
{
"epoch": 2.586206896551724,
"eval_loss": 0.641758382320404,
"eval_runtime": 404.7765,
"eval_samples_per_second": 24.453,
"eval_steps_per_second": 0.383,
"step": 450
},
{
"epoch": 2.5919540229885056,
"grad_norm": 0.08563876152038574,
"learning_rate": 1.1406553009533027e-05,
"loss": 0.6027,
"step": 451
},
{
"epoch": 2.5977011494252875,
"grad_norm": 0.07882750034332275,
"learning_rate": 1.1097859874070294e-05,
"loss": 0.6226,
"step": 452
},
{
"epoch": 2.603448275862069,
"grad_norm": 0.08562333881855011,
"learning_rate": 1.0793155744261351e-05,
"loss": 0.6145,
"step": 453
},
{
"epoch": 2.609195402298851,
"grad_norm": 0.08439898490905762,
"learning_rate": 1.0492454292066178e-05,
"loss": 0.6131,
"step": 454
},
{
"epoch": 2.6149425287356323,
"grad_norm": 0.09046713262796402,
"learning_rate": 1.019576900984599e-05,
"loss": 0.6312,
"step": 455
},
{
"epoch": 2.6206896551724137,
"grad_norm": 0.1001957505941391,
"learning_rate": 9.903113209758096e-06,
"loss": 0.6167,
"step": 456
},
{
"epoch": 2.626436781609195,
"grad_norm": 0.08048044890165329,
"learning_rate": 9.614500023158336e-06,
"loss": 0.5969,
"step": 457
},
{
"epoch": 2.632183908045977,
"grad_norm": 0.07949711382389069,
"learning_rate": 9.32994240001206e-06,
"loss": 0.6324,
"step": 458
},
{
"epoch": 2.637931034482759,
"grad_norm": 0.0978640615940094,
"learning_rate": 9.049453108312966e-06,
"loss": 0.5779,
"step": 459
},
{
"epoch": 2.6436781609195403,
"grad_norm": 0.08483273535966873,
"learning_rate": 8.773044733510338e-06,
"loss": 0.6084,
"step": 460
},
{
"epoch": 2.6436781609195403,
"eval_loss": 0.6415662169456482,
"eval_runtime": 404.188,
"eval_samples_per_second": 24.489,
"eval_steps_per_second": 0.383,
"step": 460
},
{
"epoch": 2.6494252873563218,
"grad_norm": 0.08597224205732346,
"learning_rate": 8.50072967794413e-06,
"loss": 0.5962,
"step": 461
},
{
"epoch": 2.655172413793103,
"grad_norm": 0.08336161822080612,
"learning_rate": 8.232520160288704e-06,
"loss": 0.6276,
"step": 462
},
{
"epoch": 2.660919540229885,
"grad_norm": 0.08224053680896759,
"learning_rate": 7.96842821500442e-06,
"loss": 0.6047,
"step": 463
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.08457629382610321,
"learning_rate": 7.708465691797717e-06,
"loss": 0.6006,
"step": 464
},
{
"epoch": 2.6724137931034484,
"grad_norm": 0.09363652020692825,
"learning_rate": 7.452644255089425e-06,
"loss": 0.6261,
"step": 465
},
{
"epoch": 2.67816091954023,
"grad_norm": 0.08728937804698944,
"learning_rate": 7.20097538349136e-06,
"loss": 0.6146,
"step": 466
},
{
"epoch": 2.6839080459770113,
"grad_norm": 0.08341008424758911,
"learning_rate": 6.953470369291348e-06,
"loss": 0.6237,
"step": 467
},
{
"epoch": 2.689655172413793,
"grad_norm": 0.08936601877212524,
"learning_rate": 6.710140317946423e-06,
"loss": 0.643,
"step": 468
},
{
"epoch": 2.6954022988505746,
"grad_norm": 0.09783781319856644,
"learning_rate": 6.470996147584685e-06,
"loss": 0.5764,
"step": 469
},
{
"epoch": 2.7011494252873565,
"grad_norm": 0.08959370106458664,
"learning_rate": 6.236048588515242e-06,
"loss": 0.6264,
"step": 470
},
{
"epoch": 2.7011494252873565,
"eval_loss": 0.6414589881896973,
"eval_runtime": 405.1776,
"eval_samples_per_second": 24.429,
"eval_steps_per_second": 0.383,
"step": 470
},
{
"epoch": 2.706896551724138,
"grad_norm": 0.08131396770477295,
"learning_rate": 6.0053081827469045e-06,
"loss": 0.6455,
"step": 471
},
{
"epoch": 2.7126436781609193,
"grad_norm": 0.08353292942047119,
"learning_rate": 5.778785283515053e-06,
"loss": 0.6254,
"step": 472
},
{
"epoch": 2.718390804597701,
"grad_norm": 0.0802810862660408,
"learning_rate": 5.556490054817132e-06,
"loss": 0.6284,
"step": 473
},
{
"epoch": 2.7241379310344827,
"grad_norm": 0.08118069916963577,
"learning_rate": 5.338432470956589e-06,
"loss": 0.6092,
"step": 474
},
{
"epoch": 2.7298850574712645,
"grad_norm": 0.08621113002300262,
"learning_rate": 5.1246223160953845e-06,
"loss": 0.6489,
"step": 475
},
{
"epoch": 2.735632183908046,
"grad_norm": 0.08560863137245178,
"learning_rate": 4.91506918381488e-06,
"loss": 0.6154,
"step": 476
},
{
"epoch": 2.7413793103448274,
"grad_norm": 0.081720270216465,
"learning_rate": 4.7097824766854756e-06,
"loss": 0.6232,
"step": 477
},
{
"epoch": 2.7471264367816093,
"grad_norm": 0.08384092152118683,
"learning_rate": 4.508771405844636e-06,
"loss": 0.6209,
"step": 478
},
{
"epoch": 2.7528735632183907,
"grad_norm": 0.08142372965812683,
"learning_rate": 4.312044990583675e-06,
"loss": 0.6298,
"step": 479
},
{
"epoch": 2.7586206896551726,
"grad_norm": 0.0810447633266449,
"learning_rate": 4.119612057942978e-06,
"loss": 0.608,
"step": 480
},
{
"epoch": 2.7586206896551726,
"eval_loss": 0.6413341164588928,
"eval_runtime": 410.6577,
"eval_samples_per_second": 24.103,
"eval_steps_per_second": 0.377,
"step": 480
},
{
"epoch": 2.764367816091954,
"grad_norm": 0.08321461826562881,
"learning_rate": 3.931481242315993e-06,
"loss": 0.6426,
"step": 481
},
{
"epoch": 2.7701149425287355,
"grad_norm": 0.0784662514925003,
"learning_rate": 3.747660985061785e-06,
"loss": 0.6126,
"step": 482
},
{
"epoch": 2.7758620689655173,
"grad_norm": 0.09238499402999878,
"learning_rate": 3.568159534126314e-06,
"loss": 0.5786,
"step": 483
},
{
"epoch": 2.781609195402299,
"grad_norm": 0.08142554014921188,
"learning_rate": 3.3929849436722728e-06,
"loss": 0.6341,
"step": 484
},
{
"epoch": 2.7873563218390807,
"grad_norm": 0.08540128916501999,
"learning_rate": 3.2221450737178083e-06,
"loss": 0.6062,
"step": 485
},
{
"epoch": 2.793103448275862,
"grad_norm": 0.08547057211399078,
"learning_rate": 3.0556475897837166e-06,
"loss": 0.5974,
"step": 486
},
{
"epoch": 2.7988505747126435,
"grad_norm": 0.1007808968424797,
"learning_rate": 2.8934999625496282e-06,
"loss": 0.6157,
"step": 487
},
{
"epoch": 2.8045977011494254,
"grad_norm": 0.08533742278814316,
"learning_rate": 2.735709467518699e-06,
"loss": 0.625,
"step": 488
},
{
"epoch": 2.810344827586207,
"grad_norm": 0.08325877785682678,
"learning_rate": 2.5822831846912033e-06,
"loss": 0.5991,
"step": 489
},
{
"epoch": 2.8160919540229887,
"grad_norm": 0.08522289991378784,
"learning_rate": 2.4332279982468453e-06,
"loss": 0.6039,
"step": 490
},
{
"epoch": 2.8160919540229887,
"eval_loss": 0.6412601470947266,
"eval_runtime": 405.8893,
"eval_samples_per_second": 24.386,
"eval_steps_per_second": 0.382,
"step": 490
},
{
"epoch": 2.82183908045977,
"grad_norm": 0.08191868662834167,
"learning_rate": 2.2885505962359054e-06,
"loss": 0.5907,
"step": 491
},
{
"epoch": 2.8275862068965516,
"grad_norm": 0.08263259381055832,
"learning_rate": 2.1482574702790803e-06,
"loss": 0.615,
"step": 492
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.08231104165315628,
"learning_rate": 2.0123549152762823e-06,
"loss": 0.6334,
"step": 493
},
{
"epoch": 2.839080459770115,
"grad_norm": 0.08760181069374084,
"learning_rate": 1.8808490291241432e-06,
"loss": 0.6186,
"step": 494
},
{
"epoch": 2.844827586206897,
"grad_norm": 0.07865423709154129,
"learning_rate": 1.7537457124423895e-06,
"loss": 0.6324,
"step": 495
},
{
"epoch": 2.8505747126436782,
"grad_norm": 0.08259916305541992,
"learning_rate": 1.631050668309131e-06,
"loss": 0.6406,
"step": 496
},
{
"epoch": 2.8563218390804597,
"grad_norm": 0.08283340930938721,
"learning_rate": 1.5127694020049432e-06,
"loss": 0.6253,
"step": 497
},
{
"epoch": 2.862068965517241,
"grad_norm": 0.0877593606710434,
"learning_rate": 1.3989072207658328e-06,
"loss": 0.6158,
"step": 498
},
{
"epoch": 2.867816091954023,
"grad_norm": 0.08183769136667252,
"learning_rate": 1.2894692335451375e-06,
"loss": 0.6091,
"step": 499
},
{
"epoch": 2.873563218390805,
"grad_norm": 0.08991672843694687,
"learning_rate": 1.1844603507842668e-06,
"loss": 0.6445,
"step": 500
},
{
"epoch": 2.873563218390805,
"eval_loss": 0.641264796257019,
"eval_runtime": 405.0206,
"eval_samples_per_second": 24.438,
"eval_steps_per_second": 0.383,
"step": 500
}
],
"logging_steps": 1.0,
"max_steps": 522,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.880737746399789e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}