lesso's picture
Training in progress, step 200, checkpoint
aba2981 verified
{
"best_metric": 1.0081819295883179,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.47058823529411764,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002352941176470588,
"grad_norm": 3.062838554382324,
"learning_rate": 1.0100000000000002e-05,
"loss": 2.3793,
"step": 1
},
{
"epoch": 0.002352941176470588,
"eval_loss": 2.571444034576416,
"eval_runtime": 13.5133,
"eval_samples_per_second": 423.657,
"eval_steps_per_second": 13.246,
"step": 1
},
{
"epoch": 0.004705882352941176,
"grad_norm": 3.511554002761841,
"learning_rate": 2.0200000000000003e-05,
"loss": 2.4046,
"step": 2
},
{
"epoch": 0.007058823529411765,
"grad_norm": 3.7289419174194336,
"learning_rate": 3.0299999999999998e-05,
"loss": 2.4423,
"step": 3
},
{
"epoch": 0.009411764705882352,
"grad_norm": 3.826002597808838,
"learning_rate": 4.0400000000000006e-05,
"loss": 2.433,
"step": 4
},
{
"epoch": 0.011764705882352941,
"grad_norm": 3.733748435974121,
"learning_rate": 5.05e-05,
"loss": 2.3183,
"step": 5
},
{
"epoch": 0.01411764705882353,
"grad_norm": 3.9309937953948975,
"learning_rate": 6.0599999999999996e-05,
"loss": 2.1593,
"step": 6
},
{
"epoch": 0.01647058823529412,
"grad_norm": 1.619845986366272,
"learning_rate": 7.07e-05,
"loss": 1.9091,
"step": 7
},
{
"epoch": 0.018823529411764704,
"grad_norm": 1.6118720769882202,
"learning_rate": 8.080000000000001e-05,
"loss": 1.7707,
"step": 8
},
{
"epoch": 0.021176470588235293,
"grad_norm": 1.6020575761795044,
"learning_rate": 9.09e-05,
"loss": 1.6202,
"step": 9
},
{
"epoch": 0.023529411764705882,
"grad_norm": 1.5743026733398438,
"learning_rate": 0.000101,
"loss": 1.5756,
"step": 10
},
{
"epoch": 0.02588235294117647,
"grad_norm": 1.6522502899169922,
"learning_rate": 0.00010046842105263158,
"loss": 1.4119,
"step": 11
},
{
"epoch": 0.02823529411764706,
"grad_norm": 1.905711054801941,
"learning_rate": 9.993684210526315e-05,
"loss": 1.3029,
"step": 12
},
{
"epoch": 0.03058823529411765,
"grad_norm": 1.981130838394165,
"learning_rate": 9.940526315789473e-05,
"loss": 1.6009,
"step": 13
},
{
"epoch": 0.03294117647058824,
"grad_norm": 1.2570853233337402,
"learning_rate": 9.887368421052632e-05,
"loss": 1.5536,
"step": 14
},
{
"epoch": 0.03529411764705882,
"grad_norm": 0.8584917187690735,
"learning_rate": 9.83421052631579e-05,
"loss": 1.4409,
"step": 15
},
{
"epoch": 0.03764705882352941,
"grad_norm": 1.3553175926208496,
"learning_rate": 9.781052631578948e-05,
"loss": 1.3756,
"step": 16
},
{
"epoch": 0.04,
"grad_norm": 0.9470474123954773,
"learning_rate": 9.727894736842106e-05,
"loss": 1.2287,
"step": 17
},
{
"epoch": 0.042352941176470586,
"grad_norm": 0.9331281185150146,
"learning_rate": 9.674736842105263e-05,
"loss": 1.1582,
"step": 18
},
{
"epoch": 0.04470588235294118,
"grad_norm": 1.0281063318252563,
"learning_rate": 9.621578947368421e-05,
"loss": 1.3719,
"step": 19
},
{
"epoch": 0.047058823529411764,
"grad_norm": 1.2275820970535278,
"learning_rate": 9.568421052631578e-05,
"loss": 1.5648,
"step": 20
},
{
"epoch": 0.04941176470588235,
"grad_norm": 0.8868420124053955,
"learning_rate": 9.515263157894737e-05,
"loss": 1.4091,
"step": 21
},
{
"epoch": 0.05176470588235294,
"grad_norm": 0.8574855327606201,
"learning_rate": 9.462105263157895e-05,
"loss": 1.3385,
"step": 22
},
{
"epoch": 0.05411764705882353,
"grad_norm": 1.0610370635986328,
"learning_rate": 9.408947368421054e-05,
"loss": 1.2697,
"step": 23
},
{
"epoch": 0.05647058823529412,
"grad_norm": 1.0069725513458252,
"learning_rate": 9.355789473684211e-05,
"loss": 1.1621,
"step": 24
},
{
"epoch": 0.058823529411764705,
"grad_norm": 0.9563344717025757,
"learning_rate": 9.302631578947369e-05,
"loss": 1.0005,
"step": 25
},
{
"epoch": 0.0611764705882353,
"grad_norm": 0.9165502786636353,
"learning_rate": 9.249473684210526e-05,
"loss": 1.4816,
"step": 26
},
{
"epoch": 0.06352941176470588,
"grad_norm": 1.1508539915084839,
"learning_rate": 9.196315789473685e-05,
"loss": 1.4342,
"step": 27
},
{
"epoch": 0.06588235294117648,
"grad_norm": 1.090813398361206,
"learning_rate": 9.143157894736843e-05,
"loss": 1.3629,
"step": 28
},
{
"epoch": 0.06823529411764706,
"grad_norm": 0.8920236229896545,
"learning_rate": 9.09e-05,
"loss": 1.2553,
"step": 29
},
{
"epoch": 0.07058823529411765,
"grad_norm": 0.7090237736701965,
"learning_rate": 9.036842105263158e-05,
"loss": 1.1073,
"step": 30
},
{
"epoch": 0.07294117647058823,
"grad_norm": 1.0130761861801147,
"learning_rate": 8.983684210526316e-05,
"loss": 1.0312,
"step": 31
},
{
"epoch": 0.07529411764705882,
"grad_norm": 1.0755921602249146,
"learning_rate": 8.930526315789474e-05,
"loss": 1.4467,
"step": 32
},
{
"epoch": 0.07764705882352942,
"grad_norm": 0.9146537184715271,
"learning_rate": 8.877368421052632e-05,
"loss": 1.4095,
"step": 33
},
{
"epoch": 0.08,
"grad_norm": 0.6506552696228027,
"learning_rate": 8.82421052631579e-05,
"loss": 1.3034,
"step": 34
},
{
"epoch": 0.08235294117647059,
"grad_norm": 0.6594511866569519,
"learning_rate": 8.771052631578948e-05,
"loss": 1.1873,
"step": 35
},
{
"epoch": 0.08470588235294117,
"grad_norm": 0.7837107181549072,
"learning_rate": 8.717894736842105e-05,
"loss": 1.1256,
"step": 36
},
{
"epoch": 0.08705882352941176,
"grad_norm": 0.8229796290397644,
"learning_rate": 8.664736842105263e-05,
"loss": 1.0601,
"step": 37
},
{
"epoch": 0.08941176470588236,
"grad_norm": 0.6585485339164734,
"learning_rate": 8.61157894736842e-05,
"loss": 1.3792,
"step": 38
},
{
"epoch": 0.09176470588235294,
"grad_norm": 0.6591870784759521,
"learning_rate": 8.55842105263158e-05,
"loss": 1.3891,
"step": 39
},
{
"epoch": 0.09411764705882353,
"grad_norm": 0.5894742012023926,
"learning_rate": 8.505263157894737e-05,
"loss": 1.3068,
"step": 40
},
{
"epoch": 0.09647058823529411,
"grad_norm": 0.5963307023048401,
"learning_rate": 8.452105263157896e-05,
"loss": 1.1833,
"step": 41
},
{
"epoch": 0.0988235294117647,
"grad_norm": 0.7109506726264954,
"learning_rate": 8.398947368421053e-05,
"loss": 1.1566,
"step": 42
},
{
"epoch": 0.1011764705882353,
"grad_norm": 0.7429890632629395,
"learning_rate": 8.345789473684211e-05,
"loss": 1.0686,
"step": 43
},
{
"epoch": 0.10352941176470588,
"grad_norm": 0.7084085941314697,
"learning_rate": 8.292631578947368e-05,
"loss": 1.2941,
"step": 44
},
{
"epoch": 0.10588235294117647,
"grad_norm": 0.5622501373291016,
"learning_rate": 8.239473684210526e-05,
"loss": 1.3853,
"step": 45
},
{
"epoch": 0.10823529411764705,
"grad_norm": 0.5678622126579285,
"learning_rate": 8.186315789473683e-05,
"loss": 1.2757,
"step": 46
},
{
"epoch": 0.11058823529411765,
"grad_norm": 0.5646488666534424,
"learning_rate": 8.133157894736842e-05,
"loss": 1.2279,
"step": 47
},
{
"epoch": 0.11294117647058824,
"grad_norm": 0.6251313090324402,
"learning_rate": 8.080000000000001e-05,
"loss": 1.1053,
"step": 48
},
{
"epoch": 0.11529411764705882,
"grad_norm": 0.6798649430274963,
"learning_rate": 8.026842105263159e-05,
"loss": 1.0443,
"step": 49
},
{
"epoch": 0.11764705882352941,
"grad_norm": 0.7527917623519897,
"learning_rate": 7.973684210526316e-05,
"loss": 0.881,
"step": 50
},
{
"epoch": 0.11764705882352941,
"eval_loss": 1.176622986793518,
"eval_runtime": 13.5969,
"eval_samples_per_second": 421.05,
"eval_steps_per_second": 13.165,
"step": 50
},
{
"epoch": 0.12,
"grad_norm": 0.671344518661499,
"learning_rate": 7.920526315789474e-05,
"loss": 1.4253,
"step": 51
},
{
"epoch": 0.1223529411764706,
"grad_norm": 0.5880969166755676,
"learning_rate": 7.867368421052631e-05,
"loss": 1.3348,
"step": 52
},
{
"epoch": 0.12470588235294118,
"grad_norm": 0.6282343864440918,
"learning_rate": 7.814210526315789e-05,
"loss": 1.2422,
"step": 53
},
{
"epoch": 0.12705882352941175,
"grad_norm": 0.5805298089981079,
"learning_rate": 7.761052631578946e-05,
"loss": 1.1409,
"step": 54
},
{
"epoch": 0.12941176470588237,
"grad_norm": 0.6550160050392151,
"learning_rate": 7.707894736842105e-05,
"loss": 1.0416,
"step": 55
},
{
"epoch": 0.13176470588235295,
"grad_norm": 0.8028090596199036,
"learning_rate": 7.654736842105264e-05,
"loss": 0.9202,
"step": 56
},
{
"epoch": 0.13411764705882354,
"grad_norm": 0.5485410690307617,
"learning_rate": 7.601578947368422e-05,
"loss": 1.3603,
"step": 57
},
{
"epoch": 0.13647058823529412,
"grad_norm": 0.5518571138381958,
"learning_rate": 7.548421052631579e-05,
"loss": 1.293,
"step": 58
},
{
"epoch": 0.1388235294117647,
"grad_norm": 0.5716201066970825,
"learning_rate": 7.495263157894737e-05,
"loss": 1.2133,
"step": 59
},
{
"epoch": 0.1411764705882353,
"grad_norm": 0.6243718266487122,
"learning_rate": 7.442105263157894e-05,
"loss": 1.1545,
"step": 60
},
{
"epoch": 0.14352941176470588,
"grad_norm": 0.68055260181427,
"learning_rate": 7.388947368421053e-05,
"loss": 1.1208,
"step": 61
},
{
"epoch": 0.14588235294117646,
"grad_norm": 0.7047899961471558,
"learning_rate": 7.335789473684211e-05,
"loss": 0.9698,
"step": 62
},
{
"epoch": 0.14823529411764705,
"grad_norm": 0.6293048858642578,
"learning_rate": 7.282631578947368e-05,
"loss": 1.2973,
"step": 63
},
{
"epoch": 0.15058823529411763,
"grad_norm": 0.556554913520813,
"learning_rate": 7.229473684210527e-05,
"loss": 1.3389,
"step": 64
},
{
"epoch": 0.15294117647058825,
"grad_norm": 0.5991462469100952,
"learning_rate": 7.176315789473685e-05,
"loss": 1.2742,
"step": 65
},
{
"epoch": 0.15529411764705883,
"grad_norm": 0.6164782643318176,
"learning_rate": 7.123157894736842e-05,
"loss": 1.2032,
"step": 66
},
{
"epoch": 0.15764705882352942,
"grad_norm": 0.6064922213554382,
"learning_rate": 7.07e-05,
"loss": 1.056,
"step": 67
},
{
"epoch": 0.16,
"grad_norm": 0.6620113849639893,
"learning_rate": 7.016842105263159e-05,
"loss": 1.0083,
"step": 68
},
{
"epoch": 0.1623529411764706,
"grad_norm": 0.6773979067802429,
"learning_rate": 6.963684210526316e-05,
"loss": 1.1461,
"step": 69
},
{
"epoch": 0.16470588235294117,
"grad_norm": 0.5662854313850403,
"learning_rate": 6.910526315789474e-05,
"loss": 1.3559,
"step": 70
},
{
"epoch": 0.16705882352941176,
"grad_norm": 0.5798998475074768,
"learning_rate": 6.857368421052631e-05,
"loss": 1.2389,
"step": 71
},
{
"epoch": 0.16941176470588235,
"grad_norm": 0.6113397479057312,
"learning_rate": 6.80421052631579e-05,
"loss": 1.1883,
"step": 72
},
{
"epoch": 0.17176470588235293,
"grad_norm": 0.6429812908172607,
"learning_rate": 6.751052631578948e-05,
"loss": 1.0725,
"step": 73
},
{
"epoch": 0.17411764705882352,
"grad_norm": 0.669151782989502,
"learning_rate": 6.697894736842105e-05,
"loss": 0.9818,
"step": 74
},
{
"epoch": 0.17647058823529413,
"grad_norm": 0.7604007124900818,
"learning_rate": 6.644736842105264e-05,
"loss": 0.8495,
"step": 75
},
{
"epoch": 0.17882352941176471,
"grad_norm": 0.541003942489624,
"learning_rate": 6.591578947368422e-05,
"loss": 1.3588,
"step": 76
},
{
"epoch": 0.1811764705882353,
"grad_norm": 0.6061081290245056,
"learning_rate": 6.538421052631579e-05,
"loss": 1.2932,
"step": 77
},
{
"epoch": 0.18352941176470589,
"grad_norm": 0.6060153841972351,
"learning_rate": 6.485263157894737e-05,
"loss": 1.1908,
"step": 78
},
{
"epoch": 0.18588235294117647,
"grad_norm": 0.6308295726776123,
"learning_rate": 6.432105263157894e-05,
"loss": 1.1446,
"step": 79
},
{
"epoch": 0.18823529411764706,
"grad_norm": 0.6454964280128479,
"learning_rate": 6.378947368421053e-05,
"loss": 0.9889,
"step": 80
},
{
"epoch": 0.19058823529411764,
"grad_norm": 0.7258140444755554,
"learning_rate": 6.32578947368421e-05,
"loss": 0.886,
"step": 81
},
{
"epoch": 0.19294117647058823,
"grad_norm": 0.5802991986274719,
"learning_rate": 6.27263157894737e-05,
"loss": 1.3406,
"step": 82
},
{
"epoch": 0.1952941176470588,
"grad_norm": 0.6241512894630432,
"learning_rate": 6.219473684210527e-05,
"loss": 1.3188,
"step": 83
},
{
"epoch": 0.1976470588235294,
"grad_norm": 0.6080717444419861,
"learning_rate": 6.166315789473685e-05,
"loss": 1.1992,
"step": 84
},
{
"epoch": 0.2,
"grad_norm": 0.6189625263214111,
"learning_rate": 6.113157894736842e-05,
"loss": 1.1226,
"step": 85
},
{
"epoch": 0.2023529411764706,
"grad_norm": 0.6604960560798645,
"learning_rate": 6.0599999999999996e-05,
"loss": 1.0563,
"step": 86
},
{
"epoch": 0.20470588235294118,
"grad_norm": 0.7085356116294861,
"learning_rate": 6.006842105263158e-05,
"loss": 0.9438,
"step": 87
},
{
"epoch": 0.20705882352941177,
"grad_norm": 0.6556686162948608,
"learning_rate": 5.953684210526315e-05,
"loss": 1.3147,
"step": 88
},
{
"epoch": 0.20941176470588235,
"grad_norm": 0.637535572052002,
"learning_rate": 5.900526315789474e-05,
"loss": 1.2895,
"step": 89
},
{
"epoch": 0.21176470588235294,
"grad_norm": 0.5857135057449341,
"learning_rate": 5.847368421052632e-05,
"loss": 1.2138,
"step": 90
},
{
"epoch": 0.21411764705882352,
"grad_norm": 0.6470074653625488,
"learning_rate": 5.79421052631579e-05,
"loss": 1.1728,
"step": 91
},
{
"epoch": 0.2164705882352941,
"grad_norm": 0.6902785301208496,
"learning_rate": 5.7410526315789475e-05,
"loss": 1.0183,
"step": 92
},
{
"epoch": 0.2188235294117647,
"grad_norm": 0.721517026424408,
"learning_rate": 5.687894736842105e-05,
"loss": 0.9262,
"step": 93
},
{
"epoch": 0.2211764705882353,
"grad_norm": 0.7650291919708252,
"learning_rate": 5.6347368421052625e-05,
"loss": 1.1511,
"step": 94
},
{
"epoch": 0.2235294117647059,
"grad_norm": 0.6252285838127136,
"learning_rate": 5.5815789473684214e-05,
"loss": 1.3332,
"step": 95
},
{
"epoch": 0.22588235294117648,
"grad_norm": 0.6287341117858887,
"learning_rate": 5.5284210526315796e-05,
"loss": 1.1899,
"step": 96
},
{
"epoch": 0.22823529411764706,
"grad_norm": 0.7121133208274841,
"learning_rate": 5.475263157894737e-05,
"loss": 1.1943,
"step": 97
},
{
"epoch": 0.23058823529411765,
"grad_norm": 0.6616747379302979,
"learning_rate": 5.422105263157895e-05,
"loss": 1.0497,
"step": 98
},
{
"epoch": 0.23294117647058823,
"grad_norm": 0.7290382385253906,
"learning_rate": 5.368947368421053e-05,
"loss": 0.9758,
"step": 99
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.783645749092102,
"learning_rate": 5.3157894736842104e-05,
"loss": 0.8081,
"step": 100
},
{
"epoch": 0.23529411764705882,
"eval_loss": 1.0883766412734985,
"eval_runtime": 13.5365,
"eval_samples_per_second": 422.93,
"eval_steps_per_second": 13.224,
"step": 100
},
{
"epoch": 0.2376470588235294,
"grad_norm": 0.5632948279380798,
"learning_rate": 5.262631578947368e-05,
"loss": 1.2828,
"step": 101
},
{
"epoch": 0.24,
"grad_norm": 0.6145716309547424,
"learning_rate": 5.209473684210527e-05,
"loss": 1.2529,
"step": 102
},
{
"epoch": 0.24235294117647058,
"grad_norm": 0.6459245085716248,
"learning_rate": 5.1563157894736844e-05,
"loss": 1.1525,
"step": 103
},
{
"epoch": 0.2447058823529412,
"grad_norm": 0.6708313822746277,
"learning_rate": 5.1031578947368426e-05,
"loss": 1.0458,
"step": 104
},
{
"epoch": 0.24705882352941178,
"grad_norm": 0.73152756690979,
"learning_rate": 5.05e-05,
"loss": 0.9634,
"step": 105
},
{
"epoch": 0.24941176470588236,
"grad_norm": 0.7417388558387756,
"learning_rate": 4.9968421052631576e-05,
"loss": 0.8512,
"step": 106
},
{
"epoch": 0.25176470588235295,
"grad_norm": 0.61714106798172,
"learning_rate": 4.943684210526316e-05,
"loss": 1.2966,
"step": 107
},
{
"epoch": 0.2541176470588235,
"grad_norm": 0.6486982703208923,
"learning_rate": 4.890526315789474e-05,
"loss": 1.296,
"step": 108
},
{
"epoch": 0.2564705882352941,
"grad_norm": 0.6546630859375,
"learning_rate": 4.8373684210526316e-05,
"loss": 1.1757,
"step": 109
},
{
"epoch": 0.25882352941176473,
"grad_norm": 0.6595495939254761,
"learning_rate": 4.784210526315789e-05,
"loss": 1.0585,
"step": 110
},
{
"epoch": 0.2611764705882353,
"grad_norm": 0.6996961832046509,
"learning_rate": 4.731052631578947e-05,
"loss": 0.926,
"step": 111
},
{
"epoch": 0.2635294117647059,
"grad_norm": 0.7664803266525269,
"learning_rate": 4.6778947368421055e-05,
"loss": 0.9221,
"step": 112
},
{
"epoch": 0.26588235294117646,
"grad_norm": 0.7366296648979187,
"learning_rate": 4.624736842105263e-05,
"loss": 1.2654,
"step": 113
},
{
"epoch": 0.26823529411764707,
"grad_norm": 0.5936444997787476,
"learning_rate": 4.571578947368421e-05,
"loss": 1.2535,
"step": 114
},
{
"epoch": 0.27058823529411763,
"grad_norm": 0.6593197584152222,
"learning_rate": 4.518421052631579e-05,
"loss": 1.2031,
"step": 115
},
{
"epoch": 0.27294117647058824,
"grad_norm": 0.6682748198509216,
"learning_rate": 4.465263157894737e-05,
"loss": 1.083,
"step": 116
},
{
"epoch": 0.2752941176470588,
"grad_norm": 0.723254919052124,
"learning_rate": 4.412105263157895e-05,
"loss": 0.9981,
"step": 117
},
{
"epoch": 0.2776470588235294,
"grad_norm": 0.7454279661178589,
"learning_rate": 4.358947368421053e-05,
"loss": 0.858,
"step": 118
},
{
"epoch": 0.28,
"grad_norm": 0.7310999035835266,
"learning_rate": 4.30578947368421e-05,
"loss": 1.0973,
"step": 119
},
{
"epoch": 0.2823529411764706,
"grad_norm": 0.6533228158950806,
"learning_rate": 4.2526315789473685e-05,
"loss": 1.2816,
"step": 120
},
{
"epoch": 0.2847058823529412,
"grad_norm": 0.6907062530517578,
"learning_rate": 4.199473684210527e-05,
"loss": 1.1821,
"step": 121
},
{
"epoch": 0.28705882352941176,
"grad_norm": 0.655579686164856,
"learning_rate": 4.146315789473684e-05,
"loss": 1.0816,
"step": 122
},
{
"epoch": 0.28941176470588237,
"grad_norm": 0.7422165274620056,
"learning_rate": 4.093157894736842e-05,
"loss": 1.0255,
"step": 123
},
{
"epoch": 0.2917647058823529,
"grad_norm": 0.7321489453315735,
"learning_rate": 4.0400000000000006e-05,
"loss": 0.9031,
"step": 124
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.8530080318450928,
"learning_rate": 3.986842105263158e-05,
"loss": 0.7604,
"step": 125
},
{
"epoch": 0.2964705882352941,
"grad_norm": 0.6039284467697144,
"learning_rate": 3.933684210526316e-05,
"loss": 1.3394,
"step": 126
},
{
"epoch": 0.2988235294117647,
"grad_norm": 0.6608713865280151,
"learning_rate": 3.880526315789473e-05,
"loss": 1.2596,
"step": 127
},
{
"epoch": 0.30117647058823527,
"grad_norm": 0.6703007221221924,
"learning_rate": 3.827368421052632e-05,
"loss": 1.1305,
"step": 128
},
{
"epoch": 0.3035294117647059,
"grad_norm": 0.7194546461105347,
"learning_rate": 3.7742105263157896e-05,
"loss": 1.0206,
"step": 129
},
{
"epoch": 0.3058823529411765,
"grad_norm": 0.7508780360221863,
"learning_rate": 3.721052631578947e-05,
"loss": 0.9207,
"step": 130
},
{
"epoch": 0.30823529411764705,
"grad_norm": 0.747114360332489,
"learning_rate": 3.6678947368421054e-05,
"loss": 0.7736,
"step": 131
},
{
"epoch": 0.31058823529411766,
"grad_norm": 0.6456690430641174,
"learning_rate": 3.6147368421052636e-05,
"loss": 1.2843,
"step": 132
},
{
"epoch": 0.3129411764705882,
"grad_norm": 0.6609508991241455,
"learning_rate": 3.561578947368421e-05,
"loss": 1.2404,
"step": 133
},
{
"epoch": 0.31529411764705884,
"grad_norm": 0.6665840744972229,
"learning_rate": 3.508421052631579e-05,
"loss": 1.119,
"step": 134
},
{
"epoch": 0.3176470588235294,
"grad_norm": 0.6979455351829529,
"learning_rate": 3.455263157894737e-05,
"loss": 1.0221,
"step": 135
},
{
"epoch": 0.32,
"grad_norm": 0.7018398642539978,
"learning_rate": 3.402105263157895e-05,
"loss": 0.9381,
"step": 136
},
{
"epoch": 0.32235294117647056,
"grad_norm": 0.8422653675079346,
"learning_rate": 3.3489473684210526e-05,
"loss": 0.841,
"step": 137
},
{
"epoch": 0.3247058823529412,
"grad_norm": 0.7037672400474548,
"learning_rate": 3.295789473684211e-05,
"loss": 1.2042,
"step": 138
},
{
"epoch": 0.3270588235294118,
"grad_norm": 0.6488674283027649,
"learning_rate": 3.242631578947368e-05,
"loss": 1.2254,
"step": 139
},
{
"epoch": 0.32941176470588235,
"grad_norm": 0.6584794521331787,
"learning_rate": 3.1894736842105265e-05,
"loss": 1.1519,
"step": 140
},
{
"epoch": 0.33176470588235296,
"grad_norm": 0.7326436638832092,
"learning_rate": 3.136315789473685e-05,
"loss": 1.0655,
"step": 141
},
{
"epoch": 0.3341176470588235,
"grad_norm": 0.7417482137680054,
"learning_rate": 3.083157894736842e-05,
"loss": 0.9448,
"step": 142
},
{
"epoch": 0.33647058823529413,
"grad_norm": 0.7665135264396667,
"learning_rate": 3.0299999999999998e-05,
"loss": 0.8526,
"step": 143
},
{
"epoch": 0.3388235294117647,
"grad_norm": 0.7983363270759583,
"learning_rate": 2.9768421052631577e-05,
"loss": 1.1367,
"step": 144
},
{
"epoch": 0.3411764705882353,
"grad_norm": 0.6683626770973206,
"learning_rate": 2.923684210526316e-05,
"loss": 1.2683,
"step": 145
},
{
"epoch": 0.34352941176470586,
"grad_norm": 0.6849150657653809,
"learning_rate": 2.8705263157894737e-05,
"loss": 1.1433,
"step": 146
},
{
"epoch": 0.3458823529411765,
"grad_norm": 0.7283281683921814,
"learning_rate": 2.8173684210526313e-05,
"loss": 1.0574,
"step": 147
},
{
"epoch": 0.34823529411764703,
"grad_norm": 0.7541592121124268,
"learning_rate": 2.7642105263157898e-05,
"loss": 0.9838,
"step": 148
},
{
"epoch": 0.35058823529411764,
"grad_norm": 0.7645028233528137,
"learning_rate": 2.7110526315789473e-05,
"loss": 0.9245,
"step": 149
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.8516260981559753,
"learning_rate": 2.6578947368421052e-05,
"loss": 0.7641,
"step": 150
},
{
"epoch": 0.35294117647058826,
"eval_loss": 1.0314304828643799,
"eval_runtime": 13.5252,
"eval_samples_per_second": 423.285,
"eval_steps_per_second": 13.235,
"step": 150
},
{
"epoch": 0.3552941176470588,
"grad_norm": 0.6219114661216736,
"learning_rate": 2.6047368421052634e-05,
"loss": 1.2679,
"step": 151
},
{
"epoch": 0.35764705882352943,
"grad_norm": 0.6947327852249146,
"learning_rate": 2.5515789473684213e-05,
"loss": 1.1812,
"step": 152
},
{
"epoch": 0.36,
"grad_norm": 0.6886230707168579,
"learning_rate": 2.4984210526315788e-05,
"loss": 1.0995,
"step": 153
},
{
"epoch": 0.3623529411764706,
"grad_norm": 0.7634572982788086,
"learning_rate": 2.445263157894737e-05,
"loss": 1.026,
"step": 154
},
{
"epoch": 0.36470588235294116,
"grad_norm": 0.7814245223999023,
"learning_rate": 2.3921052631578946e-05,
"loss": 0.9492,
"step": 155
},
{
"epoch": 0.36705882352941177,
"grad_norm": 0.8378550410270691,
"learning_rate": 2.3389473684210528e-05,
"loss": 0.7962,
"step": 156
},
{
"epoch": 0.36941176470588233,
"grad_norm": 0.7198935151100159,
"learning_rate": 2.2857894736842106e-05,
"loss": 1.2946,
"step": 157
},
{
"epoch": 0.37176470588235294,
"grad_norm": 0.6716011166572571,
"learning_rate": 2.2326315789473685e-05,
"loss": 1.2314,
"step": 158
},
{
"epoch": 0.37411764705882355,
"grad_norm": 0.7164915800094604,
"learning_rate": 2.1794736842105264e-05,
"loss": 1.0985,
"step": 159
},
{
"epoch": 0.3764705882352941,
"grad_norm": 0.7440788745880127,
"learning_rate": 2.1263157894736842e-05,
"loss": 1.0568,
"step": 160
},
{
"epoch": 0.3788235294117647,
"grad_norm": 0.7702521085739136,
"learning_rate": 2.073157894736842e-05,
"loss": 0.9384,
"step": 161
},
{
"epoch": 0.3811764705882353,
"grad_norm": 0.8371986746788025,
"learning_rate": 2.0200000000000003e-05,
"loss": 0.7604,
"step": 162
},
{
"epoch": 0.3835294117647059,
"grad_norm": 0.7777345776557922,
"learning_rate": 1.966842105263158e-05,
"loss": 1.202,
"step": 163
},
{
"epoch": 0.38588235294117645,
"grad_norm": 0.6666561365127563,
"learning_rate": 1.913684210526316e-05,
"loss": 1.2439,
"step": 164
},
{
"epoch": 0.38823529411764707,
"grad_norm": 0.6794990301132202,
"learning_rate": 1.8605263157894736e-05,
"loss": 1.1159,
"step": 165
},
{
"epoch": 0.3905882352941176,
"grad_norm": 0.7484257221221924,
"learning_rate": 1.8073684210526318e-05,
"loss": 1.0882,
"step": 166
},
{
"epoch": 0.39294117647058824,
"grad_norm": 0.7433997988700867,
"learning_rate": 1.7542105263157897e-05,
"loss": 0.9214,
"step": 167
},
{
"epoch": 0.3952941176470588,
"grad_norm": 0.8234543800354004,
"learning_rate": 1.7010526315789475e-05,
"loss": 0.8793,
"step": 168
},
{
"epoch": 0.3976470588235294,
"grad_norm": 0.8275732398033142,
"learning_rate": 1.6478947368421054e-05,
"loss": 1.0662,
"step": 169
},
{
"epoch": 0.4,
"grad_norm": 0.6753153800964355,
"learning_rate": 1.5947368421052633e-05,
"loss": 1.2686,
"step": 170
},
{
"epoch": 0.4023529411764706,
"grad_norm": 0.7300885319709778,
"learning_rate": 1.541578947368421e-05,
"loss": 1.1619,
"step": 171
},
{
"epoch": 0.4047058823529412,
"grad_norm": 0.720065176486969,
"learning_rate": 1.4884210526315788e-05,
"loss": 1.0637,
"step": 172
},
{
"epoch": 0.40705882352941175,
"grad_norm": 0.7786014080047607,
"learning_rate": 1.4352631578947369e-05,
"loss": 0.9741,
"step": 173
},
{
"epoch": 0.40941176470588236,
"grad_norm": 0.8148536086082458,
"learning_rate": 1.3821052631578949e-05,
"loss": 0.8535,
"step": 174
},
{
"epoch": 0.4117647058823529,
"grad_norm": 0.8557289242744446,
"learning_rate": 1.3289473684210526e-05,
"loss": 0.7194,
"step": 175
},
{
"epoch": 0.41411764705882353,
"grad_norm": 0.6708107590675354,
"learning_rate": 1.2757894736842106e-05,
"loss": 1.2988,
"step": 176
},
{
"epoch": 0.4164705882352941,
"grad_norm": 0.695059597492218,
"learning_rate": 1.2226315789473685e-05,
"loss": 1.1462,
"step": 177
},
{
"epoch": 0.4188235294117647,
"grad_norm": 0.7289761304855347,
"learning_rate": 1.1694736842105264e-05,
"loss": 1.0924,
"step": 178
},
{
"epoch": 0.4211764705882353,
"grad_norm": 0.7350468635559082,
"learning_rate": 1.1163157894736842e-05,
"loss": 0.9741,
"step": 179
},
{
"epoch": 0.4235294117647059,
"grad_norm": 0.7673172950744629,
"learning_rate": 1.0631578947368421e-05,
"loss": 0.8985,
"step": 180
},
{
"epoch": 0.4258823529411765,
"grad_norm": 0.907876193523407,
"learning_rate": 1.0100000000000002e-05,
"loss": 0.7691,
"step": 181
},
{
"epoch": 0.42823529411764705,
"grad_norm": 0.6842520236968994,
"learning_rate": 9.56842105263158e-06,
"loss": 1.2146,
"step": 182
},
{
"epoch": 0.43058823529411766,
"grad_norm": 0.6845569014549255,
"learning_rate": 9.036842105263159e-06,
"loss": 1.1655,
"step": 183
},
{
"epoch": 0.4329411764705882,
"grad_norm": 0.7404122352600098,
"learning_rate": 8.505263157894738e-06,
"loss": 1.1055,
"step": 184
},
{
"epoch": 0.43529411764705883,
"grad_norm": 0.7731722593307495,
"learning_rate": 7.973684210526316e-06,
"loss": 1.067,
"step": 185
},
{
"epoch": 0.4376470588235294,
"grad_norm": 0.8028244972229004,
"learning_rate": 7.442105263157894e-06,
"loss": 0.9007,
"step": 186
},
{
"epoch": 0.44,
"grad_norm": 0.9162750244140625,
"learning_rate": 6.9105263157894745e-06,
"loss": 0.8109,
"step": 187
},
{
"epoch": 0.4423529411764706,
"grad_norm": 0.729189932346344,
"learning_rate": 6.378947368421053e-06,
"loss": 1.1673,
"step": 188
},
{
"epoch": 0.4447058823529412,
"grad_norm": 0.691579282283783,
"learning_rate": 5.847368421052632e-06,
"loss": 1.2002,
"step": 189
},
{
"epoch": 0.4470588235294118,
"grad_norm": 0.6862315535545349,
"learning_rate": 5.315789473684211e-06,
"loss": 1.075,
"step": 190
},
{
"epoch": 0.44941176470588234,
"grad_norm": 0.795240044593811,
"learning_rate": 4.78421052631579e-06,
"loss": 1.0699,
"step": 191
},
{
"epoch": 0.45176470588235296,
"grad_norm": 0.7700913548469543,
"learning_rate": 4.252631578947369e-06,
"loss": 0.8609,
"step": 192
},
{
"epoch": 0.4541176470588235,
"grad_norm": 0.8226372003555298,
"learning_rate": 3.721052631578947e-06,
"loss": 0.8284,
"step": 193
},
{
"epoch": 0.45647058823529413,
"grad_norm": 0.7959129810333252,
"learning_rate": 3.1894736842105266e-06,
"loss": 0.9911,
"step": 194
},
{
"epoch": 0.4588235294117647,
"grad_norm": 0.6747735142707825,
"learning_rate": 2.6578947368421053e-06,
"loss": 1.2293,
"step": 195
},
{
"epoch": 0.4611764705882353,
"grad_norm": 0.729983925819397,
"learning_rate": 2.1263157894736844e-06,
"loss": 1.1679,
"step": 196
},
{
"epoch": 0.46352941176470586,
"grad_norm": 0.7446680665016174,
"learning_rate": 1.5947368421052633e-06,
"loss": 0.9839,
"step": 197
},
{
"epoch": 0.46588235294117647,
"grad_norm": 0.7689118385314941,
"learning_rate": 1.0631578947368422e-06,
"loss": 0.9591,
"step": 198
},
{
"epoch": 0.4682352941176471,
"grad_norm": 0.8310439586639404,
"learning_rate": 5.315789473684211e-07,
"loss": 0.8448,
"step": 199
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.9187881350517273,
"learning_rate": 0.0,
"loss": 0.7259,
"step": 200
},
{
"epoch": 0.47058823529411764,
"eval_loss": 1.0081819295883179,
"eval_runtime": 13.5434,
"eval_samples_per_second": 422.714,
"eval_steps_per_second": 13.217,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.205091905167688e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}