batrider32's picture
Training in progress, step 162, checkpoint
e6804c7 verified
{
"best_metric": 1.0366058349609375,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 1.0,
"eval_steps": 50,
"global_step": 162,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006172839506172839,
"grad_norm": 5.904749393463135,
"learning_rate": 1e-05,
"loss": 2.3317,
"step": 1
},
{
"epoch": 0.006172839506172839,
"eval_loss": 5.023244380950928,
"eval_runtime": 6.6516,
"eval_samples_per_second": 41.043,
"eval_steps_per_second": 10.373,
"step": 1
},
{
"epoch": 0.012345679012345678,
"grad_norm": 9.1792573928833,
"learning_rate": 2e-05,
"loss": 2.951,
"step": 2
},
{
"epoch": 0.018518518518518517,
"grad_norm": 9.795952796936035,
"learning_rate": 3e-05,
"loss": 3.2335,
"step": 3
},
{
"epoch": 0.024691358024691357,
"grad_norm": 10.837777137756348,
"learning_rate": 4e-05,
"loss": 3.2221,
"step": 4
},
{
"epoch": 0.030864197530864196,
"grad_norm": 9.804049491882324,
"learning_rate": 5e-05,
"loss": 3.1522,
"step": 5
},
{
"epoch": 0.037037037037037035,
"grad_norm": 9.929956436157227,
"learning_rate": 6e-05,
"loss": 2.8287,
"step": 6
},
{
"epoch": 0.043209876543209874,
"grad_norm": 9.047441482543945,
"learning_rate": 7e-05,
"loss": 2.4763,
"step": 7
},
{
"epoch": 0.04938271604938271,
"grad_norm": 7.844836711883545,
"learning_rate": 8e-05,
"loss": 2.5286,
"step": 8
},
{
"epoch": 0.05555555555555555,
"grad_norm": 11.074647903442383,
"learning_rate": 9e-05,
"loss": 2.0952,
"step": 9
},
{
"epoch": 0.06172839506172839,
"grad_norm": 11.319666862487793,
"learning_rate": 0.0001,
"loss": 2.1192,
"step": 10
},
{
"epoch": 0.06790123456790123,
"grad_norm": 11.624617576599121,
"learning_rate": 9.998932083939656e-05,
"loss": 1.9353,
"step": 11
},
{
"epoch": 0.07407407407407407,
"grad_norm": 8.572334289550781,
"learning_rate": 9.995728791936504e-05,
"loss": 2.0145,
"step": 12
},
{
"epoch": 0.08024691358024691,
"grad_norm": 8.755339622497559,
"learning_rate": 9.990391492329341e-05,
"loss": 1.8649,
"step": 13
},
{
"epoch": 0.08641975308641975,
"grad_norm": 8.599105834960938,
"learning_rate": 9.98292246503335e-05,
"loss": 1.7685,
"step": 14
},
{
"epoch": 0.09259259259259259,
"grad_norm": 7.195314407348633,
"learning_rate": 9.973324900566213e-05,
"loss": 1.4497,
"step": 15
},
{
"epoch": 0.09876543209876543,
"grad_norm": 8.05854606628418,
"learning_rate": 9.961602898685226e-05,
"loss": 1.8822,
"step": 16
},
{
"epoch": 0.10493827160493827,
"grad_norm": 6.703824043273926,
"learning_rate": 9.947761466636014e-05,
"loss": 1.6837,
"step": 17
},
{
"epoch": 0.1111111111111111,
"grad_norm": 8.368515014648438,
"learning_rate": 9.931806517013612e-05,
"loss": 2.0682,
"step": 18
},
{
"epoch": 0.11728395061728394,
"grad_norm": 8.363588333129883,
"learning_rate": 9.913744865236798e-05,
"loss": 1.546,
"step": 19
},
{
"epoch": 0.12345679012345678,
"grad_norm": 7.589108467102051,
"learning_rate": 9.893584226636772e-05,
"loss": 1.8713,
"step": 20
},
{
"epoch": 0.12962962962962962,
"grad_norm": 6.3271660804748535,
"learning_rate": 9.871333213161438e-05,
"loss": 1.5173,
"step": 21
},
{
"epoch": 0.13580246913580246,
"grad_norm": 7.326416492462158,
"learning_rate": 9.847001329696653e-05,
"loss": 1.7497,
"step": 22
},
{
"epoch": 0.1419753086419753,
"grad_norm": 6.967326641082764,
"learning_rate": 9.820598970006069e-05,
"loss": 1.4374,
"step": 23
},
{
"epoch": 0.14814814814814814,
"grad_norm": 8.470453262329102,
"learning_rate": 9.792137412291265e-05,
"loss": 1.6789,
"step": 24
},
{
"epoch": 0.15432098765432098,
"grad_norm": 7.615231037139893,
"learning_rate": 9.761628814374073e-05,
"loss": 1.4128,
"step": 25
},
{
"epoch": 0.16049382716049382,
"grad_norm": 6.821658134460449,
"learning_rate": 9.729086208503174e-05,
"loss": 1.2843,
"step": 26
},
{
"epoch": 0.16666666666666666,
"grad_norm": 5.842962265014648,
"learning_rate": 9.694523495787149e-05,
"loss": 1.1883,
"step": 27
},
{
"epoch": 0.1728395061728395,
"grad_norm": 5.989674091339111,
"learning_rate": 9.657955440256395e-05,
"loss": 1.3021,
"step": 28
},
{
"epoch": 0.17901234567901234,
"grad_norm": 5.922726154327393,
"learning_rate": 9.619397662556435e-05,
"loss": 1.3141,
"step": 29
},
{
"epoch": 0.18518518518518517,
"grad_norm": 8.421772956848145,
"learning_rate": 9.578866633275288e-05,
"loss": 1.56,
"step": 30
},
{
"epoch": 0.19135802469135801,
"grad_norm": 6.709497928619385,
"learning_rate": 9.5363796659078e-05,
"loss": 1.4611,
"step": 31
},
{
"epoch": 0.19753086419753085,
"grad_norm": 6.848280906677246,
"learning_rate": 9.491954909459895e-05,
"loss": 1.2324,
"step": 32
},
{
"epoch": 0.2037037037037037,
"grad_norm": 7.685532569885254,
"learning_rate": 9.445611340695926e-05,
"loss": 1.5825,
"step": 33
},
{
"epoch": 0.20987654320987653,
"grad_norm": 7.788570880889893,
"learning_rate": 9.397368756032445e-05,
"loss": 1.7091,
"step": 34
},
{
"epoch": 0.21604938271604937,
"grad_norm": 6.32633113861084,
"learning_rate": 9.347247763081835e-05,
"loss": 1.1916,
"step": 35
},
{
"epoch": 0.2222222222222222,
"grad_norm": 7.544274806976318,
"learning_rate": 9.295269771849427e-05,
"loss": 1.0334,
"step": 36
},
{
"epoch": 0.22839506172839505,
"grad_norm": 7.663769245147705,
"learning_rate": 9.241456985587868e-05,
"loss": 1.5383,
"step": 37
},
{
"epoch": 0.2345679012345679,
"grad_norm": 7.407631874084473,
"learning_rate": 9.185832391312644e-05,
"loss": 1.5609,
"step": 38
},
{
"epoch": 0.24074074074074073,
"grad_norm": 7.543033123016357,
"learning_rate": 9.12841974998278e-05,
"loss": 0.9782,
"step": 39
},
{
"epoch": 0.24691358024691357,
"grad_norm": 12.270012855529785,
"learning_rate": 9.069243586350975e-05,
"loss": 1.9799,
"step": 40
},
{
"epoch": 0.25308641975308643,
"grad_norm": 12.287930488586426,
"learning_rate": 9.008329178487442e-05,
"loss": 2.0773,
"step": 41
},
{
"epoch": 0.25925925925925924,
"grad_norm": 11.401373863220215,
"learning_rate": 8.945702546981969e-05,
"loss": 2.0059,
"step": 42
},
{
"epoch": 0.2654320987654321,
"grad_norm": 10.236944198608398,
"learning_rate": 8.881390443828787e-05,
"loss": 2.0458,
"step": 43
},
{
"epoch": 0.2716049382716049,
"grad_norm": 8.457300186157227,
"learning_rate": 8.815420340999033e-05,
"loss": 1.7319,
"step": 44
},
{
"epoch": 0.2777777777777778,
"grad_norm": 6.980026721954346,
"learning_rate": 8.74782041870563e-05,
"loss": 1.5251,
"step": 45
},
{
"epoch": 0.2839506172839506,
"grad_norm": 5.24791145324707,
"learning_rate": 8.678619553365659e-05,
"loss": 1.4336,
"step": 46
},
{
"epoch": 0.29012345679012347,
"grad_norm": 6.145897388458252,
"learning_rate": 8.60784730526531e-05,
"loss": 1.5023,
"step": 47
},
{
"epoch": 0.2962962962962963,
"grad_norm": 5.121026992797852,
"learning_rate": 8.535533905932738e-05,
"loss": 1.2577,
"step": 48
},
{
"epoch": 0.30246913580246915,
"grad_norm": 6.231418132781982,
"learning_rate": 8.461710245224148e-05,
"loss": 1.1562,
"step": 49
},
{
"epoch": 0.30864197530864196,
"grad_norm": 6.639575958251953,
"learning_rate": 8.386407858128706e-05,
"loss": 1.3981,
"step": 50
},
{
"epoch": 0.30864197530864196,
"eval_loss": 1.3253625631332397,
"eval_runtime": 6.5979,
"eval_samples_per_second": 41.377,
"eval_steps_per_second": 10.458,
"step": 50
},
{
"epoch": 0.3148148148148148,
"grad_norm": 3.98197603225708,
"learning_rate": 8.309658911297834e-05,
"loss": 1.2837,
"step": 51
},
{
"epoch": 0.32098765432098764,
"grad_norm": 4.89648962020874,
"learning_rate": 8.231496189304704e-05,
"loss": 1.369,
"step": 52
},
{
"epoch": 0.3271604938271605,
"grad_norm": 5.329682350158691,
"learning_rate": 8.151953080639775e-05,
"loss": 1.4328,
"step": 53
},
{
"epoch": 0.3333333333333333,
"grad_norm": 4.384424209594727,
"learning_rate": 8.07106356344834e-05,
"loss": 1.0606,
"step": 54
},
{
"epoch": 0.3395061728395062,
"grad_norm": 6.257839202880859,
"learning_rate": 7.988862191016205e-05,
"loss": 1.4948,
"step": 55
},
{
"epoch": 0.345679012345679,
"grad_norm": 6.1981658935546875,
"learning_rate": 7.905384077009693e-05,
"loss": 1.3013,
"step": 56
},
{
"epoch": 0.35185185185185186,
"grad_norm": 5.686239719390869,
"learning_rate": 7.820664880476256e-05,
"loss": 1.0967,
"step": 57
},
{
"epoch": 0.35802469135802467,
"grad_norm": 4.9572248458862305,
"learning_rate": 7.734740790612136e-05,
"loss": 1.2383,
"step": 58
},
{
"epoch": 0.36419753086419754,
"grad_norm": 5.314846992492676,
"learning_rate": 7.647648511303544e-05,
"loss": 1.422,
"step": 59
},
{
"epoch": 0.37037037037037035,
"grad_norm": 4.13405704498291,
"learning_rate": 7.559425245448006e-05,
"loss": 1.1135,
"step": 60
},
{
"epoch": 0.3765432098765432,
"grad_norm": 7.146441459655762,
"learning_rate": 7.470108679062521e-05,
"loss": 0.9621,
"step": 61
},
{
"epoch": 0.38271604938271603,
"grad_norm": 4.9275970458984375,
"learning_rate": 7.379736965185368e-05,
"loss": 0.9482,
"step": 62
},
{
"epoch": 0.3888888888888889,
"grad_norm": 4.272430419921875,
"learning_rate": 7.288348707578408e-05,
"loss": 0.8654,
"step": 63
},
{
"epoch": 0.3950617283950617,
"grad_norm": 4.993551731109619,
"learning_rate": 7.195982944236851e-05,
"loss": 0.9993,
"step": 64
},
{
"epoch": 0.4012345679012346,
"grad_norm": 5.2262983322143555,
"learning_rate": 7.102679130713537e-05,
"loss": 0.9431,
"step": 65
},
{
"epoch": 0.4074074074074074,
"grad_norm": 4.463367938995361,
"learning_rate": 7.008477123264848e-05,
"loss": 0.8866,
"step": 66
},
{
"epoch": 0.41358024691358025,
"grad_norm": 5.570180892944336,
"learning_rate": 6.91341716182545e-05,
"loss": 1.0092,
"step": 67
},
{
"epoch": 0.41975308641975306,
"grad_norm": 6.994357585906982,
"learning_rate": 6.817539852819149e-05,
"loss": 1.5365,
"step": 68
},
{
"epoch": 0.42592592592592593,
"grad_norm": 5.369216442108154,
"learning_rate": 6.720886151813194e-05,
"loss": 1.3261,
"step": 69
},
{
"epoch": 0.43209876543209874,
"grad_norm": 8.574287414550781,
"learning_rate": 6.623497346023418e-05,
"loss": 1.53,
"step": 70
},
{
"epoch": 0.4382716049382716,
"grad_norm": 5.898441314697266,
"learning_rate": 6.525415036677744e-05,
"loss": 1.4404,
"step": 71
},
{
"epoch": 0.4444444444444444,
"grad_norm": 5.377044200897217,
"learning_rate": 6.426681121245527e-05,
"loss": 1.2586,
"step": 72
},
{
"epoch": 0.4506172839506173,
"grad_norm": 6.086128234863281,
"learning_rate": 6.327337775540362e-05,
"loss": 1.283,
"step": 73
},
{
"epoch": 0.4567901234567901,
"grad_norm": 5.488633155822754,
"learning_rate": 6.227427435703997e-05,
"loss": 1.0245,
"step": 74
},
{
"epoch": 0.46296296296296297,
"grad_norm": 6.107352256774902,
"learning_rate": 6.126992780079031e-05,
"loss": 1.3278,
"step": 75
},
{
"epoch": 0.4691358024691358,
"grad_norm": 7.161839962005615,
"learning_rate": 6.026076710978171e-05,
"loss": 1.4159,
"step": 76
},
{
"epoch": 0.47530864197530864,
"grad_norm": 7.746915817260742,
"learning_rate": 5.924722336357793e-05,
"loss": 1.213,
"step": 77
},
{
"epoch": 0.48148148148148145,
"grad_norm": 7.127296447753906,
"learning_rate": 5.8229729514036705e-05,
"loss": 1.1374,
"step": 78
},
{
"epoch": 0.4876543209876543,
"grad_norm": 7.508954048156738,
"learning_rate": 5.720872020036734e-05,
"loss": 1.2544,
"step": 79
},
{
"epoch": 0.49382716049382713,
"grad_norm": 12.170858383178711,
"learning_rate": 5.618463156346739e-05,
"loss": 1.4475,
"step": 80
},
{
"epoch": 0.5,
"grad_norm": 4.2009758949279785,
"learning_rate": 5.515790105961786e-05,
"loss": 1.4977,
"step": 81
},
{
"epoch": 0.5061728395061729,
"grad_norm": 4.75076150894165,
"learning_rate": 5.4128967273616625e-05,
"loss": 1.2753,
"step": 82
},
{
"epoch": 0.5123456790123457,
"grad_norm": 4.539117813110352,
"learning_rate": 5.3098269731429736e-05,
"loss": 1.3245,
"step": 83
},
{
"epoch": 0.5185185185185185,
"grad_norm": 4.6597514152526855,
"learning_rate": 5.2066248712440656e-05,
"loss": 1.2439,
"step": 84
},
{
"epoch": 0.5246913580246914,
"grad_norm": 4.974895477294922,
"learning_rate": 5.103334506137772e-05,
"loss": 1.2295,
"step": 85
},
{
"epoch": 0.5308641975308642,
"grad_norm": 4.4906744956970215,
"learning_rate": 5e-05,
"loss": 1.1152,
"step": 86
},
{
"epoch": 0.5370370370370371,
"grad_norm": 3.8232007026672363,
"learning_rate": 4.8966654938622295e-05,
"loss": 0.9539,
"step": 87
},
{
"epoch": 0.5432098765432098,
"grad_norm": 3.584791660308838,
"learning_rate": 4.7933751287559335e-05,
"loss": 1.1015,
"step": 88
},
{
"epoch": 0.5493827160493827,
"grad_norm": 3.5272321701049805,
"learning_rate": 4.6901730268570275e-05,
"loss": 0.9281,
"step": 89
},
{
"epoch": 0.5555555555555556,
"grad_norm": 3.680220365524292,
"learning_rate": 4.5871032726383386e-05,
"loss": 0.8081,
"step": 90
},
{
"epoch": 0.5617283950617284,
"grad_norm": 3.9015774726867676,
"learning_rate": 4.4842098940382155e-05,
"loss": 0.9318,
"step": 91
},
{
"epoch": 0.5679012345679012,
"grad_norm": 3.8515422344207764,
"learning_rate": 4.381536843653262e-05,
"loss": 0.9208,
"step": 92
},
{
"epoch": 0.5740740740740741,
"grad_norm": 5.001286506652832,
"learning_rate": 4.2791279799632666e-05,
"loss": 1.2188,
"step": 93
},
{
"epoch": 0.5802469135802469,
"grad_norm": 4.476446151733398,
"learning_rate": 4.17702704859633e-05,
"loss": 1.1415,
"step": 94
},
{
"epoch": 0.5864197530864198,
"grad_norm": 3.9653468132019043,
"learning_rate": 4.075277663642208e-05,
"loss": 0.8843,
"step": 95
},
{
"epoch": 0.5925925925925926,
"grad_norm": 3.713266611099243,
"learning_rate": 3.973923289021829e-05,
"loss": 0.9288,
"step": 96
},
{
"epoch": 0.5987654320987654,
"grad_norm": 4.325329303741455,
"learning_rate": 3.87300721992097e-05,
"loss": 0.9417,
"step": 97
},
{
"epoch": 0.6049382716049383,
"grad_norm": 4.720461845397949,
"learning_rate": 3.772572564296005e-05,
"loss": 1.1637,
"step": 98
},
{
"epoch": 0.6111111111111112,
"grad_norm": 4.1974711418151855,
"learning_rate": 3.67266222445964e-05,
"loss": 1.0539,
"step": 99
},
{
"epoch": 0.6172839506172839,
"grad_norm": 5.233874320983887,
"learning_rate": 3.5733188787544745e-05,
"loss": 1.1811,
"step": 100
},
{
"epoch": 0.6172839506172839,
"eval_loss": 1.112654447555542,
"eval_runtime": 6.61,
"eval_samples_per_second": 41.301,
"eval_steps_per_second": 10.439,
"step": 100
},
{
"epoch": 0.6234567901234568,
"grad_norm": 4.717146396636963,
"learning_rate": 3.474584963322257e-05,
"loss": 1.2253,
"step": 101
},
{
"epoch": 0.6296296296296297,
"grad_norm": 4.815060615539551,
"learning_rate": 3.3765026539765834e-05,
"loss": 1.0185,
"step": 102
},
{
"epoch": 0.6358024691358025,
"grad_norm": 5.483373165130615,
"learning_rate": 3.279113848186808e-05,
"loss": 1.1232,
"step": 103
},
{
"epoch": 0.6419753086419753,
"grad_norm": 4.842006206512451,
"learning_rate": 3.18246014718085e-05,
"loss": 0.8888,
"step": 104
},
{
"epoch": 0.6481481481481481,
"grad_norm": 4.802065372467041,
"learning_rate": 3.086582838174551e-05,
"loss": 1.3338,
"step": 105
},
{
"epoch": 0.654320987654321,
"grad_norm": 3.6822216510772705,
"learning_rate": 2.991522876735154e-05,
"loss": 0.6658,
"step": 106
},
{
"epoch": 0.6604938271604939,
"grad_norm": 4.310146808624268,
"learning_rate": 2.8973208692864624e-05,
"loss": 1.0561,
"step": 107
},
{
"epoch": 0.6666666666666666,
"grad_norm": 4.102274417877197,
"learning_rate": 2.804017055763149e-05,
"loss": 0.841,
"step": 108
},
{
"epoch": 0.6728395061728395,
"grad_norm": 6.015258312225342,
"learning_rate": 2.711651292421593e-05,
"loss": 1.1361,
"step": 109
},
{
"epoch": 0.6790123456790124,
"grad_norm": 7.070762634277344,
"learning_rate": 2.6202630348146324e-05,
"loss": 1.338,
"step": 110
},
{
"epoch": 0.6851851851851852,
"grad_norm": 4.881083965301514,
"learning_rate": 2.529891320937481e-05,
"loss": 1.0047,
"step": 111
},
{
"epoch": 0.691358024691358,
"grad_norm": 6.118247032165527,
"learning_rate": 2.4405747545519963e-05,
"loss": 1.4045,
"step": 112
},
{
"epoch": 0.6975308641975309,
"grad_norm": 4.34047794342041,
"learning_rate": 2.352351488696457e-05,
"loss": 1.0392,
"step": 113
},
{
"epoch": 0.7037037037037037,
"grad_norm": 7.272618770599365,
"learning_rate": 2.2652592093878666e-05,
"loss": 1.3566,
"step": 114
},
{
"epoch": 0.7098765432098766,
"grad_norm": 6.038531303405762,
"learning_rate": 2.179335119523745e-05,
"loss": 1.0086,
"step": 115
},
{
"epoch": 0.7160493827160493,
"grad_norm": 6.147348880767822,
"learning_rate": 2.094615922990309e-05,
"loss": 1.1353,
"step": 116
},
{
"epoch": 0.7222222222222222,
"grad_norm": 5.421988487243652,
"learning_rate": 2.0111378089837956e-05,
"loss": 1.0498,
"step": 117
},
{
"epoch": 0.7283950617283951,
"grad_norm": 5.742246150970459,
"learning_rate": 1.928936436551661e-05,
"loss": 1.1477,
"step": 118
},
{
"epoch": 0.7345679012345679,
"grad_norm": 9.083409309387207,
"learning_rate": 1.848046919360225e-05,
"loss": 1.0184,
"step": 119
},
{
"epoch": 0.7407407407407407,
"grad_norm": 9.322260856628418,
"learning_rate": 1.768503810695295e-05,
"loss": 1.5916,
"step": 120
},
{
"epoch": 0.7469135802469136,
"grad_norm": 2.834907054901123,
"learning_rate": 1.6903410887021676e-05,
"loss": 1.2386,
"step": 121
},
{
"epoch": 0.7530864197530864,
"grad_norm": 3.143998384475708,
"learning_rate": 1.6135921418712956e-05,
"loss": 1.328,
"step": 122
},
{
"epoch": 0.7592592592592593,
"grad_norm": 2.8615365028381348,
"learning_rate": 1.5382897547758514e-05,
"loss": 1.1832,
"step": 123
},
{
"epoch": 0.7654320987654321,
"grad_norm": 3.3596787452697754,
"learning_rate": 1.4644660940672627e-05,
"loss": 1.0036,
"step": 124
},
{
"epoch": 0.7716049382716049,
"grad_norm": 3.3503646850585938,
"learning_rate": 1.3921526947346902e-05,
"loss": 0.8836,
"step": 125
},
{
"epoch": 0.7777777777777778,
"grad_norm": 3.049670696258545,
"learning_rate": 1.3213804466343421e-05,
"loss": 0.9173,
"step": 126
},
{
"epoch": 0.7839506172839507,
"grad_norm": 3.030001163482666,
"learning_rate": 1.2521795812943704e-05,
"loss": 0.8346,
"step": 127
},
{
"epoch": 0.7901234567901234,
"grad_norm": 3.547058582305908,
"learning_rate": 1.1845796590009683e-05,
"loss": 1.0046,
"step": 128
},
{
"epoch": 0.7962962962962963,
"grad_norm": 3.6192965507507324,
"learning_rate": 1.118609556171213e-05,
"loss": 0.8706,
"step": 129
},
{
"epoch": 0.8024691358024691,
"grad_norm": 3.6605732440948486,
"learning_rate": 1.0542974530180327e-05,
"loss": 0.7687,
"step": 130
},
{
"epoch": 0.808641975308642,
"grad_norm": 4.206137180328369,
"learning_rate": 9.916708215125587e-06,
"loss": 1.0612,
"step": 131
},
{
"epoch": 0.8148148148148148,
"grad_norm": 3.2321722507476807,
"learning_rate": 9.307564136490254e-06,
"loss": 0.8656,
"step": 132
},
{
"epoch": 0.8209876543209876,
"grad_norm": 4.059853553771973,
"learning_rate": 8.715802500172216e-06,
"loss": 1.0091,
"step": 133
},
{
"epoch": 0.8271604938271605,
"grad_norm": 3.761200189590454,
"learning_rate": 8.141676086873572e-06,
"loss": 0.8369,
"step": 134
},
{
"epoch": 0.8333333333333334,
"grad_norm": 4.452486991882324,
"learning_rate": 7.585430144121319e-06,
"loss": 1.2127,
"step": 135
},
{
"epoch": 0.8395061728395061,
"grad_norm": 3.1412620544433594,
"learning_rate": 7.047302281505736e-06,
"loss": 0.5492,
"step": 136
},
{
"epoch": 0.845679012345679,
"grad_norm": 5.175487995147705,
"learning_rate": 6.527522369181655e-06,
"loss": 1.1802,
"step": 137
},
{
"epoch": 0.8518518518518519,
"grad_norm": 4.258315086364746,
"learning_rate": 6.026312439675552e-06,
"loss": 0.9737,
"step": 138
},
{
"epoch": 0.8580246913580247,
"grad_norm": 4.971921920776367,
"learning_rate": 5.543886593040737e-06,
"loss": 1.1732,
"step": 139
},
{
"epoch": 0.8641975308641975,
"grad_norm": 5.052080154418945,
"learning_rate": 5.080450905401057e-06,
"loss": 1.0749,
"step": 140
},
{
"epoch": 0.8703703703703703,
"grad_norm": 4.974813461303711,
"learning_rate": 4.636203340922008e-06,
"loss": 0.8149,
"step": 141
},
{
"epoch": 0.8765432098765432,
"grad_norm": 4.113583564758301,
"learning_rate": 4.2113336672471245e-06,
"loss": 0.9646,
"step": 142
},
{
"epoch": 0.8827160493827161,
"grad_norm": 3.6815974712371826,
"learning_rate": 3.8060233744356633e-06,
"loss": 0.7519,
"step": 143
},
{
"epoch": 0.8888888888888888,
"grad_norm": 5.150738716125488,
"learning_rate": 3.420445597436056e-06,
"loss": 1.0115,
"step": 144
},
{
"epoch": 0.8950617283950617,
"grad_norm": 5.06300163269043,
"learning_rate": 3.054765042128521e-06,
"loss": 1.0224,
"step": 145
},
{
"epoch": 0.9012345679012346,
"grad_norm": 4.764848232269287,
"learning_rate": 2.7091379149682685e-06,
"loss": 0.9915,
"step": 146
},
{
"epoch": 0.9074074074074074,
"grad_norm": 4.230654716491699,
"learning_rate": 2.3837118562592797e-06,
"loss": 0.8216,
"step": 147
},
{
"epoch": 0.9135802469135802,
"grad_norm": 5.193974018096924,
"learning_rate": 2.0786258770873647e-06,
"loss": 1.0874,
"step": 148
},
{
"epoch": 0.9197530864197531,
"grad_norm": 4.15454626083374,
"learning_rate": 1.7940102999393194e-06,
"loss": 0.9001,
"step": 149
},
{
"epoch": 0.9259259259259259,
"grad_norm": 6.763053894042969,
"learning_rate": 1.5299867030334814e-06,
"loss": 1.2224,
"step": 150
},
{
"epoch": 0.9259259259259259,
"eval_loss": 1.0366058349609375,
"eval_runtime": 6.6176,
"eval_samples_per_second": 41.253,
"eval_steps_per_second": 10.427,
"step": 150
},
{
"epoch": 0.9320987654320988,
"grad_norm": 5.5308146476745605,
"learning_rate": 1.286667868385627e-06,
"loss": 1.351,
"step": 151
},
{
"epoch": 0.9382716049382716,
"grad_norm": 6.575994968414307,
"learning_rate": 1.064157733632276e-06,
"loss": 1.3306,
"step": 152
},
{
"epoch": 0.9444444444444444,
"grad_norm": 8.211260795593262,
"learning_rate": 8.62551347632029e-07,
"loss": 1.2282,
"step": 153
},
{
"epoch": 0.9506172839506173,
"grad_norm": 4.557495594024658,
"learning_rate": 6.819348298638839e-07,
"loss": 1.0393,
"step": 154
},
{
"epoch": 0.9567901234567902,
"grad_norm": 5.561540126800537,
"learning_rate": 5.223853336398632e-07,
"loss": 0.81,
"step": 155
},
{
"epoch": 0.9629629629629629,
"grad_norm": 5.5861077308654785,
"learning_rate": 3.839710131477492e-07,
"loss": 1.1083,
"step": 156
},
{
"epoch": 0.9691358024691358,
"grad_norm": 7.85378885269165,
"learning_rate": 2.667509943378721e-07,
"loss": 1.53,
"step": 157
},
{
"epoch": 0.9753086419753086,
"grad_norm": 5.781926155090332,
"learning_rate": 1.7077534966650766e-07,
"loss": 1.125,
"step": 158
},
{
"epoch": 0.9814814814814815,
"grad_norm": 6.874696731567383,
"learning_rate": 9.60850767065924e-08,
"loss": 1.2242,
"step": 159
},
{
"epoch": 0.9876543209876543,
"grad_norm": 6.842217922210693,
"learning_rate": 4.2712080634949024e-08,
"loss": 0.9251,
"step": 160
},
{
"epoch": 0.9938271604938271,
"grad_norm": 3.6685681343078613,
"learning_rate": 1.0679160603449534e-08,
"loss": 0.9695,
"step": 161
},
{
"epoch": 1.0,
"grad_norm": 5.732940673828125,
"learning_rate": 0.0,
"loss": 1.3258,
"step": 162
}
],
"logging_steps": 1,
"max_steps": 162,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2505650634948608e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}