OLMo-2-1124-7B-Instruct_SFTv02.05 / trainer_state.json
Neelectric's picture
Model save
3aed3cc verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9985082048731975,
"eval_steps": 500,
"global_step": 1004,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001989060169070114,
"grad_norm": 4.783787250518799,
"learning_rate": 0.0,
"loss": 1.1878,
"num_tokens": 1379928.0,
"step": 1
},
{
"epoch": 0.003978120338140228,
"grad_norm": 4.881955623626709,
"learning_rate": 9.9009900990099e-09,
"loss": 1.2018,
"num_tokens": 2791003.0,
"step": 2
},
{
"epoch": 0.005967180507210343,
"grad_norm": 4.82396125793457,
"learning_rate": 1.98019801980198e-08,
"loss": 1.1913,
"num_tokens": 4194463.0,
"step": 3
},
{
"epoch": 0.007956240676280457,
"grad_norm": 4.832489967346191,
"learning_rate": 2.97029702970297e-08,
"loss": 1.1974,
"num_tokens": 5584143.0,
"step": 4
},
{
"epoch": 0.009945300845350571,
"grad_norm": 4.7777581214904785,
"learning_rate": 3.96039603960396e-08,
"loss": 1.1893,
"num_tokens": 7031244.0,
"step": 5
},
{
"epoch": 0.011934361014420686,
"grad_norm": 4.757320880889893,
"learning_rate": 4.950495049504951e-08,
"loss": 1.1874,
"num_tokens": 8436181.0,
"step": 6
},
{
"epoch": 0.0139234211834908,
"grad_norm": 4.777390480041504,
"learning_rate": 5.94059405940594e-08,
"loss": 1.1855,
"num_tokens": 9836769.0,
"step": 7
},
{
"epoch": 0.015912481352560914,
"grad_norm": 4.798787593841553,
"learning_rate": 6.930693069306931e-08,
"loss": 1.1791,
"num_tokens": 11244836.0,
"step": 8
},
{
"epoch": 0.01790154152163103,
"grad_norm": 4.807232856750488,
"learning_rate": 7.92079207920792e-08,
"loss": 1.181,
"num_tokens": 12623567.0,
"step": 9
},
{
"epoch": 0.019890601690701143,
"grad_norm": 4.721550464630127,
"learning_rate": 8.91089108910891e-08,
"loss": 1.1763,
"num_tokens": 14028360.0,
"step": 10
},
{
"epoch": 0.02187966185977126,
"grad_norm": 4.797355651855469,
"learning_rate": 9.900990099009901e-08,
"loss": 1.1935,
"num_tokens": 15405448.0,
"step": 11
},
{
"epoch": 0.023868722028841372,
"grad_norm": 4.754054069519043,
"learning_rate": 1.089108910891089e-07,
"loss": 1.1914,
"num_tokens": 16852417.0,
"step": 12
},
{
"epoch": 0.025857782197911485,
"grad_norm": 4.781761169433594,
"learning_rate": 1.188118811881188e-07,
"loss": 1.1883,
"num_tokens": 18245229.0,
"step": 13
},
{
"epoch": 0.0278468423669816,
"grad_norm": 4.794190406799316,
"learning_rate": 1.2871287128712872e-07,
"loss": 1.1829,
"num_tokens": 19630250.0,
"step": 14
},
{
"epoch": 0.029835902536051714,
"grad_norm": 4.7866363525390625,
"learning_rate": 1.3861386138613863e-07,
"loss": 1.2058,
"num_tokens": 21090140.0,
"step": 15
},
{
"epoch": 0.03182496270512183,
"grad_norm": 4.720057964324951,
"learning_rate": 1.485148514851485e-07,
"loss": 1.1942,
"num_tokens": 22459305.0,
"step": 16
},
{
"epoch": 0.03381402287419195,
"grad_norm": 4.714616775512695,
"learning_rate": 1.584158415841584e-07,
"loss": 1.1765,
"num_tokens": 23864152.0,
"step": 17
},
{
"epoch": 0.03580308304326206,
"grad_norm": 4.747438907623291,
"learning_rate": 1.6831683168316832e-07,
"loss": 1.1952,
"num_tokens": 25269279.0,
"step": 18
},
{
"epoch": 0.03779214321233217,
"grad_norm": 4.7715654373168945,
"learning_rate": 1.782178217821782e-07,
"loss": 1.1815,
"num_tokens": 26685193.0,
"step": 19
},
{
"epoch": 0.039781203381402286,
"grad_norm": 4.7065911293029785,
"learning_rate": 1.8811881188118812e-07,
"loss": 1.1685,
"num_tokens": 28076540.0,
"step": 20
},
{
"epoch": 0.0417702635504724,
"grad_norm": 4.74373197555542,
"learning_rate": 1.9801980198019803e-07,
"loss": 1.1932,
"num_tokens": 29489172.0,
"step": 21
},
{
"epoch": 0.04375932371954252,
"grad_norm": 4.572362899780273,
"learning_rate": 2.079207920792079e-07,
"loss": 1.1591,
"num_tokens": 30889595.0,
"step": 22
},
{
"epoch": 0.04574838388861263,
"grad_norm": 4.560796737670898,
"learning_rate": 2.178217821782178e-07,
"loss": 1.1752,
"num_tokens": 32298146.0,
"step": 23
},
{
"epoch": 0.047737444057682744,
"grad_norm": 4.545941352844238,
"learning_rate": 2.2772277227722772e-07,
"loss": 1.1707,
"num_tokens": 33729949.0,
"step": 24
},
{
"epoch": 0.04972650422675286,
"grad_norm": 4.548399448394775,
"learning_rate": 2.376237623762376e-07,
"loss": 1.1759,
"num_tokens": 35146745.0,
"step": 25
},
{
"epoch": 0.05171556439582297,
"grad_norm": 4.558365345001221,
"learning_rate": 2.475247524752475e-07,
"loss": 1.1859,
"num_tokens": 36534224.0,
"step": 26
},
{
"epoch": 0.05370462456489309,
"grad_norm": 4.54498291015625,
"learning_rate": 2.5742574257425743e-07,
"loss": 1.1644,
"num_tokens": 37942016.0,
"step": 27
},
{
"epoch": 0.0556936847339632,
"grad_norm": 4.540126323699951,
"learning_rate": 2.673267326732673e-07,
"loss": 1.1689,
"num_tokens": 39362015.0,
"step": 28
},
{
"epoch": 0.057682744903033316,
"grad_norm": 4.505973815917969,
"learning_rate": 2.7722772277227726e-07,
"loss": 1.158,
"num_tokens": 40746995.0,
"step": 29
},
{
"epoch": 0.05967180507210343,
"grad_norm": 4.145939350128174,
"learning_rate": 2.871287128712871e-07,
"loss": 1.147,
"num_tokens": 42184610.0,
"step": 30
},
{
"epoch": 0.06166086524117355,
"grad_norm": 4.096794128417969,
"learning_rate": 2.97029702970297e-07,
"loss": 1.1433,
"num_tokens": 43594850.0,
"step": 31
},
{
"epoch": 0.06364992541024365,
"grad_norm": 4.0818586349487305,
"learning_rate": 3.069306930693069e-07,
"loss": 1.1351,
"num_tokens": 45000725.0,
"step": 32
},
{
"epoch": 0.06563898557931377,
"grad_norm": 4.103175640106201,
"learning_rate": 3.168316831683168e-07,
"loss": 1.1303,
"num_tokens": 46389383.0,
"step": 33
},
{
"epoch": 0.0676280457483839,
"grad_norm": 4.075052738189697,
"learning_rate": 3.2673267326732674e-07,
"loss": 1.1427,
"num_tokens": 47780652.0,
"step": 34
},
{
"epoch": 0.069617105917454,
"grad_norm": 4.112435340881348,
"learning_rate": 3.3663366336633663e-07,
"loss": 1.14,
"num_tokens": 49201017.0,
"step": 35
},
{
"epoch": 0.07160616608652412,
"grad_norm": 4.016624450683594,
"learning_rate": 3.465346534653465e-07,
"loss": 1.142,
"num_tokens": 50577829.0,
"step": 36
},
{
"epoch": 0.07359522625559423,
"grad_norm": 3.9627022743225098,
"learning_rate": 3.564356435643564e-07,
"loss": 1.1189,
"num_tokens": 52003696.0,
"step": 37
},
{
"epoch": 0.07558428642466435,
"grad_norm": 4.006690502166748,
"learning_rate": 3.663366336633663e-07,
"loss": 1.1307,
"num_tokens": 53420296.0,
"step": 38
},
{
"epoch": 0.07757334659373447,
"grad_norm": 3.9357409477233887,
"learning_rate": 3.7623762376237623e-07,
"loss": 1.1279,
"num_tokens": 54812494.0,
"step": 39
},
{
"epoch": 0.07956240676280457,
"grad_norm": 3.903628349304199,
"learning_rate": 3.861386138613861e-07,
"loss": 1.1042,
"num_tokens": 56180118.0,
"step": 40
},
{
"epoch": 0.08155146693187469,
"grad_norm": 3.4105091094970703,
"learning_rate": 3.9603960396039606e-07,
"loss": 1.0558,
"num_tokens": 57631209.0,
"step": 41
},
{
"epoch": 0.0835405271009448,
"grad_norm": 3.0491228103637695,
"learning_rate": 4.0594059405940595e-07,
"loss": 1.0404,
"num_tokens": 59050835.0,
"step": 42
},
{
"epoch": 0.08552958727001492,
"grad_norm": 2.9168484210968018,
"learning_rate": 4.158415841584158e-07,
"loss": 1.033,
"num_tokens": 60439622.0,
"step": 43
},
{
"epoch": 0.08751864743908504,
"grad_norm": 2.8887298107147217,
"learning_rate": 4.257425742574257e-07,
"loss": 1.026,
"num_tokens": 61853701.0,
"step": 44
},
{
"epoch": 0.08950770760815514,
"grad_norm": 2.882795810699463,
"learning_rate": 4.356435643564356e-07,
"loss": 1.0189,
"num_tokens": 63243921.0,
"step": 45
},
{
"epoch": 0.09149676777722526,
"grad_norm": 2.826184034347534,
"learning_rate": 4.4554455445544555e-07,
"loss": 1.0267,
"num_tokens": 64692234.0,
"step": 46
},
{
"epoch": 0.09348582794629537,
"grad_norm": 2.837517261505127,
"learning_rate": 4.5544554455445543e-07,
"loss": 1.0188,
"num_tokens": 66095339.0,
"step": 47
},
{
"epoch": 0.09547488811536549,
"grad_norm": 2.7744252681732178,
"learning_rate": 4.6534653465346537e-07,
"loss": 1.021,
"num_tokens": 67470384.0,
"step": 48
},
{
"epoch": 0.09746394828443561,
"grad_norm": 2.7776126861572266,
"learning_rate": 4.752475247524752e-07,
"loss": 1.0072,
"num_tokens": 68907983.0,
"step": 49
},
{
"epoch": 0.09945300845350571,
"grad_norm": 2.696685552597046,
"learning_rate": 4.851485148514851e-07,
"loss": 1.0018,
"num_tokens": 70284975.0,
"step": 50
},
{
"epoch": 0.10144206862257583,
"grad_norm": 2.6469790935516357,
"learning_rate": 4.95049504950495e-07,
"loss": 0.999,
"num_tokens": 71668994.0,
"step": 51
},
{
"epoch": 0.10343112879164594,
"grad_norm": 2.5766100883483887,
"learning_rate": 5.04950495049505e-07,
"loss": 0.9941,
"num_tokens": 73087542.0,
"step": 52
},
{
"epoch": 0.10542018896071606,
"grad_norm": 2.5015177726745605,
"learning_rate": 5.148514851485149e-07,
"loss": 0.9808,
"num_tokens": 74502903.0,
"step": 53
},
{
"epoch": 0.10740924912978618,
"grad_norm": 2.4191646575927734,
"learning_rate": 5.247524752475247e-07,
"loss": 0.9769,
"num_tokens": 75902927.0,
"step": 54
},
{
"epoch": 0.10939830929885629,
"grad_norm": 2.2691502571105957,
"learning_rate": 5.346534653465346e-07,
"loss": 0.9584,
"num_tokens": 77315581.0,
"step": 55
},
{
"epoch": 0.1113873694679264,
"grad_norm": 2.0384151935577393,
"learning_rate": 5.445544554455445e-07,
"loss": 0.9216,
"num_tokens": 78698236.0,
"step": 56
},
{
"epoch": 0.11337642963699653,
"grad_norm": 1.7994420528411865,
"learning_rate": 5.544554455445545e-07,
"loss": 0.8971,
"num_tokens": 80129837.0,
"step": 57
},
{
"epoch": 0.11536548980606663,
"grad_norm": 1.6083354949951172,
"learning_rate": 5.643564356435643e-07,
"loss": 0.887,
"num_tokens": 81529934.0,
"step": 58
},
{
"epoch": 0.11735454997513675,
"grad_norm": 1.4472858905792236,
"learning_rate": 5.742574257425742e-07,
"loss": 0.8759,
"num_tokens": 82926044.0,
"step": 59
},
{
"epoch": 0.11934361014420686,
"grad_norm": 1.3287198543548584,
"learning_rate": 5.841584158415841e-07,
"loss": 0.8572,
"num_tokens": 84300935.0,
"step": 60
},
{
"epoch": 0.12133267031327698,
"grad_norm": 1.2509266138076782,
"learning_rate": 5.94059405940594e-07,
"loss": 0.8497,
"num_tokens": 85709827.0,
"step": 61
},
{
"epoch": 0.1233217304823471,
"grad_norm": 1.2169718742370605,
"learning_rate": 6.03960396039604e-07,
"loss": 0.8503,
"num_tokens": 87094581.0,
"step": 62
},
{
"epoch": 0.1253107906514172,
"grad_norm": 1.1747347116470337,
"learning_rate": 6.138613861386138e-07,
"loss": 0.8499,
"num_tokens": 88486857.0,
"step": 63
},
{
"epoch": 0.1272998508204873,
"grad_norm": 1.108124852180481,
"learning_rate": 6.237623762376237e-07,
"loss": 0.8385,
"num_tokens": 89892391.0,
"step": 64
},
{
"epoch": 0.12928891098955744,
"grad_norm": 1.080589771270752,
"learning_rate": 6.336633663366336e-07,
"loss": 0.8415,
"num_tokens": 91290866.0,
"step": 65
},
{
"epoch": 0.13127797115862755,
"grad_norm": 1.0345003604888916,
"learning_rate": 6.435643564356436e-07,
"loss": 0.8287,
"num_tokens": 92700954.0,
"step": 66
},
{
"epoch": 0.13326703132769765,
"grad_norm": 0.9741297364234924,
"learning_rate": 6.534653465346535e-07,
"loss": 0.8193,
"num_tokens": 94131765.0,
"step": 67
},
{
"epoch": 0.1352560914967678,
"grad_norm": 0.9597648978233337,
"learning_rate": 6.633663366336634e-07,
"loss": 0.8245,
"num_tokens": 95516370.0,
"step": 68
},
{
"epoch": 0.1372451516658379,
"grad_norm": 0.9024690985679626,
"learning_rate": 6.732673267326733e-07,
"loss": 0.8203,
"num_tokens": 96921506.0,
"step": 69
},
{
"epoch": 0.139234211834908,
"grad_norm": 0.8436392545700073,
"learning_rate": 6.831683168316831e-07,
"loss": 0.8005,
"num_tokens": 98292636.0,
"step": 70
},
{
"epoch": 0.14122327200397813,
"grad_norm": 0.7839356660842896,
"learning_rate": 6.93069306930693e-07,
"loss": 0.832,
"num_tokens": 99743934.0,
"step": 71
},
{
"epoch": 0.14321233217304824,
"grad_norm": 0.7109583616256714,
"learning_rate": 7.029702970297029e-07,
"loss": 0.794,
"num_tokens": 101144081.0,
"step": 72
},
{
"epoch": 0.14520139234211835,
"grad_norm": 0.6535574793815613,
"learning_rate": 7.128712871287128e-07,
"loss": 0.8081,
"num_tokens": 102546821.0,
"step": 73
},
{
"epoch": 0.14719045251118845,
"grad_norm": 0.5921033620834351,
"learning_rate": 7.227722772277227e-07,
"loss": 0.7886,
"num_tokens": 103957723.0,
"step": 74
},
{
"epoch": 0.14917951268025859,
"grad_norm": 0.5575245022773743,
"learning_rate": 7.326732673267326e-07,
"loss": 0.783,
"num_tokens": 105353702.0,
"step": 75
},
{
"epoch": 0.1511685728493287,
"grad_norm": 0.5279855132102966,
"learning_rate": 7.425742574257426e-07,
"loss": 0.7729,
"num_tokens": 106790931.0,
"step": 76
},
{
"epoch": 0.1531576330183988,
"grad_norm": 0.5107793211936951,
"learning_rate": 7.524752475247525e-07,
"loss": 0.7583,
"num_tokens": 108181851.0,
"step": 77
},
{
"epoch": 0.15514669318746893,
"grad_norm": 0.5129069089889526,
"learning_rate": 7.623762376237624e-07,
"loss": 0.7624,
"num_tokens": 109558589.0,
"step": 78
},
{
"epoch": 0.15713575335653904,
"grad_norm": 0.479915589094162,
"learning_rate": 7.722772277227722e-07,
"loss": 0.7378,
"num_tokens": 110915629.0,
"step": 79
},
{
"epoch": 0.15912481352560914,
"grad_norm": 0.4730769991874695,
"learning_rate": 7.821782178217821e-07,
"loss": 0.7619,
"num_tokens": 112326351.0,
"step": 80
},
{
"epoch": 0.16111387369467928,
"grad_norm": 0.4519067406654358,
"learning_rate": 7.920792079207921e-07,
"loss": 0.7459,
"num_tokens": 113740610.0,
"step": 81
},
{
"epoch": 0.16310293386374938,
"grad_norm": 0.4359944760799408,
"learning_rate": 8.01980198019802e-07,
"loss": 0.749,
"num_tokens": 115148611.0,
"step": 82
},
{
"epoch": 0.1650919940328195,
"grad_norm": 0.4051918685436249,
"learning_rate": 8.118811881188119e-07,
"loss": 0.7447,
"num_tokens": 116569175.0,
"step": 83
},
{
"epoch": 0.1670810542018896,
"grad_norm": 0.38103756308555603,
"learning_rate": 8.217821782178217e-07,
"loss": 0.7469,
"num_tokens": 117975546.0,
"step": 84
},
{
"epoch": 0.16907011437095973,
"grad_norm": 0.360709011554718,
"learning_rate": 8.316831683168316e-07,
"loss": 0.722,
"num_tokens": 119391367.0,
"step": 85
},
{
"epoch": 0.17105917454002983,
"grad_norm": 0.3431180417537689,
"learning_rate": 8.415841584158416e-07,
"loss": 0.7323,
"num_tokens": 120769077.0,
"step": 86
},
{
"epoch": 0.17304823470909994,
"grad_norm": 0.32238441705703735,
"learning_rate": 8.514851485148514e-07,
"loss": 0.7254,
"num_tokens": 122218630.0,
"step": 87
},
{
"epoch": 0.17503729487817007,
"grad_norm": 0.3144312798976898,
"learning_rate": 8.613861386138613e-07,
"loss": 0.741,
"num_tokens": 123638408.0,
"step": 88
},
{
"epoch": 0.17702635504724018,
"grad_norm": 0.2969042658805847,
"learning_rate": 8.712871287128712e-07,
"loss": 0.707,
"num_tokens": 125024280.0,
"step": 89
},
{
"epoch": 0.1790154152163103,
"grad_norm": 0.28798267245292664,
"learning_rate": 8.811881188118812e-07,
"loss": 0.7082,
"num_tokens": 126462816.0,
"step": 90
},
{
"epoch": 0.18100447538538042,
"grad_norm": 0.2822662591934204,
"learning_rate": 8.910891089108911e-07,
"loss": 0.7139,
"num_tokens": 127856241.0,
"step": 91
},
{
"epoch": 0.18299353555445053,
"grad_norm": 0.275879830121994,
"learning_rate": 9.00990099009901e-07,
"loss": 0.7113,
"num_tokens": 129247744.0,
"step": 92
},
{
"epoch": 0.18498259572352063,
"grad_norm": 0.26023879647254944,
"learning_rate": 9.108910891089109e-07,
"loss": 0.7,
"num_tokens": 130653045.0,
"step": 93
},
{
"epoch": 0.18697165589259074,
"grad_norm": 0.2475547045469284,
"learning_rate": 9.207920792079208e-07,
"loss": 0.7075,
"num_tokens": 132075768.0,
"step": 94
},
{
"epoch": 0.18896071606166087,
"grad_norm": 0.23815281689167023,
"learning_rate": 9.306930693069307e-07,
"loss": 0.7034,
"num_tokens": 133471787.0,
"step": 95
},
{
"epoch": 0.19094977623073098,
"grad_norm": 0.2257402390241623,
"learning_rate": 9.405940594059405e-07,
"loss": 0.7005,
"num_tokens": 134884172.0,
"step": 96
},
{
"epoch": 0.19293883639980108,
"grad_norm": 0.21398675441741943,
"learning_rate": 9.504950495049504e-07,
"loss": 0.6954,
"num_tokens": 136321122.0,
"step": 97
},
{
"epoch": 0.19492789656887122,
"grad_norm": 0.21331369876861572,
"learning_rate": 9.603960396039604e-07,
"loss": 0.6931,
"num_tokens": 137714653.0,
"step": 98
},
{
"epoch": 0.19691695673794132,
"grad_norm": 0.21159325540065765,
"learning_rate": 9.702970297029702e-07,
"loss": 0.7108,
"num_tokens": 139151591.0,
"step": 99
},
{
"epoch": 0.19890601690701143,
"grad_norm": 0.20192930102348328,
"learning_rate": 9.801980198019802e-07,
"loss": 0.6838,
"num_tokens": 140552794.0,
"step": 100
},
{
"epoch": 0.20089507707608156,
"grad_norm": 0.20033682882785797,
"learning_rate": 9.9009900990099e-07,
"loss": 0.6839,
"num_tokens": 141962296.0,
"step": 101
},
{
"epoch": 0.20288413724515167,
"grad_norm": 0.1956896036863327,
"learning_rate": 1e-06,
"loss": 0.6853,
"num_tokens": 143391533.0,
"step": 102
},
{
"epoch": 0.20487319741422177,
"grad_norm": 0.1973898708820343,
"learning_rate": 1e-06,
"loss": 0.6867,
"num_tokens": 144782734.0,
"step": 103
},
{
"epoch": 0.20686225758329188,
"grad_norm": 0.17975734174251556,
"learning_rate": 1e-06,
"loss": 0.6773,
"num_tokens": 146195534.0,
"step": 104
},
{
"epoch": 0.20885131775236201,
"grad_norm": 0.17711520195007324,
"learning_rate": 1e-06,
"loss": 0.6582,
"num_tokens": 147581967.0,
"step": 105
},
{
"epoch": 0.21084037792143212,
"grad_norm": 0.1741390824317932,
"learning_rate": 1e-06,
"loss": 0.6852,
"num_tokens": 148962092.0,
"step": 106
},
{
"epoch": 0.21282943809050223,
"grad_norm": 0.1642421931028366,
"learning_rate": 1e-06,
"loss": 0.6722,
"num_tokens": 150367148.0,
"step": 107
},
{
"epoch": 0.21481849825957236,
"grad_norm": 0.1667676866054535,
"learning_rate": 1e-06,
"loss": 0.6628,
"num_tokens": 151756604.0,
"step": 108
},
{
"epoch": 0.21680755842864247,
"grad_norm": 0.1586826741695404,
"learning_rate": 1e-06,
"loss": 0.6622,
"num_tokens": 153175572.0,
"step": 109
},
{
"epoch": 0.21879661859771257,
"grad_norm": 0.15848655998706818,
"learning_rate": 1e-06,
"loss": 0.6775,
"num_tokens": 154616863.0,
"step": 110
},
{
"epoch": 0.2207856787667827,
"grad_norm": 0.15296334028244019,
"learning_rate": 1e-06,
"loss": 0.6544,
"num_tokens": 156003361.0,
"step": 111
},
{
"epoch": 0.2227747389358528,
"grad_norm": 0.1545649766921997,
"learning_rate": 1e-06,
"loss": 0.6646,
"num_tokens": 157395330.0,
"step": 112
},
{
"epoch": 0.22476379910492292,
"grad_norm": 0.15351980924606323,
"learning_rate": 1e-06,
"loss": 0.6646,
"num_tokens": 158812399.0,
"step": 113
},
{
"epoch": 0.22675285927399305,
"grad_norm": 0.14907206594944,
"learning_rate": 1e-06,
"loss": 0.6508,
"num_tokens": 160216957.0,
"step": 114
},
{
"epoch": 0.22874191944306316,
"grad_norm": 0.14644992351531982,
"learning_rate": 1e-06,
"loss": 0.6644,
"num_tokens": 161681970.0,
"step": 115
},
{
"epoch": 0.23073097961213326,
"grad_norm": 0.15030954778194427,
"learning_rate": 1e-06,
"loss": 0.6432,
"num_tokens": 163031303.0,
"step": 116
},
{
"epoch": 0.23272003978120337,
"grad_norm": 0.1434543877840042,
"learning_rate": 1e-06,
"loss": 0.647,
"num_tokens": 164438690.0,
"step": 117
},
{
"epoch": 0.2347090999502735,
"grad_norm": 0.14714758098125458,
"learning_rate": 1e-06,
"loss": 0.6547,
"num_tokens": 165848650.0,
"step": 118
},
{
"epoch": 0.2366981601193436,
"grad_norm": 0.14545480906963348,
"learning_rate": 1e-06,
"loss": 0.6666,
"num_tokens": 167315232.0,
"step": 119
},
{
"epoch": 0.23868722028841372,
"grad_norm": 0.14221689105033875,
"learning_rate": 1e-06,
"loss": 0.6481,
"num_tokens": 168709749.0,
"step": 120
},
{
"epoch": 0.24067628045748385,
"grad_norm": 0.14459247887134552,
"learning_rate": 1e-06,
"loss": 0.6361,
"num_tokens": 170093537.0,
"step": 121
},
{
"epoch": 0.24266534062655395,
"grad_norm": 0.14327335357666016,
"learning_rate": 1e-06,
"loss": 0.6513,
"num_tokens": 171464842.0,
"step": 122
},
{
"epoch": 0.24465440079562406,
"grad_norm": 0.13770653307437897,
"learning_rate": 1e-06,
"loss": 0.641,
"num_tokens": 172860600.0,
"step": 123
},
{
"epoch": 0.2466434609646942,
"grad_norm": 0.1363484412431717,
"learning_rate": 1e-06,
"loss": 0.6445,
"num_tokens": 174305923.0,
"step": 124
},
{
"epoch": 0.2486325211337643,
"grad_norm": 0.1345747411251068,
"learning_rate": 1e-06,
"loss": 0.6374,
"num_tokens": 175699135.0,
"step": 125
},
{
"epoch": 0.2506215813028344,
"grad_norm": 0.13693904876708984,
"learning_rate": 1e-06,
"loss": 0.6491,
"num_tokens": 177132442.0,
"step": 126
},
{
"epoch": 0.25261064147190454,
"grad_norm": 0.1340012401342392,
"learning_rate": 1e-06,
"loss": 0.6349,
"num_tokens": 178530726.0,
"step": 127
},
{
"epoch": 0.2545997016409746,
"grad_norm": 0.13670295476913452,
"learning_rate": 1e-06,
"loss": 0.6439,
"num_tokens": 179929069.0,
"step": 128
},
{
"epoch": 0.25658876181004475,
"grad_norm": 0.13612930476665497,
"learning_rate": 1e-06,
"loss": 0.6382,
"num_tokens": 181362595.0,
"step": 129
},
{
"epoch": 0.2585778219791149,
"grad_norm": 0.13262143731117249,
"learning_rate": 1e-06,
"loss": 0.636,
"num_tokens": 182758225.0,
"step": 130
},
{
"epoch": 0.26056688214818496,
"grad_norm": 0.13065095245838165,
"learning_rate": 1e-06,
"loss": 0.6339,
"num_tokens": 184186457.0,
"step": 131
},
{
"epoch": 0.2625559423172551,
"grad_norm": 0.13491177558898926,
"learning_rate": 1e-06,
"loss": 0.6424,
"num_tokens": 185597538.0,
"step": 132
},
{
"epoch": 0.26454500248632523,
"grad_norm": 0.13115911185741425,
"learning_rate": 1e-06,
"loss": 0.636,
"num_tokens": 186999644.0,
"step": 133
},
{
"epoch": 0.2665340626553953,
"grad_norm": 0.13199158012866974,
"learning_rate": 1e-06,
"loss": 0.6404,
"num_tokens": 188383971.0,
"step": 134
},
{
"epoch": 0.26852312282446544,
"grad_norm": 0.1268879771232605,
"learning_rate": 1e-06,
"loss": 0.6458,
"num_tokens": 189800770.0,
"step": 135
},
{
"epoch": 0.2705121829935356,
"grad_norm": 0.12490399926900864,
"learning_rate": 1e-06,
"loss": 0.6437,
"num_tokens": 191189967.0,
"step": 136
},
{
"epoch": 0.27250124316260566,
"grad_norm": 0.1264086365699768,
"learning_rate": 1e-06,
"loss": 0.6015,
"num_tokens": 192578506.0,
"step": 137
},
{
"epoch": 0.2744903033316758,
"grad_norm": 0.1346651315689087,
"learning_rate": 1e-06,
"loss": 0.6271,
"num_tokens": 193939223.0,
"step": 138
},
{
"epoch": 0.2764793635007459,
"grad_norm": 0.12618686258792877,
"learning_rate": 1e-06,
"loss": 0.6249,
"num_tokens": 195333601.0,
"step": 139
},
{
"epoch": 0.278468423669816,
"grad_norm": 0.12344113737344742,
"learning_rate": 1e-06,
"loss": 0.6314,
"num_tokens": 196788664.0,
"step": 140
},
{
"epoch": 0.28045748383888613,
"grad_norm": 0.12506203353405,
"learning_rate": 1e-06,
"loss": 0.6282,
"num_tokens": 198163401.0,
"step": 141
},
{
"epoch": 0.28244654400795627,
"grad_norm": 0.12806549668312073,
"learning_rate": 1e-06,
"loss": 0.6304,
"num_tokens": 199567499.0,
"step": 142
},
{
"epoch": 0.28443560417702635,
"grad_norm": 0.12447736412286758,
"learning_rate": 1e-06,
"loss": 0.6145,
"num_tokens": 200961717.0,
"step": 143
},
{
"epoch": 0.2864246643460965,
"grad_norm": 0.123995341360569,
"learning_rate": 1e-06,
"loss": 0.6296,
"num_tokens": 202374356.0,
"step": 144
},
{
"epoch": 0.28841372451516656,
"grad_norm": 0.12191283702850342,
"learning_rate": 1e-06,
"loss": 0.6255,
"num_tokens": 203763067.0,
"step": 145
},
{
"epoch": 0.2904027846842367,
"grad_norm": 0.12908770143985748,
"learning_rate": 1e-06,
"loss": 0.6134,
"num_tokens": 205148329.0,
"step": 146
},
{
"epoch": 0.2923918448533068,
"grad_norm": 0.12782661616802216,
"learning_rate": 1e-06,
"loss": 0.6366,
"num_tokens": 206571749.0,
"step": 147
},
{
"epoch": 0.2943809050223769,
"grad_norm": 0.12458013743162155,
"learning_rate": 1e-06,
"loss": 0.6274,
"num_tokens": 207945787.0,
"step": 148
},
{
"epoch": 0.29636996519144704,
"grad_norm": 0.1228199154138565,
"learning_rate": 1e-06,
"loss": 0.6126,
"num_tokens": 209368496.0,
"step": 149
},
{
"epoch": 0.29835902536051717,
"grad_norm": 0.11965636909008026,
"learning_rate": 1e-06,
"loss": 0.6222,
"num_tokens": 210815947.0,
"step": 150
},
{
"epoch": 0.30034808552958725,
"grad_norm": 0.1323099434375763,
"learning_rate": 1e-06,
"loss": 0.6242,
"num_tokens": 212216246.0,
"step": 151
},
{
"epoch": 0.3023371456986574,
"grad_norm": 0.12547767162322998,
"learning_rate": 1e-06,
"loss": 0.6129,
"num_tokens": 213565534.0,
"step": 152
},
{
"epoch": 0.3043262058677275,
"grad_norm": 0.12437719106674194,
"learning_rate": 1e-06,
"loss": 0.6268,
"num_tokens": 215022812.0,
"step": 153
},
{
"epoch": 0.3063152660367976,
"grad_norm": 0.12172248214483261,
"learning_rate": 1e-06,
"loss": 0.6158,
"num_tokens": 216418038.0,
"step": 154
},
{
"epoch": 0.30830432620586773,
"grad_norm": 0.12287899106740952,
"learning_rate": 1e-06,
"loss": 0.6165,
"num_tokens": 217835190.0,
"step": 155
},
{
"epoch": 0.31029338637493786,
"grad_norm": 0.12283938378095627,
"learning_rate": 1e-06,
"loss": 0.6088,
"num_tokens": 219203881.0,
"step": 156
},
{
"epoch": 0.31228244654400794,
"grad_norm": 0.12894928455352783,
"learning_rate": 1e-06,
"loss": 0.6245,
"num_tokens": 220596713.0,
"step": 157
},
{
"epoch": 0.3142715067130781,
"grad_norm": 0.12768110632896423,
"learning_rate": 1e-06,
"loss": 0.6148,
"num_tokens": 221990986.0,
"step": 158
},
{
"epoch": 0.3162605668821482,
"grad_norm": 0.11902462691068649,
"learning_rate": 1e-06,
"loss": 0.6134,
"num_tokens": 223430785.0,
"step": 159
},
{
"epoch": 0.3182496270512183,
"grad_norm": 0.12414086610078812,
"learning_rate": 1e-06,
"loss": 0.6236,
"num_tokens": 224823735.0,
"step": 160
},
{
"epoch": 0.3202386872202884,
"grad_norm": 0.12295661866664886,
"learning_rate": 1e-06,
"loss": 0.6222,
"num_tokens": 226213815.0,
"step": 161
},
{
"epoch": 0.32222774738935855,
"grad_norm": 0.12154286354780197,
"learning_rate": 1e-06,
"loss": 0.6154,
"num_tokens": 227589479.0,
"step": 162
},
{
"epoch": 0.32421680755842863,
"grad_norm": 0.12159918993711472,
"learning_rate": 1e-06,
"loss": 0.6112,
"num_tokens": 228978021.0,
"step": 163
},
{
"epoch": 0.32620586772749877,
"grad_norm": 0.1207566186785698,
"learning_rate": 1e-06,
"loss": 0.6101,
"num_tokens": 230373569.0,
"step": 164
},
{
"epoch": 0.32819492789656884,
"grad_norm": 0.1222677007317543,
"learning_rate": 1e-06,
"loss": 0.6089,
"num_tokens": 231771815.0,
"step": 165
},
{
"epoch": 0.330183988065639,
"grad_norm": 0.1245197132229805,
"learning_rate": 1e-06,
"loss": 0.6095,
"num_tokens": 233172925.0,
"step": 166
},
{
"epoch": 0.3321730482347091,
"grad_norm": 0.1191013976931572,
"learning_rate": 1e-06,
"loss": 0.6138,
"num_tokens": 234578536.0,
"step": 167
},
{
"epoch": 0.3341621084037792,
"grad_norm": 0.12282473593950272,
"learning_rate": 1e-06,
"loss": 0.605,
"num_tokens": 235997856.0,
"step": 168
},
{
"epoch": 0.3361511685728493,
"grad_norm": 0.11851899325847626,
"learning_rate": 1e-06,
"loss": 0.6224,
"num_tokens": 237456733.0,
"step": 169
},
{
"epoch": 0.33814022874191946,
"grad_norm": 0.12077327817678452,
"learning_rate": 1e-06,
"loss": 0.6026,
"num_tokens": 238849801.0,
"step": 170
},
{
"epoch": 0.34012928891098954,
"grad_norm": 0.11882904917001724,
"learning_rate": 1e-06,
"loss": 0.6126,
"num_tokens": 240265062.0,
"step": 171
},
{
"epoch": 0.34211834908005967,
"grad_norm": 0.12201754748821259,
"learning_rate": 1e-06,
"loss": 0.5968,
"num_tokens": 241660845.0,
"step": 172
},
{
"epoch": 0.3441074092491298,
"grad_norm": 0.11851049214601517,
"learning_rate": 1e-06,
"loss": 0.6045,
"num_tokens": 243073416.0,
"step": 173
},
{
"epoch": 0.3460964694181999,
"grad_norm": 0.1223016306757927,
"learning_rate": 1e-06,
"loss": 0.6081,
"num_tokens": 244457775.0,
"step": 174
},
{
"epoch": 0.34808552958727,
"grad_norm": 0.11766277253627777,
"learning_rate": 1e-06,
"loss": 0.6115,
"num_tokens": 245880299.0,
"step": 175
},
{
"epoch": 0.35007458975634015,
"grad_norm": 0.12223277240991592,
"learning_rate": 1e-06,
"loss": 0.6147,
"num_tokens": 247285182.0,
"step": 176
},
{
"epoch": 0.3520636499254102,
"grad_norm": 0.12305691093206406,
"learning_rate": 1e-06,
"loss": 0.621,
"num_tokens": 248698180.0,
"step": 177
},
{
"epoch": 0.35405271009448036,
"grad_norm": 0.12199797481298447,
"learning_rate": 1e-06,
"loss": 0.6028,
"num_tokens": 250069197.0,
"step": 178
},
{
"epoch": 0.3560417702635505,
"grad_norm": 0.12232685089111328,
"learning_rate": 1e-06,
"loss": 0.6219,
"num_tokens": 251503045.0,
"step": 179
},
{
"epoch": 0.3580308304326206,
"grad_norm": 0.12115549296140671,
"learning_rate": 1e-06,
"loss": 0.6139,
"num_tokens": 252942250.0,
"step": 180
},
{
"epoch": 0.3600198906016907,
"grad_norm": 0.12164245545864105,
"learning_rate": 1e-06,
"loss": 0.6158,
"num_tokens": 254386360.0,
"step": 181
},
{
"epoch": 0.36200895077076084,
"grad_norm": 0.12038590759038925,
"learning_rate": 1e-06,
"loss": 0.6171,
"num_tokens": 255791663.0,
"step": 182
},
{
"epoch": 0.3639980109398309,
"grad_norm": 0.12380865961313248,
"learning_rate": 1e-06,
"loss": 0.6146,
"num_tokens": 257180435.0,
"step": 183
},
{
"epoch": 0.36598707110890105,
"grad_norm": 0.12001994252204895,
"learning_rate": 1e-06,
"loss": 0.5962,
"num_tokens": 258575017.0,
"step": 184
},
{
"epoch": 0.3679761312779712,
"grad_norm": 0.12672315537929535,
"learning_rate": 1e-06,
"loss": 0.5982,
"num_tokens": 259971664.0,
"step": 185
},
{
"epoch": 0.36996519144704126,
"grad_norm": 0.12033303827047348,
"learning_rate": 1e-06,
"loss": 0.5976,
"num_tokens": 261365988.0,
"step": 186
},
{
"epoch": 0.3719542516161114,
"grad_norm": 0.12143931537866592,
"learning_rate": 1e-06,
"loss": 0.5963,
"num_tokens": 262783906.0,
"step": 187
},
{
"epoch": 0.3739433117851815,
"grad_norm": 0.12255272269248962,
"learning_rate": 1e-06,
"loss": 0.6123,
"num_tokens": 264206687.0,
"step": 188
},
{
"epoch": 0.3759323719542516,
"grad_norm": 0.1221979632973671,
"learning_rate": 1e-06,
"loss": 0.5967,
"num_tokens": 265600406.0,
"step": 189
},
{
"epoch": 0.37792143212332174,
"grad_norm": 0.11851304769515991,
"learning_rate": 1e-06,
"loss": 0.5885,
"num_tokens": 266998184.0,
"step": 190
},
{
"epoch": 0.3799104922923918,
"grad_norm": 0.11754640191793442,
"learning_rate": 1e-06,
"loss": 0.6109,
"num_tokens": 268432018.0,
"step": 191
},
{
"epoch": 0.38189955246146196,
"grad_norm": 0.12079335004091263,
"learning_rate": 1e-06,
"loss": 0.6018,
"num_tokens": 269811989.0,
"step": 192
},
{
"epoch": 0.3838886126305321,
"grad_norm": 0.12268039584159851,
"learning_rate": 1e-06,
"loss": 0.6076,
"num_tokens": 271216935.0,
"step": 193
},
{
"epoch": 0.38587767279960217,
"grad_norm": 0.12338917702436447,
"learning_rate": 1e-06,
"loss": 0.6127,
"num_tokens": 272636458.0,
"step": 194
},
{
"epoch": 0.3878667329686723,
"grad_norm": 0.11832890659570694,
"learning_rate": 1e-06,
"loss": 0.6117,
"num_tokens": 274045339.0,
"step": 195
},
{
"epoch": 0.38985579313774243,
"grad_norm": 0.11602655798196793,
"learning_rate": 1e-06,
"loss": 0.6099,
"num_tokens": 275455795.0,
"step": 196
},
{
"epoch": 0.3918448533068125,
"grad_norm": 0.11740183085203171,
"learning_rate": 1e-06,
"loss": 0.6065,
"num_tokens": 276879525.0,
"step": 197
},
{
"epoch": 0.39383391347588265,
"grad_norm": 0.12076492607593536,
"learning_rate": 1e-06,
"loss": 0.6082,
"num_tokens": 278290807.0,
"step": 198
},
{
"epoch": 0.3958229736449528,
"grad_norm": 0.12188921868801117,
"learning_rate": 1e-06,
"loss": 0.5984,
"num_tokens": 279716188.0,
"step": 199
},
{
"epoch": 0.39781203381402286,
"grad_norm": 0.12162777781486511,
"learning_rate": 1e-06,
"loss": 0.6029,
"num_tokens": 281099544.0,
"step": 200
},
{
"epoch": 0.399801093983093,
"grad_norm": 0.11848218739032745,
"learning_rate": 1e-06,
"loss": 0.5823,
"num_tokens": 282469441.0,
"step": 201
},
{
"epoch": 0.4017901541521631,
"grad_norm": 0.1235736832022667,
"learning_rate": 1e-06,
"loss": 0.5956,
"num_tokens": 283884792.0,
"step": 202
},
{
"epoch": 0.4037792143212332,
"grad_norm": 0.1228078156709671,
"learning_rate": 1e-06,
"loss": 0.6079,
"num_tokens": 285286535.0,
"step": 203
},
{
"epoch": 0.40576827449030334,
"grad_norm": 0.12109891325235367,
"learning_rate": 1e-06,
"loss": 0.6001,
"num_tokens": 286679118.0,
"step": 204
},
{
"epoch": 0.40775733465937347,
"grad_norm": 0.12217779457569122,
"learning_rate": 1e-06,
"loss": 0.6023,
"num_tokens": 288108758.0,
"step": 205
},
{
"epoch": 0.40974639482844355,
"grad_norm": 0.12233875691890717,
"learning_rate": 1e-06,
"loss": 0.5983,
"num_tokens": 289524538.0,
"step": 206
},
{
"epoch": 0.4117354549975137,
"grad_norm": 0.12142330408096313,
"learning_rate": 1e-06,
"loss": 0.6072,
"num_tokens": 290911340.0,
"step": 207
},
{
"epoch": 0.41372451516658376,
"grad_norm": 0.12050288170576096,
"learning_rate": 1e-06,
"loss": 0.5899,
"num_tokens": 292301970.0,
"step": 208
},
{
"epoch": 0.4157135753356539,
"grad_norm": 0.12356843799352646,
"learning_rate": 1e-06,
"loss": 0.5958,
"num_tokens": 293718750.0,
"step": 209
},
{
"epoch": 0.41770263550472403,
"grad_norm": 0.12285276502370834,
"learning_rate": 1e-06,
"loss": 0.5915,
"num_tokens": 295137029.0,
"step": 210
},
{
"epoch": 0.4196916956737941,
"grad_norm": 0.12178198248147964,
"learning_rate": 1e-06,
"loss": 0.6015,
"num_tokens": 296516937.0,
"step": 211
},
{
"epoch": 0.42168075584286424,
"grad_norm": 0.12042722851037979,
"learning_rate": 1e-06,
"loss": 0.592,
"num_tokens": 297914166.0,
"step": 212
},
{
"epoch": 0.4236698160119344,
"grad_norm": 0.12015018612146378,
"learning_rate": 1e-06,
"loss": 0.5917,
"num_tokens": 299291477.0,
"step": 213
},
{
"epoch": 0.42565887618100445,
"grad_norm": 0.11729878187179565,
"learning_rate": 1e-06,
"loss": 0.5902,
"num_tokens": 300690831.0,
"step": 214
},
{
"epoch": 0.4276479363500746,
"grad_norm": 0.11877918988466263,
"learning_rate": 1e-06,
"loss": 0.5837,
"num_tokens": 302125996.0,
"step": 215
},
{
"epoch": 0.4296369965191447,
"grad_norm": 0.12102558463811874,
"learning_rate": 1e-06,
"loss": 0.6027,
"num_tokens": 303541215.0,
"step": 216
},
{
"epoch": 0.4316260566882148,
"grad_norm": 0.11814971268177032,
"learning_rate": 1e-06,
"loss": 0.5912,
"num_tokens": 304932140.0,
"step": 217
},
{
"epoch": 0.43361511685728493,
"grad_norm": 0.11896595358848572,
"learning_rate": 1e-06,
"loss": 0.6004,
"num_tokens": 306387713.0,
"step": 218
},
{
"epoch": 0.43560417702635507,
"grad_norm": 0.12220481783151627,
"learning_rate": 1e-06,
"loss": 0.6155,
"num_tokens": 307820317.0,
"step": 219
},
{
"epoch": 0.43759323719542514,
"grad_norm": 0.11805558949708939,
"learning_rate": 1e-06,
"loss": 0.5929,
"num_tokens": 309219520.0,
"step": 220
},
{
"epoch": 0.4395822973644953,
"grad_norm": 0.12132007628679276,
"learning_rate": 1e-06,
"loss": 0.5837,
"num_tokens": 310591459.0,
"step": 221
},
{
"epoch": 0.4415713575335654,
"grad_norm": 0.11762286722660065,
"learning_rate": 1e-06,
"loss": 0.5861,
"num_tokens": 312008252.0,
"step": 222
},
{
"epoch": 0.4435604177026355,
"grad_norm": 0.11986927688121796,
"learning_rate": 1e-06,
"loss": 0.594,
"num_tokens": 313425810.0,
"step": 223
},
{
"epoch": 0.4455494778717056,
"grad_norm": 0.11796683073043823,
"learning_rate": 1e-06,
"loss": 0.5866,
"num_tokens": 314853879.0,
"step": 224
},
{
"epoch": 0.44753853804077576,
"grad_norm": 0.12010174244642258,
"learning_rate": 1e-06,
"loss": 0.6066,
"num_tokens": 316294854.0,
"step": 225
},
{
"epoch": 0.44952759820984584,
"grad_norm": 0.11594483256340027,
"learning_rate": 1e-06,
"loss": 0.5822,
"num_tokens": 317700098.0,
"step": 226
},
{
"epoch": 0.45151665837891597,
"grad_norm": 0.11875201016664505,
"learning_rate": 1e-06,
"loss": 0.6061,
"num_tokens": 319128053.0,
"step": 227
},
{
"epoch": 0.4535057185479861,
"grad_norm": 0.1269533336162567,
"learning_rate": 1e-06,
"loss": 0.5777,
"num_tokens": 320515582.0,
"step": 228
},
{
"epoch": 0.4554947787170562,
"grad_norm": 0.11610081046819687,
"learning_rate": 1e-06,
"loss": 0.5855,
"num_tokens": 321928550.0,
"step": 229
},
{
"epoch": 0.4574838388861263,
"grad_norm": 0.12167216837406158,
"learning_rate": 1e-06,
"loss": 0.5834,
"num_tokens": 323331818.0,
"step": 230
},
{
"epoch": 0.4594728990551964,
"grad_norm": 0.1215236485004425,
"learning_rate": 1e-06,
"loss": 0.5922,
"num_tokens": 324737162.0,
"step": 231
},
{
"epoch": 0.4614619592242665,
"grad_norm": 0.12563477456569672,
"learning_rate": 1e-06,
"loss": 0.5898,
"num_tokens": 326131119.0,
"step": 232
},
{
"epoch": 0.46345101939333666,
"grad_norm": 0.12811586260795593,
"learning_rate": 1e-06,
"loss": 0.5749,
"num_tokens": 327476246.0,
"step": 233
},
{
"epoch": 0.46544007956240674,
"grad_norm": 0.12052274495363235,
"learning_rate": 1e-06,
"loss": 0.5946,
"num_tokens": 328874811.0,
"step": 234
},
{
"epoch": 0.4674291397314769,
"grad_norm": 0.11987277865409851,
"learning_rate": 1e-06,
"loss": 0.5886,
"num_tokens": 330249034.0,
"step": 235
},
{
"epoch": 0.469418199900547,
"grad_norm": 0.11991394311189651,
"learning_rate": 1e-06,
"loss": 0.5796,
"num_tokens": 331639806.0,
"step": 236
},
{
"epoch": 0.4714072600696171,
"grad_norm": 0.11994659900665283,
"learning_rate": 1e-06,
"loss": 0.5989,
"num_tokens": 333038051.0,
"step": 237
},
{
"epoch": 0.4733963202386872,
"grad_norm": 0.12097202241420746,
"learning_rate": 1e-06,
"loss": 0.5783,
"num_tokens": 334431046.0,
"step": 238
},
{
"epoch": 0.47538538040775735,
"grad_norm": 0.12271056324243546,
"learning_rate": 1e-06,
"loss": 0.5896,
"num_tokens": 335827008.0,
"step": 239
},
{
"epoch": 0.47737444057682743,
"grad_norm": 0.12297134101390839,
"learning_rate": 1e-06,
"loss": 0.5999,
"num_tokens": 337226457.0,
"step": 240
},
{
"epoch": 0.47936350074589756,
"grad_norm": 0.1218937486410141,
"learning_rate": 1e-06,
"loss": 0.5926,
"num_tokens": 338666075.0,
"step": 241
},
{
"epoch": 0.4813525609149677,
"grad_norm": 0.12227629870176315,
"learning_rate": 1e-06,
"loss": 0.58,
"num_tokens": 340097602.0,
"step": 242
},
{
"epoch": 0.4833416210840378,
"grad_norm": 0.11952868849039078,
"learning_rate": 1e-06,
"loss": 0.5948,
"num_tokens": 341495194.0,
"step": 243
},
{
"epoch": 0.4853306812531079,
"grad_norm": 0.11985652893781662,
"learning_rate": 1e-06,
"loss": 0.5774,
"num_tokens": 342859196.0,
"step": 244
},
{
"epoch": 0.48731974142217804,
"grad_norm": 0.11957862228155136,
"learning_rate": 1e-06,
"loss": 0.5755,
"num_tokens": 344277995.0,
"step": 245
},
{
"epoch": 0.4893088015912481,
"grad_norm": 0.12266214936971664,
"learning_rate": 1e-06,
"loss": 0.5877,
"num_tokens": 345669645.0,
"step": 246
},
{
"epoch": 0.49129786176031826,
"grad_norm": 0.11971444636583328,
"learning_rate": 1e-06,
"loss": 0.5962,
"num_tokens": 347043955.0,
"step": 247
},
{
"epoch": 0.4932869219293884,
"grad_norm": 0.12104374915361404,
"learning_rate": 1e-06,
"loss": 0.5878,
"num_tokens": 348465321.0,
"step": 248
},
{
"epoch": 0.49527598209845847,
"grad_norm": 0.12102899700403214,
"learning_rate": 1e-06,
"loss": 0.5904,
"num_tokens": 349862835.0,
"step": 249
},
{
"epoch": 0.4972650422675286,
"grad_norm": 0.11829105764627457,
"learning_rate": 1e-06,
"loss": 0.5826,
"num_tokens": 351283798.0,
"step": 250
},
{
"epoch": 0.49925410243659873,
"grad_norm": 0.11999989300966263,
"learning_rate": 1e-06,
"loss": 0.5983,
"num_tokens": 352684836.0,
"step": 251
},
{
"epoch": 0.5012431626056688,
"grad_norm": 0.11723977327346802,
"learning_rate": 1e-06,
"loss": 0.5951,
"num_tokens": 354084748.0,
"step": 252
},
{
"epoch": 0.5032322227747389,
"grad_norm": 0.12956112623214722,
"learning_rate": 1e-06,
"loss": 0.5898,
"num_tokens": 355478595.0,
"step": 253
},
{
"epoch": 0.5052212829438091,
"grad_norm": 0.11831249296665192,
"learning_rate": 1e-06,
"loss": 0.5901,
"num_tokens": 356886933.0,
"step": 254
},
{
"epoch": 0.5072103431128792,
"grad_norm": 0.11731645464897156,
"learning_rate": 1e-06,
"loss": 0.5787,
"num_tokens": 358295994.0,
"step": 255
},
{
"epoch": 0.5091994032819492,
"grad_norm": 0.12075242400169373,
"learning_rate": 1e-06,
"loss": 0.591,
"num_tokens": 359683170.0,
"step": 256
},
{
"epoch": 0.5111884634510194,
"grad_norm": 0.12081367522478104,
"learning_rate": 1e-06,
"loss": 0.5903,
"num_tokens": 361079150.0,
"step": 257
},
{
"epoch": 0.5131775236200895,
"grad_norm": 0.11680179834365845,
"learning_rate": 1e-06,
"loss": 0.5706,
"num_tokens": 362477549.0,
"step": 258
},
{
"epoch": 0.5151665837891596,
"grad_norm": 0.11828526854515076,
"learning_rate": 1e-06,
"loss": 0.5811,
"num_tokens": 363848018.0,
"step": 259
},
{
"epoch": 0.5171556439582298,
"grad_norm": 0.1175350472331047,
"learning_rate": 1e-06,
"loss": 0.5771,
"num_tokens": 365236527.0,
"step": 260
},
{
"epoch": 0.5191447041272998,
"grad_norm": 0.1198112890124321,
"learning_rate": 1e-06,
"loss": 0.5787,
"num_tokens": 366640485.0,
"step": 261
},
{
"epoch": 0.5211337642963699,
"grad_norm": 0.11720699816942215,
"learning_rate": 1e-06,
"loss": 0.584,
"num_tokens": 368033122.0,
"step": 262
},
{
"epoch": 0.5231228244654401,
"grad_norm": 0.12382423877716064,
"learning_rate": 1e-06,
"loss": 0.5866,
"num_tokens": 369417377.0,
"step": 263
},
{
"epoch": 0.5251118846345102,
"grad_norm": 0.12134955078363419,
"learning_rate": 1e-06,
"loss": 0.5757,
"num_tokens": 370794212.0,
"step": 264
},
{
"epoch": 0.5271009448035803,
"grad_norm": 0.12391626089811325,
"learning_rate": 1e-06,
"loss": 0.601,
"num_tokens": 372225021.0,
"step": 265
},
{
"epoch": 0.5290900049726505,
"grad_norm": 0.11900907754898071,
"learning_rate": 1e-06,
"loss": 0.5909,
"num_tokens": 373610685.0,
"step": 266
},
{
"epoch": 0.5310790651417205,
"grad_norm": 0.11934248358011246,
"learning_rate": 1e-06,
"loss": 0.5876,
"num_tokens": 375021896.0,
"step": 267
},
{
"epoch": 0.5330681253107906,
"grad_norm": 0.12139896303415298,
"learning_rate": 1e-06,
"loss": 0.5821,
"num_tokens": 376456216.0,
"step": 268
},
{
"epoch": 0.5350571854798608,
"grad_norm": 0.12349140644073486,
"learning_rate": 1e-06,
"loss": 0.58,
"num_tokens": 377828715.0,
"step": 269
},
{
"epoch": 0.5370462456489309,
"grad_norm": 0.12981721758842468,
"learning_rate": 1e-06,
"loss": 0.5782,
"num_tokens": 379262595.0,
"step": 270
},
{
"epoch": 0.539035305818001,
"grad_norm": 0.12098333984613419,
"learning_rate": 1e-06,
"loss": 0.5906,
"num_tokens": 380657341.0,
"step": 271
},
{
"epoch": 0.5410243659870712,
"grad_norm": 0.1278562843799591,
"learning_rate": 1e-06,
"loss": 0.5891,
"num_tokens": 382061345.0,
"step": 272
},
{
"epoch": 0.5430134261561412,
"grad_norm": 0.11872877925634384,
"learning_rate": 1e-06,
"loss": 0.5765,
"num_tokens": 383439874.0,
"step": 273
},
{
"epoch": 0.5450024863252113,
"grad_norm": 0.11846951395273209,
"learning_rate": 1e-06,
"loss": 0.5803,
"num_tokens": 384849495.0,
"step": 274
},
{
"epoch": 0.5469915464942815,
"grad_norm": 0.11676832288503647,
"learning_rate": 1e-06,
"loss": 0.5859,
"num_tokens": 386272096.0,
"step": 275
},
{
"epoch": 0.5489806066633516,
"grad_norm": 0.118569515645504,
"learning_rate": 1e-06,
"loss": 0.5706,
"num_tokens": 387673261.0,
"step": 276
},
{
"epoch": 0.5509696668324217,
"grad_norm": 0.11762821674346924,
"learning_rate": 1e-06,
"loss": 0.5849,
"num_tokens": 389067728.0,
"step": 277
},
{
"epoch": 0.5529587270014918,
"grad_norm": 0.11877725273370743,
"learning_rate": 1e-06,
"loss": 0.5909,
"num_tokens": 390485805.0,
"step": 278
},
{
"epoch": 0.5549477871705619,
"grad_norm": 0.11775851249694824,
"learning_rate": 1e-06,
"loss": 0.5836,
"num_tokens": 391908547.0,
"step": 279
},
{
"epoch": 0.556936847339632,
"grad_norm": 0.11834586411714554,
"learning_rate": 1e-06,
"loss": 0.5788,
"num_tokens": 393310396.0,
"step": 280
},
{
"epoch": 0.5589259075087022,
"grad_norm": 0.1218111664056778,
"learning_rate": 1e-06,
"loss": 0.5927,
"num_tokens": 394698059.0,
"step": 281
},
{
"epoch": 0.5609149676777723,
"grad_norm": 0.12251269072294235,
"learning_rate": 1e-06,
"loss": 0.5801,
"num_tokens": 396077595.0,
"step": 282
},
{
"epoch": 0.5629040278468423,
"grad_norm": 0.11502744257450104,
"learning_rate": 1e-06,
"loss": 0.5719,
"num_tokens": 397494815.0,
"step": 283
},
{
"epoch": 0.5648930880159125,
"grad_norm": 0.11885383725166321,
"learning_rate": 1e-06,
"loss": 0.5706,
"num_tokens": 398915154.0,
"step": 284
},
{
"epoch": 0.5668821481849826,
"grad_norm": 0.1333908885717392,
"learning_rate": 1e-06,
"loss": 0.5702,
"num_tokens": 400321357.0,
"step": 285
},
{
"epoch": 0.5688712083540527,
"grad_norm": 0.12071363627910614,
"learning_rate": 1e-06,
"loss": 0.5818,
"num_tokens": 401692589.0,
"step": 286
},
{
"epoch": 0.5708602685231228,
"grad_norm": 0.12001436948776245,
"learning_rate": 1e-06,
"loss": 0.5859,
"num_tokens": 403117801.0,
"step": 287
},
{
"epoch": 0.572849328692193,
"grad_norm": 0.12118804454803467,
"learning_rate": 1e-06,
"loss": 0.5738,
"num_tokens": 404520032.0,
"step": 288
},
{
"epoch": 0.574838388861263,
"grad_norm": 0.12114690989255905,
"learning_rate": 1e-06,
"loss": 0.581,
"num_tokens": 405931871.0,
"step": 289
},
{
"epoch": 0.5768274490303331,
"grad_norm": 0.11723317950963974,
"learning_rate": 1e-06,
"loss": 0.569,
"num_tokens": 407337363.0,
"step": 290
},
{
"epoch": 0.5788165091994033,
"grad_norm": 0.11783155053853989,
"learning_rate": 1e-06,
"loss": 0.58,
"num_tokens": 408742361.0,
"step": 291
},
{
"epoch": 0.5808055693684734,
"grad_norm": 0.12249549478292465,
"learning_rate": 1e-06,
"loss": 0.5759,
"num_tokens": 410152716.0,
"step": 292
},
{
"epoch": 0.5827946295375435,
"grad_norm": 0.11785644292831421,
"learning_rate": 1e-06,
"loss": 0.5719,
"num_tokens": 411547709.0,
"step": 293
},
{
"epoch": 0.5847836897066137,
"grad_norm": 0.11994913220405579,
"learning_rate": 1e-06,
"loss": 0.5839,
"num_tokens": 412958152.0,
"step": 294
},
{
"epoch": 0.5867727498756837,
"grad_norm": 0.1180441826581955,
"learning_rate": 1e-06,
"loss": 0.5874,
"num_tokens": 414335814.0,
"step": 295
},
{
"epoch": 0.5887618100447538,
"grad_norm": 0.1199953630566597,
"learning_rate": 1e-06,
"loss": 0.5892,
"num_tokens": 415753335.0,
"step": 296
},
{
"epoch": 0.590750870213824,
"grad_norm": 0.12051574885845184,
"learning_rate": 1e-06,
"loss": 0.587,
"num_tokens": 417123268.0,
"step": 297
},
{
"epoch": 0.5927399303828941,
"grad_norm": 0.12905830144882202,
"learning_rate": 1e-06,
"loss": 0.5822,
"num_tokens": 418534320.0,
"step": 298
},
{
"epoch": 0.5947289905519642,
"grad_norm": 0.11819034814834595,
"learning_rate": 1e-06,
"loss": 0.5738,
"num_tokens": 419934167.0,
"step": 299
},
{
"epoch": 0.5967180507210343,
"grad_norm": 0.1195630431175232,
"learning_rate": 1e-06,
"loss": 0.5891,
"num_tokens": 421330163.0,
"step": 300
},
{
"epoch": 0.5987071108901044,
"grad_norm": 0.11934220045804977,
"learning_rate": 1e-06,
"loss": 0.5681,
"num_tokens": 422702201.0,
"step": 301
},
{
"epoch": 0.6006961710591745,
"grad_norm": 0.11755826324224472,
"learning_rate": 1e-06,
"loss": 0.5843,
"num_tokens": 424135521.0,
"step": 302
},
{
"epoch": 0.6026852312282447,
"grad_norm": 0.11706581711769104,
"learning_rate": 1e-06,
"loss": 0.5748,
"num_tokens": 425481108.0,
"step": 303
},
{
"epoch": 0.6046742913973148,
"grad_norm": 0.11658696830272675,
"learning_rate": 1e-06,
"loss": 0.5702,
"num_tokens": 426880860.0,
"step": 304
},
{
"epoch": 0.6066633515663848,
"grad_norm": 0.12044768035411835,
"learning_rate": 1e-06,
"loss": 0.5723,
"num_tokens": 428272765.0,
"step": 305
},
{
"epoch": 0.608652411735455,
"grad_norm": 0.12221231311559677,
"learning_rate": 1e-06,
"loss": 0.5645,
"num_tokens": 429650167.0,
"step": 306
},
{
"epoch": 0.6106414719045251,
"grad_norm": 0.11917625367641449,
"learning_rate": 1e-06,
"loss": 0.5748,
"num_tokens": 431055294.0,
"step": 307
},
{
"epoch": 0.6126305320735952,
"grad_norm": 0.11839272826910019,
"learning_rate": 1e-06,
"loss": 0.5692,
"num_tokens": 432471600.0,
"step": 308
},
{
"epoch": 0.6146195922426654,
"grad_norm": 0.1222674548625946,
"learning_rate": 1e-06,
"loss": 0.5798,
"num_tokens": 433875649.0,
"step": 309
},
{
"epoch": 0.6166086524117355,
"grad_norm": 0.11733808368444443,
"learning_rate": 1e-06,
"loss": 0.5641,
"num_tokens": 435283504.0,
"step": 310
},
{
"epoch": 0.6185977125808055,
"grad_norm": 0.1265021413564682,
"learning_rate": 1e-06,
"loss": 0.5758,
"num_tokens": 436661567.0,
"step": 311
},
{
"epoch": 0.6205867727498757,
"grad_norm": 0.11938843131065369,
"learning_rate": 1e-06,
"loss": 0.5749,
"num_tokens": 438042305.0,
"step": 312
},
{
"epoch": 0.6225758329189458,
"grad_norm": 0.11977506428956985,
"learning_rate": 1e-06,
"loss": 0.5715,
"num_tokens": 439453603.0,
"step": 313
},
{
"epoch": 0.6245648930880159,
"grad_norm": 0.1187228411436081,
"learning_rate": 1e-06,
"loss": 0.5738,
"num_tokens": 440857261.0,
"step": 314
},
{
"epoch": 0.6265539532570861,
"grad_norm": 0.11981964856386185,
"learning_rate": 1e-06,
"loss": 0.5798,
"num_tokens": 442261964.0,
"step": 315
},
{
"epoch": 0.6285430134261561,
"grad_norm": 0.11806244403123856,
"learning_rate": 1e-06,
"loss": 0.5789,
"num_tokens": 443693559.0,
"step": 316
},
{
"epoch": 0.6305320735952262,
"grad_norm": 0.11995609104633331,
"learning_rate": 1e-06,
"loss": 0.5736,
"num_tokens": 445134735.0,
"step": 317
},
{
"epoch": 0.6325211337642964,
"grad_norm": 0.11736578494310379,
"learning_rate": 1e-06,
"loss": 0.5782,
"num_tokens": 446570052.0,
"step": 318
},
{
"epoch": 0.6345101939333665,
"grad_norm": 0.11504673957824707,
"learning_rate": 1e-06,
"loss": 0.57,
"num_tokens": 447974813.0,
"step": 319
},
{
"epoch": 0.6364992541024366,
"grad_norm": 0.11872579902410507,
"learning_rate": 1e-06,
"loss": 0.5781,
"num_tokens": 449374619.0,
"step": 320
},
{
"epoch": 0.6384883142715068,
"grad_norm": 0.11843977123498917,
"learning_rate": 1e-06,
"loss": 0.5719,
"num_tokens": 450791966.0,
"step": 321
},
{
"epoch": 0.6404773744405768,
"grad_norm": 0.11978229135274887,
"learning_rate": 1e-06,
"loss": 0.5728,
"num_tokens": 452194196.0,
"step": 322
},
{
"epoch": 0.6424664346096469,
"grad_norm": 0.11723372340202332,
"learning_rate": 1e-06,
"loss": 0.5683,
"num_tokens": 453551548.0,
"step": 323
},
{
"epoch": 0.6444554947787171,
"grad_norm": 0.11690861731767654,
"learning_rate": 1e-06,
"loss": 0.5702,
"num_tokens": 454937681.0,
"step": 324
},
{
"epoch": 0.6464445549477872,
"grad_norm": 0.11684879660606384,
"learning_rate": 1e-06,
"loss": 0.5692,
"num_tokens": 456361552.0,
"step": 325
},
{
"epoch": 0.6484336151168573,
"grad_norm": 0.11809241771697998,
"learning_rate": 1e-06,
"loss": 0.5952,
"num_tokens": 457751294.0,
"step": 326
},
{
"epoch": 0.6504226752859275,
"grad_norm": 0.11862190812826157,
"learning_rate": 1e-06,
"loss": 0.5756,
"num_tokens": 459155713.0,
"step": 327
},
{
"epoch": 0.6524117354549975,
"grad_norm": 0.12103772908449173,
"learning_rate": 1e-06,
"loss": 0.5678,
"num_tokens": 460560134.0,
"step": 328
},
{
"epoch": 0.6544007956240676,
"grad_norm": 0.1281164437532425,
"learning_rate": 1e-06,
"loss": 0.5845,
"num_tokens": 461962174.0,
"step": 329
},
{
"epoch": 0.6563898557931377,
"grad_norm": 0.1147463396191597,
"learning_rate": 1e-06,
"loss": 0.5784,
"num_tokens": 463406269.0,
"step": 330
},
{
"epoch": 0.6583789159622079,
"grad_norm": 0.11639434099197388,
"learning_rate": 1e-06,
"loss": 0.5762,
"num_tokens": 464831720.0,
"step": 331
},
{
"epoch": 0.660367976131278,
"grad_norm": 0.12189288437366486,
"learning_rate": 1e-06,
"loss": 0.5708,
"num_tokens": 466236911.0,
"step": 332
},
{
"epoch": 0.662357036300348,
"grad_norm": 0.11594757437705994,
"learning_rate": 1e-06,
"loss": 0.5785,
"num_tokens": 467667060.0,
"step": 333
},
{
"epoch": 0.6643460964694182,
"grad_norm": 0.120713010430336,
"learning_rate": 1e-06,
"loss": 0.56,
"num_tokens": 469076105.0,
"step": 334
},
{
"epoch": 0.6663351566384883,
"grad_norm": 0.1252930611371994,
"learning_rate": 1e-06,
"loss": 0.5586,
"num_tokens": 470479085.0,
"step": 335
},
{
"epoch": 0.6683242168075584,
"grad_norm": 0.12119864672422409,
"learning_rate": 1e-06,
"loss": 0.5696,
"num_tokens": 471878453.0,
"step": 336
},
{
"epoch": 0.6703132769766286,
"grad_norm": 0.11741020530462265,
"learning_rate": 1e-06,
"loss": 0.578,
"num_tokens": 473317934.0,
"step": 337
},
{
"epoch": 0.6723023371456986,
"grad_norm": 0.11722774058580399,
"learning_rate": 1e-06,
"loss": 0.5705,
"num_tokens": 474718637.0,
"step": 338
},
{
"epoch": 0.6742913973147687,
"grad_norm": 0.1168806403875351,
"learning_rate": 1e-06,
"loss": 0.576,
"num_tokens": 476109339.0,
"step": 339
},
{
"epoch": 0.6762804574838389,
"grad_norm": 0.11499282717704773,
"learning_rate": 1e-06,
"loss": 0.5695,
"num_tokens": 477530852.0,
"step": 340
},
{
"epoch": 0.678269517652909,
"grad_norm": 0.11914825439453125,
"learning_rate": 1e-06,
"loss": 0.5647,
"num_tokens": 478878070.0,
"step": 341
},
{
"epoch": 0.6802585778219791,
"grad_norm": 0.11614906787872314,
"learning_rate": 1e-06,
"loss": 0.5749,
"num_tokens": 480284827.0,
"step": 342
},
{
"epoch": 0.6822476379910493,
"grad_norm": 0.11405870318412781,
"learning_rate": 1e-06,
"loss": 0.5728,
"num_tokens": 481705922.0,
"step": 343
},
{
"epoch": 0.6842366981601193,
"grad_norm": 0.12021178752183914,
"learning_rate": 1e-06,
"loss": 0.5757,
"num_tokens": 483096669.0,
"step": 344
},
{
"epoch": 0.6862257583291894,
"grad_norm": 0.11697462201118469,
"learning_rate": 1e-06,
"loss": 0.5778,
"num_tokens": 484497174.0,
"step": 345
},
{
"epoch": 0.6882148184982596,
"grad_norm": 0.11658283323049545,
"learning_rate": 1e-06,
"loss": 0.5527,
"num_tokens": 485910302.0,
"step": 346
},
{
"epoch": 0.6902038786673297,
"grad_norm": 0.11884041875600815,
"learning_rate": 1e-06,
"loss": 0.5774,
"num_tokens": 487314373.0,
"step": 347
},
{
"epoch": 0.6921929388363998,
"grad_norm": 0.11939556896686554,
"learning_rate": 1e-06,
"loss": 0.5808,
"num_tokens": 488749735.0,
"step": 348
},
{
"epoch": 0.69418199900547,
"grad_norm": 0.12483149766921997,
"learning_rate": 1e-06,
"loss": 0.558,
"num_tokens": 490118122.0,
"step": 349
},
{
"epoch": 0.69617105917454,
"grad_norm": 0.12257801741361618,
"learning_rate": 1e-06,
"loss": 0.5624,
"num_tokens": 491510538.0,
"step": 350
},
{
"epoch": 0.6981601193436101,
"grad_norm": 0.11767923086881638,
"learning_rate": 1e-06,
"loss": 0.5718,
"num_tokens": 492926246.0,
"step": 351
},
{
"epoch": 0.7001491795126803,
"grad_norm": 0.11635064333677292,
"learning_rate": 1e-06,
"loss": 0.5803,
"num_tokens": 494380244.0,
"step": 352
},
{
"epoch": 0.7021382396817504,
"grad_norm": 0.12177010625600815,
"learning_rate": 1e-06,
"loss": 0.5652,
"num_tokens": 495730583.0,
"step": 353
},
{
"epoch": 0.7041272998508205,
"grad_norm": 0.11822908371686935,
"learning_rate": 1e-06,
"loss": 0.5663,
"num_tokens": 497125075.0,
"step": 354
},
{
"epoch": 0.7061163600198906,
"grad_norm": 0.12682731449604034,
"learning_rate": 1e-06,
"loss": 0.5658,
"num_tokens": 498549275.0,
"step": 355
},
{
"epoch": 0.7081054201889607,
"grad_norm": 0.12045978009700775,
"learning_rate": 1e-06,
"loss": 0.57,
"num_tokens": 499930592.0,
"step": 356
},
{
"epoch": 0.7100944803580308,
"grad_norm": 0.12545664608478546,
"learning_rate": 1e-06,
"loss": 0.5652,
"num_tokens": 501333957.0,
"step": 357
},
{
"epoch": 0.712083540527101,
"grad_norm": 0.11865141242742538,
"learning_rate": 1e-06,
"loss": 0.566,
"num_tokens": 502737493.0,
"step": 358
},
{
"epoch": 0.7140726006961711,
"grad_norm": 0.12852944433689117,
"learning_rate": 1e-06,
"loss": 0.5777,
"num_tokens": 504130760.0,
"step": 359
},
{
"epoch": 0.7160616608652411,
"grad_norm": 0.11820497363805771,
"learning_rate": 1e-06,
"loss": 0.5542,
"num_tokens": 505509701.0,
"step": 360
},
{
"epoch": 0.7180507210343113,
"grad_norm": 0.1146169900894165,
"learning_rate": 1e-06,
"loss": 0.5568,
"num_tokens": 506911298.0,
"step": 361
},
{
"epoch": 0.7200397812033814,
"grad_norm": 0.11726175993680954,
"learning_rate": 1e-06,
"loss": 0.5609,
"num_tokens": 508293436.0,
"step": 362
},
{
"epoch": 0.7220288413724515,
"grad_norm": 0.12022433429956436,
"learning_rate": 1e-06,
"loss": 0.5605,
"num_tokens": 509696103.0,
"step": 363
},
{
"epoch": 0.7240179015415217,
"grad_norm": 0.11907174438238144,
"learning_rate": 1e-06,
"loss": 0.5652,
"num_tokens": 511108327.0,
"step": 364
},
{
"epoch": 0.7260069617105918,
"grad_norm": 0.11890577524900436,
"learning_rate": 1e-06,
"loss": 0.5618,
"num_tokens": 512511723.0,
"step": 365
},
{
"epoch": 0.7279960218796618,
"grad_norm": 0.12470176070928574,
"learning_rate": 1e-06,
"loss": 0.5584,
"num_tokens": 513907412.0,
"step": 366
},
{
"epoch": 0.729985082048732,
"grad_norm": 0.12026024609804153,
"learning_rate": 1e-06,
"loss": 0.5774,
"num_tokens": 515324644.0,
"step": 367
},
{
"epoch": 0.7319741422178021,
"grad_norm": 0.12734608352184296,
"learning_rate": 1e-06,
"loss": 0.5739,
"num_tokens": 516674022.0,
"step": 368
},
{
"epoch": 0.7339632023868722,
"grad_norm": 0.11902155727148056,
"learning_rate": 1e-06,
"loss": 0.5717,
"num_tokens": 518087720.0,
"step": 369
},
{
"epoch": 0.7359522625559424,
"grad_norm": 0.11661865562200546,
"learning_rate": 1e-06,
"loss": 0.5528,
"num_tokens": 519497699.0,
"step": 370
},
{
"epoch": 0.7379413227250124,
"grad_norm": 0.12583503127098083,
"learning_rate": 1e-06,
"loss": 0.5767,
"num_tokens": 520908702.0,
"step": 371
},
{
"epoch": 0.7399303828940825,
"grad_norm": 0.11799920350313187,
"learning_rate": 1e-06,
"loss": 0.5692,
"num_tokens": 522317938.0,
"step": 372
},
{
"epoch": 0.7419194430631526,
"grad_norm": 0.12218224257230759,
"learning_rate": 1e-06,
"loss": 0.5642,
"num_tokens": 523719142.0,
"step": 373
},
{
"epoch": 0.7439085032322228,
"grad_norm": 0.11948370188474655,
"learning_rate": 1e-06,
"loss": 0.5702,
"num_tokens": 525136176.0,
"step": 374
},
{
"epoch": 0.7458975634012929,
"grad_norm": 0.1194443628191948,
"learning_rate": 1e-06,
"loss": 0.5472,
"num_tokens": 526481164.0,
"step": 375
},
{
"epoch": 0.747886623570363,
"grad_norm": 0.12138593941926956,
"learning_rate": 1e-06,
"loss": 0.5667,
"num_tokens": 527888658.0,
"step": 376
},
{
"epoch": 0.7498756837394331,
"grad_norm": 0.12166504561901093,
"learning_rate": 1e-06,
"loss": 0.5552,
"num_tokens": 529287967.0,
"step": 377
},
{
"epoch": 0.7518647439085032,
"grad_norm": 0.1202344223856926,
"learning_rate": 1e-06,
"loss": 0.5527,
"num_tokens": 530654844.0,
"step": 378
},
{
"epoch": 0.7538538040775733,
"grad_norm": 0.11958526074886322,
"learning_rate": 1e-06,
"loss": 0.5647,
"num_tokens": 532043235.0,
"step": 379
},
{
"epoch": 0.7558428642466435,
"grad_norm": 0.11844973266124725,
"learning_rate": 1e-06,
"loss": 0.5721,
"num_tokens": 533462609.0,
"step": 380
},
{
"epoch": 0.7578319244157136,
"grad_norm": 0.12316368520259857,
"learning_rate": 1e-06,
"loss": 0.5659,
"num_tokens": 534847372.0,
"step": 381
},
{
"epoch": 0.7598209845847836,
"grad_norm": 0.1315903514623642,
"learning_rate": 1e-06,
"loss": 0.5636,
"num_tokens": 536262717.0,
"step": 382
},
{
"epoch": 0.7618100447538538,
"grad_norm": 0.11977870017290115,
"learning_rate": 1e-06,
"loss": 0.5708,
"num_tokens": 537686599.0,
"step": 383
},
{
"epoch": 0.7637991049229239,
"grad_norm": 0.12060414999723434,
"learning_rate": 1e-06,
"loss": 0.5605,
"num_tokens": 539059998.0,
"step": 384
},
{
"epoch": 0.765788165091994,
"grad_norm": 0.11461476981639862,
"learning_rate": 1e-06,
"loss": 0.569,
"num_tokens": 540488356.0,
"step": 385
},
{
"epoch": 0.7677772252610642,
"grad_norm": 0.11902087926864624,
"learning_rate": 1e-06,
"loss": 0.563,
"num_tokens": 541895951.0,
"step": 386
},
{
"epoch": 0.7697662854301343,
"grad_norm": 0.11860883980989456,
"learning_rate": 1e-06,
"loss": 0.575,
"num_tokens": 543281120.0,
"step": 387
},
{
"epoch": 0.7717553455992043,
"grad_norm": 0.12020647525787354,
"learning_rate": 1e-06,
"loss": 0.5614,
"num_tokens": 544646661.0,
"step": 388
},
{
"epoch": 0.7737444057682745,
"grad_norm": 0.11638668179512024,
"learning_rate": 1e-06,
"loss": 0.5742,
"num_tokens": 546097259.0,
"step": 389
},
{
"epoch": 0.7757334659373446,
"grad_norm": 0.11645980924367905,
"learning_rate": 1e-06,
"loss": 0.5647,
"num_tokens": 547518580.0,
"step": 390
},
{
"epoch": 0.7777225261064147,
"grad_norm": 0.12269024550914764,
"learning_rate": 1e-06,
"loss": 0.5689,
"num_tokens": 548932428.0,
"step": 391
},
{
"epoch": 0.7797115862754849,
"grad_norm": 0.11761284619569778,
"learning_rate": 1e-06,
"loss": 0.554,
"num_tokens": 550354089.0,
"step": 392
},
{
"epoch": 0.781700646444555,
"grad_norm": 0.1194002628326416,
"learning_rate": 1e-06,
"loss": 0.5691,
"num_tokens": 551785808.0,
"step": 393
},
{
"epoch": 0.783689706613625,
"grad_norm": 0.11770683526992798,
"learning_rate": 1e-06,
"loss": 0.5675,
"num_tokens": 553166180.0,
"step": 394
},
{
"epoch": 0.7856787667826952,
"grad_norm": 0.12067251652479172,
"learning_rate": 1e-06,
"loss": 0.5698,
"num_tokens": 554595305.0,
"step": 395
},
{
"epoch": 0.7876678269517653,
"grad_norm": 0.1211480051279068,
"learning_rate": 1e-06,
"loss": 0.5545,
"num_tokens": 555975917.0,
"step": 396
},
{
"epoch": 0.7896568871208354,
"grad_norm": 0.12212938815355301,
"learning_rate": 1e-06,
"loss": 0.563,
"num_tokens": 557357748.0,
"step": 397
},
{
"epoch": 0.7916459472899056,
"grad_norm": 0.12312401086091995,
"learning_rate": 1e-06,
"loss": 0.5635,
"num_tokens": 558758541.0,
"step": 398
},
{
"epoch": 0.7936350074589756,
"grad_norm": 0.11911605298519135,
"learning_rate": 1e-06,
"loss": 0.569,
"num_tokens": 560135846.0,
"step": 399
},
{
"epoch": 0.7956240676280457,
"grad_norm": 0.8023229241371155,
"learning_rate": 1e-06,
"loss": 0.5761,
"num_tokens": 561561065.0,
"step": 400
},
{
"epoch": 0.7976131277971159,
"grad_norm": 0.12671087682247162,
"learning_rate": 1e-06,
"loss": 0.5718,
"num_tokens": 562950768.0,
"step": 401
},
{
"epoch": 0.799602187966186,
"grad_norm": 0.12295536696910858,
"learning_rate": 1e-06,
"loss": 0.5531,
"num_tokens": 564325036.0,
"step": 402
},
{
"epoch": 0.8015912481352561,
"grad_norm": 0.11960072070360184,
"learning_rate": 1e-06,
"loss": 0.5551,
"num_tokens": 565703350.0,
"step": 403
},
{
"epoch": 0.8035803083043263,
"grad_norm": 0.11970996856689453,
"learning_rate": 1e-06,
"loss": 0.5758,
"num_tokens": 567129767.0,
"step": 404
},
{
"epoch": 0.8055693684733963,
"grad_norm": 0.11514374613761902,
"learning_rate": 1e-06,
"loss": 0.5667,
"num_tokens": 568533394.0,
"step": 405
},
{
"epoch": 0.8075584286424664,
"grad_norm": 0.12267459183931351,
"learning_rate": 1e-06,
"loss": 0.5698,
"num_tokens": 569966937.0,
"step": 406
},
{
"epoch": 0.8095474888115366,
"grad_norm": 0.1218687891960144,
"learning_rate": 1e-06,
"loss": 0.5588,
"num_tokens": 571338522.0,
"step": 407
},
{
"epoch": 0.8115365489806067,
"grad_norm": 0.12409517914056778,
"learning_rate": 1e-06,
"loss": 0.564,
"num_tokens": 572731805.0,
"step": 408
},
{
"epoch": 0.8135256091496768,
"grad_norm": 0.11474578827619553,
"learning_rate": 1e-06,
"loss": 0.5704,
"num_tokens": 574159917.0,
"step": 409
},
{
"epoch": 0.8155146693187469,
"grad_norm": 0.11738289892673492,
"learning_rate": 1e-06,
"loss": 0.5615,
"num_tokens": 575571352.0,
"step": 410
},
{
"epoch": 0.817503729487817,
"grad_norm": 0.12023717910051346,
"learning_rate": 1e-06,
"loss": 0.5569,
"num_tokens": 576984870.0,
"step": 411
},
{
"epoch": 0.8194927896568871,
"grad_norm": 0.1194944977760315,
"learning_rate": 1e-06,
"loss": 0.5666,
"num_tokens": 578414303.0,
"step": 412
},
{
"epoch": 0.8214818498259573,
"grad_norm": 0.12020692229270935,
"learning_rate": 1e-06,
"loss": 0.5478,
"num_tokens": 579812487.0,
"step": 413
},
{
"epoch": 0.8234709099950274,
"grad_norm": 0.11791769415140152,
"learning_rate": 1e-06,
"loss": 0.5681,
"num_tokens": 581203528.0,
"step": 414
},
{
"epoch": 0.8254599701640974,
"grad_norm": 0.11593407392501831,
"learning_rate": 1e-06,
"loss": 0.5573,
"num_tokens": 582588268.0,
"step": 415
},
{
"epoch": 0.8274490303331675,
"grad_norm": 0.12016279250383377,
"learning_rate": 1e-06,
"loss": 0.5777,
"num_tokens": 583983078.0,
"step": 416
},
{
"epoch": 0.8294380905022377,
"grad_norm": 0.12653815746307373,
"learning_rate": 1e-06,
"loss": 0.5839,
"num_tokens": 585400140.0,
"step": 417
},
{
"epoch": 0.8314271506713078,
"grad_norm": 0.11608505994081497,
"learning_rate": 1e-06,
"loss": 0.5575,
"num_tokens": 586830320.0,
"step": 418
},
{
"epoch": 0.8334162108403779,
"grad_norm": 0.11865837126970291,
"learning_rate": 1e-06,
"loss": 0.5744,
"num_tokens": 588258999.0,
"step": 419
},
{
"epoch": 0.8354052710094481,
"grad_norm": 0.11725205928087234,
"learning_rate": 1e-06,
"loss": 0.5693,
"num_tokens": 589692364.0,
"step": 420
},
{
"epoch": 0.8373943311785181,
"grad_norm": 0.11638808995485306,
"learning_rate": 1e-06,
"loss": 0.5597,
"num_tokens": 591094415.0,
"step": 421
},
{
"epoch": 0.8393833913475882,
"grad_norm": 0.12141529470682144,
"learning_rate": 1e-06,
"loss": 0.5559,
"num_tokens": 592500735.0,
"step": 422
},
{
"epoch": 0.8413724515166584,
"grad_norm": 0.11554522812366486,
"learning_rate": 1e-06,
"loss": 0.5666,
"num_tokens": 593903743.0,
"step": 423
},
{
"epoch": 0.8433615116857285,
"grad_norm": 0.11890975385904312,
"learning_rate": 1e-06,
"loss": 0.5543,
"num_tokens": 595314514.0,
"step": 424
},
{
"epoch": 0.8453505718547986,
"grad_norm": 0.12231077998876572,
"learning_rate": 1e-06,
"loss": 0.5615,
"num_tokens": 596671023.0,
"step": 425
},
{
"epoch": 0.8473396320238687,
"grad_norm": 0.12027207762002945,
"learning_rate": 1e-06,
"loss": 0.5583,
"num_tokens": 598031581.0,
"step": 426
},
{
"epoch": 0.8493286921929388,
"grad_norm": 0.11685006320476532,
"learning_rate": 1e-06,
"loss": 0.5628,
"num_tokens": 599477745.0,
"step": 427
},
{
"epoch": 0.8513177523620089,
"grad_norm": 0.17819620668888092,
"learning_rate": 1e-06,
"loss": 0.5714,
"num_tokens": 600848881.0,
"step": 428
},
{
"epoch": 0.8533068125310791,
"grad_norm": 0.11678969860076904,
"learning_rate": 1e-06,
"loss": 0.5534,
"num_tokens": 602266449.0,
"step": 429
},
{
"epoch": 0.8552958727001492,
"grad_norm": 0.11610161513090134,
"learning_rate": 1e-06,
"loss": 0.5609,
"num_tokens": 603675715.0,
"step": 430
},
{
"epoch": 0.8572849328692193,
"grad_norm": 0.1225874274969101,
"learning_rate": 1e-06,
"loss": 0.5554,
"num_tokens": 605072725.0,
"step": 431
},
{
"epoch": 0.8592739930382894,
"grad_norm": 0.11319524794816971,
"learning_rate": 1e-06,
"loss": 0.5538,
"num_tokens": 606474929.0,
"step": 432
},
{
"epoch": 0.8612630532073595,
"grad_norm": 0.11506962776184082,
"learning_rate": 1e-06,
"loss": 0.5531,
"num_tokens": 607855829.0,
"step": 433
},
{
"epoch": 0.8632521133764296,
"grad_norm": 0.1158682256937027,
"learning_rate": 1e-06,
"loss": 0.567,
"num_tokens": 609285755.0,
"step": 434
},
{
"epoch": 0.8652411735454998,
"grad_norm": 0.11761818826198578,
"learning_rate": 1e-06,
"loss": 0.5671,
"num_tokens": 610697282.0,
"step": 435
},
{
"epoch": 0.8672302337145699,
"grad_norm": 0.11838319152593613,
"learning_rate": 1e-06,
"loss": 0.5534,
"num_tokens": 612094535.0,
"step": 436
},
{
"epoch": 0.8692192938836399,
"grad_norm": 0.11570697277784348,
"learning_rate": 1e-06,
"loss": 0.5632,
"num_tokens": 613501408.0,
"step": 437
},
{
"epoch": 0.8712083540527101,
"grad_norm": 0.11999750137329102,
"learning_rate": 1e-06,
"loss": 0.5594,
"num_tokens": 614872296.0,
"step": 438
},
{
"epoch": 0.8731974142217802,
"grad_norm": 0.1152532622218132,
"learning_rate": 1e-06,
"loss": 0.5573,
"num_tokens": 616300717.0,
"step": 439
},
{
"epoch": 0.8751864743908503,
"grad_norm": 0.11520667374134064,
"learning_rate": 1e-06,
"loss": 0.5441,
"num_tokens": 617678025.0,
"step": 440
},
{
"epoch": 0.8771755345599205,
"grad_norm": 0.11722690612077713,
"learning_rate": 1e-06,
"loss": 0.5535,
"num_tokens": 619082991.0,
"step": 441
},
{
"epoch": 0.8791645947289906,
"grad_norm": 0.11644702404737473,
"learning_rate": 1e-06,
"loss": 0.5595,
"num_tokens": 620470914.0,
"step": 442
},
{
"epoch": 0.8811536548980606,
"grad_norm": 0.11839190125465393,
"learning_rate": 1e-06,
"loss": 0.5481,
"num_tokens": 621862665.0,
"step": 443
},
{
"epoch": 0.8831427150671308,
"grad_norm": 0.1157626211643219,
"learning_rate": 1e-06,
"loss": 0.5663,
"num_tokens": 623310812.0,
"step": 444
},
{
"epoch": 0.8851317752362009,
"grad_norm": 0.11904613673686981,
"learning_rate": 1e-06,
"loss": 0.5594,
"num_tokens": 624718938.0,
"step": 445
},
{
"epoch": 0.887120835405271,
"grad_norm": 0.11730271577835083,
"learning_rate": 1e-06,
"loss": 0.5515,
"num_tokens": 626091716.0,
"step": 446
},
{
"epoch": 0.8891098955743412,
"grad_norm": 0.11457692086696625,
"learning_rate": 1e-06,
"loss": 0.5506,
"num_tokens": 627487215.0,
"step": 447
},
{
"epoch": 0.8910989557434112,
"grad_norm": 0.11875864863395691,
"learning_rate": 1e-06,
"loss": 0.5526,
"num_tokens": 628850853.0,
"step": 448
},
{
"epoch": 0.8930880159124813,
"grad_norm": 0.11865832656621933,
"learning_rate": 1e-06,
"loss": 0.5526,
"num_tokens": 630249653.0,
"step": 449
},
{
"epoch": 0.8950770760815515,
"grad_norm": 0.11921685934066772,
"learning_rate": 1e-06,
"loss": 0.5651,
"num_tokens": 631626896.0,
"step": 450
},
{
"epoch": 0.8970661362506216,
"grad_norm": 0.11800325661897659,
"learning_rate": 1e-06,
"loss": 0.5463,
"num_tokens": 633018745.0,
"step": 451
},
{
"epoch": 0.8990551964196917,
"grad_norm": 0.11664669215679169,
"learning_rate": 1e-06,
"loss": 0.5529,
"num_tokens": 634434489.0,
"step": 452
},
{
"epoch": 0.9010442565887619,
"grad_norm": 0.11907332390546799,
"learning_rate": 1e-06,
"loss": 0.5588,
"num_tokens": 635836521.0,
"step": 453
},
{
"epoch": 0.9030333167578319,
"grad_norm": 0.119346983730793,
"learning_rate": 1e-06,
"loss": 0.5605,
"num_tokens": 637251420.0,
"step": 454
},
{
"epoch": 0.905022376926902,
"grad_norm": 0.11810777336359024,
"learning_rate": 1e-06,
"loss": 0.5538,
"num_tokens": 638673973.0,
"step": 455
},
{
"epoch": 0.9070114370959722,
"grad_norm": 0.12147301435470581,
"learning_rate": 1e-06,
"loss": 0.5477,
"num_tokens": 640053944.0,
"step": 456
},
{
"epoch": 0.9090004972650423,
"grad_norm": 0.11813312768936157,
"learning_rate": 1e-06,
"loss": 0.5699,
"num_tokens": 641459580.0,
"step": 457
},
{
"epoch": 0.9109895574341124,
"grad_norm": 0.1261916309595108,
"learning_rate": 1e-06,
"loss": 0.5565,
"num_tokens": 642848898.0,
"step": 458
},
{
"epoch": 0.9129786176031826,
"grad_norm": 0.11821988970041275,
"learning_rate": 1e-06,
"loss": 0.5527,
"num_tokens": 644312542.0,
"step": 459
},
{
"epoch": 0.9149676777722526,
"grad_norm": 0.11542089283466339,
"learning_rate": 1e-06,
"loss": 0.5614,
"num_tokens": 645735657.0,
"step": 460
},
{
"epoch": 0.9169567379413227,
"grad_norm": 0.131094828248024,
"learning_rate": 1e-06,
"loss": 0.5682,
"num_tokens": 647137018.0,
"step": 461
},
{
"epoch": 0.9189457981103928,
"grad_norm": 0.11712004989385605,
"learning_rate": 1e-06,
"loss": 0.5578,
"num_tokens": 648516983.0,
"step": 462
},
{
"epoch": 0.920934858279463,
"grad_norm": 0.11788227409124374,
"learning_rate": 1e-06,
"loss": 0.5591,
"num_tokens": 649953519.0,
"step": 463
},
{
"epoch": 0.922923918448533,
"grad_norm": 0.11795365810394287,
"learning_rate": 1e-06,
"loss": 0.5463,
"num_tokens": 651366264.0,
"step": 464
},
{
"epoch": 0.9249129786176031,
"grad_norm": 0.11824613064527512,
"learning_rate": 1e-06,
"loss": 0.5664,
"num_tokens": 652792598.0,
"step": 465
},
{
"epoch": 0.9269020387866733,
"grad_norm": 0.11839722841978073,
"learning_rate": 1e-06,
"loss": 0.5493,
"num_tokens": 654162858.0,
"step": 466
},
{
"epoch": 0.9288910989557434,
"grad_norm": 0.11836480349302292,
"learning_rate": 1e-06,
"loss": 0.5514,
"num_tokens": 655539001.0,
"step": 467
},
{
"epoch": 0.9308801591248135,
"grad_norm": 0.17381928861141205,
"learning_rate": 1e-06,
"loss": 0.5526,
"num_tokens": 656952170.0,
"step": 468
},
{
"epoch": 0.9328692192938837,
"grad_norm": 0.11770905554294586,
"learning_rate": 1e-06,
"loss": 0.5615,
"num_tokens": 658358118.0,
"step": 469
},
{
"epoch": 0.9348582794629537,
"grad_norm": 0.11960657685995102,
"learning_rate": 1e-06,
"loss": 0.5495,
"num_tokens": 659752280.0,
"step": 470
},
{
"epoch": 0.9368473396320238,
"grad_norm": 0.11758306622505188,
"learning_rate": 1e-06,
"loss": 0.5592,
"num_tokens": 661127510.0,
"step": 471
},
{
"epoch": 0.938836399801094,
"grad_norm": 0.1147281602025032,
"learning_rate": 1e-06,
"loss": 0.5431,
"num_tokens": 662539378.0,
"step": 472
},
{
"epoch": 0.9408254599701641,
"grad_norm": 0.11964991688728333,
"learning_rate": 1e-06,
"loss": 0.5578,
"num_tokens": 663935199.0,
"step": 473
},
{
"epoch": 0.9428145201392342,
"grad_norm": 0.12270357459783554,
"learning_rate": 1e-06,
"loss": 0.5535,
"num_tokens": 665328684.0,
"step": 474
},
{
"epoch": 0.9448035803083044,
"grad_norm": 0.11691749840974808,
"learning_rate": 1e-06,
"loss": 0.5598,
"num_tokens": 666742297.0,
"step": 475
},
{
"epoch": 0.9467926404773744,
"grad_norm": 0.11798378825187683,
"learning_rate": 1e-06,
"loss": 0.5563,
"num_tokens": 668121085.0,
"step": 476
},
{
"epoch": 0.9487817006464445,
"grad_norm": 0.11690951138734818,
"learning_rate": 1e-06,
"loss": 0.553,
"num_tokens": 669537516.0,
"step": 477
},
{
"epoch": 0.9507707608155147,
"grad_norm": 0.12072078138589859,
"learning_rate": 1e-06,
"loss": 0.5578,
"num_tokens": 670932178.0,
"step": 478
},
{
"epoch": 0.9527598209845848,
"grad_norm": 0.11482840776443481,
"learning_rate": 1e-06,
"loss": 0.5484,
"num_tokens": 672306421.0,
"step": 479
},
{
"epoch": 0.9547488811536549,
"grad_norm": 0.12619654834270477,
"learning_rate": 1e-06,
"loss": 0.5604,
"num_tokens": 673688811.0,
"step": 480
},
{
"epoch": 0.956737941322725,
"grad_norm": 0.12017329037189484,
"learning_rate": 1e-06,
"loss": 0.5599,
"num_tokens": 675082703.0,
"step": 481
},
{
"epoch": 0.9587270014917951,
"grad_norm": 0.11547863483428955,
"learning_rate": 1e-06,
"loss": 0.5475,
"num_tokens": 676488414.0,
"step": 482
},
{
"epoch": 0.9607160616608652,
"grad_norm": 0.11614055931568146,
"learning_rate": 1e-06,
"loss": 0.5621,
"num_tokens": 677896011.0,
"step": 483
},
{
"epoch": 0.9627051218299354,
"grad_norm": 0.11469247937202454,
"learning_rate": 1e-06,
"loss": 0.5503,
"num_tokens": 679301564.0,
"step": 484
},
{
"epoch": 0.9646941819990055,
"grad_norm": 0.12310828268527985,
"learning_rate": 1e-06,
"loss": 0.5526,
"num_tokens": 680674072.0,
"step": 485
},
{
"epoch": 0.9666832421680756,
"grad_norm": 0.11646957695484161,
"learning_rate": 1e-06,
"loss": 0.5505,
"num_tokens": 682077773.0,
"step": 486
},
{
"epoch": 0.9686723023371457,
"grad_norm": 0.11596749722957611,
"learning_rate": 1e-06,
"loss": 0.5596,
"num_tokens": 683492461.0,
"step": 487
},
{
"epoch": 0.9706613625062158,
"grad_norm": 0.11795108765363693,
"learning_rate": 1e-06,
"loss": 0.5568,
"num_tokens": 684913267.0,
"step": 488
},
{
"epoch": 0.9726504226752859,
"grad_norm": 0.1201254203915596,
"learning_rate": 1e-06,
"loss": 0.5645,
"num_tokens": 686315028.0,
"step": 489
},
{
"epoch": 0.9746394828443561,
"grad_norm": 0.1162814274430275,
"learning_rate": 1e-06,
"loss": 0.5494,
"num_tokens": 687711185.0,
"step": 490
},
{
"epoch": 0.9766285430134262,
"grad_norm": 0.11750290542840958,
"learning_rate": 1e-06,
"loss": 0.5628,
"num_tokens": 689115928.0,
"step": 491
},
{
"epoch": 0.9786176031824962,
"grad_norm": 0.13300985097885132,
"learning_rate": 1e-06,
"loss": 0.563,
"num_tokens": 690534590.0,
"step": 492
},
{
"epoch": 0.9806066633515664,
"grad_norm": 0.12720529735088348,
"learning_rate": 1e-06,
"loss": 0.5567,
"num_tokens": 691981545.0,
"step": 493
},
{
"epoch": 0.9825957235206365,
"grad_norm": 0.11822197586297989,
"learning_rate": 1e-06,
"loss": 0.5659,
"num_tokens": 693402676.0,
"step": 494
},
{
"epoch": 0.9845847836897066,
"grad_norm": 0.11941008269786835,
"learning_rate": 1e-06,
"loss": 0.5568,
"num_tokens": 694809843.0,
"step": 495
},
{
"epoch": 0.9865738438587768,
"grad_norm": 0.11860588937997818,
"learning_rate": 1e-06,
"loss": 0.5531,
"num_tokens": 696211950.0,
"step": 496
},
{
"epoch": 0.9885629040278469,
"grad_norm": 0.12157568335533142,
"learning_rate": 1e-06,
"loss": 0.5569,
"num_tokens": 697624687.0,
"step": 497
},
{
"epoch": 0.9905519641969169,
"grad_norm": 0.11806346476078033,
"learning_rate": 1e-06,
"loss": 0.5743,
"num_tokens": 699049049.0,
"step": 498
},
{
"epoch": 0.9925410243659871,
"grad_norm": 0.1187855452299118,
"learning_rate": 1e-06,
"loss": 0.5558,
"num_tokens": 700440889.0,
"step": 499
},
{
"epoch": 0.9945300845350572,
"grad_norm": 0.11523641645908356,
"learning_rate": 1e-06,
"loss": 0.56,
"num_tokens": 701864042.0,
"step": 500
},
{
"epoch": 0.9965191447041273,
"grad_norm": 0.11771584302186966,
"learning_rate": 1e-06,
"loss": 0.5536,
"num_tokens": 703255455.0,
"step": 501
},
{
"epoch": 0.9985082048731975,
"grad_norm": 0.11954519152641296,
"learning_rate": 1e-06,
"loss": 0.5487,
"num_tokens": 704646212.0,
"step": 502
},
{
"epoch": 1.0019890601690702,
"grad_norm": 0.24864843487739563,
"learning_rate": 1e-06,
"loss": 1.1087,
"num_tokens": 706359781.0,
"step": 503
},
{
"epoch": 1.0039781203381402,
"grad_norm": 0.15665407478809357,
"learning_rate": 1e-06,
"loss": 0.542,
"num_tokens": 707777523.0,
"step": 504
},
{
"epoch": 1.0059671805072103,
"grad_norm": 0.12076932191848755,
"learning_rate": 1e-06,
"loss": 0.5393,
"num_tokens": 709165414.0,
"step": 505
},
{
"epoch": 1.0079562406762805,
"grad_norm": 0.11568966507911682,
"learning_rate": 1e-06,
"loss": 0.5503,
"num_tokens": 710541219.0,
"step": 506
},
{
"epoch": 1.0099453008453505,
"grad_norm": 0.1189185082912445,
"learning_rate": 1e-06,
"loss": 0.555,
"num_tokens": 711904899.0,
"step": 507
},
{
"epoch": 1.0119343610144207,
"grad_norm": 0.11912715435028076,
"learning_rate": 1e-06,
"loss": 0.5399,
"num_tokens": 713297626.0,
"step": 508
},
{
"epoch": 1.0139234211834909,
"grad_norm": 0.11964980512857437,
"learning_rate": 1e-06,
"loss": 0.5585,
"num_tokens": 714712902.0,
"step": 509
},
{
"epoch": 1.0159124813525608,
"grad_norm": 0.11817710846662521,
"learning_rate": 1e-06,
"loss": 0.5518,
"num_tokens": 716119424.0,
"step": 510
},
{
"epoch": 1.017901541521631,
"grad_norm": 0.11699645221233368,
"learning_rate": 1e-06,
"loss": 0.5447,
"num_tokens": 717518297.0,
"step": 511
},
{
"epoch": 1.0198906016907012,
"grad_norm": 0.11419110000133514,
"learning_rate": 1e-06,
"loss": 0.5532,
"num_tokens": 718934437.0,
"step": 512
},
{
"epoch": 1.0218796618597712,
"grad_norm": 0.11955999583005905,
"learning_rate": 1e-06,
"loss": 0.5365,
"num_tokens": 720296609.0,
"step": 513
},
{
"epoch": 1.0238687220288414,
"grad_norm": 0.11673971265554428,
"learning_rate": 1e-06,
"loss": 0.5549,
"num_tokens": 721713131.0,
"step": 514
},
{
"epoch": 1.0258577821979116,
"grad_norm": 0.11641617864370346,
"learning_rate": 1e-06,
"loss": 0.5654,
"num_tokens": 723093279.0,
"step": 515
},
{
"epoch": 1.0278468423669815,
"grad_norm": 0.11750617623329163,
"learning_rate": 1e-06,
"loss": 0.5391,
"num_tokens": 724491829.0,
"step": 516
},
{
"epoch": 1.0298359025360517,
"grad_norm": 0.1122995987534523,
"learning_rate": 1e-06,
"loss": 0.5421,
"num_tokens": 725927036.0,
"step": 517
},
{
"epoch": 1.031824962705122,
"grad_norm": 0.11935453861951828,
"learning_rate": 1e-06,
"loss": 0.5522,
"num_tokens": 727309822.0,
"step": 518
},
{
"epoch": 1.0338140228741919,
"grad_norm": 0.1215103417634964,
"learning_rate": 1e-06,
"loss": 0.5543,
"num_tokens": 728731842.0,
"step": 519
},
{
"epoch": 1.035803083043262,
"grad_norm": 0.12213215231895447,
"learning_rate": 1e-06,
"loss": 0.5502,
"num_tokens": 730121195.0,
"step": 520
},
{
"epoch": 1.0377921432123323,
"grad_norm": 0.11967755854129791,
"learning_rate": 1e-06,
"loss": 0.5513,
"num_tokens": 731512351.0,
"step": 521
},
{
"epoch": 1.0397812033814022,
"grad_norm": 0.11795569211244583,
"learning_rate": 1e-06,
"loss": 0.548,
"num_tokens": 732931422.0,
"step": 522
},
{
"epoch": 1.0417702635504724,
"grad_norm": 0.11842501163482666,
"learning_rate": 1e-06,
"loss": 0.55,
"num_tokens": 734353348.0,
"step": 523
},
{
"epoch": 1.0437593237195426,
"grad_norm": 0.1207478791475296,
"learning_rate": 1e-06,
"loss": 0.5516,
"num_tokens": 735742554.0,
"step": 524
},
{
"epoch": 1.0457483838886126,
"grad_norm": 0.11943433433771133,
"learning_rate": 1e-06,
"loss": 0.5513,
"num_tokens": 737158566.0,
"step": 525
},
{
"epoch": 1.0477374440576828,
"grad_norm": 0.12060469388961792,
"learning_rate": 1e-06,
"loss": 0.5623,
"num_tokens": 738552506.0,
"step": 526
},
{
"epoch": 1.049726504226753,
"grad_norm": 0.11873313784599304,
"learning_rate": 1e-06,
"loss": 0.5502,
"num_tokens": 739982404.0,
"step": 527
},
{
"epoch": 1.051715564395823,
"grad_norm": 0.11538344621658325,
"learning_rate": 1e-06,
"loss": 0.5523,
"num_tokens": 741379974.0,
"step": 528
},
{
"epoch": 1.053704624564893,
"grad_norm": 0.11816058307886124,
"learning_rate": 1e-06,
"loss": 0.5625,
"num_tokens": 742756391.0,
"step": 529
},
{
"epoch": 1.0556936847339633,
"grad_norm": 0.12244360148906708,
"learning_rate": 1e-06,
"loss": 0.5433,
"num_tokens": 744195716.0,
"step": 530
},
{
"epoch": 1.0576827449030333,
"grad_norm": 0.12584052979946136,
"learning_rate": 1e-06,
"loss": 0.5553,
"num_tokens": 745589558.0,
"step": 531
},
{
"epoch": 1.0596718050721035,
"grad_norm": 0.11298387497663498,
"learning_rate": 1e-06,
"loss": 0.5477,
"num_tokens": 746987877.0,
"step": 532
},
{
"epoch": 1.0616608652411736,
"grad_norm": 0.12312706559896469,
"learning_rate": 1e-06,
"loss": 0.5701,
"num_tokens": 748360690.0,
"step": 533
},
{
"epoch": 1.0636499254102436,
"grad_norm": 0.11732471734285355,
"learning_rate": 1e-06,
"loss": 0.545,
"num_tokens": 749752632.0,
"step": 534
},
{
"epoch": 1.0656389855793138,
"grad_norm": 0.11663486808538437,
"learning_rate": 1e-06,
"loss": 0.5556,
"num_tokens": 751135143.0,
"step": 535
},
{
"epoch": 1.067628045748384,
"grad_norm": 0.11738405376672745,
"learning_rate": 1e-06,
"loss": 0.542,
"num_tokens": 752504589.0,
"step": 536
},
{
"epoch": 1.069617105917454,
"grad_norm": 0.11650814861059189,
"learning_rate": 1e-06,
"loss": 0.5501,
"num_tokens": 753908272.0,
"step": 537
},
{
"epoch": 1.0716061660865241,
"grad_norm": 0.11730290204286575,
"learning_rate": 1e-06,
"loss": 0.5525,
"num_tokens": 755345567.0,
"step": 538
},
{
"epoch": 1.0735952262555943,
"grad_norm": 0.11732745170593262,
"learning_rate": 1e-06,
"loss": 0.5471,
"num_tokens": 756742209.0,
"step": 539
},
{
"epoch": 1.0755842864246643,
"grad_norm": 0.11920227110385895,
"learning_rate": 1e-06,
"loss": 0.5453,
"num_tokens": 758140909.0,
"step": 540
},
{
"epoch": 1.0775733465937345,
"grad_norm": 0.11768822371959686,
"learning_rate": 1e-06,
"loss": 0.5427,
"num_tokens": 759550869.0,
"step": 541
},
{
"epoch": 1.0795624067628045,
"grad_norm": 0.11679795384407043,
"learning_rate": 1e-06,
"loss": 0.5615,
"num_tokens": 760954672.0,
"step": 542
},
{
"epoch": 1.0815514669318746,
"grad_norm": 0.11920733749866486,
"learning_rate": 1e-06,
"loss": 0.5505,
"num_tokens": 762355822.0,
"step": 543
},
{
"epoch": 1.0835405271009448,
"grad_norm": 0.1188458576798439,
"learning_rate": 1e-06,
"loss": 0.5678,
"num_tokens": 763788308.0,
"step": 544
},
{
"epoch": 1.085529587270015,
"grad_norm": 0.11863641440868378,
"learning_rate": 1e-06,
"loss": 0.5467,
"num_tokens": 765177690.0,
"step": 545
},
{
"epoch": 1.087518647439085,
"grad_norm": 0.11603251844644547,
"learning_rate": 1e-06,
"loss": 0.5369,
"num_tokens": 766617783.0,
"step": 546
},
{
"epoch": 1.0895077076081552,
"grad_norm": 0.1217975988984108,
"learning_rate": 1e-06,
"loss": 0.5431,
"num_tokens": 768029229.0,
"step": 547
},
{
"epoch": 1.0914967677772252,
"grad_norm": 0.11564943194389343,
"learning_rate": 1e-06,
"loss": 0.5438,
"num_tokens": 769417054.0,
"step": 548
},
{
"epoch": 1.0934858279462953,
"grad_norm": 0.1208975538611412,
"learning_rate": 1e-06,
"loss": 0.5502,
"num_tokens": 770836479.0,
"step": 549
},
{
"epoch": 1.0954748881153655,
"grad_norm": 0.11667314916849136,
"learning_rate": 1e-06,
"loss": 0.5547,
"num_tokens": 772254674.0,
"step": 550
},
{
"epoch": 1.0974639482844357,
"grad_norm": 0.11516263335943222,
"learning_rate": 1e-06,
"loss": 0.55,
"num_tokens": 773706020.0,
"step": 551
},
{
"epoch": 1.0994530084535057,
"grad_norm": 0.11822597682476044,
"learning_rate": 1e-06,
"loss": 0.54,
"num_tokens": 775110778.0,
"step": 552
},
{
"epoch": 1.1014420686225759,
"grad_norm": 0.12346749752759933,
"learning_rate": 1e-06,
"loss": 0.5471,
"num_tokens": 776538512.0,
"step": 553
},
{
"epoch": 1.1034311287916458,
"grad_norm": 0.11938782781362534,
"learning_rate": 1e-06,
"loss": 0.5472,
"num_tokens": 777966414.0,
"step": 554
},
{
"epoch": 1.105420188960716,
"grad_norm": 0.14581921696662903,
"learning_rate": 1e-06,
"loss": 0.5537,
"num_tokens": 779354221.0,
"step": 555
},
{
"epoch": 1.1074092491297862,
"grad_norm": 0.11971963196992874,
"learning_rate": 1e-06,
"loss": 0.5487,
"num_tokens": 780779804.0,
"step": 556
},
{
"epoch": 1.1093983092988562,
"grad_norm": 0.11833789944648743,
"learning_rate": 1e-06,
"loss": 0.5463,
"num_tokens": 782198212.0,
"step": 557
},
{
"epoch": 1.1113873694679264,
"grad_norm": 0.12102148681879044,
"learning_rate": 1e-06,
"loss": 0.5565,
"num_tokens": 783630792.0,
"step": 558
},
{
"epoch": 1.1133764296369966,
"grad_norm": 0.11505177617073059,
"learning_rate": 1e-06,
"loss": 0.5361,
"num_tokens": 785041609.0,
"step": 559
},
{
"epoch": 1.1153654898060665,
"grad_norm": 0.12186893075704575,
"learning_rate": 1e-06,
"loss": 0.5406,
"num_tokens": 786417350.0,
"step": 560
},
{
"epoch": 1.1173545499751367,
"grad_norm": 0.12019500881433487,
"learning_rate": 1e-06,
"loss": 0.5568,
"num_tokens": 787852563.0,
"step": 561
},
{
"epoch": 1.119343610144207,
"grad_norm": 0.11836836487054825,
"learning_rate": 1e-06,
"loss": 0.5338,
"num_tokens": 789256403.0,
"step": 562
},
{
"epoch": 1.1213326703132769,
"grad_norm": 0.1224868968129158,
"learning_rate": 1e-06,
"loss": 0.5363,
"num_tokens": 790627959.0,
"step": 563
},
{
"epoch": 1.123321730482347,
"grad_norm": 0.1227540671825409,
"learning_rate": 1e-06,
"loss": 0.5398,
"num_tokens": 792047776.0,
"step": 564
},
{
"epoch": 1.1253107906514173,
"grad_norm": 0.12231338769197464,
"learning_rate": 1e-06,
"loss": 0.5393,
"num_tokens": 793437309.0,
"step": 565
},
{
"epoch": 1.1272998508204872,
"grad_norm": 0.11635982990264893,
"learning_rate": 1e-06,
"loss": 0.5491,
"num_tokens": 794832359.0,
"step": 566
},
{
"epoch": 1.1292889109895574,
"grad_norm": 0.11689839512109756,
"learning_rate": 1e-06,
"loss": 0.5521,
"num_tokens": 796221119.0,
"step": 567
},
{
"epoch": 1.1312779711586276,
"grad_norm": 0.11974216252565384,
"learning_rate": 1e-06,
"loss": 0.5472,
"num_tokens": 797660744.0,
"step": 568
},
{
"epoch": 1.1332670313276976,
"grad_norm": 0.1189328134059906,
"learning_rate": 1e-06,
"loss": 0.5444,
"num_tokens": 799060817.0,
"step": 569
},
{
"epoch": 1.1352560914967678,
"grad_norm": 0.1164221465587616,
"learning_rate": 1e-06,
"loss": 0.5442,
"num_tokens": 800454569.0,
"step": 570
},
{
"epoch": 1.137245151665838,
"grad_norm": 0.11653874069452286,
"learning_rate": 1e-06,
"loss": 0.55,
"num_tokens": 801905348.0,
"step": 571
},
{
"epoch": 1.139234211834908,
"grad_norm": 0.11661481857299805,
"learning_rate": 1e-06,
"loss": 0.5492,
"num_tokens": 803300441.0,
"step": 572
},
{
"epoch": 1.141223272003978,
"grad_norm": 0.11541904509067535,
"learning_rate": 1e-06,
"loss": 0.5299,
"num_tokens": 804671703.0,
"step": 573
},
{
"epoch": 1.1432123321730483,
"grad_norm": 0.11833840608596802,
"learning_rate": 1e-06,
"loss": 0.5442,
"num_tokens": 806085229.0,
"step": 574
},
{
"epoch": 1.1452013923421183,
"grad_norm": 0.11650761216878891,
"learning_rate": 1e-06,
"loss": 0.5363,
"num_tokens": 807470114.0,
"step": 575
},
{
"epoch": 1.1471904525111885,
"grad_norm": 0.11970090866088867,
"learning_rate": 1e-06,
"loss": 0.5517,
"num_tokens": 808920938.0,
"step": 576
},
{
"epoch": 1.1491795126802586,
"grad_norm": 0.11860202997922897,
"learning_rate": 1e-06,
"loss": 0.5531,
"num_tokens": 810330812.0,
"step": 577
},
{
"epoch": 1.1511685728493286,
"grad_norm": 0.11822472512722015,
"learning_rate": 1e-06,
"loss": 0.5386,
"num_tokens": 811715751.0,
"step": 578
},
{
"epoch": 1.1531576330183988,
"grad_norm": 0.11776979267597198,
"learning_rate": 1e-06,
"loss": 0.538,
"num_tokens": 813102121.0,
"step": 579
},
{
"epoch": 1.155146693187469,
"grad_norm": 0.11876077950000763,
"learning_rate": 1e-06,
"loss": 0.5578,
"num_tokens": 814501435.0,
"step": 580
},
{
"epoch": 1.157135753356539,
"grad_norm": 0.12163852900266647,
"learning_rate": 1e-06,
"loss": 0.5301,
"num_tokens": 815900526.0,
"step": 581
},
{
"epoch": 1.1591248135256091,
"grad_norm": 0.11880628764629364,
"learning_rate": 1e-06,
"loss": 0.553,
"num_tokens": 817316569.0,
"step": 582
},
{
"epoch": 1.1611138736946793,
"grad_norm": 0.11747530102729797,
"learning_rate": 1e-06,
"loss": 0.5407,
"num_tokens": 818712696.0,
"step": 583
},
{
"epoch": 1.1631029338637493,
"grad_norm": 0.11508717387914658,
"learning_rate": 1e-06,
"loss": 0.5525,
"num_tokens": 820179550.0,
"step": 584
},
{
"epoch": 1.1650919940328195,
"grad_norm": 0.11923891305923462,
"learning_rate": 1e-06,
"loss": 0.5299,
"num_tokens": 821519204.0,
"step": 585
},
{
"epoch": 1.1670810542018897,
"grad_norm": 0.12130296975374222,
"learning_rate": 1e-06,
"loss": 0.5584,
"num_tokens": 822934109.0,
"step": 586
},
{
"epoch": 1.1690701143709596,
"grad_norm": 0.11868572235107422,
"learning_rate": 1e-06,
"loss": 0.5237,
"num_tokens": 824350114.0,
"step": 587
},
{
"epoch": 1.1710591745400298,
"grad_norm": 0.11723876744508743,
"learning_rate": 1e-06,
"loss": 0.5535,
"num_tokens": 825723811.0,
"step": 588
},
{
"epoch": 1.1730482347091,
"grad_norm": 0.11835741996765137,
"learning_rate": 1e-06,
"loss": 0.5346,
"num_tokens": 827152454.0,
"step": 589
},
{
"epoch": 1.17503729487817,
"grad_norm": 0.11868591606616974,
"learning_rate": 1e-06,
"loss": 0.5351,
"num_tokens": 828579537.0,
"step": 590
},
{
"epoch": 1.1770263550472402,
"grad_norm": 0.11987331509590149,
"learning_rate": 1e-06,
"loss": 0.5367,
"num_tokens": 829969684.0,
"step": 591
},
{
"epoch": 1.1790154152163104,
"grad_norm": 0.11743640154600143,
"learning_rate": 1e-06,
"loss": 0.5548,
"num_tokens": 831391395.0,
"step": 592
},
{
"epoch": 1.1810044753853803,
"grad_norm": 0.1182253286242485,
"learning_rate": 1e-06,
"loss": 0.5516,
"num_tokens": 832783491.0,
"step": 593
},
{
"epoch": 1.1829935355544505,
"grad_norm": 0.11603699624538422,
"learning_rate": 1e-06,
"loss": 0.5356,
"num_tokens": 834171588.0,
"step": 594
},
{
"epoch": 1.1849825957235207,
"grad_norm": 0.1183595210313797,
"learning_rate": 1e-06,
"loss": 0.5511,
"num_tokens": 835573502.0,
"step": 595
},
{
"epoch": 1.1869716558925907,
"grad_norm": 0.11468026787042618,
"learning_rate": 1e-06,
"loss": 0.5509,
"num_tokens": 837028240.0,
"step": 596
},
{
"epoch": 1.1889607160616609,
"grad_norm": 0.11567319929599762,
"learning_rate": 1e-06,
"loss": 0.5472,
"num_tokens": 838416832.0,
"step": 597
},
{
"epoch": 1.190949776230731,
"grad_norm": 0.11915028095245361,
"learning_rate": 1e-06,
"loss": 0.553,
"num_tokens": 839812316.0,
"step": 598
},
{
"epoch": 1.192938836399801,
"grad_norm": 0.11828889697790146,
"learning_rate": 1e-06,
"loss": 0.5442,
"num_tokens": 841215463.0,
"step": 599
},
{
"epoch": 1.1949278965688712,
"grad_norm": 0.11770551651716232,
"learning_rate": 1e-06,
"loss": 0.547,
"num_tokens": 842662727.0,
"step": 600
},
{
"epoch": 1.1969169567379414,
"grad_norm": 0.1193631961941719,
"learning_rate": 1e-06,
"loss": 0.5452,
"num_tokens": 844081793.0,
"step": 601
},
{
"epoch": 1.1989060169070114,
"grad_norm": 0.11217671632766724,
"learning_rate": 1e-06,
"loss": 0.5401,
"num_tokens": 845485094.0,
"step": 602
},
{
"epoch": 1.2008950770760816,
"grad_norm": 0.11600279062986374,
"learning_rate": 1e-06,
"loss": 0.5565,
"num_tokens": 846899288.0,
"step": 603
},
{
"epoch": 1.2028841372451518,
"grad_norm": 0.11796706914901733,
"learning_rate": 1e-06,
"loss": 0.5434,
"num_tokens": 848304697.0,
"step": 604
},
{
"epoch": 1.2048731974142217,
"grad_norm": 0.11813243478536606,
"learning_rate": 1e-06,
"loss": 0.5535,
"num_tokens": 849715462.0,
"step": 605
},
{
"epoch": 1.206862257583292,
"grad_norm": 0.1221594288945198,
"learning_rate": 1e-06,
"loss": 0.5461,
"num_tokens": 851059379.0,
"step": 606
},
{
"epoch": 1.208851317752362,
"grad_norm": 0.11475210636854172,
"learning_rate": 1e-06,
"loss": 0.5394,
"num_tokens": 852449620.0,
"step": 607
},
{
"epoch": 1.210840377921432,
"grad_norm": 0.1158720999956131,
"learning_rate": 1e-06,
"loss": 0.5415,
"num_tokens": 853876338.0,
"step": 608
},
{
"epoch": 1.2128294380905023,
"grad_norm": 0.11944854259490967,
"learning_rate": 1e-06,
"loss": 0.5398,
"num_tokens": 855326253.0,
"step": 609
},
{
"epoch": 1.2148184982595724,
"grad_norm": 0.11523836106061935,
"learning_rate": 1e-06,
"loss": 0.5415,
"num_tokens": 856726725.0,
"step": 610
},
{
"epoch": 1.2168075584286424,
"grad_norm": 0.11895252019166946,
"learning_rate": 1e-06,
"loss": 0.5499,
"num_tokens": 858107209.0,
"step": 611
},
{
"epoch": 1.2187966185977126,
"grad_norm": 0.11535745114088058,
"learning_rate": 1e-06,
"loss": 0.5286,
"num_tokens": 859540802.0,
"step": 612
},
{
"epoch": 1.2207856787667828,
"grad_norm": 0.1177186667919159,
"learning_rate": 1e-06,
"loss": 0.5516,
"num_tokens": 860956386.0,
"step": 613
},
{
"epoch": 1.2227747389358528,
"grad_norm": 0.11561235785484314,
"learning_rate": 1e-06,
"loss": 0.5345,
"num_tokens": 862351381.0,
"step": 614
},
{
"epoch": 1.224763799104923,
"grad_norm": 0.12278357893228531,
"learning_rate": 1e-06,
"loss": 0.5436,
"num_tokens": 863752120.0,
"step": 615
},
{
"epoch": 1.2267528592739931,
"grad_norm": 0.12000274658203125,
"learning_rate": 1e-06,
"loss": 0.5409,
"num_tokens": 865134261.0,
"step": 616
},
{
"epoch": 1.228741919443063,
"grad_norm": 0.11960814148187637,
"learning_rate": 1e-06,
"loss": 0.5421,
"num_tokens": 866500301.0,
"step": 617
},
{
"epoch": 1.2307309796121333,
"grad_norm": 0.11419054120779037,
"learning_rate": 1e-06,
"loss": 0.5407,
"num_tokens": 867914829.0,
"step": 618
},
{
"epoch": 1.2327200397812033,
"grad_norm": 0.11924876272678375,
"learning_rate": 1e-06,
"loss": 0.533,
"num_tokens": 869296945.0,
"step": 619
},
{
"epoch": 1.2347090999502734,
"grad_norm": 0.12687626481056213,
"learning_rate": 1e-06,
"loss": 0.5433,
"num_tokens": 870720355.0,
"step": 620
},
{
"epoch": 1.2366981601193436,
"grad_norm": 0.11868540942668915,
"learning_rate": 1e-06,
"loss": 0.5427,
"num_tokens": 872090554.0,
"step": 621
},
{
"epoch": 1.2386872202884138,
"grad_norm": 0.11346638202667236,
"learning_rate": 1e-06,
"loss": 0.5455,
"num_tokens": 873519656.0,
"step": 622
},
{
"epoch": 1.2406762804574838,
"grad_norm": 0.12468260526657104,
"learning_rate": 1e-06,
"loss": 0.5406,
"num_tokens": 874897139.0,
"step": 623
},
{
"epoch": 1.242665340626554,
"grad_norm": 0.11793619394302368,
"learning_rate": 1e-06,
"loss": 0.5541,
"num_tokens": 876281923.0,
"step": 624
},
{
"epoch": 1.244654400795624,
"grad_norm": 0.11685628443956375,
"learning_rate": 1e-06,
"loss": 0.5419,
"num_tokens": 877695435.0,
"step": 625
},
{
"epoch": 1.2466434609646941,
"grad_norm": 0.12373646348714828,
"learning_rate": 1e-06,
"loss": 0.5372,
"num_tokens": 879058206.0,
"step": 626
},
{
"epoch": 1.2486325211337643,
"grad_norm": 0.11609544605016708,
"learning_rate": 1e-06,
"loss": 0.5497,
"num_tokens": 880465724.0,
"step": 627
},
{
"epoch": 1.2506215813028345,
"grad_norm": 0.11885792016983032,
"learning_rate": 1e-06,
"loss": 0.5458,
"num_tokens": 881863204.0,
"step": 628
},
{
"epoch": 1.2526106414719045,
"grad_norm": 0.1189800500869751,
"learning_rate": 1e-06,
"loss": 0.5425,
"num_tokens": 883235922.0,
"step": 629
},
{
"epoch": 1.2545997016409747,
"grad_norm": 0.11329221725463867,
"learning_rate": 1e-06,
"loss": 0.5371,
"num_tokens": 884644276.0,
"step": 630
},
{
"epoch": 1.2565887618100446,
"grad_norm": 0.11825796961784363,
"learning_rate": 1e-06,
"loss": 0.5351,
"num_tokens": 886073154.0,
"step": 631
},
{
"epoch": 1.2585778219791148,
"grad_norm": 0.114280566573143,
"learning_rate": 1e-06,
"loss": 0.541,
"num_tokens": 887514458.0,
"step": 632
},
{
"epoch": 1.260566882148185,
"grad_norm": 0.1187988743185997,
"learning_rate": 1e-06,
"loss": 0.5364,
"num_tokens": 888906987.0,
"step": 633
},
{
"epoch": 1.2625559423172552,
"grad_norm": 0.11506423354148865,
"learning_rate": 1e-06,
"loss": 0.5431,
"num_tokens": 890319605.0,
"step": 634
},
{
"epoch": 1.2645450024863252,
"grad_norm": 0.1173451617360115,
"learning_rate": 1e-06,
"loss": 0.5345,
"num_tokens": 891716052.0,
"step": 635
},
{
"epoch": 1.2665340626553954,
"grad_norm": 0.11930102109909058,
"learning_rate": 1e-06,
"loss": 0.5439,
"num_tokens": 893119175.0,
"step": 636
},
{
"epoch": 1.2685231228244653,
"grad_norm": 0.11887087672948837,
"learning_rate": 1e-06,
"loss": 0.5348,
"num_tokens": 894490469.0,
"step": 637
},
{
"epoch": 1.2705121829935355,
"grad_norm": 0.11899300664663315,
"learning_rate": 1e-06,
"loss": 0.5491,
"num_tokens": 895868778.0,
"step": 638
},
{
"epoch": 1.2725012431626057,
"grad_norm": 0.116294264793396,
"learning_rate": 1e-06,
"loss": 0.5537,
"num_tokens": 897281587.0,
"step": 639
},
{
"epoch": 1.274490303331676,
"grad_norm": 0.12177430093288422,
"learning_rate": 1e-06,
"loss": 0.5369,
"num_tokens": 898705618.0,
"step": 640
},
{
"epoch": 1.2764793635007459,
"grad_norm": 0.11538566648960114,
"learning_rate": 1e-06,
"loss": 0.5387,
"num_tokens": 900118795.0,
"step": 641
},
{
"epoch": 1.278468423669816,
"grad_norm": 0.11888190358877182,
"learning_rate": 1e-06,
"loss": 0.5448,
"num_tokens": 901549973.0,
"step": 642
},
{
"epoch": 1.280457483838886,
"grad_norm": 0.11358219385147095,
"learning_rate": 1e-06,
"loss": 0.5403,
"num_tokens": 902947794.0,
"step": 643
},
{
"epoch": 1.2824465440079562,
"grad_norm": 0.11417380720376968,
"learning_rate": 1e-06,
"loss": 0.5332,
"num_tokens": 904349508.0,
"step": 644
},
{
"epoch": 1.2844356041770264,
"grad_norm": 0.11959497630596161,
"learning_rate": 1e-06,
"loss": 0.5406,
"num_tokens": 905747046.0,
"step": 645
},
{
"epoch": 1.2864246643460966,
"grad_norm": 0.11622175574302673,
"learning_rate": 1e-06,
"loss": 0.5415,
"num_tokens": 907173072.0,
"step": 646
},
{
"epoch": 1.2884137245151666,
"grad_norm": 0.11988142877817154,
"learning_rate": 1e-06,
"loss": 0.5451,
"num_tokens": 908584591.0,
"step": 647
},
{
"epoch": 1.2904027846842367,
"grad_norm": 0.1185469850897789,
"learning_rate": 1e-06,
"loss": 0.5351,
"num_tokens": 910011116.0,
"step": 648
},
{
"epoch": 1.2923918448533067,
"grad_norm": 0.11444271355867386,
"learning_rate": 1e-06,
"loss": 0.5384,
"num_tokens": 911427909.0,
"step": 649
},
{
"epoch": 1.294380905022377,
"grad_norm": 0.12033736705780029,
"learning_rate": 1e-06,
"loss": 0.5498,
"num_tokens": 912835863.0,
"step": 650
},
{
"epoch": 1.296369965191447,
"grad_norm": 0.1319677233695984,
"learning_rate": 1e-06,
"loss": 0.5296,
"num_tokens": 914238947.0,
"step": 651
},
{
"epoch": 1.2983590253605173,
"grad_norm": 0.12392336130142212,
"learning_rate": 1e-06,
"loss": 0.5341,
"num_tokens": 915655787.0,
"step": 652
},
{
"epoch": 1.3003480855295872,
"grad_norm": 0.11796288192272186,
"learning_rate": 1e-06,
"loss": 0.547,
"num_tokens": 917089247.0,
"step": 653
},
{
"epoch": 1.3023371456986574,
"grad_norm": 0.12152981013059616,
"learning_rate": 1e-06,
"loss": 0.5544,
"num_tokens": 918505085.0,
"step": 654
},
{
"epoch": 1.3043262058677274,
"grad_norm": 0.11925685405731201,
"learning_rate": 1e-06,
"loss": 0.5304,
"num_tokens": 919892998.0,
"step": 655
},
{
"epoch": 1.3063152660367976,
"grad_norm": 0.11711208522319794,
"learning_rate": 1e-06,
"loss": 0.5426,
"num_tokens": 921240479.0,
"step": 656
},
{
"epoch": 1.3083043262058678,
"grad_norm": 0.12039055675268173,
"learning_rate": 1e-06,
"loss": 0.5368,
"num_tokens": 922632985.0,
"step": 657
},
{
"epoch": 1.310293386374938,
"grad_norm": 0.11820589005947113,
"learning_rate": 1e-06,
"loss": 0.5324,
"num_tokens": 924054020.0,
"step": 658
},
{
"epoch": 1.312282446544008,
"grad_norm": 0.11549760401248932,
"learning_rate": 1e-06,
"loss": 0.5442,
"num_tokens": 925466685.0,
"step": 659
},
{
"epoch": 1.3142715067130781,
"grad_norm": 0.12046794593334198,
"learning_rate": 1e-06,
"loss": 0.5491,
"num_tokens": 926874635.0,
"step": 660
},
{
"epoch": 1.316260566882148,
"grad_norm": 0.1153668761253357,
"learning_rate": 1e-06,
"loss": 0.5371,
"num_tokens": 928294778.0,
"step": 661
},
{
"epoch": 1.3182496270512183,
"grad_norm": 0.11516553908586502,
"learning_rate": 1e-06,
"loss": 0.5405,
"num_tokens": 929703109.0,
"step": 662
},
{
"epoch": 1.3202386872202885,
"grad_norm": 0.11781197786331177,
"learning_rate": 1e-06,
"loss": 0.5342,
"num_tokens": 931098455.0,
"step": 663
},
{
"epoch": 1.3222277473893587,
"grad_norm": 0.11899585276842117,
"learning_rate": 1e-06,
"loss": 0.546,
"num_tokens": 932511314.0,
"step": 664
},
{
"epoch": 1.3242168075584286,
"grad_norm": 0.11900392174720764,
"learning_rate": 1e-06,
"loss": 0.5404,
"num_tokens": 933900286.0,
"step": 665
},
{
"epoch": 1.3262058677274988,
"grad_norm": 0.12140580266714096,
"learning_rate": 1e-06,
"loss": 0.5355,
"num_tokens": 935253652.0,
"step": 666
},
{
"epoch": 1.3281949278965688,
"grad_norm": 0.12140516191720963,
"learning_rate": 1e-06,
"loss": 0.5394,
"num_tokens": 936636184.0,
"step": 667
},
{
"epoch": 1.330183988065639,
"grad_norm": 0.11907900869846344,
"learning_rate": 1e-06,
"loss": 0.527,
"num_tokens": 938023738.0,
"step": 668
},
{
"epoch": 1.3321730482347092,
"grad_norm": 0.12013056874275208,
"learning_rate": 1e-06,
"loss": 0.5444,
"num_tokens": 939396704.0,
"step": 669
},
{
"epoch": 1.3341621084037791,
"grad_norm": 0.11747721582651138,
"learning_rate": 1e-06,
"loss": 0.534,
"num_tokens": 940820588.0,
"step": 670
},
{
"epoch": 1.3361511685728493,
"grad_norm": 0.11940892785787582,
"learning_rate": 1e-06,
"loss": 0.5323,
"num_tokens": 942230394.0,
"step": 671
},
{
"epoch": 1.3381402287419195,
"grad_norm": 0.12076081335544586,
"learning_rate": 1e-06,
"loss": 0.5395,
"num_tokens": 943640623.0,
"step": 672
},
{
"epoch": 1.3401292889109895,
"grad_norm": 0.11554915457963943,
"learning_rate": 1e-06,
"loss": 0.5423,
"num_tokens": 945055848.0,
"step": 673
},
{
"epoch": 1.3421183490800597,
"grad_norm": 0.11654532700777054,
"learning_rate": 1e-06,
"loss": 0.5235,
"num_tokens": 946461742.0,
"step": 674
},
{
"epoch": 1.3441074092491299,
"grad_norm": 0.11917490512132645,
"learning_rate": 1e-06,
"loss": 0.5389,
"num_tokens": 947841726.0,
"step": 675
},
{
"epoch": 1.3460964694181998,
"grad_norm": 0.1211070865392685,
"learning_rate": 1e-06,
"loss": 0.5347,
"num_tokens": 949278490.0,
"step": 676
},
{
"epoch": 1.34808552958727,
"grad_norm": 0.12035378068685532,
"learning_rate": 1e-06,
"loss": 0.5334,
"num_tokens": 950642424.0,
"step": 677
},
{
"epoch": 1.3500745897563402,
"grad_norm": 0.11970808357000351,
"learning_rate": 1e-06,
"loss": 0.5304,
"num_tokens": 952042727.0,
"step": 678
},
{
"epoch": 1.3520636499254102,
"grad_norm": 0.12441671639680862,
"learning_rate": 1e-06,
"loss": 0.5429,
"num_tokens": 953444541.0,
"step": 679
},
{
"epoch": 1.3540527100944804,
"grad_norm": 0.11760767549276352,
"learning_rate": 1e-06,
"loss": 0.5433,
"num_tokens": 954885585.0,
"step": 680
},
{
"epoch": 1.3560417702635505,
"grad_norm": 0.12732824683189392,
"learning_rate": 1e-06,
"loss": 0.5339,
"num_tokens": 956271933.0,
"step": 681
},
{
"epoch": 1.3580308304326205,
"grad_norm": 0.11609245091676712,
"learning_rate": 1e-06,
"loss": 0.5398,
"num_tokens": 957653773.0,
"step": 682
},
{
"epoch": 1.3600198906016907,
"grad_norm": 0.11394883692264557,
"learning_rate": 1e-06,
"loss": 0.5357,
"num_tokens": 959087783.0,
"step": 683
},
{
"epoch": 1.362008950770761,
"grad_norm": 0.13362184166908264,
"learning_rate": 1e-06,
"loss": 0.5404,
"num_tokens": 960495424.0,
"step": 684
},
{
"epoch": 1.3639980109398309,
"grad_norm": 0.11702502518892288,
"learning_rate": 1e-06,
"loss": 0.5325,
"num_tokens": 961901643.0,
"step": 685
},
{
"epoch": 1.365987071108901,
"grad_norm": 0.11658236384391785,
"learning_rate": 1e-06,
"loss": 0.5322,
"num_tokens": 963283706.0,
"step": 686
},
{
"epoch": 1.3679761312779712,
"grad_norm": 0.12017067521810532,
"learning_rate": 1e-06,
"loss": 0.5338,
"num_tokens": 964734481.0,
"step": 687
},
{
"epoch": 1.3699651914470412,
"grad_norm": 0.12005714327096939,
"learning_rate": 1e-06,
"loss": 0.5423,
"num_tokens": 966125592.0,
"step": 688
},
{
"epoch": 1.3719542516161114,
"grad_norm": 0.12015294283628464,
"learning_rate": 1e-06,
"loss": 0.5513,
"num_tokens": 967523933.0,
"step": 689
},
{
"epoch": 1.3739433117851814,
"grad_norm": 0.11920958757400513,
"learning_rate": 1e-06,
"loss": 0.5319,
"num_tokens": 968897975.0,
"step": 690
},
{
"epoch": 1.3759323719542516,
"grad_norm": 0.12052245438098907,
"learning_rate": 1e-06,
"loss": 0.5333,
"num_tokens": 970267864.0,
"step": 691
},
{
"epoch": 1.3779214321233217,
"grad_norm": 0.11884114146232605,
"learning_rate": 1e-06,
"loss": 0.5336,
"num_tokens": 971706591.0,
"step": 692
},
{
"epoch": 1.379910492292392,
"grad_norm": 0.11437772214412689,
"learning_rate": 1e-06,
"loss": 0.552,
"num_tokens": 973155693.0,
"step": 693
},
{
"epoch": 1.381899552461462,
"grad_norm": 0.12122377008199692,
"learning_rate": 1e-06,
"loss": 0.5444,
"num_tokens": 974547211.0,
"step": 694
},
{
"epoch": 1.383888612630532,
"grad_norm": 0.11425941437482834,
"learning_rate": 1e-06,
"loss": 0.5415,
"num_tokens": 976010063.0,
"step": 695
},
{
"epoch": 1.385877672799602,
"grad_norm": 0.11765948683023453,
"learning_rate": 1e-06,
"loss": 0.5284,
"num_tokens": 977411227.0,
"step": 696
},
{
"epoch": 1.3878667329686722,
"grad_norm": 0.11854742467403412,
"learning_rate": 1e-06,
"loss": 0.5338,
"num_tokens": 978786923.0,
"step": 697
},
{
"epoch": 1.3898557931377424,
"grad_norm": 0.12211066484451294,
"learning_rate": 1e-06,
"loss": 0.5319,
"num_tokens": 980164053.0,
"step": 698
},
{
"epoch": 1.3918448533068126,
"grad_norm": 0.1181558147072792,
"learning_rate": 1e-06,
"loss": 0.5233,
"num_tokens": 981562290.0,
"step": 699
},
{
"epoch": 1.3938339134758826,
"grad_norm": 0.12071076780557632,
"learning_rate": 1e-06,
"loss": 0.5394,
"num_tokens": 982998912.0,
"step": 700
},
{
"epoch": 1.3958229736449528,
"grad_norm": 0.11735861748456955,
"learning_rate": 1e-06,
"loss": 0.5456,
"num_tokens": 984410558.0,
"step": 701
},
{
"epoch": 1.3978120338140227,
"grad_norm": 0.11743367463350296,
"learning_rate": 1e-06,
"loss": 0.5396,
"num_tokens": 985836461.0,
"step": 702
},
{
"epoch": 1.399801093983093,
"grad_norm": 0.11885622888803482,
"learning_rate": 1e-06,
"loss": 0.5424,
"num_tokens": 987230105.0,
"step": 703
},
{
"epoch": 1.4017901541521631,
"grad_norm": 0.11783410608768463,
"learning_rate": 1e-06,
"loss": 0.5352,
"num_tokens": 988630556.0,
"step": 704
},
{
"epoch": 1.4037792143212333,
"grad_norm": 0.11775646358728409,
"learning_rate": 1e-06,
"loss": 0.5239,
"num_tokens": 989996092.0,
"step": 705
},
{
"epoch": 1.4057682744903033,
"grad_norm": 0.11850979924201965,
"learning_rate": 1e-06,
"loss": 0.5292,
"num_tokens": 991410237.0,
"step": 706
},
{
"epoch": 1.4077573346593735,
"grad_norm": 0.13466332852840424,
"learning_rate": 1e-06,
"loss": 0.5229,
"num_tokens": 992831683.0,
"step": 707
},
{
"epoch": 1.4097463948284434,
"grad_norm": 0.1183917373418808,
"learning_rate": 1e-06,
"loss": 0.5306,
"num_tokens": 994220085.0,
"step": 708
},
{
"epoch": 1.4117354549975136,
"grad_norm": 0.11719982326030731,
"learning_rate": 1e-06,
"loss": 0.5347,
"num_tokens": 995634427.0,
"step": 709
},
{
"epoch": 1.4137245151665838,
"grad_norm": 0.1188247948884964,
"learning_rate": 1e-06,
"loss": 0.5214,
"num_tokens": 996991797.0,
"step": 710
},
{
"epoch": 1.415713575335654,
"grad_norm": 0.12354787439107895,
"learning_rate": 1e-06,
"loss": 0.5394,
"num_tokens": 998387297.0,
"step": 711
},
{
"epoch": 1.417702635504724,
"grad_norm": 0.11752255260944366,
"learning_rate": 1e-06,
"loss": 0.5289,
"num_tokens": 999796509.0,
"step": 712
},
{
"epoch": 1.4196916956737942,
"grad_norm": 0.11573730409145355,
"learning_rate": 1e-06,
"loss": 0.5306,
"num_tokens": 1001189936.0,
"step": 713
},
{
"epoch": 1.4216807558428641,
"grad_norm": 0.12337598204612732,
"learning_rate": 1e-06,
"loss": 0.5368,
"num_tokens": 1002569771.0,
"step": 714
},
{
"epoch": 1.4236698160119343,
"grad_norm": 0.1165643110871315,
"learning_rate": 1e-06,
"loss": 0.5284,
"num_tokens": 1003987924.0,
"step": 715
},
{
"epoch": 1.4256588761810045,
"grad_norm": 0.12802375853061676,
"learning_rate": 1e-06,
"loss": 0.5404,
"num_tokens": 1005393054.0,
"step": 716
},
{
"epoch": 1.4276479363500747,
"grad_norm": 0.1228770762681961,
"learning_rate": 1e-06,
"loss": 0.5448,
"num_tokens": 1006837132.0,
"step": 717
},
{
"epoch": 1.4296369965191447,
"grad_norm": 0.12197130918502808,
"learning_rate": 1e-06,
"loss": 0.5514,
"num_tokens": 1008250141.0,
"step": 718
},
{
"epoch": 1.4316260566882149,
"grad_norm": 0.11660060286521912,
"learning_rate": 1e-06,
"loss": 0.5501,
"num_tokens": 1009672430.0,
"step": 719
},
{
"epoch": 1.4336151168572848,
"grad_norm": 0.13146308064460754,
"learning_rate": 1e-06,
"loss": 0.5526,
"num_tokens": 1011105282.0,
"step": 720
},
{
"epoch": 1.435604177026355,
"grad_norm": 0.12230085581541061,
"learning_rate": 1e-06,
"loss": 0.5388,
"num_tokens": 1012519871.0,
"step": 721
},
{
"epoch": 1.4375932371954252,
"grad_norm": 0.11886520683765411,
"learning_rate": 1e-06,
"loss": 0.5512,
"num_tokens": 1013919180.0,
"step": 722
},
{
"epoch": 1.4395822973644954,
"grad_norm": 0.12054796516895294,
"learning_rate": 1e-06,
"loss": 0.5418,
"num_tokens": 1015311974.0,
"step": 723
},
{
"epoch": 1.4415713575335654,
"grad_norm": 0.1221555769443512,
"learning_rate": 1e-06,
"loss": 0.5315,
"num_tokens": 1016710513.0,
"step": 724
},
{
"epoch": 1.4435604177026355,
"grad_norm": 0.12085101753473282,
"learning_rate": 1e-06,
"loss": 0.5267,
"num_tokens": 1018108542.0,
"step": 725
},
{
"epoch": 1.4455494778717055,
"grad_norm": 0.11824904382228851,
"learning_rate": 1e-06,
"loss": 0.5275,
"num_tokens": 1019503680.0,
"step": 726
},
{
"epoch": 1.4475385380407757,
"grad_norm": 0.11636096239089966,
"learning_rate": 1e-06,
"loss": 0.5326,
"num_tokens": 1020904351.0,
"step": 727
},
{
"epoch": 1.449527598209846,
"grad_norm": 0.12087342143058777,
"learning_rate": 1e-06,
"loss": 0.5418,
"num_tokens": 1022300450.0,
"step": 728
},
{
"epoch": 1.451516658378916,
"grad_norm": 0.11914920806884766,
"learning_rate": 1e-06,
"loss": 0.547,
"num_tokens": 1023750162.0,
"step": 729
},
{
"epoch": 1.453505718547986,
"grad_norm": 0.11899517476558685,
"learning_rate": 1e-06,
"loss": 0.5452,
"num_tokens": 1025196694.0,
"step": 730
},
{
"epoch": 1.4554947787170562,
"grad_norm": 0.1218375712633133,
"learning_rate": 1e-06,
"loss": 0.5321,
"num_tokens": 1026597406.0,
"step": 731
},
{
"epoch": 1.4574838388861262,
"grad_norm": 0.11746956408023834,
"learning_rate": 1e-06,
"loss": 0.5341,
"num_tokens": 1027971754.0,
"step": 732
},
{
"epoch": 1.4594728990551964,
"grad_norm": 0.12773922085762024,
"learning_rate": 1e-06,
"loss": 0.547,
"num_tokens": 1029364538.0,
"step": 733
},
{
"epoch": 1.4614619592242666,
"grad_norm": 0.1228381097316742,
"learning_rate": 1e-06,
"loss": 0.5403,
"num_tokens": 1030782739.0,
"step": 734
},
{
"epoch": 1.4634510193933368,
"grad_norm": 0.11855144798755646,
"learning_rate": 1e-06,
"loss": 0.5345,
"num_tokens": 1032154592.0,
"step": 735
},
{
"epoch": 1.4654400795624067,
"grad_norm": 0.12030474096536636,
"learning_rate": 1e-06,
"loss": 0.5314,
"num_tokens": 1033543123.0,
"step": 736
},
{
"epoch": 1.467429139731477,
"grad_norm": 0.12364325672388077,
"learning_rate": 1e-06,
"loss": 0.5422,
"num_tokens": 1034938165.0,
"step": 737
},
{
"epoch": 1.469418199900547,
"grad_norm": 0.12098907679319382,
"learning_rate": 1e-06,
"loss": 0.5375,
"num_tokens": 1036314909.0,
"step": 738
},
{
"epoch": 1.471407260069617,
"grad_norm": 0.11932458728551865,
"learning_rate": 1e-06,
"loss": 0.5347,
"num_tokens": 1037727924.0,
"step": 739
},
{
"epoch": 1.4733963202386873,
"grad_norm": 0.11801986396312714,
"learning_rate": 1e-06,
"loss": 0.5248,
"num_tokens": 1039111581.0,
"step": 740
},
{
"epoch": 1.4753853804077575,
"grad_norm": 0.11731583625078201,
"learning_rate": 1e-06,
"loss": 0.5345,
"num_tokens": 1040546305.0,
"step": 741
},
{
"epoch": 1.4773744405768274,
"grad_norm": 0.11801919341087341,
"learning_rate": 1e-06,
"loss": 0.5336,
"num_tokens": 1041939972.0,
"step": 742
},
{
"epoch": 1.4793635007458976,
"grad_norm": 0.12367312610149384,
"learning_rate": 1e-06,
"loss": 0.5389,
"num_tokens": 1043334299.0,
"step": 743
},
{
"epoch": 1.4813525609149676,
"grad_norm": 0.12621083855628967,
"learning_rate": 1e-06,
"loss": 0.5411,
"num_tokens": 1044732856.0,
"step": 744
},
{
"epoch": 1.4833416210840378,
"grad_norm": 0.1213909313082695,
"learning_rate": 1e-06,
"loss": 0.5339,
"num_tokens": 1046119537.0,
"step": 745
},
{
"epoch": 1.485330681253108,
"grad_norm": 0.12064032256603241,
"learning_rate": 1e-06,
"loss": 0.5501,
"num_tokens": 1047526533.0,
"step": 746
},
{
"epoch": 1.4873197414221782,
"grad_norm": 0.11744097620248795,
"learning_rate": 1e-06,
"loss": 0.5323,
"num_tokens": 1048909786.0,
"step": 747
},
{
"epoch": 1.4893088015912481,
"grad_norm": 0.11546076834201813,
"learning_rate": 1e-06,
"loss": 0.5396,
"num_tokens": 1050351234.0,
"step": 748
},
{
"epoch": 1.4912978617603183,
"grad_norm": 0.12047351151704788,
"learning_rate": 1e-06,
"loss": 0.5439,
"num_tokens": 1051747737.0,
"step": 749
},
{
"epoch": 1.4932869219293883,
"grad_norm": 0.11532466858625412,
"learning_rate": 1e-06,
"loss": 0.5267,
"num_tokens": 1053183317.0,
"step": 750
},
{
"epoch": 1.4952759820984585,
"grad_norm": 0.1169387549161911,
"learning_rate": 1e-06,
"loss": 0.526,
"num_tokens": 1054557285.0,
"step": 751
},
{
"epoch": 1.4972650422675287,
"grad_norm": 0.1165088340640068,
"learning_rate": 1e-06,
"loss": 0.5308,
"num_tokens": 1056007559.0,
"step": 752
},
{
"epoch": 1.4992541024365988,
"grad_norm": 0.11395064741373062,
"learning_rate": 1e-06,
"loss": 0.5408,
"num_tokens": 1057438934.0,
"step": 753
},
{
"epoch": 1.5012431626056688,
"grad_norm": 0.12597793340682983,
"learning_rate": 1e-06,
"loss": 0.54,
"num_tokens": 1058838684.0,
"step": 754
},
{
"epoch": 1.5032322227747388,
"grad_norm": 0.12164228409528732,
"learning_rate": 1e-06,
"loss": 0.538,
"num_tokens": 1060278255.0,
"step": 755
},
{
"epoch": 1.505221282943809,
"grad_norm": 0.11811868101358414,
"learning_rate": 1e-06,
"loss": 0.531,
"num_tokens": 1061695693.0,
"step": 756
},
{
"epoch": 1.5072103431128792,
"grad_norm": 0.1173863634467125,
"learning_rate": 1e-06,
"loss": 0.529,
"num_tokens": 1063113004.0,
"step": 757
},
{
"epoch": 1.5091994032819493,
"grad_norm": 0.1195053681731224,
"learning_rate": 1e-06,
"loss": 0.5467,
"num_tokens": 1064537749.0,
"step": 758
},
{
"epoch": 1.5111884634510195,
"grad_norm": 0.12815718352794647,
"learning_rate": 1e-06,
"loss": 0.5202,
"num_tokens": 1065931962.0,
"step": 759
},
{
"epoch": 1.5131775236200895,
"grad_norm": 0.12516118586063385,
"learning_rate": 1e-06,
"loss": 0.517,
"num_tokens": 1067328106.0,
"step": 760
},
{
"epoch": 1.5151665837891595,
"grad_norm": 0.11947084218263626,
"learning_rate": 1e-06,
"loss": 0.5198,
"num_tokens": 1068712446.0,
"step": 761
},
{
"epoch": 1.5171556439582297,
"grad_norm": 0.1181473359465599,
"learning_rate": 1e-06,
"loss": 0.5399,
"num_tokens": 1070129632.0,
"step": 762
},
{
"epoch": 1.5191447041272998,
"grad_norm": 0.12384405732154846,
"learning_rate": 1e-06,
"loss": 0.5369,
"num_tokens": 1071485504.0,
"step": 763
},
{
"epoch": 1.52113376429637,
"grad_norm": 0.12462117522954941,
"learning_rate": 1e-06,
"loss": 0.5313,
"num_tokens": 1072876547.0,
"step": 764
},
{
"epoch": 1.5231228244654402,
"grad_norm": 0.11820020526647568,
"learning_rate": 1e-06,
"loss": 0.5282,
"num_tokens": 1074262233.0,
"step": 765
},
{
"epoch": 1.5251118846345102,
"grad_norm": 0.11712965369224548,
"learning_rate": 1e-06,
"loss": 0.5286,
"num_tokens": 1075681885.0,
"step": 766
},
{
"epoch": 1.5271009448035802,
"grad_norm": 0.11938031017780304,
"learning_rate": 1e-06,
"loss": 0.524,
"num_tokens": 1077086623.0,
"step": 767
},
{
"epoch": 1.5290900049726504,
"grad_norm": 0.11355356127023697,
"learning_rate": 1e-06,
"loss": 0.5357,
"num_tokens": 1078502366.0,
"step": 768
},
{
"epoch": 1.5310790651417205,
"grad_norm": 0.12431412935256958,
"learning_rate": 1e-06,
"loss": 0.5317,
"num_tokens": 1079851354.0,
"step": 769
},
{
"epoch": 1.5330681253107907,
"grad_norm": 0.12124991416931152,
"learning_rate": 1e-06,
"loss": 0.5452,
"num_tokens": 1081246250.0,
"step": 770
},
{
"epoch": 1.535057185479861,
"grad_norm": 0.11724445968866348,
"learning_rate": 1e-06,
"loss": 0.53,
"num_tokens": 1082635196.0,
"step": 771
},
{
"epoch": 1.5370462456489309,
"grad_norm": 0.12210683524608612,
"learning_rate": 1e-06,
"loss": 0.5291,
"num_tokens": 1084011995.0,
"step": 772
},
{
"epoch": 1.5390353058180009,
"grad_norm": 0.11981118470430374,
"learning_rate": 1e-06,
"loss": 0.5438,
"num_tokens": 1085412000.0,
"step": 773
},
{
"epoch": 1.541024365987071,
"grad_norm": 0.12824386358261108,
"learning_rate": 1e-06,
"loss": 0.5393,
"num_tokens": 1086812231.0,
"step": 774
},
{
"epoch": 1.5430134261561412,
"grad_norm": 0.1165238469839096,
"learning_rate": 1e-06,
"loss": 0.5468,
"num_tokens": 1088217759.0,
"step": 775
},
{
"epoch": 1.5450024863252114,
"grad_norm": 0.11545863002538681,
"learning_rate": 1e-06,
"loss": 0.522,
"num_tokens": 1089622076.0,
"step": 776
},
{
"epoch": 1.5469915464942816,
"grad_norm": 0.11776294559240341,
"learning_rate": 1e-06,
"loss": 0.5439,
"num_tokens": 1091049994.0,
"step": 777
},
{
"epoch": 1.5489806066633516,
"grad_norm": 0.11632154881954193,
"learning_rate": 1e-06,
"loss": 0.5078,
"num_tokens": 1092400841.0,
"step": 778
},
{
"epoch": 1.5509696668324215,
"grad_norm": 0.1186751276254654,
"learning_rate": 1e-06,
"loss": 0.5372,
"num_tokens": 1093805680.0,
"step": 779
},
{
"epoch": 1.5529587270014917,
"grad_norm": 0.11867798119783401,
"learning_rate": 1e-06,
"loss": 0.5334,
"num_tokens": 1095211670.0,
"step": 780
},
{
"epoch": 1.554947787170562,
"grad_norm": 0.11884911358356476,
"learning_rate": 1e-06,
"loss": 0.5355,
"num_tokens": 1096608638.0,
"step": 781
},
{
"epoch": 1.5569368473396321,
"grad_norm": 0.12014549970626831,
"learning_rate": 1e-06,
"loss": 0.5485,
"num_tokens": 1098024501.0,
"step": 782
},
{
"epoch": 1.5589259075087023,
"grad_norm": 0.115864098072052,
"learning_rate": 1e-06,
"loss": 0.5372,
"num_tokens": 1099418318.0,
"step": 783
},
{
"epoch": 1.5609149676777723,
"grad_norm": 0.12087985128164291,
"learning_rate": 1e-06,
"loss": 0.5331,
"num_tokens": 1100818699.0,
"step": 784
},
{
"epoch": 1.5629040278468422,
"grad_norm": 0.11292678117752075,
"learning_rate": 1e-06,
"loss": 0.5345,
"num_tokens": 1102255753.0,
"step": 785
},
{
"epoch": 1.5648930880159124,
"grad_norm": 0.11706893891096115,
"learning_rate": 1e-06,
"loss": 0.5211,
"num_tokens": 1103629573.0,
"step": 786
},
{
"epoch": 1.5668821481849826,
"grad_norm": 0.11923506110906601,
"learning_rate": 1e-06,
"loss": 0.5309,
"num_tokens": 1104985171.0,
"step": 787
},
{
"epoch": 1.5688712083540528,
"grad_norm": 0.12054122984409332,
"learning_rate": 1e-06,
"loss": 0.5395,
"num_tokens": 1106405658.0,
"step": 788
},
{
"epoch": 1.5708602685231228,
"grad_norm": 0.11507654935121536,
"learning_rate": 1e-06,
"loss": 0.5414,
"num_tokens": 1107830805.0,
"step": 789
},
{
"epoch": 1.572849328692193,
"grad_norm": 0.11983779072761536,
"learning_rate": 1e-06,
"loss": 0.5314,
"num_tokens": 1109163312.0,
"step": 790
},
{
"epoch": 1.574838388861263,
"grad_norm": 0.11855509877204895,
"learning_rate": 1e-06,
"loss": 0.5279,
"num_tokens": 1110582152.0,
"step": 791
},
{
"epoch": 1.5768274490303331,
"grad_norm": 0.11437905579805374,
"learning_rate": 1e-06,
"loss": 0.5335,
"num_tokens": 1112007797.0,
"step": 792
},
{
"epoch": 1.5788165091994033,
"grad_norm": 0.11613345891237259,
"learning_rate": 1e-06,
"loss": 0.5364,
"num_tokens": 1113366919.0,
"step": 793
},
{
"epoch": 1.5808055693684735,
"grad_norm": 0.1167130321264267,
"learning_rate": 1e-06,
"loss": 0.5295,
"num_tokens": 1114808855.0,
"step": 794
},
{
"epoch": 1.5827946295375435,
"grad_norm": 0.11898983269929886,
"learning_rate": 1e-06,
"loss": 0.5426,
"num_tokens": 1116215942.0,
"step": 795
},
{
"epoch": 1.5847836897066137,
"grad_norm": 0.11814821511507034,
"learning_rate": 1e-06,
"loss": 0.5249,
"num_tokens": 1117635329.0,
"step": 796
},
{
"epoch": 1.5867727498756836,
"grad_norm": 0.11845888942480087,
"learning_rate": 1e-06,
"loss": 0.5347,
"num_tokens": 1119076550.0,
"step": 797
},
{
"epoch": 1.5887618100447538,
"grad_norm": 0.11748501658439636,
"learning_rate": 1e-06,
"loss": 0.5376,
"num_tokens": 1120476445.0,
"step": 798
},
{
"epoch": 1.590750870213824,
"grad_norm": 0.11534599214792252,
"learning_rate": 1e-06,
"loss": 0.5204,
"num_tokens": 1121859435.0,
"step": 799
},
{
"epoch": 1.5927399303828942,
"grad_norm": 0.11943424493074417,
"learning_rate": 1e-06,
"loss": 0.526,
"num_tokens": 1123242470.0,
"step": 800
},
{
"epoch": 1.5947289905519642,
"grad_norm": 0.12252327799797058,
"learning_rate": 1e-06,
"loss": 0.5278,
"num_tokens": 1124644409.0,
"step": 801
},
{
"epoch": 1.5967180507210343,
"grad_norm": 0.11914427578449249,
"learning_rate": 1e-06,
"loss": 0.5308,
"num_tokens": 1126067895.0,
"step": 802
},
{
"epoch": 1.5987071108901043,
"grad_norm": 0.12473994493484497,
"learning_rate": 1e-06,
"loss": 0.53,
"num_tokens": 1127456815.0,
"step": 803
},
{
"epoch": 1.6006961710591745,
"grad_norm": 0.11551981419324875,
"learning_rate": 1e-06,
"loss": 0.5255,
"num_tokens": 1128878563.0,
"step": 804
},
{
"epoch": 1.6026852312282447,
"grad_norm": 0.11678687483072281,
"learning_rate": 1e-06,
"loss": 0.5325,
"num_tokens": 1130317088.0,
"step": 805
},
{
"epoch": 1.6046742913973149,
"grad_norm": 0.1221092939376831,
"learning_rate": 1e-06,
"loss": 0.5404,
"num_tokens": 1131770565.0,
"step": 806
},
{
"epoch": 1.6066633515663848,
"grad_norm": 0.11658436805009842,
"learning_rate": 1e-06,
"loss": 0.5345,
"num_tokens": 1133178633.0,
"step": 807
},
{
"epoch": 1.608652411735455,
"grad_norm": 0.11696770042181015,
"learning_rate": 1e-06,
"loss": 0.5275,
"num_tokens": 1134574443.0,
"step": 808
},
{
"epoch": 1.610641471904525,
"grad_norm": 0.11893412470817566,
"learning_rate": 1e-06,
"loss": 0.5349,
"num_tokens": 1135948785.0,
"step": 809
},
{
"epoch": 1.6126305320735952,
"grad_norm": 0.12174445390701294,
"learning_rate": 1e-06,
"loss": 0.5292,
"num_tokens": 1137342427.0,
"step": 810
},
{
"epoch": 1.6146195922426654,
"grad_norm": 0.12794137001037598,
"learning_rate": 1e-06,
"loss": 0.5418,
"num_tokens": 1138767131.0,
"step": 811
},
{
"epoch": 1.6166086524117356,
"grad_norm": 0.11655872315168381,
"learning_rate": 1e-06,
"loss": 0.525,
"num_tokens": 1140181396.0,
"step": 812
},
{
"epoch": 1.6185977125808055,
"grad_norm": 0.11739625781774521,
"learning_rate": 1e-06,
"loss": 0.5265,
"num_tokens": 1141585956.0,
"step": 813
},
{
"epoch": 1.6205867727498757,
"grad_norm": 0.11966431885957718,
"learning_rate": 1e-06,
"loss": 0.5439,
"num_tokens": 1142972552.0,
"step": 814
},
{
"epoch": 1.6225758329189457,
"grad_norm": 0.1153174564242363,
"learning_rate": 1e-06,
"loss": 0.5269,
"num_tokens": 1144360432.0,
"step": 815
},
{
"epoch": 1.6245648930880159,
"grad_norm": 0.11621485650539398,
"learning_rate": 1e-06,
"loss": 0.5274,
"num_tokens": 1145763084.0,
"step": 816
},
{
"epoch": 1.626553953257086,
"grad_norm": 0.12078936398029327,
"learning_rate": 1e-06,
"loss": 0.5408,
"num_tokens": 1147181036.0,
"step": 817
},
{
"epoch": 1.6285430134261563,
"grad_norm": 0.11993258446455002,
"learning_rate": 1e-06,
"loss": 0.5371,
"num_tokens": 1148568031.0,
"step": 818
},
{
"epoch": 1.6305320735952262,
"grad_norm": 0.12087948620319366,
"learning_rate": 1e-06,
"loss": 0.5239,
"num_tokens": 1149971169.0,
"step": 819
},
{
"epoch": 1.6325211337642964,
"grad_norm": 0.12193674594163895,
"learning_rate": 1e-06,
"loss": 0.532,
"num_tokens": 1151375502.0,
"step": 820
},
{
"epoch": 1.6345101939333664,
"grad_norm": 0.11705011874437332,
"learning_rate": 1e-06,
"loss": 0.5374,
"num_tokens": 1152788267.0,
"step": 821
},
{
"epoch": 1.6364992541024366,
"grad_norm": 0.11758929491043091,
"learning_rate": 1e-06,
"loss": 0.5321,
"num_tokens": 1154203978.0,
"step": 822
},
{
"epoch": 1.6384883142715068,
"grad_norm": 0.11853373050689697,
"learning_rate": 1e-06,
"loss": 0.537,
"num_tokens": 1155628446.0,
"step": 823
},
{
"epoch": 1.640477374440577,
"grad_norm": 0.11777627468109131,
"learning_rate": 1e-06,
"loss": 0.5272,
"num_tokens": 1157039859.0,
"step": 824
},
{
"epoch": 1.642466434609647,
"grad_norm": 0.12647390365600586,
"learning_rate": 1e-06,
"loss": 0.5326,
"num_tokens": 1158461070.0,
"step": 825
},
{
"epoch": 1.644455494778717,
"grad_norm": 0.11644790321588516,
"learning_rate": 1e-06,
"loss": 0.5176,
"num_tokens": 1159857529.0,
"step": 826
},
{
"epoch": 1.646444554947787,
"grad_norm": 0.11872653663158417,
"learning_rate": 1e-06,
"loss": 0.5255,
"num_tokens": 1161253129.0,
"step": 827
},
{
"epoch": 1.6484336151168573,
"grad_norm": 0.1227663904428482,
"learning_rate": 1e-06,
"loss": 0.5217,
"num_tokens": 1162695252.0,
"step": 828
},
{
"epoch": 1.6504226752859275,
"grad_norm": 0.12286870181560516,
"learning_rate": 1e-06,
"loss": 0.5252,
"num_tokens": 1164073116.0,
"step": 829
},
{
"epoch": 1.6524117354549976,
"grad_norm": 0.11983044445514679,
"learning_rate": 1e-06,
"loss": 0.5277,
"num_tokens": 1165473708.0,
"step": 830
},
{
"epoch": 1.6544007956240676,
"grad_norm": 0.11557100713253021,
"learning_rate": 1e-06,
"loss": 0.5383,
"num_tokens": 1166861777.0,
"step": 831
},
{
"epoch": 1.6563898557931376,
"grad_norm": 0.12076076120138168,
"learning_rate": 1e-06,
"loss": 0.5438,
"num_tokens": 1168281147.0,
"step": 832
},
{
"epoch": 1.6583789159622078,
"grad_norm": 0.11843711882829666,
"learning_rate": 1e-06,
"loss": 0.5263,
"num_tokens": 1169644876.0,
"step": 833
},
{
"epoch": 1.660367976131278,
"grad_norm": 0.11820250749588013,
"learning_rate": 1e-06,
"loss": 0.5311,
"num_tokens": 1171071751.0,
"step": 834
},
{
"epoch": 1.6623570363003481,
"grad_norm": 0.11654365807771683,
"learning_rate": 1e-06,
"loss": 0.5307,
"num_tokens": 1172492906.0,
"step": 835
},
{
"epoch": 1.6643460964694183,
"grad_norm": 0.12128648906946182,
"learning_rate": 1e-06,
"loss": 0.5255,
"num_tokens": 1173918229.0,
"step": 836
},
{
"epoch": 1.6663351566384883,
"grad_norm": 0.1194423958659172,
"learning_rate": 1e-06,
"loss": 0.5265,
"num_tokens": 1175294915.0,
"step": 837
},
{
"epoch": 1.6683242168075583,
"grad_norm": 0.11887000501155853,
"learning_rate": 1e-06,
"loss": 0.5413,
"num_tokens": 1176717797.0,
"step": 838
},
{
"epoch": 1.6703132769766285,
"grad_norm": 0.1198115423321724,
"learning_rate": 1e-06,
"loss": 0.5251,
"num_tokens": 1178114954.0,
"step": 839
},
{
"epoch": 1.6723023371456986,
"grad_norm": 0.12133830040693283,
"learning_rate": 1e-06,
"loss": 0.5311,
"num_tokens": 1179495561.0,
"step": 840
},
{
"epoch": 1.6742913973147688,
"grad_norm": 0.12414910644292831,
"learning_rate": 1e-06,
"loss": 0.5354,
"num_tokens": 1180900637.0,
"step": 841
},
{
"epoch": 1.676280457483839,
"grad_norm": 0.11752628535032272,
"learning_rate": 1e-06,
"loss": 0.5302,
"num_tokens": 1182291450.0,
"step": 842
},
{
"epoch": 1.678269517652909,
"grad_norm": 0.11616742610931396,
"learning_rate": 1e-06,
"loss": 0.5253,
"num_tokens": 1183704362.0,
"step": 843
},
{
"epoch": 1.680258577821979,
"grad_norm": 0.11694184690713882,
"learning_rate": 1e-06,
"loss": 0.5365,
"num_tokens": 1185133642.0,
"step": 844
},
{
"epoch": 1.6822476379910491,
"grad_norm": 0.17413176596164703,
"learning_rate": 1e-06,
"loss": 0.5194,
"num_tokens": 1186533182.0,
"step": 845
},
{
"epoch": 1.6842366981601193,
"grad_norm": 0.11673513799905777,
"learning_rate": 1e-06,
"loss": 0.5238,
"num_tokens": 1187935618.0,
"step": 846
},
{
"epoch": 1.6862257583291895,
"grad_norm": 0.11706209182739258,
"learning_rate": 1e-06,
"loss": 0.5274,
"num_tokens": 1189355590.0,
"step": 847
},
{
"epoch": 1.6882148184982597,
"grad_norm": 0.12071144580841064,
"learning_rate": 1e-06,
"loss": 0.522,
"num_tokens": 1190723818.0,
"step": 848
},
{
"epoch": 1.6902038786673297,
"grad_norm": 0.11622277647256851,
"learning_rate": 1e-06,
"loss": 0.5284,
"num_tokens": 1192146190.0,
"step": 849
},
{
"epoch": 1.6921929388363997,
"grad_norm": 0.11853674054145813,
"learning_rate": 1e-06,
"loss": 0.5294,
"num_tokens": 1193541029.0,
"step": 850
},
{
"epoch": 1.6941819990054698,
"grad_norm": 0.11840229481458664,
"learning_rate": 1e-06,
"loss": 0.5146,
"num_tokens": 1194973011.0,
"step": 851
},
{
"epoch": 1.69617105917454,
"grad_norm": 0.11261092871427536,
"learning_rate": 1e-06,
"loss": 0.5263,
"num_tokens": 1196379352.0,
"step": 852
},
{
"epoch": 1.6981601193436102,
"grad_norm": 0.11384103447198868,
"learning_rate": 1e-06,
"loss": 0.5203,
"num_tokens": 1197758489.0,
"step": 853
},
{
"epoch": 1.7001491795126804,
"grad_norm": 0.12079748511314392,
"learning_rate": 1e-06,
"loss": 0.5315,
"num_tokens": 1199163292.0,
"step": 854
},
{
"epoch": 1.7021382396817504,
"grad_norm": 0.11787824332714081,
"learning_rate": 1e-06,
"loss": 0.516,
"num_tokens": 1200558367.0,
"step": 855
},
{
"epoch": 1.7041272998508203,
"grad_norm": 0.11851559579372406,
"learning_rate": 1e-06,
"loss": 0.5208,
"num_tokens": 1201948798.0,
"step": 856
},
{
"epoch": 1.7061163600198905,
"grad_norm": 0.11743155121803284,
"learning_rate": 1e-06,
"loss": 0.5213,
"num_tokens": 1203317747.0,
"step": 857
},
{
"epoch": 1.7081054201889607,
"grad_norm": 0.11653459817171097,
"learning_rate": 1e-06,
"loss": 0.5292,
"num_tokens": 1204751593.0,
"step": 858
},
{
"epoch": 1.710094480358031,
"grad_norm": 0.11759098619222641,
"learning_rate": 1e-06,
"loss": 0.5272,
"num_tokens": 1206123965.0,
"step": 859
},
{
"epoch": 1.712083540527101,
"grad_norm": 0.11782211810350418,
"learning_rate": 1e-06,
"loss": 0.525,
"num_tokens": 1207504339.0,
"step": 860
},
{
"epoch": 1.714072600696171,
"grad_norm": 0.11687052994966507,
"learning_rate": 1e-06,
"loss": 0.5321,
"num_tokens": 1208933928.0,
"step": 861
},
{
"epoch": 1.716061660865241,
"grad_norm": 0.11712006479501724,
"learning_rate": 1e-06,
"loss": 0.5311,
"num_tokens": 1210329028.0,
"step": 862
},
{
"epoch": 1.7180507210343112,
"grad_norm": 0.11513438820838928,
"learning_rate": 1e-06,
"loss": 0.5298,
"num_tokens": 1211754806.0,
"step": 863
},
{
"epoch": 1.7200397812033814,
"grad_norm": 0.11873895674943924,
"learning_rate": 1e-06,
"loss": 0.5211,
"num_tokens": 1213112342.0,
"step": 864
},
{
"epoch": 1.7220288413724516,
"grad_norm": 0.11382027715444565,
"learning_rate": 1e-06,
"loss": 0.5227,
"num_tokens": 1214524561.0,
"step": 865
},
{
"epoch": 1.7240179015415218,
"grad_norm": 0.11664129793643951,
"learning_rate": 1e-06,
"loss": 0.5255,
"num_tokens": 1215928909.0,
"step": 866
},
{
"epoch": 1.7260069617105918,
"grad_norm": 0.11998264491558075,
"learning_rate": 1e-06,
"loss": 0.5195,
"num_tokens": 1217291462.0,
"step": 867
},
{
"epoch": 1.7279960218796617,
"grad_norm": 0.11994168907403946,
"learning_rate": 1e-06,
"loss": 0.5359,
"num_tokens": 1218671826.0,
"step": 868
},
{
"epoch": 1.729985082048732,
"grad_norm": 0.1149284616112709,
"learning_rate": 1e-06,
"loss": 0.5305,
"num_tokens": 1220083806.0,
"step": 869
},
{
"epoch": 1.731974142217802,
"grad_norm": 0.11925540119409561,
"learning_rate": 1e-06,
"loss": 0.5365,
"num_tokens": 1221495745.0,
"step": 870
},
{
"epoch": 1.7339632023868723,
"grad_norm": 0.11679849773645401,
"learning_rate": 1e-06,
"loss": 0.5227,
"num_tokens": 1222941705.0,
"step": 871
},
{
"epoch": 1.7359522625559425,
"grad_norm": 0.1146961897611618,
"learning_rate": 1e-06,
"loss": 0.5359,
"num_tokens": 1224378188.0,
"step": 872
},
{
"epoch": 1.7379413227250124,
"grad_norm": 0.11416348069906235,
"learning_rate": 1e-06,
"loss": 0.5365,
"num_tokens": 1225807121.0,
"step": 873
},
{
"epoch": 1.7399303828940824,
"grad_norm": 0.12208687514066696,
"learning_rate": 1e-06,
"loss": 0.5266,
"num_tokens": 1227205689.0,
"step": 874
},
{
"epoch": 1.7419194430631526,
"grad_norm": 0.11736491322517395,
"learning_rate": 1e-06,
"loss": 0.5315,
"num_tokens": 1228632893.0,
"step": 875
},
{
"epoch": 1.7439085032322228,
"grad_norm": 0.11757368594408035,
"learning_rate": 1e-06,
"loss": 0.5324,
"num_tokens": 1230044870.0,
"step": 876
},
{
"epoch": 1.745897563401293,
"grad_norm": 0.1128118559718132,
"learning_rate": 1e-06,
"loss": 0.5284,
"num_tokens": 1231458999.0,
"step": 877
},
{
"epoch": 1.747886623570363,
"grad_norm": 0.11604047566652298,
"learning_rate": 1e-06,
"loss": 0.5246,
"num_tokens": 1232886812.0,
"step": 878
},
{
"epoch": 1.7498756837394331,
"grad_norm": 0.11722288280725479,
"learning_rate": 1e-06,
"loss": 0.5205,
"num_tokens": 1234263247.0,
"step": 879
},
{
"epoch": 1.751864743908503,
"grad_norm": 0.11631619185209274,
"learning_rate": 1e-06,
"loss": 0.5271,
"num_tokens": 1235684949.0,
"step": 880
},
{
"epoch": 1.7538538040775733,
"grad_norm": 0.11910022050142288,
"learning_rate": 1e-06,
"loss": 0.5174,
"num_tokens": 1237092421.0,
"step": 881
},
{
"epoch": 1.7558428642466435,
"grad_norm": 0.37577947974205017,
"learning_rate": 1e-06,
"loss": 0.5305,
"num_tokens": 1238488077.0,
"step": 882
},
{
"epoch": 1.7578319244157137,
"grad_norm": 0.11624370515346527,
"learning_rate": 1e-06,
"loss": 0.5257,
"num_tokens": 1239858200.0,
"step": 883
},
{
"epoch": 1.7598209845847836,
"grad_norm": 0.11999819427728653,
"learning_rate": 1e-06,
"loss": 0.5296,
"num_tokens": 1241270454.0,
"step": 884
},
{
"epoch": 1.7618100447538538,
"grad_norm": 0.11568225175142288,
"learning_rate": 1e-06,
"loss": 0.5204,
"num_tokens": 1242689791.0,
"step": 885
},
{
"epoch": 1.7637991049229238,
"grad_norm": 0.1157977357506752,
"learning_rate": 1e-06,
"loss": 0.5273,
"num_tokens": 1244076494.0,
"step": 886
},
{
"epoch": 1.765788165091994,
"grad_norm": 0.12077594548463821,
"learning_rate": 1e-06,
"loss": 0.5352,
"num_tokens": 1245450599.0,
"step": 887
},
{
"epoch": 1.7677772252610642,
"grad_norm": 0.11732426285743713,
"learning_rate": 1e-06,
"loss": 0.518,
"num_tokens": 1246866283.0,
"step": 888
},
{
"epoch": 1.7697662854301344,
"grad_norm": 0.11936353892087936,
"learning_rate": 1e-06,
"loss": 0.5211,
"num_tokens": 1248255137.0,
"step": 889
},
{
"epoch": 1.7717553455992043,
"grad_norm": 0.11886170506477356,
"learning_rate": 1e-06,
"loss": 0.5257,
"num_tokens": 1249659564.0,
"step": 890
},
{
"epoch": 1.7737444057682745,
"grad_norm": 0.11828939616680145,
"learning_rate": 1e-06,
"loss": 0.5405,
"num_tokens": 1251058792.0,
"step": 891
},
{
"epoch": 1.7757334659373445,
"grad_norm": 0.11856890469789505,
"learning_rate": 1e-06,
"loss": 0.5306,
"num_tokens": 1252485161.0,
"step": 892
},
{
"epoch": 1.7777225261064147,
"grad_norm": 0.11919775605201721,
"learning_rate": 1e-06,
"loss": 0.5263,
"num_tokens": 1253864701.0,
"step": 893
},
{
"epoch": 1.7797115862754849,
"grad_norm": 0.11722150444984436,
"learning_rate": 1e-06,
"loss": 0.5319,
"num_tokens": 1255315675.0,
"step": 894
},
{
"epoch": 1.781700646444555,
"grad_norm": 0.11736007779836655,
"learning_rate": 1e-06,
"loss": 0.5212,
"num_tokens": 1256727422.0,
"step": 895
},
{
"epoch": 1.783689706613625,
"grad_norm": 0.11714823544025421,
"learning_rate": 1e-06,
"loss": 0.5154,
"num_tokens": 1258118035.0,
"step": 896
},
{
"epoch": 1.7856787667826952,
"grad_norm": 0.11619334667921066,
"learning_rate": 1e-06,
"loss": 0.5354,
"num_tokens": 1259569135.0,
"step": 897
},
{
"epoch": 1.7876678269517652,
"grad_norm": 0.12298526614904404,
"learning_rate": 1e-06,
"loss": 0.5241,
"num_tokens": 1260962943.0,
"step": 898
},
{
"epoch": 1.7896568871208354,
"grad_norm": 0.11889567226171494,
"learning_rate": 1e-06,
"loss": 0.5156,
"num_tokens": 1262364349.0,
"step": 899
},
{
"epoch": 1.7916459472899056,
"grad_norm": 0.11857149749994278,
"learning_rate": 1e-06,
"loss": 0.5338,
"num_tokens": 1263806716.0,
"step": 900
},
{
"epoch": 1.7936350074589757,
"grad_norm": 0.11797276884317398,
"learning_rate": 1e-06,
"loss": 0.5207,
"num_tokens": 1265169481.0,
"step": 901
},
{
"epoch": 1.7956240676280457,
"grad_norm": 0.11892815679311752,
"learning_rate": 1e-06,
"loss": 0.5293,
"num_tokens": 1266576993.0,
"step": 902
},
{
"epoch": 1.797613127797116,
"grad_norm": 0.12395410984754562,
"learning_rate": 1e-06,
"loss": 0.5366,
"num_tokens": 1267993556.0,
"step": 903
},
{
"epoch": 1.7996021879661859,
"grad_norm": 0.11953330039978027,
"learning_rate": 1e-06,
"loss": 0.5368,
"num_tokens": 1269389274.0,
"step": 904
},
{
"epoch": 1.801591248135256,
"grad_norm": 0.12015491724014282,
"learning_rate": 1e-06,
"loss": 0.5221,
"num_tokens": 1270797211.0,
"step": 905
},
{
"epoch": 1.8035803083043263,
"grad_norm": 0.12275838851928711,
"learning_rate": 1e-06,
"loss": 0.516,
"num_tokens": 1272228896.0,
"step": 906
},
{
"epoch": 1.8055693684733964,
"grad_norm": 0.11699802428483963,
"learning_rate": 1e-06,
"loss": 0.5261,
"num_tokens": 1273621370.0,
"step": 907
},
{
"epoch": 1.8075584286424664,
"grad_norm": 0.1161780133843422,
"learning_rate": 1e-06,
"loss": 0.5274,
"num_tokens": 1275017133.0,
"step": 908
},
{
"epoch": 1.8095474888115366,
"grad_norm": 0.11965856701135635,
"learning_rate": 1e-06,
"loss": 0.5337,
"num_tokens": 1276448660.0,
"step": 909
},
{
"epoch": 1.8115365489806066,
"grad_norm": 0.12168006598949432,
"learning_rate": 1e-06,
"loss": 0.5175,
"num_tokens": 1277862125.0,
"step": 910
},
{
"epoch": 1.8135256091496768,
"grad_norm": 0.11938372254371643,
"learning_rate": 1e-06,
"loss": 0.5216,
"num_tokens": 1279248819.0,
"step": 911
},
{
"epoch": 1.815514669318747,
"grad_norm": 0.11735393106937408,
"learning_rate": 1e-06,
"loss": 0.5233,
"num_tokens": 1280646759.0,
"step": 912
},
{
"epoch": 1.8175037294878171,
"grad_norm": 0.11623270809650421,
"learning_rate": 1e-06,
"loss": 0.5233,
"num_tokens": 1282057498.0,
"step": 913
},
{
"epoch": 1.819492789656887,
"grad_norm": 0.11557810753583908,
"learning_rate": 1e-06,
"loss": 0.5153,
"num_tokens": 1283431495.0,
"step": 914
},
{
"epoch": 1.8214818498259573,
"grad_norm": 0.1188741996884346,
"learning_rate": 1e-06,
"loss": 0.5046,
"num_tokens": 1284807817.0,
"step": 915
},
{
"epoch": 1.8234709099950273,
"grad_norm": 0.12421073764562607,
"learning_rate": 1e-06,
"loss": 0.529,
"num_tokens": 1286222594.0,
"step": 916
},
{
"epoch": 1.8254599701640974,
"grad_norm": 0.11472687870264053,
"learning_rate": 1e-06,
"loss": 0.5189,
"num_tokens": 1287660647.0,
"step": 917
},
{
"epoch": 1.8274490303331676,
"grad_norm": 0.12024683505296707,
"learning_rate": 1e-06,
"loss": 0.5327,
"num_tokens": 1289025502.0,
"step": 918
},
{
"epoch": 1.8294380905022378,
"grad_norm": 0.11754554510116577,
"learning_rate": 1e-06,
"loss": 0.5266,
"num_tokens": 1290429204.0,
"step": 919
},
{
"epoch": 1.8314271506713078,
"grad_norm": 0.11749599874019623,
"learning_rate": 1e-06,
"loss": 0.5346,
"num_tokens": 1291835199.0,
"step": 920
},
{
"epoch": 1.8334162108403778,
"grad_norm": 0.12334319949150085,
"learning_rate": 1e-06,
"loss": 0.5286,
"num_tokens": 1293232429.0,
"step": 921
},
{
"epoch": 1.835405271009448,
"grad_norm": 0.1195298433303833,
"learning_rate": 1e-06,
"loss": 0.5342,
"num_tokens": 1294636234.0,
"step": 922
},
{
"epoch": 1.8373943311785181,
"grad_norm": 0.12667156755924225,
"learning_rate": 1e-06,
"loss": 0.5329,
"num_tokens": 1296037959.0,
"step": 923
},
{
"epoch": 1.8393833913475883,
"grad_norm": 0.11679953336715698,
"learning_rate": 1e-06,
"loss": 0.521,
"num_tokens": 1297473035.0,
"step": 924
},
{
"epoch": 1.8413724515166585,
"grad_norm": 0.12583783268928528,
"learning_rate": 1e-06,
"loss": 0.5276,
"num_tokens": 1298890907.0,
"step": 925
},
{
"epoch": 1.8433615116857285,
"grad_norm": 0.11728479713201523,
"learning_rate": 1e-06,
"loss": 0.5395,
"num_tokens": 1300299527.0,
"step": 926
},
{
"epoch": 1.8453505718547984,
"grad_norm": 0.12031624466180801,
"learning_rate": 1e-06,
"loss": 0.5223,
"num_tokens": 1301707639.0,
"step": 927
},
{
"epoch": 1.8473396320238686,
"grad_norm": 0.11883748322725296,
"learning_rate": 1e-06,
"loss": 0.5314,
"num_tokens": 1303102646.0,
"step": 928
},
{
"epoch": 1.8493286921929388,
"grad_norm": 0.11844377219676971,
"learning_rate": 1e-06,
"loss": 0.5239,
"num_tokens": 1304494162.0,
"step": 929
},
{
"epoch": 1.851317752362009,
"grad_norm": 0.11900133639574051,
"learning_rate": 1e-06,
"loss": 0.5417,
"num_tokens": 1305913481.0,
"step": 930
},
{
"epoch": 1.8533068125310792,
"grad_norm": 0.11836715042591095,
"learning_rate": 1e-06,
"loss": 0.5233,
"num_tokens": 1307345252.0,
"step": 931
},
{
"epoch": 1.8552958727001492,
"grad_norm": 0.11962825059890747,
"learning_rate": 1e-06,
"loss": 0.5189,
"num_tokens": 1308721678.0,
"step": 932
},
{
"epoch": 1.8572849328692191,
"grad_norm": 0.11557050049304962,
"learning_rate": 1e-06,
"loss": 0.534,
"num_tokens": 1310185671.0,
"step": 933
},
{
"epoch": 1.8592739930382893,
"grad_norm": 0.11940222978591919,
"learning_rate": 1e-06,
"loss": 0.5201,
"num_tokens": 1311578400.0,
"step": 934
},
{
"epoch": 1.8612630532073595,
"grad_norm": 0.11809618771076202,
"learning_rate": 1e-06,
"loss": 0.5302,
"num_tokens": 1313012059.0,
"step": 935
},
{
"epoch": 1.8632521133764297,
"grad_norm": 0.11569247394800186,
"learning_rate": 1e-06,
"loss": 0.5261,
"num_tokens": 1314429380.0,
"step": 936
},
{
"epoch": 1.8652411735455,
"grad_norm": 0.11870964616537094,
"learning_rate": 1e-06,
"loss": 0.531,
"num_tokens": 1315859666.0,
"step": 937
},
{
"epoch": 1.8672302337145699,
"grad_norm": 0.11804142594337463,
"learning_rate": 1e-06,
"loss": 0.524,
"num_tokens": 1317284109.0,
"step": 938
},
{
"epoch": 1.8692192938836398,
"grad_norm": 0.12137165665626526,
"learning_rate": 1e-06,
"loss": 0.5172,
"num_tokens": 1318645618.0,
"step": 939
},
{
"epoch": 1.87120835405271,
"grad_norm": 0.1159721091389656,
"learning_rate": 1e-06,
"loss": 0.5197,
"num_tokens": 1320005960.0,
"step": 940
},
{
"epoch": 1.8731974142217802,
"grad_norm": 0.1175520122051239,
"learning_rate": 1e-06,
"loss": 0.5141,
"num_tokens": 1321343874.0,
"step": 941
},
{
"epoch": 1.8751864743908504,
"grad_norm": 0.12161525338888168,
"learning_rate": 1e-06,
"loss": 0.5283,
"num_tokens": 1322745862.0,
"step": 942
},
{
"epoch": 1.8771755345599206,
"grad_norm": 0.11651836335659027,
"learning_rate": 1e-06,
"loss": 0.5166,
"num_tokens": 1324173893.0,
"step": 943
},
{
"epoch": 1.8791645947289906,
"grad_norm": 0.11678607016801834,
"learning_rate": 1e-06,
"loss": 0.5157,
"num_tokens": 1325528425.0,
"step": 944
},
{
"epoch": 1.8811536548980605,
"grad_norm": 0.11988682299852371,
"learning_rate": 1e-06,
"loss": 0.5241,
"num_tokens": 1326934542.0,
"step": 945
},
{
"epoch": 1.8831427150671307,
"grad_norm": 0.1179330125451088,
"learning_rate": 1e-06,
"loss": 0.5278,
"num_tokens": 1328329326.0,
"step": 946
},
{
"epoch": 1.885131775236201,
"grad_norm": 0.12124811857938766,
"learning_rate": 1e-06,
"loss": 0.5121,
"num_tokens": 1329690107.0,
"step": 947
},
{
"epoch": 1.887120835405271,
"grad_norm": 0.11679516732692719,
"learning_rate": 1e-06,
"loss": 0.5123,
"num_tokens": 1331099443.0,
"step": 948
},
{
"epoch": 1.8891098955743413,
"grad_norm": 0.12094994634389877,
"learning_rate": 1e-06,
"loss": 0.5294,
"num_tokens": 1332514974.0,
"step": 949
},
{
"epoch": 1.8910989557434112,
"grad_norm": 0.11741626262664795,
"learning_rate": 1e-06,
"loss": 0.5221,
"num_tokens": 1333908900.0,
"step": 950
},
{
"epoch": 1.8930880159124812,
"grad_norm": 0.11530331522226334,
"learning_rate": 1e-06,
"loss": 0.5253,
"num_tokens": 1335304709.0,
"step": 951
},
{
"epoch": 1.8950770760815514,
"grad_norm": 0.1158575564622879,
"learning_rate": 1e-06,
"loss": 0.5285,
"num_tokens": 1336705867.0,
"step": 952
},
{
"epoch": 1.8970661362506216,
"grad_norm": 0.12077206373214722,
"learning_rate": 1e-06,
"loss": 0.5282,
"num_tokens": 1338134157.0,
"step": 953
},
{
"epoch": 1.8990551964196918,
"grad_norm": 0.11674216389656067,
"learning_rate": 1e-06,
"loss": 0.5256,
"num_tokens": 1339570572.0,
"step": 954
},
{
"epoch": 1.901044256588762,
"grad_norm": 0.11999034881591797,
"learning_rate": 1e-06,
"loss": 0.5217,
"num_tokens": 1340963763.0,
"step": 955
},
{
"epoch": 1.903033316757832,
"grad_norm": 0.1159081682562828,
"learning_rate": 1e-06,
"loss": 0.5193,
"num_tokens": 1342343659.0,
"step": 956
},
{
"epoch": 1.905022376926902,
"grad_norm": 0.12077668309211731,
"learning_rate": 1e-06,
"loss": 0.5258,
"num_tokens": 1343740689.0,
"step": 957
},
{
"epoch": 1.907011437095972,
"grad_norm": 0.1166224554181099,
"learning_rate": 1e-06,
"loss": 0.5292,
"num_tokens": 1345132334.0,
"step": 958
},
{
"epoch": 1.9090004972650423,
"grad_norm": 0.1241859421133995,
"learning_rate": 1e-06,
"loss": 0.5148,
"num_tokens": 1346524317.0,
"step": 959
},
{
"epoch": 1.9109895574341125,
"grad_norm": 0.11995749920606613,
"learning_rate": 1e-06,
"loss": 0.5299,
"num_tokens": 1347941197.0,
"step": 960
},
{
"epoch": 1.9129786176031827,
"grad_norm": 0.12070658802986145,
"learning_rate": 1e-06,
"loss": 0.521,
"num_tokens": 1349348423.0,
"step": 961
},
{
"epoch": 1.9149676777722526,
"grad_norm": 0.11713531613349915,
"learning_rate": 1e-06,
"loss": 0.5194,
"num_tokens": 1350775976.0,
"step": 962
},
{
"epoch": 1.9169567379413226,
"grad_norm": 0.11523653566837311,
"learning_rate": 1e-06,
"loss": 0.5187,
"num_tokens": 1352185197.0,
"step": 963
},
{
"epoch": 1.9189457981103928,
"grad_norm": 0.1193997859954834,
"learning_rate": 1e-06,
"loss": 0.521,
"num_tokens": 1353553440.0,
"step": 964
},
{
"epoch": 1.920934858279463,
"grad_norm": 0.11642030626535416,
"learning_rate": 1e-06,
"loss": 0.5187,
"num_tokens": 1354944439.0,
"step": 965
},
{
"epoch": 1.9229239184485332,
"grad_norm": 0.12482885271310806,
"learning_rate": 1e-06,
"loss": 0.515,
"num_tokens": 1356336096.0,
"step": 966
},
{
"epoch": 1.9249129786176031,
"grad_norm": 0.12160996347665787,
"learning_rate": 1e-06,
"loss": 0.5118,
"num_tokens": 1357735474.0,
"step": 967
},
{
"epoch": 1.9269020387866733,
"grad_norm": 0.11489357799291611,
"learning_rate": 1e-06,
"loss": 0.5202,
"num_tokens": 1359150923.0,
"step": 968
},
{
"epoch": 1.9288910989557433,
"grad_norm": 0.1207101047039032,
"learning_rate": 1e-06,
"loss": 0.5185,
"num_tokens": 1360546317.0,
"step": 969
},
{
"epoch": 1.9308801591248135,
"grad_norm": 0.12328807264566422,
"learning_rate": 1e-06,
"loss": 0.5359,
"num_tokens": 1361956243.0,
"step": 970
},
{
"epoch": 1.9328692192938837,
"grad_norm": 0.11625958234071732,
"learning_rate": 1e-06,
"loss": 0.5302,
"num_tokens": 1363369844.0,
"step": 971
},
{
"epoch": 1.9348582794629539,
"grad_norm": 0.11894381046295166,
"learning_rate": 1e-06,
"loss": 0.5152,
"num_tokens": 1364788358.0,
"step": 972
},
{
"epoch": 1.9368473396320238,
"grad_norm": 0.11918807029724121,
"learning_rate": 1e-06,
"loss": 0.5294,
"num_tokens": 1366212166.0,
"step": 973
},
{
"epoch": 1.938836399801094,
"grad_norm": 0.1214541345834732,
"learning_rate": 1e-06,
"loss": 0.5279,
"num_tokens": 1367620280.0,
"step": 974
},
{
"epoch": 1.940825459970164,
"grad_norm": 0.12185929715633392,
"learning_rate": 1e-06,
"loss": 0.5411,
"num_tokens": 1369018460.0,
"step": 975
},
{
"epoch": 1.9428145201392342,
"grad_norm": 0.12074369192123413,
"learning_rate": 1e-06,
"loss": 0.5169,
"num_tokens": 1370404156.0,
"step": 976
},
{
"epoch": 1.9448035803083044,
"grad_norm": 0.11974059790372849,
"learning_rate": 1e-06,
"loss": 0.5189,
"num_tokens": 1371779401.0,
"step": 977
},
{
"epoch": 1.9467926404773745,
"grad_norm": 0.1180194690823555,
"learning_rate": 1e-06,
"loss": 0.5255,
"num_tokens": 1373167544.0,
"step": 978
},
{
"epoch": 1.9487817006464445,
"grad_norm": 0.11891929805278778,
"learning_rate": 1e-06,
"loss": 0.5181,
"num_tokens": 1374625457.0,
"step": 979
},
{
"epoch": 1.9507707608155147,
"grad_norm": 0.11925170570611954,
"learning_rate": 1e-06,
"loss": 0.5386,
"num_tokens": 1376040593.0,
"step": 980
},
{
"epoch": 1.9527598209845847,
"grad_norm": 0.11748997122049332,
"learning_rate": 1e-06,
"loss": 0.5292,
"num_tokens": 1377439737.0,
"step": 981
},
{
"epoch": 1.9547488811536549,
"grad_norm": 0.12130767852067947,
"learning_rate": 1e-06,
"loss": 0.5283,
"num_tokens": 1378841802.0,
"step": 982
},
{
"epoch": 1.956737941322725,
"grad_norm": 0.117047019302845,
"learning_rate": 1e-06,
"loss": 0.5307,
"num_tokens": 1380279333.0,
"step": 983
},
{
"epoch": 1.9587270014917952,
"grad_norm": 0.11402452737092972,
"learning_rate": 1e-06,
"loss": 0.5252,
"num_tokens": 1381666440.0,
"step": 984
},
{
"epoch": 1.9607160616608652,
"grad_norm": 0.11929241567850113,
"learning_rate": 1e-06,
"loss": 0.5267,
"num_tokens": 1383062810.0,
"step": 985
},
{
"epoch": 1.9627051218299354,
"grad_norm": 0.13006067276000977,
"learning_rate": 1e-06,
"loss": 0.5245,
"num_tokens": 1384441301.0,
"step": 986
},
{
"epoch": 1.9646941819990054,
"grad_norm": 0.11855417490005493,
"learning_rate": 1e-06,
"loss": 0.5241,
"num_tokens": 1385828740.0,
"step": 987
},
{
"epoch": 1.9666832421680756,
"grad_norm": 0.11694888770580292,
"learning_rate": 1e-06,
"loss": 0.5198,
"num_tokens": 1387217686.0,
"step": 988
},
{
"epoch": 1.9686723023371457,
"grad_norm": 0.12000932544469833,
"learning_rate": 1e-06,
"loss": 0.5167,
"num_tokens": 1388611363.0,
"step": 989
},
{
"epoch": 1.970661362506216,
"grad_norm": 0.11440891027450562,
"learning_rate": 1e-06,
"loss": 0.5149,
"num_tokens": 1390014972.0,
"step": 990
},
{
"epoch": 1.972650422675286,
"grad_norm": 0.11770491302013397,
"learning_rate": 1e-06,
"loss": 0.5254,
"num_tokens": 1391404923.0,
"step": 991
},
{
"epoch": 1.974639482844356,
"grad_norm": 0.1166594997048378,
"learning_rate": 1e-06,
"loss": 0.5316,
"num_tokens": 1392816272.0,
"step": 992
},
{
"epoch": 1.976628543013426,
"grad_norm": 0.12553872168064117,
"learning_rate": 1e-06,
"loss": 0.5168,
"num_tokens": 1394201525.0,
"step": 993
},
{
"epoch": 1.9786176031824962,
"grad_norm": 0.11989990621805191,
"learning_rate": 1e-06,
"loss": 0.5317,
"num_tokens": 1395588841.0,
"step": 994
},
{
"epoch": 1.9806066633515664,
"grad_norm": 0.1259329915046692,
"learning_rate": 1e-06,
"loss": 0.5367,
"num_tokens": 1397008721.0,
"step": 995
},
{
"epoch": 1.9825957235206366,
"grad_norm": 0.11585794389247894,
"learning_rate": 1e-06,
"loss": 0.5216,
"num_tokens": 1398471728.0,
"step": 996
},
{
"epoch": 1.9845847836897066,
"grad_norm": 0.12714508175849915,
"learning_rate": 1e-06,
"loss": 0.5278,
"num_tokens": 1399889254.0,
"step": 997
},
{
"epoch": 1.9865738438587768,
"grad_norm": 0.12047947198152542,
"learning_rate": 1e-06,
"loss": 0.5258,
"num_tokens": 1401259409.0,
"step": 998
},
{
"epoch": 1.9885629040278467,
"grad_norm": 0.12171699851751328,
"learning_rate": 1e-06,
"loss": 0.5198,
"num_tokens": 1402651874.0,
"step": 999
},
{
"epoch": 1.990551964196917,
"grad_norm": 0.11678752303123474,
"learning_rate": 1e-06,
"loss": 0.5278,
"num_tokens": 1404075471.0,
"step": 1000
},
{
"epoch": 1.9925410243659871,
"grad_norm": 0.1172628328204155,
"learning_rate": 1e-06,
"loss": 0.5231,
"num_tokens": 1405486757.0,
"step": 1001
},
{
"epoch": 1.9945300845350573,
"grad_norm": 0.11984940618276596,
"learning_rate": 1e-06,
"loss": 0.522,
"num_tokens": 1406892660.0,
"step": 1002
},
{
"epoch": 1.9965191447041273,
"grad_norm": 0.11809879541397095,
"learning_rate": 1e-06,
"loss": 0.516,
"num_tokens": 1408310027.0,
"step": 1003
},
{
"epoch": 1.9985082048731975,
"grad_norm": 0.11894188821315765,
"learning_rate": 1e-06,
"loss": 0.5125,
"num_tokens": 1409683792.0,
"step": 1004
},
{
"epoch": 1.9985082048731975,
"step": 1004,
"total_flos": 5.825582167854206e+19,
"train_loss": 0.5989120793770034,
"train_runtime": 31705.2652,
"train_samples_per_second": 14.201,
"train_steps_per_second": 0.032
}
],
"logging_steps": 1,
"max_steps": 1004,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 51,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.825582167854206e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}