|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9985082048731975, |
|
"eval_steps": 500, |
|
"global_step": 1004, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001989060169070114, |
|
"grad_norm": 4.783787250518799, |
|
"learning_rate": 0.0, |
|
"loss": 1.1878, |
|
"num_tokens": 1379928.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003978120338140228, |
|
"grad_norm": 4.881955623626709, |
|
"learning_rate": 9.9009900990099e-09, |
|
"loss": 1.2018, |
|
"num_tokens": 2791003.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005967180507210343, |
|
"grad_norm": 4.82396125793457, |
|
"learning_rate": 1.98019801980198e-08, |
|
"loss": 1.1913, |
|
"num_tokens": 4194463.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007956240676280457, |
|
"grad_norm": 4.832489967346191, |
|
"learning_rate": 2.97029702970297e-08, |
|
"loss": 1.1974, |
|
"num_tokens": 5584143.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.009945300845350571, |
|
"grad_norm": 4.7777581214904785, |
|
"learning_rate": 3.96039603960396e-08, |
|
"loss": 1.1893, |
|
"num_tokens": 7031244.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.011934361014420686, |
|
"grad_norm": 4.757320880889893, |
|
"learning_rate": 4.950495049504951e-08, |
|
"loss": 1.1874, |
|
"num_tokens": 8436181.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0139234211834908, |
|
"grad_norm": 4.777390480041504, |
|
"learning_rate": 5.94059405940594e-08, |
|
"loss": 1.1855, |
|
"num_tokens": 9836769.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.015912481352560914, |
|
"grad_norm": 4.798787593841553, |
|
"learning_rate": 6.930693069306931e-08, |
|
"loss": 1.1791, |
|
"num_tokens": 11244836.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01790154152163103, |
|
"grad_norm": 4.807232856750488, |
|
"learning_rate": 7.92079207920792e-08, |
|
"loss": 1.181, |
|
"num_tokens": 12623567.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.019890601690701143, |
|
"grad_norm": 4.721550464630127, |
|
"learning_rate": 8.91089108910891e-08, |
|
"loss": 1.1763, |
|
"num_tokens": 14028360.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02187966185977126, |
|
"grad_norm": 4.797355651855469, |
|
"learning_rate": 9.900990099009901e-08, |
|
"loss": 1.1935, |
|
"num_tokens": 15405448.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.023868722028841372, |
|
"grad_norm": 4.754054069519043, |
|
"learning_rate": 1.089108910891089e-07, |
|
"loss": 1.1914, |
|
"num_tokens": 16852417.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.025857782197911485, |
|
"grad_norm": 4.781761169433594, |
|
"learning_rate": 1.188118811881188e-07, |
|
"loss": 1.1883, |
|
"num_tokens": 18245229.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0278468423669816, |
|
"grad_norm": 4.794190406799316, |
|
"learning_rate": 1.2871287128712872e-07, |
|
"loss": 1.1829, |
|
"num_tokens": 19630250.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.029835902536051714, |
|
"grad_norm": 4.7866363525390625, |
|
"learning_rate": 1.3861386138613863e-07, |
|
"loss": 1.2058, |
|
"num_tokens": 21090140.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03182496270512183, |
|
"grad_norm": 4.720057964324951, |
|
"learning_rate": 1.485148514851485e-07, |
|
"loss": 1.1942, |
|
"num_tokens": 22459305.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03381402287419195, |
|
"grad_norm": 4.714616775512695, |
|
"learning_rate": 1.584158415841584e-07, |
|
"loss": 1.1765, |
|
"num_tokens": 23864152.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03580308304326206, |
|
"grad_norm": 4.747438907623291, |
|
"learning_rate": 1.6831683168316832e-07, |
|
"loss": 1.1952, |
|
"num_tokens": 25269279.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03779214321233217, |
|
"grad_norm": 4.7715654373168945, |
|
"learning_rate": 1.782178217821782e-07, |
|
"loss": 1.1815, |
|
"num_tokens": 26685193.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.039781203381402286, |
|
"grad_norm": 4.7065911293029785, |
|
"learning_rate": 1.8811881188118812e-07, |
|
"loss": 1.1685, |
|
"num_tokens": 28076540.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0417702635504724, |
|
"grad_norm": 4.74373197555542, |
|
"learning_rate": 1.9801980198019803e-07, |
|
"loss": 1.1932, |
|
"num_tokens": 29489172.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.04375932371954252, |
|
"grad_norm": 4.572362899780273, |
|
"learning_rate": 2.079207920792079e-07, |
|
"loss": 1.1591, |
|
"num_tokens": 30889595.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04574838388861263, |
|
"grad_norm": 4.560796737670898, |
|
"learning_rate": 2.178217821782178e-07, |
|
"loss": 1.1752, |
|
"num_tokens": 32298146.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.047737444057682744, |
|
"grad_norm": 4.545941352844238, |
|
"learning_rate": 2.2772277227722772e-07, |
|
"loss": 1.1707, |
|
"num_tokens": 33729949.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.04972650422675286, |
|
"grad_norm": 4.548399448394775, |
|
"learning_rate": 2.376237623762376e-07, |
|
"loss": 1.1759, |
|
"num_tokens": 35146745.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05171556439582297, |
|
"grad_norm": 4.558365345001221, |
|
"learning_rate": 2.475247524752475e-07, |
|
"loss": 1.1859, |
|
"num_tokens": 36534224.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.05370462456489309, |
|
"grad_norm": 4.54498291015625, |
|
"learning_rate": 2.5742574257425743e-07, |
|
"loss": 1.1644, |
|
"num_tokens": 37942016.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0556936847339632, |
|
"grad_norm": 4.540126323699951, |
|
"learning_rate": 2.673267326732673e-07, |
|
"loss": 1.1689, |
|
"num_tokens": 39362015.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.057682744903033316, |
|
"grad_norm": 4.505973815917969, |
|
"learning_rate": 2.7722772277227726e-07, |
|
"loss": 1.158, |
|
"num_tokens": 40746995.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.05967180507210343, |
|
"grad_norm": 4.145939350128174, |
|
"learning_rate": 2.871287128712871e-07, |
|
"loss": 1.147, |
|
"num_tokens": 42184610.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06166086524117355, |
|
"grad_norm": 4.096794128417969, |
|
"learning_rate": 2.97029702970297e-07, |
|
"loss": 1.1433, |
|
"num_tokens": 43594850.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.06364992541024365, |
|
"grad_norm": 4.0818586349487305, |
|
"learning_rate": 3.069306930693069e-07, |
|
"loss": 1.1351, |
|
"num_tokens": 45000725.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.06563898557931377, |
|
"grad_norm": 4.103175640106201, |
|
"learning_rate": 3.168316831683168e-07, |
|
"loss": 1.1303, |
|
"num_tokens": 46389383.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0676280457483839, |
|
"grad_norm": 4.075052738189697, |
|
"learning_rate": 3.2673267326732674e-07, |
|
"loss": 1.1427, |
|
"num_tokens": 47780652.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.069617105917454, |
|
"grad_norm": 4.112435340881348, |
|
"learning_rate": 3.3663366336633663e-07, |
|
"loss": 1.14, |
|
"num_tokens": 49201017.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.07160616608652412, |
|
"grad_norm": 4.016624450683594, |
|
"learning_rate": 3.465346534653465e-07, |
|
"loss": 1.142, |
|
"num_tokens": 50577829.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07359522625559423, |
|
"grad_norm": 3.9627022743225098, |
|
"learning_rate": 3.564356435643564e-07, |
|
"loss": 1.1189, |
|
"num_tokens": 52003696.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.07558428642466435, |
|
"grad_norm": 4.006690502166748, |
|
"learning_rate": 3.663366336633663e-07, |
|
"loss": 1.1307, |
|
"num_tokens": 53420296.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.07757334659373447, |
|
"grad_norm": 3.9357409477233887, |
|
"learning_rate": 3.7623762376237623e-07, |
|
"loss": 1.1279, |
|
"num_tokens": 54812494.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07956240676280457, |
|
"grad_norm": 3.903628349304199, |
|
"learning_rate": 3.861386138613861e-07, |
|
"loss": 1.1042, |
|
"num_tokens": 56180118.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08155146693187469, |
|
"grad_norm": 3.4105091094970703, |
|
"learning_rate": 3.9603960396039606e-07, |
|
"loss": 1.0558, |
|
"num_tokens": 57631209.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0835405271009448, |
|
"grad_norm": 3.0491228103637695, |
|
"learning_rate": 4.0594059405940595e-07, |
|
"loss": 1.0404, |
|
"num_tokens": 59050835.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.08552958727001492, |
|
"grad_norm": 2.9168484210968018, |
|
"learning_rate": 4.158415841584158e-07, |
|
"loss": 1.033, |
|
"num_tokens": 60439622.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.08751864743908504, |
|
"grad_norm": 2.8887298107147217, |
|
"learning_rate": 4.257425742574257e-07, |
|
"loss": 1.026, |
|
"num_tokens": 61853701.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08950770760815514, |
|
"grad_norm": 2.882795810699463, |
|
"learning_rate": 4.356435643564356e-07, |
|
"loss": 1.0189, |
|
"num_tokens": 63243921.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09149676777722526, |
|
"grad_norm": 2.826184034347534, |
|
"learning_rate": 4.4554455445544555e-07, |
|
"loss": 1.0267, |
|
"num_tokens": 64692234.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.09348582794629537, |
|
"grad_norm": 2.837517261505127, |
|
"learning_rate": 4.5544554455445543e-07, |
|
"loss": 1.0188, |
|
"num_tokens": 66095339.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.09547488811536549, |
|
"grad_norm": 2.7744252681732178, |
|
"learning_rate": 4.6534653465346537e-07, |
|
"loss": 1.021, |
|
"num_tokens": 67470384.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.09746394828443561, |
|
"grad_norm": 2.7776126861572266, |
|
"learning_rate": 4.752475247524752e-07, |
|
"loss": 1.0072, |
|
"num_tokens": 68907983.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.09945300845350571, |
|
"grad_norm": 2.696685552597046, |
|
"learning_rate": 4.851485148514851e-07, |
|
"loss": 1.0018, |
|
"num_tokens": 70284975.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10144206862257583, |
|
"grad_norm": 2.6469790935516357, |
|
"learning_rate": 4.95049504950495e-07, |
|
"loss": 0.999, |
|
"num_tokens": 71668994.0, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.10343112879164594, |
|
"grad_norm": 2.5766100883483887, |
|
"learning_rate": 5.04950495049505e-07, |
|
"loss": 0.9941, |
|
"num_tokens": 73087542.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.10542018896071606, |
|
"grad_norm": 2.5015177726745605, |
|
"learning_rate": 5.148514851485149e-07, |
|
"loss": 0.9808, |
|
"num_tokens": 74502903.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.10740924912978618, |
|
"grad_norm": 2.4191646575927734, |
|
"learning_rate": 5.247524752475247e-07, |
|
"loss": 0.9769, |
|
"num_tokens": 75902927.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.10939830929885629, |
|
"grad_norm": 2.2691502571105957, |
|
"learning_rate": 5.346534653465346e-07, |
|
"loss": 0.9584, |
|
"num_tokens": 77315581.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1113873694679264, |
|
"grad_norm": 2.0384151935577393, |
|
"learning_rate": 5.445544554455445e-07, |
|
"loss": 0.9216, |
|
"num_tokens": 78698236.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.11337642963699653, |
|
"grad_norm": 1.7994420528411865, |
|
"learning_rate": 5.544554455445545e-07, |
|
"loss": 0.8971, |
|
"num_tokens": 80129837.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.11536548980606663, |
|
"grad_norm": 1.6083354949951172, |
|
"learning_rate": 5.643564356435643e-07, |
|
"loss": 0.887, |
|
"num_tokens": 81529934.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.11735454997513675, |
|
"grad_norm": 1.4472858905792236, |
|
"learning_rate": 5.742574257425742e-07, |
|
"loss": 0.8759, |
|
"num_tokens": 82926044.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.11934361014420686, |
|
"grad_norm": 1.3287198543548584, |
|
"learning_rate": 5.841584158415841e-07, |
|
"loss": 0.8572, |
|
"num_tokens": 84300935.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12133267031327698, |
|
"grad_norm": 1.2509266138076782, |
|
"learning_rate": 5.94059405940594e-07, |
|
"loss": 0.8497, |
|
"num_tokens": 85709827.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.1233217304823471, |
|
"grad_norm": 1.2169718742370605, |
|
"learning_rate": 6.03960396039604e-07, |
|
"loss": 0.8503, |
|
"num_tokens": 87094581.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1253107906514172, |
|
"grad_norm": 1.1747347116470337, |
|
"learning_rate": 6.138613861386138e-07, |
|
"loss": 0.8499, |
|
"num_tokens": 88486857.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1272998508204873, |
|
"grad_norm": 1.108124852180481, |
|
"learning_rate": 6.237623762376237e-07, |
|
"loss": 0.8385, |
|
"num_tokens": 89892391.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.12928891098955744, |
|
"grad_norm": 1.080589771270752, |
|
"learning_rate": 6.336633663366336e-07, |
|
"loss": 0.8415, |
|
"num_tokens": 91290866.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.13127797115862755, |
|
"grad_norm": 1.0345003604888916, |
|
"learning_rate": 6.435643564356436e-07, |
|
"loss": 0.8287, |
|
"num_tokens": 92700954.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.13326703132769765, |
|
"grad_norm": 0.9741297364234924, |
|
"learning_rate": 6.534653465346535e-07, |
|
"loss": 0.8193, |
|
"num_tokens": 94131765.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.1352560914967678, |
|
"grad_norm": 0.9597648978233337, |
|
"learning_rate": 6.633663366336634e-07, |
|
"loss": 0.8245, |
|
"num_tokens": 95516370.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1372451516658379, |
|
"grad_norm": 0.9024690985679626, |
|
"learning_rate": 6.732673267326733e-07, |
|
"loss": 0.8203, |
|
"num_tokens": 96921506.0, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.139234211834908, |
|
"grad_norm": 0.8436392545700073, |
|
"learning_rate": 6.831683168316831e-07, |
|
"loss": 0.8005, |
|
"num_tokens": 98292636.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14122327200397813, |
|
"grad_norm": 0.7839356660842896, |
|
"learning_rate": 6.93069306930693e-07, |
|
"loss": 0.832, |
|
"num_tokens": 99743934.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.14321233217304824, |
|
"grad_norm": 0.7109583616256714, |
|
"learning_rate": 7.029702970297029e-07, |
|
"loss": 0.794, |
|
"num_tokens": 101144081.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.14520139234211835, |
|
"grad_norm": 0.6535574793815613, |
|
"learning_rate": 7.128712871287128e-07, |
|
"loss": 0.8081, |
|
"num_tokens": 102546821.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.14719045251118845, |
|
"grad_norm": 0.5921033620834351, |
|
"learning_rate": 7.227722772277227e-07, |
|
"loss": 0.7886, |
|
"num_tokens": 103957723.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.14917951268025859, |
|
"grad_norm": 0.5575245022773743, |
|
"learning_rate": 7.326732673267326e-07, |
|
"loss": 0.783, |
|
"num_tokens": 105353702.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1511685728493287, |
|
"grad_norm": 0.5279855132102966, |
|
"learning_rate": 7.425742574257426e-07, |
|
"loss": 0.7729, |
|
"num_tokens": 106790931.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1531576330183988, |
|
"grad_norm": 0.5107793211936951, |
|
"learning_rate": 7.524752475247525e-07, |
|
"loss": 0.7583, |
|
"num_tokens": 108181851.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.15514669318746893, |
|
"grad_norm": 0.5129069089889526, |
|
"learning_rate": 7.623762376237624e-07, |
|
"loss": 0.7624, |
|
"num_tokens": 109558589.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.15713575335653904, |
|
"grad_norm": 0.479915589094162, |
|
"learning_rate": 7.722772277227722e-07, |
|
"loss": 0.7378, |
|
"num_tokens": 110915629.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.15912481352560914, |
|
"grad_norm": 0.4730769991874695, |
|
"learning_rate": 7.821782178217821e-07, |
|
"loss": 0.7619, |
|
"num_tokens": 112326351.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16111387369467928, |
|
"grad_norm": 0.4519067406654358, |
|
"learning_rate": 7.920792079207921e-07, |
|
"loss": 0.7459, |
|
"num_tokens": 113740610.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.16310293386374938, |
|
"grad_norm": 0.4359944760799408, |
|
"learning_rate": 8.01980198019802e-07, |
|
"loss": 0.749, |
|
"num_tokens": 115148611.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.1650919940328195, |
|
"grad_norm": 0.4051918685436249, |
|
"learning_rate": 8.118811881188119e-07, |
|
"loss": 0.7447, |
|
"num_tokens": 116569175.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.1670810542018896, |
|
"grad_norm": 0.38103756308555603, |
|
"learning_rate": 8.217821782178217e-07, |
|
"loss": 0.7469, |
|
"num_tokens": 117975546.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.16907011437095973, |
|
"grad_norm": 0.360709011554718, |
|
"learning_rate": 8.316831683168316e-07, |
|
"loss": 0.722, |
|
"num_tokens": 119391367.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.17105917454002983, |
|
"grad_norm": 0.3431180417537689, |
|
"learning_rate": 8.415841584158416e-07, |
|
"loss": 0.7323, |
|
"num_tokens": 120769077.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.17304823470909994, |
|
"grad_norm": 0.32238441705703735, |
|
"learning_rate": 8.514851485148514e-07, |
|
"loss": 0.7254, |
|
"num_tokens": 122218630.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.17503729487817007, |
|
"grad_norm": 0.3144312798976898, |
|
"learning_rate": 8.613861386138613e-07, |
|
"loss": 0.741, |
|
"num_tokens": 123638408.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.17702635504724018, |
|
"grad_norm": 0.2969042658805847, |
|
"learning_rate": 8.712871287128712e-07, |
|
"loss": 0.707, |
|
"num_tokens": 125024280.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.1790154152163103, |
|
"grad_norm": 0.28798267245292664, |
|
"learning_rate": 8.811881188118812e-07, |
|
"loss": 0.7082, |
|
"num_tokens": 126462816.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18100447538538042, |
|
"grad_norm": 0.2822662591934204, |
|
"learning_rate": 8.910891089108911e-07, |
|
"loss": 0.7139, |
|
"num_tokens": 127856241.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.18299353555445053, |
|
"grad_norm": 0.275879830121994, |
|
"learning_rate": 9.00990099009901e-07, |
|
"loss": 0.7113, |
|
"num_tokens": 129247744.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.18498259572352063, |
|
"grad_norm": 0.26023879647254944, |
|
"learning_rate": 9.108910891089109e-07, |
|
"loss": 0.7, |
|
"num_tokens": 130653045.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.18697165589259074, |
|
"grad_norm": 0.2475547045469284, |
|
"learning_rate": 9.207920792079208e-07, |
|
"loss": 0.7075, |
|
"num_tokens": 132075768.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.18896071606166087, |
|
"grad_norm": 0.23815281689167023, |
|
"learning_rate": 9.306930693069307e-07, |
|
"loss": 0.7034, |
|
"num_tokens": 133471787.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.19094977623073098, |
|
"grad_norm": 0.2257402390241623, |
|
"learning_rate": 9.405940594059405e-07, |
|
"loss": 0.7005, |
|
"num_tokens": 134884172.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.19293883639980108, |
|
"grad_norm": 0.21398675441741943, |
|
"learning_rate": 9.504950495049504e-07, |
|
"loss": 0.6954, |
|
"num_tokens": 136321122.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.19492789656887122, |
|
"grad_norm": 0.21331369876861572, |
|
"learning_rate": 9.603960396039604e-07, |
|
"loss": 0.6931, |
|
"num_tokens": 137714653.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.19691695673794132, |
|
"grad_norm": 0.21159325540065765, |
|
"learning_rate": 9.702970297029702e-07, |
|
"loss": 0.7108, |
|
"num_tokens": 139151591.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.19890601690701143, |
|
"grad_norm": 0.20192930102348328, |
|
"learning_rate": 9.801980198019802e-07, |
|
"loss": 0.6838, |
|
"num_tokens": 140552794.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.20089507707608156, |
|
"grad_norm": 0.20033682882785797, |
|
"learning_rate": 9.9009900990099e-07, |
|
"loss": 0.6839, |
|
"num_tokens": 141962296.0, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.20288413724515167, |
|
"grad_norm": 0.1956896036863327, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6853, |
|
"num_tokens": 143391533.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.20487319741422177, |
|
"grad_norm": 0.1973898708820343, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6867, |
|
"num_tokens": 144782734.0, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.20686225758329188, |
|
"grad_norm": 0.17975734174251556, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6773, |
|
"num_tokens": 146195534.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.20885131775236201, |
|
"grad_norm": 0.17711520195007324, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6582, |
|
"num_tokens": 147581967.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.21084037792143212, |
|
"grad_norm": 0.1741390824317932, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6852, |
|
"num_tokens": 148962092.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.21282943809050223, |
|
"grad_norm": 0.1642421931028366, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6722, |
|
"num_tokens": 150367148.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.21481849825957236, |
|
"grad_norm": 0.1667676866054535, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6628, |
|
"num_tokens": 151756604.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.21680755842864247, |
|
"grad_norm": 0.1586826741695404, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6622, |
|
"num_tokens": 153175572.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.21879661859771257, |
|
"grad_norm": 0.15848655998706818, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6775, |
|
"num_tokens": 154616863.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2207856787667827, |
|
"grad_norm": 0.15296334028244019, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6544, |
|
"num_tokens": 156003361.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2227747389358528, |
|
"grad_norm": 0.1545649766921997, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6646, |
|
"num_tokens": 157395330.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.22476379910492292, |
|
"grad_norm": 0.15351980924606323, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6646, |
|
"num_tokens": 158812399.0, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.22675285927399305, |
|
"grad_norm": 0.14907206594944, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6508, |
|
"num_tokens": 160216957.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.22874191944306316, |
|
"grad_norm": 0.14644992351531982, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6644, |
|
"num_tokens": 161681970.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.23073097961213326, |
|
"grad_norm": 0.15030954778194427, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6432, |
|
"num_tokens": 163031303.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.23272003978120337, |
|
"grad_norm": 0.1434543877840042, |
|
"learning_rate": 1e-06, |
|
"loss": 0.647, |
|
"num_tokens": 164438690.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.2347090999502735, |
|
"grad_norm": 0.14714758098125458, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6547, |
|
"num_tokens": 165848650.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2366981601193436, |
|
"grad_norm": 0.14545480906963348, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6666, |
|
"num_tokens": 167315232.0, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.23868722028841372, |
|
"grad_norm": 0.14221689105033875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6481, |
|
"num_tokens": 168709749.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24067628045748385, |
|
"grad_norm": 0.14459247887134552, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6361, |
|
"num_tokens": 170093537.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.24266534062655395, |
|
"grad_norm": 0.14327335357666016, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6513, |
|
"num_tokens": 171464842.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.24465440079562406, |
|
"grad_norm": 0.13770653307437897, |
|
"learning_rate": 1e-06, |
|
"loss": 0.641, |
|
"num_tokens": 172860600.0, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.2466434609646942, |
|
"grad_norm": 0.1363484412431717, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6445, |
|
"num_tokens": 174305923.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.2486325211337643, |
|
"grad_norm": 0.1345747411251068, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6374, |
|
"num_tokens": 175699135.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2506215813028344, |
|
"grad_norm": 0.13693904876708984, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6491, |
|
"num_tokens": 177132442.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.25261064147190454, |
|
"grad_norm": 0.1340012401342392, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6349, |
|
"num_tokens": 178530726.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.2545997016409746, |
|
"grad_norm": 0.13670295476913452, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6439, |
|
"num_tokens": 179929069.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.25658876181004475, |
|
"grad_norm": 0.13612930476665497, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6382, |
|
"num_tokens": 181362595.0, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.2585778219791149, |
|
"grad_norm": 0.13262143731117249, |
|
"learning_rate": 1e-06, |
|
"loss": 0.636, |
|
"num_tokens": 182758225.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.26056688214818496, |
|
"grad_norm": 0.13065095245838165, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6339, |
|
"num_tokens": 184186457.0, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2625559423172551, |
|
"grad_norm": 0.13491177558898926, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6424, |
|
"num_tokens": 185597538.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.26454500248632523, |
|
"grad_norm": 0.13115911185741425, |
|
"learning_rate": 1e-06, |
|
"loss": 0.636, |
|
"num_tokens": 186999644.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.2665340626553953, |
|
"grad_norm": 0.13199158012866974, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6404, |
|
"num_tokens": 188383971.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.26852312282446544, |
|
"grad_norm": 0.1268879771232605, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6458, |
|
"num_tokens": 189800770.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2705121829935356, |
|
"grad_norm": 0.12490399926900864, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6437, |
|
"num_tokens": 191189967.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.27250124316260566, |
|
"grad_norm": 0.1264086365699768, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6015, |
|
"num_tokens": 192578506.0, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.2744903033316758, |
|
"grad_norm": 0.1346651315689087, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6271, |
|
"num_tokens": 193939223.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.2764793635007459, |
|
"grad_norm": 0.12618686258792877, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6249, |
|
"num_tokens": 195333601.0, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.278468423669816, |
|
"grad_norm": 0.12344113737344742, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6314, |
|
"num_tokens": 196788664.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.28045748383888613, |
|
"grad_norm": 0.12506203353405, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6282, |
|
"num_tokens": 198163401.0, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.28244654400795627, |
|
"grad_norm": 0.12806549668312073, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6304, |
|
"num_tokens": 199567499.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.28443560417702635, |
|
"grad_norm": 0.12447736412286758, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6145, |
|
"num_tokens": 200961717.0, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.2864246643460965, |
|
"grad_norm": 0.123995341360569, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6296, |
|
"num_tokens": 202374356.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.28841372451516656, |
|
"grad_norm": 0.12191283702850342, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6255, |
|
"num_tokens": 203763067.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2904027846842367, |
|
"grad_norm": 0.12908770143985748, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6134, |
|
"num_tokens": 205148329.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.2923918448533068, |
|
"grad_norm": 0.12782661616802216, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6366, |
|
"num_tokens": 206571749.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.2943809050223769, |
|
"grad_norm": 0.12458013743162155, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6274, |
|
"num_tokens": 207945787.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.29636996519144704, |
|
"grad_norm": 0.1228199154138565, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6126, |
|
"num_tokens": 209368496.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.29835902536051717, |
|
"grad_norm": 0.11965636909008026, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6222, |
|
"num_tokens": 210815947.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.30034808552958725, |
|
"grad_norm": 0.1323099434375763, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6242, |
|
"num_tokens": 212216246.0, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.3023371456986574, |
|
"grad_norm": 0.12547767162322998, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6129, |
|
"num_tokens": 213565534.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3043262058677275, |
|
"grad_norm": 0.12437719106674194, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6268, |
|
"num_tokens": 215022812.0, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.3063152660367976, |
|
"grad_norm": 0.12172248214483261, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6158, |
|
"num_tokens": 216418038.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.30830432620586773, |
|
"grad_norm": 0.12287899106740952, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6165, |
|
"num_tokens": 217835190.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.31029338637493786, |
|
"grad_norm": 0.12283938378095627, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6088, |
|
"num_tokens": 219203881.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.31228244654400794, |
|
"grad_norm": 0.12894928455352783, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6245, |
|
"num_tokens": 220596713.0, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.3142715067130781, |
|
"grad_norm": 0.12768110632896423, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6148, |
|
"num_tokens": 221990986.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3162605668821482, |
|
"grad_norm": 0.11902462691068649, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6134, |
|
"num_tokens": 223430785.0, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3182496270512183, |
|
"grad_norm": 0.12414086610078812, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6236, |
|
"num_tokens": 224823735.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3202386872202884, |
|
"grad_norm": 0.12295661866664886, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6222, |
|
"num_tokens": 226213815.0, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.32222774738935855, |
|
"grad_norm": 0.12154286354780197, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6154, |
|
"num_tokens": 227589479.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.32421680755842863, |
|
"grad_norm": 0.12159918993711472, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6112, |
|
"num_tokens": 228978021.0, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.32620586772749877, |
|
"grad_norm": 0.1207566186785698, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6101, |
|
"num_tokens": 230373569.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.32819492789656884, |
|
"grad_norm": 0.1222677007317543, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6089, |
|
"num_tokens": 231771815.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.330183988065639, |
|
"grad_norm": 0.1245197132229805, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6095, |
|
"num_tokens": 233172925.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3321730482347091, |
|
"grad_norm": 0.1191013976931572, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6138, |
|
"num_tokens": 234578536.0, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.3341621084037792, |
|
"grad_norm": 0.12282473593950272, |
|
"learning_rate": 1e-06, |
|
"loss": 0.605, |
|
"num_tokens": 235997856.0, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.3361511685728493, |
|
"grad_norm": 0.11851899325847626, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6224, |
|
"num_tokens": 237456733.0, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.33814022874191946, |
|
"grad_norm": 0.12077327817678452, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6026, |
|
"num_tokens": 238849801.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.34012928891098954, |
|
"grad_norm": 0.11882904917001724, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6126, |
|
"num_tokens": 240265062.0, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.34211834908005967, |
|
"grad_norm": 0.12201754748821259, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5968, |
|
"num_tokens": 241660845.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.3441074092491298, |
|
"grad_norm": 0.11851049214601517, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6045, |
|
"num_tokens": 243073416.0, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.3460964694181999, |
|
"grad_norm": 0.1223016306757927, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6081, |
|
"num_tokens": 244457775.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.34808552958727, |
|
"grad_norm": 0.11766277253627777, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6115, |
|
"num_tokens": 245880299.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.35007458975634015, |
|
"grad_norm": 0.12223277240991592, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6147, |
|
"num_tokens": 247285182.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.3520636499254102, |
|
"grad_norm": 0.12305691093206406, |
|
"learning_rate": 1e-06, |
|
"loss": 0.621, |
|
"num_tokens": 248698180.0, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.35405271009448036, |
|
"grad_norm": 0.12199797481298447, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6028, |
|
"num_tokens": 250069197.0, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.3560417702635505, |
|
"grad_norm": 0.12232685089111328, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6219, |
|
"num_tokens": 251503045.0, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.3580308304326206, |
|
"grad_norm": 0.12115549296140671, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6139, |
|
"num_tokens": 252942250.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3600198906016907, |
|
"grad_norm": 0.12164245545864105, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6158, |
|
"num_tokens": 254386360.0, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.36200895077076084, |
|
"grad_norm": 0.12038590759038925, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6171, |
|
"num_tokens": 255791663.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.3639980109398309, |
|
"grad_norm": 0.12380865961313248, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6146, |
|
"num_tokens": 257180435.0, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.36598707110890105, |
|
"grad_norm": 0.12001994252204895, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5962, |
|
"num_tokens": 258575017.0, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.3679761312779712, |
|
"grad_norm": 0.12672315537929535, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5982, |
|
"num_tokens": 259971664.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.36996519144704126, |
|
"grad_norm": 0.12033303827047348, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5976, |
|
"num_tokens": 261365988.0, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.3719542516161114, |
|
"grad_norm": 0.12143931537866592, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5963, |
|
"num_tokens": 262783906.0, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.3739433117851815, |
|
"grad_norm": 0.12255272269248962, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6123, |
|
"num_tokens": 264206687.0, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.3759323719542516, |
|
"grad_norm": 0.1221979632973671, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5967, |
|
"num_tokens": 265600406.0, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.37792143212332174, |
|
"grad_norm": 0.11851304769515991, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5885, |
|
"num_tokens": 266998184.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3799104922923918, |
|
"grad_norm": 0.11754640191793442, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6109, |
|
"num_tokens": 268432018.0, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.38189955246146196, |
|
"grad_norm": 0.12079335004091263, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6018, |
|
"num_tokens": 269811989.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3838886126305321, |
|
"grad_norm": 0.12268039584159851, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6076, |
|
"num_tokens": 271216935.0, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.38587767279960217, |
|
"grad_norm": 0.12338917702436447, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6127, |
|
"num_tokens": 272636458.0, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.3878667329686723, |
|
"grad_norm": 0.11832890659570694, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6117, |
|
"num_tokens": 274045339.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.38985579313774243, |
|
"grad_norm": 0.11602655798196793, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6099, |
|
"num_tokens": 275455795.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.3918448533068125, |
|
"grad_norm": 0.11740183085203171, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6065, |
|
"num_tokens": 276879525.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.39383391347588265, |
|
"grad_norm": 0.12076492607593536, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6082, |
|
"num_tokens": 278290807.0, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.3958229736449528, |
|
"grad_norm": 0.12188921868801117, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5984, |
|
"num_tokens": 279716188.0, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.39781203381402286, |
|
"grad_norm": 0.12162777781486511, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6029, |
|
"num_tokens": 281099544.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.399801093983093, |
|
"grad_norm": 0.11848218739032745, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5823, |
|
"num_tokens": 282469441.0, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.4017901541521631, |
|
"grad_norm": 0.1235736832022667, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5956, |
|
"num_tokens": 283884792.0, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.4037792143212332, |
|
"grad_norm": 0.1228078156709671, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6079, |
|
"num_tokens": 285286535.0, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.40576827449030334, |
|
"grad_norm": 0.12109891325235367, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6001, |
|
"num_tokens": 286679118.0, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.40775733465937347, |
|
"grad_norm": 0.12217779457569122, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6023, |
|
"num_tokens": 288108758.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.40974639482844355, |
|
"grad_norm": 0.12233875691890717, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5983, |
|
"num_tokens": 289524538.0, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.4117354549975137, |
|
"grad_norm": 0.12142330408096313, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6072, |
|
"num_tokens": 290911340.0, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.41372451516658376, |
|
"grad_norm": 0.12050288170576096, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5899, |
|
"num_tokens": 292301970.0, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.4157135753356539, |
|
"grad_norm": 0.12356843799352646, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5958, |
|
"num_tokens": 293718750.0, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.41770263550472403, |
|
"grad_norm": 0.12285276502370834, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5915, |
|
"num_tokens": 295137029.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4196916956737941, |
|
"grad_norm": 0.12178198248147964, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6015, |
|
"num_tokens": 296516937.0, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.42168075584286424, |
|
"grad_norm": 0.12042722851037979, |
|
"learning_rate": 1e-06, |
|
"loss": 0.592, |
|
"num_tokens": 297914166.0, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.4236698160119344, |
|
"grad_norm": 0.12015018612146378, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5917, |
|
"num_tokens": 299291477.0, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.42565887618100445, |
|
"grad_norm": 0.11729878187179565, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5902, |
|
"num_tokens": 300690831.0, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.4276479363500746, |
|
"grad_norm": 0.11877918988466263, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5837, |
|
"num_tokens": 302125996.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.4296369965191447, |
|
"grad_norm": 0.12102558463811874, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6027, |
|
"num_tokens": 303541215.0, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.4316260566882148, |
|
"grad_norm": 0.11814971268177032, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5912, |
|
"num_tokens": 304932140.0, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.43361511685728493, |
|
"grad_norm": 0.11896595358848572, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6004, |
|
"num_tokens": 306387713.0, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.43560417702635507, |
|
"grad_norm": 0.12220481783151627, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6155, |
|
"num_tokens": 307820317.0, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.43759323719542514, |
|
"grad_norm": 0.11805558949708939, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5929, |
|
"num_tokens": 309219520.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4395822973644953, |
|
"grad_norm": 0.12132007628679276, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5837, |
|
"num_tokens": 310591459.0, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.4415713575335654, |
|
"grad_norm": 0.11762286722660065, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5861, |
|
"num_tokens": 312008252.0, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.4435604177026355, |
|
"grad_norm": 0.11986927688121796, |
|
"learning_rate": 1e-06, |
|
"loss": 0.594, |
|
"num_tokens": 313425810.0, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.4455494778717056, |
|
"grad_norm": 0.11796683073043823, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5866, |
|
"num_tokens": 314853879.0, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.44753853804077576, |
|
"grad_norm": 0.12010174244642258, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6066, |
|
"num_tokens": 316294854.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.44952759820984584, |
|
"grad_norm": 0.11594483256340027, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5822, |
|
"num_tokens": 317700098.0, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.45151665837891597, |
|
"grad_norm": 0.11875201016664505, |
|
"learning_rate": 1e-06, |
|
"loss": 0.6061, |
|
"num_tokens": 319128053.0, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.4535057185479861, |
|
"grad_norm": 0.1269533336162567, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5777, |
|
"num_tokens": 320515582.0, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.4554947787170562, |
|
"grad_norm": 0.11610081046819687, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5855, |
|
"num_tokens": 321928550.0, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.4574838388861263, |
|
"grad_norm": 0.12167216837406158, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5834, |
|
"num_tokens": 323331818.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4594728990551964, |
|
"grad_norm": 0.1215236485004425, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5922, |
|
"num_tokens": 324737162.0, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.4614619592242665, |
|
"grad_norm": 0.12563477456569672, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5898, |
|
"num_tokens": 326131119.0, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.46345101939333666, |
|
"grad_norm": 0.12811586260795593, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5749, |
|
"num_tokens": 327476246.0, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.46544007956240674, |
|
"grad_norm": 0.12052274495363235, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5946, |
|
"num_tokens": 328874811.0, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.4674291397314769, |
|
"grad_norm": 0.11987277865409851, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5886, |
|
"num_tokens": 330249034.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.469418199900547, |
|
"grad_norm": 0.11991394311189651, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5796, |
|
"num_tokens": 331639806.0, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.4714072600696171, |
|
"grad_norm": 0.11994659900665283, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5989, |
|
"num_tokens": 333038051.0, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.4733963202386872, |
|
"grad_norm": 0.12097202241420746, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5783, |
|
"num_tokens": 334431046.0, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.47538538040775735, |
|
"grad_norm": 0.12271056324243546, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5896, |
|
"num_tokens": 335827008.0, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.47737444057682743, |
|
"grad_norm": 0.12297134101390839, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5999, |
|
"num_tokens": 337226457.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.47936350074589756, |
|
"grad_norm": 0.1218937486410141, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5926, |
|
"num_tokens": 338666075.0, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.4813525609149677, |
|
"grad_norm": 0.12227629870176315, |
|
"learning_rate": 1e-06, |
|
"loss": 0.58, |
|
"num_tokens": 340097602.0, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.4833416210840378, |
|
"grad_norm": 0.11952868849039078, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5948, |
|
"num_tokens": 341495194.0, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.4853306812531079, |
|
"grad_norm": 0.11985652893781662, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5774, |
|
"num_tokens": 342859196.0, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.48731974142217804, |
|
"grad_norm": 0.11957862228155136, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5755, |
|
"num_tokens": 344277995.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4893088015912481, |
|
"grad_norm": 0.12266214936971664, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5877, |
|
"num_tokens": 345669645.0, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.49129786176031826, |
|
"grad_norm": 0.11971444636583328, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5962, |
|
"num_tokens": 347043955.0, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.4932869219293884, |
|
"grad_norm": 0.12104374915361404, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5878, |
|
"num_tokens": 348465321.0, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.49527598209845847, |
|
"grad_norm": 0.12102899700403214, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5904, |
|
"num_tokens": 349862835.0, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.4972650422675286, |
|
"grad_norm": 0.11829105764627457, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5826, |
|
"num_tokens": 351283798.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.49925410243659873, |
|
"grad_norm": 0.11999989300966263, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5983, |
|
"num_tokens": 352684836.0, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.5012431626056688, |
|
"grad_norm": 0.11723977327346802, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5951, |
|
"num_tokens": 354084748.0, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.5032322227747389, |
|
"grad_norm": 0.12956112623214722, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5898, |
|
"num_tokens": 355478595.0, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.5052212829438091, |
|
"grad_norm": 0.11831249296665192, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5901, |
|
"num_tokens": 356886933.0, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.5072103431128792, |
|
"grad_norm": 0.11731645464897156, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5787, |
|
"num_tokens": 358295994.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5091994032819492, |
|
"grad_norm": 0.12075242400169373, |
|
"learning_rate": 1e-06, |
|
"loss": 0.591, |
|
"num_tokens": 359683170.0, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.5111884634510194, |
|
"grad_norm": 0.12081367522478104, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5903, |
|
"num_tokens": 361079150.0, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.5131775236200895, |
|
"grad_norm": 0.11680179834365845, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5706, |
|
"num_tokens": 362477549.0, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.5151665837891596, |
|
"grad_norm": 0.11828526854515076, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5811, |
|
"num_tokens": 363848018.0, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.5171556439582298, |
|
"grad_norm": 0.1175350472331047, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5771, |
|
"num_tokens": 365236527.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5191447041272998, |
|
"grad_norm": 0.1198112890124321, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5787, |
|
"num_tokens": 366640485.0, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.5211337642963699, |
|
"grad_norm": 0.11720699816942215, |
|
"learning_rate": 1e-06, |
|
"loss": 0.584, |
|
"num_tokens": 368033122.0, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.5231228244654401, |
|
"grad_norm": 0.12382423877716064, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5866, |
|
"num_tokens": 369417377.0, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.5251118846345102, |
|
"grad_norm": 0.12134955078363419, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5757, |
|
"num_tokens": 370794212.0, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.5271009448035803, |
|
"grad_norm": 0.12391626089811325, |
|
"learning_rate": 1e-06, |
|
"loss": 0.601, |
|
"num_tokens": 372225021.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5290900049726505, |
|
"grad_norm": 0.11900907754898071, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5909, |
|
"num_tokens": 373610685.0, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.5310790651417205, |
|
"grad_norm": 0.11934248358011246, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5876, |
|
"num_tokens": 375021896.0, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.5330681253107906, |
|
"grad_norm": 0.12139896303415298, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5821, |
|
"num_tokens": 376456216.0, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.5350571854798608, |
|
"grad_norm": 0.12349140644073486, |
|
"learning_rate": 1e-06, |
|
"loss": 0.58, |
|
"num_tokens": 377828715.0, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.5370462456489309, |
|
"grad_norm": 0.12981721758842468, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5782, |
|
"num_tokens": 379262595.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.539035305818001, |
|
"grad_norm": 0.12098333984613419, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5906, |
|
"num_tokens": 380657341.0, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.5410243659870712, |
|
"grad_norm": 0.1278562843799591, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5891, |
|
"num_tokens": 382061345.0, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.5430134261561412, |
|
"grad_norm": 0.11872877925634384, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5765, |
|
"num_tokens": 383439874.0, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.5450024863252113, |
|
"grad_norm": 0.11846951395273209, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5803, |
|
"num_tokens": 384849495.0, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.5469915464942815, |
|
"grad_norm": 0.11676832288503647, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5859, |
|
"num_tokens": 386272096.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5489806066633516, |
|
"grad_norm": 0.118569515645504, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5706, |
|
"num_tokens": 387673261.0, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.5509696668324217, |
|
"grad_norm": 0.11762821674346924, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5849, |
|
"num_tokens": 389067728.0, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.5529587270014918, |
|
"grad_norm": 0.11877725273370743, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5909, |
|
"num_tokens": 390485805.0, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.5549477871705619, |
|
"grad_norm": 0.11775851249694824, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5836, |
|
"num_tokens": 391908547.0, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.556936847339632, |
|
"grad_norm": 0.11834586411714554, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5788, |
|
"num_tokens": 393310396.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5589259075087022, |
|
"grad_norm": 0.1218111664056778, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5927, |
|
"num_tokens": 394698059.0, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.5609149676777723, |
|
"grad_norm": 0.12251269072294235, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5801, |
|
"num_tokens": 396077595.0, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.5629040278468423, |
|
"grad_norm": 0.11502744257450104, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5719, |
|
"num_tokens": 397494815.0, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.5648930880159125, |
|
"grad_norm": 0.11885383725166321, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5706, |
|
"num_tokens": 398915154.0, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5668821481849826, |
|
"grad_norm": 0.1333908885717392, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5702, |
|
"num_tokens": 400321357.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5688712083540527, |
|
"grad_norm": 0.12071363627910614, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5818, |
|
"num_tokens": 401692589.0, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.5708602685231228, |
|
"grad_norm": 0.12001436948776245, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5859, |
|
"num_tokens": 403117801.0, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.572849328692193, |
|
"grad_norm": 0.12118804454803467, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5738, |
|
"num_tokens": 404520032.0, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.574838388861263, |
|
"grad_norm": 0.12114690989255905, |
|
"learning_rate": 1e-06, |
|
"loss": 0.581, |
|
"num_tokens": 405931871.0, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.5768274490303331, |
|
"grad_norm": 0.11723317950963974, |
|
"learning_rate": 1e-06, |
|
"loss": 0.569, |
|
"num_tokens": 407337363.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5788165091994033, |
|
"grad_norm": 0.11783155053853989, |
|
"learning_rate": 1e-06, |
|
"loss": 0.58, |
|
"num_tokens": 408742361.0, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.5808055693684734, |
|
"grad_norm": 0.12249549478292465, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5759, |
|
"num_tokens": 410152716.0, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5827946295375435, |
|
"grad_norm": 0.11785644292831421, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5719, |
|
"num_tokens": 411547709.0, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.5847836897066137, |
|
"grad_norm": 0.11994913220405579, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5839, |
|
"num_tokens": 412958152.0, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.5867727498756837, |
|
"grad_norm": 0.1180441826581955, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5874, |
|
"num_tokens": 414335814.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5887618100447538, |
|
"grad_norm": 0.1199953630566597, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5892, |
|
"num_tokens": 415753335.0, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.590750870213824, |
|
"grad_norm": 0.12051574885845184, |
|
"learning_rate": 1e-06, |
|
"loss": 0.587, |
|
"num_tokens": 417123268.0, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.5927399303828941, |
|
"grad_norm": 0.12905830144882202, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5822, |
|
"num_tokens": 418534320.0, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.5947289905519642, |
|
"grad_norm": 0.11819034814834595, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5738, |
|
"num_tokens": 419934167.0, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.5967180507210343, |
|
"grad_norm": 0.1195630431175232, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5891, |
|
"num_tokens": 421330163.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5987071108901044, |
|
"grad_norm": 0.11934220045804977, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5681, |
|
"num_tokens": 422702201.0, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.6006961710591745, |
|
"grad_norm": 0.11755826324224472, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5843, |
|
"num_tokens": 424135521.0, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.6026852312282447, |
|
"grad_norm": 0.11706581711769104, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5748, |
|
"num_tokens": 425481108.0, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.6046742913973148, |
|
"grad_norm": 0.11658696830272675, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5702, |
|
"num_tokens": 426880860.0, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.6066633515663848, |
|
"grad_norm": 0.12044768035411835, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5723, |
|
"num_tokens": 428272765.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.608652411735455, |
|
"grad_norm": 0.12221231311559677, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5645, |
|
"num_tokens": 429650167.0, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.6106414719045251, |
|
"grad_norm": 0.11917625367641449, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5748, |
|
"num_tokens": 431055294.0, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.6126305320735952, |
|
"grad_norm": 0.11839272826910019, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5692, |
|
"num_tokens": 432471600.0, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.6146195922426654, |
|
"grad_norm": 0.1222674548625946, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5798, |
|
"num_tokens": 433875649.0, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.6166086524117355, |
|
"grad_norm": 0.11733808368444443, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5641, |
|
"num_tokens": 435283504.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6185977125808055, |
|
"grad_norm": 0.1265021413564682, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5758, |
|
"num_tokens": 436661567.0, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.6205867727498757, |
|
"grad_norm": 0.11938843131065369, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5749, |
|
"num_tokens": 438042305.0, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.6225758329189458, |
|
"grad_norm": 0.11977506428956985, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5715, |
|
"num_tokens": 439453603.0, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.6245648930880159, |
|
"grad_norm": 0.1187228411436081, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5738, |
|
"num_tokens": 440857261.0, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.6265539532570861, |
|
"grad_norm": 0.11981964856386185, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5798, |
|
"num_tokens": 442261964.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6285430134261561, |
|
"grad_norm": 0.11806244403123856, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5789, |
|
"num_tokens": 443693559.0, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.6305320735952262, |
|
"grad_norm": 0.11995609104633331, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5736, |
|
"num_tokens": 445134735.0, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.6325211337642964, |
|
"grad_norm": 0.11736578494310379, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5782, |
|
"num_tokens": 446570052.0, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.6345101939333665, |
|
"grad_norm": 0.11504673957824707, |
|
"learning_rate": 1e-06, |
|
"loss": 0.57, |
|
"num_tokens": 447974813.0, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.6364992541024366, |
|
"grad_norm": 0.11872579902410507, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5781, |
|
"num_tokens": 449374619.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6384883142715068, |
|
"grad_norm": 0.11843977123498917, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5719, |
|
"num_tokens": 450791966.0, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.6404773744405768, |
|
"grad_norm": 0.11978229135274887, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5728, |
|
"num_tokens": 452194196.0, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.6424664346096469, |
|
"grad_norm": 0.11723372340202332, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5683, |
|
"num_tokens": 453551548.0, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.6444554947787171, |
|
"grad_norm": 0.11690861731767654, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5702, |
|
"num_tokens": 454937681.0, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.6464445549477872, |
|
"grad_norm": 0.11684879660606384, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5692, |
|
"num_tokens": 456361552.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6484336151168573, |
|
"grad_norm": 0.11809241771697998, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5952, |
|
"num_tokens": 457751294.0, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.6504226752859275, |
|
"grad_norm": 0.11862190812826157, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5756, |
|
"num_tokens": 459155713.0, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.6524117354549975, |
|
"grad_norm": 0.12103772908449173, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5678, |
|
"num_tokens": 460560134.0, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.6544007956240676, |
|
"grad_norm": 0.1281164437532425, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5845, |
|
"num_tokens": 461962174.0, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.6563898557931377, |
|
"grad_norm": 0.1147463396191597, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5784, |
|
"num_tokens": 463406269.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6583789159622079, |
|
"grad_norm": 0.11639434099197388, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5762, |
|
"num_tokens": 464831720.0, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.660367976131278, |
|
"grad_norm": 0.12189288437366486, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5708, |
|
"num_tokens": 466236911.0, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.662357036300348, |
|
"grad_norm": 0.11594757437705994, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5785, |
|
"num_tokens": 467667060.0, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.6643460964694182, |
|
"grad_norm": 0.120713010430336, |
|
"learning_rate": 1e-06, |
|
"loss": 0.56, |
|
"num_tokens": 469076105.0, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.6663351566384883, |
|
"grad_norm": 0.1252930611371994, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5586, |
|
"num_tokens": 470479085.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6683242168075584, |
|
"grad_norm": 0.12119864672422409, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5696, |
|
"num_tokens": 471878453.0, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.6703132769766286, |
|
"grad_norm": 0.11741020530462265, |
|
"learning_rate": 1e-06, |
|
"loss": 0.578, |
|
"num_tokens": 473317934.0, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.6723023371456986, |
|
"grad_norm": 0.11722774058580399, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5705, |
|
"num_tokens": 474718637.0, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.6742913973147687, |
|
"grad_norm": 0.1168806403875351, |
|
"learning_rate": 1e-06, |
|
"loss": 0.576, |
|
"num_tokens": 476109339.0, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.6762804574838389, |
|
"grad_norm": 0.11499282717704773, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5695, |
|
"num_tokens": 477530852.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.678269517652909, |
|
"grad_norm": 0.11914825439453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5647, |
|
"num_tokens": 478878070.0, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.6802585778219791, |
|
"grad_norm": 0.11614906787872314, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5749, |
|
"num_tokens": 480284827.0, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.6822476379910493, |
|
"grad_norm": 0.11405870318412781, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5728, |
|
"num_tokens": 481705922.0, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.6842366981601193, |
|
"grad_norm": 0.12021178752183914, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5757, |
|
"num_tokens": 483096669.0, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.6862257583291894, |
|
"grad_norm": 0.11697462201118469, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5778, |
|
"num_tokens": 484497174.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6882148184982596, |
|
"grad_norm": 0.11658283323049545, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5527, |
|
"num_tokens": 485910302.0, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.6902038786673297, |
|
"grad_norm": 0.11884041875600815, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5774, |
|
"num_tokens": 487314373.0, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.6921929388363998, |
|
"grad_norm": 0.11939556896686554, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5808, |
|
"num_tokens": 488749735.0, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.69418199900547, |
|
"grad_norm": 0.12483149766921997, |
|
"learning_rate": 1e-06, |
|
"loss": 0.558, |
|
"num_tokens": 490118122.0, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.69617105917454, |
|
"grad_norm": 0.12257801741361618, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5624, |
|
"num_tokens": 491510538.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6981601193436101, |
|
"grad_norm": 0.11767923086881638, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5718, |
|
"num_tokens": 492926246.0, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.7001491795126803, |
|
"grad_norm": 0.11635064333677292, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5803, |
|
"num_tokens": 494380244.0, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.7021382396817504, |
|
"grad_norm": 0.12177010625600815, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5652, |
|
"num_tokens": 495730583.0, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.7041272998508205, |
|
"grad_norm": 0.11822908371686935, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5663, |
|
"num_tokens": 497125075.0, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.7061163600198906, |
|
"grad_norm": 0.12682731449604034, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5658, |
|
"num_tokens": 498549275.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.7081054201889607, |
|
"grad_norm": 0.12045978009700775, |
|
"learning_rate": 1e-06, |
|
"loss": 0.57, |
|
"num_tokens": 499930592.0, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.7100944803580308, |
|
"grad_norm": 0.12545664608478546, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5652, |
|
"num_tokens": 501333957.0, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.712083540527101, |
|
"grad_norm": 0.11865141242742538, |
|
"learning_rate": 1e-06, |
|
"loss": 0.566, |
|
"num_tokens": 502737493.0, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.7140726006961711, |
|
"grad_norm": 0.12852944433689117, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5777, |
|
"num_tokens": 504130760.0, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.7160616608652411, |
|
"grad_norm": 0.11820497363805771, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5542, |
|
"num_tokens": 505509701.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7180507210343113, |
|
"grad_norm": 0.1146169900894165, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5568, |
|
"num_tokens": 506911298.0, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.7200397812033814, |
|
"grad_norm": 0.11726175993680954, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5609, |
|
"num_tokens": 508293436.0, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.7220288413724515, |
|
"grad_norm": 0.12022433429956436, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5605, |
|
"num_tokens": 509696103.0, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.7240179015415217, |
|
"grad_norm": 0.11907174438238144, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5652, |
|
"num_tokens": 511108327.0, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.7260069617105918, |
|
"grad_norm": 0.11890577524900436, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5618, |
|
"num_tokens": 512511723.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.7279960218796618, |
|
"grad_norm": 0.12470176070928574, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5584, |
|
"num_tokens": 513907412.0, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.729985082048732, |
|
"grad_norm": 0.12026024609804153, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5774, |
|
"num_tokens": 515324644.0, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.7319741422178021, |
|
"grad_norm": 0.12734608352184296, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5739, |
|
"num_tokens": 516674022.0, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.7339632023868722, |
|
"grad_norm": 0.11902155727148056, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5717, |
|
"num_tokens": 518087720.0, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.7359522625559424, |
|
"grad_norm": 0.11661865562200546, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5528, |
|
"num_tokens": 519497699.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7379413227250124, |
|
"grad_norm": 0.12583503127098083, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5767, |
|
"num_tokens": 520908702.0, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.7399303828940825, |
|
"grad_norm": 0.11799920350313187, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5692, |
|
"num_tokens": 522317938.0, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.7419194430631526, |
|
"grad_norm": 0.12218224257230759, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5642, |
|
"num_tokens": 523719142.0, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.7439085032322228, |
|
"grad_norm": 0.11948370188474655, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5702, |
|
"num_tokens": 525136176.0, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.7458975634012929, |
|
"grad_norm": 0.1194443628191948, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5472, |
|
"num_tokens": 526481164.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.747886623570363, |
|
"grad_norm": 0.12138593941926956, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5667, |
|
"num_tokens": 527888658.0, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.7498756837394331, |
|
"grad_norm": 0.12166504561901093, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5552, |
|
"num_tokens": 529287967.0, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.7518647439085032, |
|
"grad_norm": 0.1202344223856926, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5527, |
|
"num_tokens": 530654844.0, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.7538538040775733, |
|
"grad_norm": 0.11958526074886322, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5647, |
|
"num_tokens": 532043235.0, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.7558428642466435, |
|
"grad_norm": 0.11844973266124725, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5721, |
|
"num_tokens": 533462609.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7578319244157136, |
|
"grad_norm": 0.12316368520259857, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5659, |
|
"num_tokens": 534847372.0, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.7598209845847836, |
|
"grad_norm": 0.1315903514623642, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5636, |
|
"num_tokens": 536262717.0, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.7618100447538538, |
|
"grad_norm": 0.11977870017290115, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5708, |
|
"num_tokens": 537686599.0, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.7637991049229239, |
|
"grad_norm": 0.12060414999723434, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5605, |
|
"num_tokens": 539059998.0, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.765788165091994, |
|
"grad_norm": 0.11461476981639862, |
|
"learning_rate": 1e-06, |
|
"loss": 0.569, |
|
"num_tokens": 540488356.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.7677772252610642, |
|
"grad_norm": 0.11902087926864624, |
|
"learning_rate": 1e-06, |
|
"loss": 0.563, |
|
"num_tokens": 541895951.0, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.7697662854301343, |
|
"grad_norm": 0.11860883980989456, |
|
"learning_rate": 1e-06, |
|
"loss": 0.575, |
|
"num_tokens": 543281120.0, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.7717553455992043, |
|
"grad_norm": 0.12020647525787354, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5614, |
|
"num_tokens": 544646661.0, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.7737444057682745, |
|
"grad_norm": 0.11638668179512024, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5742, |
|
"num_tokens": 546097259.0, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.7757334659373446, |
|
"grad_norm": 0.11645980924367905, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5647, |
|
"num_tokens": 547518580.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7777225261064147, |
|
"grad_norm": 0.12269024550914764, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5689, |
|
"num_tokens": 548932428.0, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.7797115862754849, |
|
"grad_norm": 0.11761284619569778, |
|
"learning_rate": 1e-06, |
|
"loss": 0.554, |
|
"num_tokens": 550354089.0, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.781700646444555, |
|
"grad_norm": 0.1194002628326416, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5691, |
|
"num_tokens": 551785808.0, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.783689706613625, |
|
"grad_norm": 0.11770683526992798, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5675, |
|
"num_tokens": 553166180.0, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.7856787667826952, |
|
"grad_norm": 0.12067251652479172, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5698, |
|
"num_tokens": 554595305.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7876678269517653, |
|
"grad_norm": 0.1211480051279068, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5545, |
|
"num_tokens": 555975917.0, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.7896568871208354, |
|
"grad_norm": 0.12212938815355301, |
|
"learning_rate": 1e-06, |
|
"loss": 0.563, |
|
"num_tokens": 557357748.0, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.7916459472899056, |
|
"grad_norm": 0.12312401086091995, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5635, |
|
"num_tokens": 558758541.0, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.7936350074589756, |
|
"grad_norm": 0.11911605298519135, |
|
"learning_rate": 1e-06, |
|
"loss": 0.569, |
|
"num_tokens": 560135846.0, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.7956240676280457, |
|
"grad_norm": 0.8023229241371155, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5761, |
|
"num_tokens": 561561065.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7976131277971159, |
|
"grad_norm": 0.12671087682247162, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5718, |
|
"num_tokens": 562950768.0, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.799602187966186, |
|
"grad_norm": 0.12295536696910858, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5531, |
|
"num_tokens": 564325036.0, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.8015912481352561, |
|
"grad_norm": 0.11960072070360184, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5551, |
|
"num_tokens": 565703350.0, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.8035803083043263, |
|
"grad_norm": 0.11970996856689453, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5758, |
|
"num_tokens": 567129767.0, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.8055693684733963, |
|
"grad_norm": 0.11514374613761902, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5667, |
|
"num_tokens": 568533394.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.8075584286424664, |
|
"grad_norm": 0.12267459183931351, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5698, |
|
"num_tokens": 569966937.0, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.8095474888115366, |
|
"grad_norm": 0.1218687891960144, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5588, |
|
"num_tokens": 571338522.0, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.8115365489806067, |
|
"grad_norm": 0.12409517914056778, |
|
"learning_rate": 1e-06, |
|
"loss": 0.564, |
|
"num_tokens": 572731805.0, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.8135256091496768, |
|
"grad_norm": 0.11474578827619553, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5704, |
|
"num_tokens": 574159917.0, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.8155146693187469, |
|
"grad_norm": 0.11738289892673492, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5615, |
|
"num_tokens": 575571352.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.817503729487817, |
|
"grad_norm": 0.12023717910051346, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5569, |
|
"num_tokens": 576984870.0, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.8194927896568871, |
|
"grad_norm": 0.1194944977760315, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5666, |
|
"num_tokens": 578414303.0, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.8214818498259573, |
|
"grad_norm": 0.12020692229270935, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5478, |
|
"num_tokens": 579812487.0, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.8234709099950274, |
|
"grad_norm": 0.11791769415140152, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5681, |
|
"num_tokens": 581203528.0, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.8254599701640974, |
|
"grad_norm": 0.11593407392501831, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5573, |
|
"num_tokens": 582588268.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.8274490303331675, |
|
"grad_norm": 0.12016279250383377, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5777, |
|
"num_tokens": 583983078.0, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.8294380905022377, |
|
"grad_norm": 0.12653815746307373, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5839, |
|
"num_tokens": 585400140.0, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.8314271506713078, |
|
"grad_norm": 0.11608505994081497, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5575, |
|
"num_tokens": 586830320.0, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.8334162108403779, |
|
"grad_norm": 0.11865837126970291, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5744, |
|
"num_tokens": 588258999.0, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.8354052710094481, |
|
"grad_norm": 0.11725205928087234, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5693, |
|
"num_tokens": 589692364.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8373943311785181, |
|
"grad_norm": 0.11638808995485306, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5597, |
|
"num_tokens": 591094415.0, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.8393833913475882, |
|
"grad_norm": 0.12141529470682144, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5559, |
|
"num_tokens": 592500735.0, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.8413724515166584, |
|
"grad_norm": 0.11554522812366486, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5666, |
|
"num_tokens": 593903743.0, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.8433615116857285, |
|
"grad_norm": 0.11890975385904312, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5543, |
|
"num_tokens": 595314514.0, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.8453505718547986, |
|
"grad_norm": 0.12231077998876572, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5615, |
|
"num_tokens": 596671023.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.8473396320238687, |
|
"grad_norm": 0.12027207762002945, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5583, |
|
"num_tokens": 598031581.0, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.8493286921929388, |
|
"grad_norm": 0.11685006320476532, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5628, |
|
"num_tokens": 599477745.0, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.8513177523620089, |
|
"grad_norm": 0.17819620668888092, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5714, |
|
"num_tokens": 600848881.0, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.8533068125310791, |
|
"grad_norm": 0.11678969860076904, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5534, |
|
"num_tokens": 602266449.0, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.8552958727001492, |
|
"grad_norm": 0.11610161513090134, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5609, |
|
"num_tokens": 603675715.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8572849328692193, |
|
"grad_norm": 0.1225874274969101, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5554, |
|
"num_tokens": 605072725.0, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.8592739930382894, |
|
"grad_norm": 0.11319524794816971, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5538, |
|
"num_tokens": 606474929.0, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.8612630532073595, |
|
"grad_norm": 0.11506962776184082, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5531, |
|
"num_tokens": 607855829.0, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.8632521133764296, |
|
"grad_norm": 0.1158682256937027, |
|
"learning_rate": 1e-06, |
|
"loss": 0.567, |
|
"num_tokens": 609285755.0, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.8652411735454998, |
|
"grad_norm": 0.11761818826198578, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5671, |
|
"num_tokens": 610697282.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.8672302337145699, |
|
"grad_norm": 0.11838319152593613, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5534, |
|
"num_tokens": 612094535.0, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.8692192938836399, |
|
"grad_norm": 0.11570697277784348, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5632, |
|
"num_tokens": 613501408.0, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.8712083540527101, |
|
"grad_norm": 0.11999750137329102, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5594, |
|
"num_tokens": 614872296.0, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.8731974142217802, |
|
"grad_norm": 0.1152532622218132, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5573, |
|
"num_tokens": 616300717.0, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.8751864743908503, |
|
"grad_norm": 0.11520667374134064, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5441, |
|
"num_tokens": 617678025.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8771755345599205, |
|
"grad_norm": 0.11722690612077713, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5535, |
|
"num_tokens": 619082991.0, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.8791645947289906, |
|
"grad_norm": 0.11644702404737473, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5595, |
|
"num_tokens": 620470914.0, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.8811536548980606, |
|
"grad_norm": 0.11839190125465393, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5481, |
|
"num_tokens": 621862665.0, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.8831427150671308, |
|
"grad_norm": 0.1157626211643219, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5663, |
|
"num_tokens": 623310812.0, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.8851317752362009, |
|
"grad_norm": 0.11904613673686981, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5594, |
|
"num_tokens": 624718938.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.887120835405271, |
|
"grad_norm": 0.11730271577835083, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5515, |
|
"num_tokens": 626091716.0, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.8891098955743412, |
|
"grad_norm": 0.11457692086696625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5506, |
|
"num_tokens": 627487215.0, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.8910989557434112, |
|
"grad_norm": 0.11875864863395691, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5526, |
|
"num_tokens": 628850853.0, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.8930880159124813, |
|
"grad_norm": 0.11865832656621933, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5526, |
|
"num_tokens": 630249653.0, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.8950770760815515, |
|
"grad_norm": 0.11921685934066772, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5651, |
|
"num_tokens": 631626896.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8970661362506216, |
|
"grad_norm": 0.11800325661897659, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5463, |
|
"num_tokens": 633018745.0, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.8990551964196917, |
|
"grad_norm": 0.11664669215679169, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5529, |
|
"num_tokens": 634434489.0, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.9010442565887619, |
|
"grad_norm": 0.11907332390546799, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5588, |
|
"num_tokens": 635836521.0, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.9030333167578319, |
|
"grad_norm": 0.119346983730793, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5605, |
|
"num_tokens": 637251420.0, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.905022376926902, |
|
"grad_norm": 0.11810777336359024, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5538, |
|
"num_tokens": 638673973.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.9070114370959722, |
|
"grad_norm": 0.12147301435470581, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5477, |
|
"num_tokens": 640053944.0, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.9090004972650423, |
|
"grad_norm": 0.11813312768936157, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5699, |
|
"num_tokens": 641459580.0, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.9109895574341124, |
|
"grad_norm": 0.1261916309595108, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5565, |
|
"num_tokens": 642848898.0, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.9129786176031826, |
|
"grad_norm": 0.11821988970041275, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5527, |
|
"num_tokens": 644312542.0, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.9149676777722526, |
|
"grad_norm": 0.11542089283466339, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5614, |
|
"num_tokens": 645735657.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9169567379413227, |
|
"grad_norm": 0.131094828248024, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5682, |
|
"num_tokens": 647137018.0, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.9189457981103928, |
|
"grad_norm": 0.11712004989385605, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5578, |
|
"num_tokens": 648516983.0, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.920934858279463, |
|
"grad_norm": 0.11788227409124374, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5591, |
|
"num_tokens": 649953519.0, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.922923918448533, |
|
"grad_norm": 0.11795365810394287, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5463, |
|
"num_tokens": 651366264.0, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.9249129786176031, |
|
"grad_norm": 0.11824613064527512, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5664, |
|
"num_tokens": 652792598.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.9269020387866733, |
|
"grad_norm": 0.11839722841978073, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5493, |
|
"num_tokens": 654162858.0, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.9288910989557434, |
|
"grad_norm": 0.11836480349302292, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5514, |
|
"num_tokens": 655539001.0, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.9308801591248135, |
|
"grad_norm": 0.17381928861141205, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5526, |
|
"num_tokens": 656952170.0, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.9328692192938837, |
|
"grad_norm": 0.11770905554294586, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5615, |
|
"num_tokens": 658358118.0, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.9348582794629537, |
|
"grad_norm": 0.11960657685995102, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5495, |
|
"num_tokens": 659752280.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9368473396320238, |
|
"grad_norm": 0.11758306622505188, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5592, |
|
"num_tokens": 661127510.0, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.938836399801094, |
|
"grad_norm": 0.1147281602025032, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5431, |
|
"num_tokens": 662539378.0, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.9408254599701641, |
|
"grad_norm": 0.11964991688728333, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5578, |
|
"num_tokens": 663935199.0, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.9428145201392342, |
|
"grad_norm": 0.12270357459783554, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5535, |
|
"num_tokens": 665328684.0, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.9448035803083044, |
|
"grad_norm": 0.11691749840974808, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5598, |
|
"num_tokens": 666742297.0, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.9467926404773744, |
|
"grad_norm": 0.11798378825187683, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5563, |
|
"num_tokens": 668121085.0, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.9487817006464445, |
|
"grad_norm": 0.11690951138734818, |
|
"learning_rate": 1e-06, |
|
"loss": 0.553, |
|
"num_tokens": 669537516.0, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.9507707608155147, |
|
"grad_norm": 0.12072078138589859, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5578, |
|
"num_tokens": 670932178.0, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.9527598209845848, |
|
"grad_norm": 0.11482840776443481, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5484, |
|
"num_tokens": 672306421.0, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.9547488811536549, |
|
"grad_norm": 0.12619654834270477, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5604, |
|
"num_tokens": 673688811.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.956737941322725, |
|
"grad_norm": 0.12017329037189484, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5599, |
|
"num_tokens": 675082703.0, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.9587270014917951, |
|
"grad_norm": 0.11547863483428955, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5475, |
|
"num_tokens": 676488414.0, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.9607160616608652, |
|
"grad_norm": 0.11614055931568146, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5621, |
|
"num_tokens": 677896011.0, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.9627051218299354, |
|
"grad_norm": 0.11469247937202454, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5503, |
|
"num_tokens": 679301564.0, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.9646941819990055, |
|
"grad_norm": 0.12310828268527985, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5526, |
|
"num_tokens": 680674072.0, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.9666832421680756, |
|
"grad_norm": 0.11646957695484161, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5505, |
|
"num_tokens": 682077773.0, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.9686723023371457, |
|
"grad_norm": 0.11596749722957611, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5596, |
|
"num_tokens": 683492461.0, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.9706613625062158, |
|
"grad_norm": 0.11795108765363693, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5568, |
|
"num_tokens": 684913267.0, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.9726504226752859, |
|
"grad_norm": 0.1201254203915596, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5645, |
|
"num_tokens": 686315028.0, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.9746394828443561, |
|
"grad_norm": 0.1162814274430275, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5494, |
|
"num_tokens": 687711185.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9766285430134262, |
|
"grad_norm": 0.11750290542840958, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5628, |
|
"num_tokens": 689115928.0, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.9786176031824962, |
|
"grad_norm": 0.13300985097885132, |
|
"learning_rate": 1e-06, |
|
"loss": 0.563, |
|
"num_tokens": 690534590.0, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.9806066633515664, |
|
"grad_norm": 0.12720529735088348, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5567, |
|
"num_tokens": 691981545.0, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.9825957235206365, |
|
"grad_norm": 0.11822197586297989, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5659, |
|
"num_tokens": 693402676.0, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.9845847836897066, |
|
"grad_norm": 0.11941008269786835, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5568, |
|
"num_tokens": 694809843.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.9865738438587768, |
|
"grad_norm": 0.11860588937997818, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5531, |
|
"num_tokens": 696211950.0, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.9885629040278469, |
|
"grad_norm": 0.12157568335533142, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5569, |
|
"num_tokens": 697624687.0, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.9905519641969169, |
|
"grad_norm": 0.11806346476078033, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5743, |
|
"num_tokens": 699049049.0, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.9925410243659871, |
|
"grad_norm": 0.1187855452299118, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5558, |
|
"num_tokens": 700440889.0, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.9945300845350572, |
|
"grad_norm": 0.11523641645908356, |
|
"learning_rate": 1e-06, |
|
"loss": 0.56, |
|
"num_tokens": 701864042.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9965191447041273, |
|
"grad_norm": 0.11771584302186966, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5536, |
|
"num_tokens": 703255455.0, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.9985082048731975, |
|
"grad_norm": 0.11954519152641296, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5487, |
|
"num_tokens": 704646212.0, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.0019890601690702, |
|
"grad_norm": 0.24864843487739563, |
|
"learning_rate": 1e-06, |
|
"loss": 1.1087, |
|
"num_tokens": 706359781.0, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.0039781203381402, |
|
"grad_norm": 0.15665407478809357, |
|
"learning_rate": 1e-06, |
|
"loss": 0.542, |
|
"num_tokens": 707777523.0, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.0059671805072103, |
|
"grad_norm": 0.12076932191848755, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5393, |
|
"num_tokens": 709165414.0, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.0079562406762805, |
|
"grad_norm": 0.11568966507911682, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5503, |
|
"num_tokens": 710541219.0, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.0099453008453505, |
|
"grad_norm": 0.1189185082912445, |
|
"learning_rate": 1e-06, |
|
"loss": 0.555, |
|
"num_tokens": 711904899.0, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.0119343610144207, |
|
"grad_norm": 0.11912715435028076, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5399, |
|
"num_tokens": 713297626.0, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.0139234211834909, |
|
"grad_norm": 0.11964980512857437, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5585, |
|
"num_tokens": 714712902.0, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.0159124813525608, |
|
"grad_norm": 0.11817710846662521, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5518, |
|
"num_tokens": 716119424.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.017901541521631, |
|
"grad_norm": 0.11699645221233368, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5447, |
|
"num_tokens": 717518297.0, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.0198906016907012, |
|
"grad_norm": 0.11419110000133514, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5532, |
|
"num_tokens": 718934437.0, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.0218796618597712, |
|
"grad_norm": 0.11955999583005905, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5365, |
|
"num_tokens": 720296609.0, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.0238687220288414, |
|
"grad_norm": 0.11673971265554428, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5549, |
|
"num_tokens": 721713131.0, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.0258577821979116, |
|
"grad_norm": 0.11641617864370346, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5654, |
|
"num_tokens": 723093279.0, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.0278468423669815, |
|
"grad_norm": 0.11750617623329163, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5391, |
|
"num_tokens": 724491829.0, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.0298359025360517, |
|
"grad_norm": 0.1122995987534523, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5421, |
|
"num_tokens": 725927036.0, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.031824962705122, |
|
"grad_norm": 0.11935453861951828, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5522, |
|
"num_tokens": 727309822.0, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.0338140228741919, |
|
"grad_norm": 0.1215103417634964, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5543, |
|
"num_tokens": 728731842.0, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.035803083043262, |
|
"grad_norm": 0.12213215231895447, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5502, |
|
"num_tokens": 730121195.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.0377921432123323, |
|
"grad_norm": 0.11967755854129791, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5513, |
|
"num_tokens": 731512351.0, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.0397812033814022, |
|
"grad_norm": 0.11795569211244583, |
|
"learning_rate": 1e-06, |
|
"loss": 0.548, |
|
"num_tokens": 732931422.0, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.0417702635504724, |
|
"grad_norm": 0.11842501163482666, |
|
"learning_rate": 1e-06, |
|
"loss": 0.55, |
|
"num_tokens": 734353348.0, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.0437593237195426, |
|
"grad_norm": 0.1207478791475296, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5516, |
|
"num_tokens": 735742554.0, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.0457483838886126, |
|
"grad_norm": 0.11943433433771133, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5513, |
|
"num_tokens": 737158566.0, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.0477374440576828, |
|
"grad_norm": 0.12060469388961792, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5623, |
|
"num_tokens": 738552506.0, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.049726504226753, |
|
"grad_norm": 0.11873313784599304, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5502, |
|
"num_tokens": 739982404.0, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.051715564395823, |
|
"grad_norm": 0.11538344621658325, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5523, |
|
"num_tokens": 741379974.0, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.053704624564893, |
|
"grad_norm": 0.11816058307886124, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5625, |
|
"num_tokens": 742756391.0, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.0556936847339633, |
|
"grad_norm": 0.12244360148906708, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5433, |
|
"num_tokens": 744195716.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0576827449030333, |
|
"grad_norm": 0.12584052979946136, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5553, |
|
"num_tokens": 745589558.0, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.0596718050721035, |
|
"grad_norm": 0.11298387497663498, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5477, |
|
"num_tokens": 746987877.0, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.0616608652411736, |
|
"grad_norm": 0.12312706559896469, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5701, |
|
"num_tokens": 748360690.0, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.0636499254102436, |
|
"grad_norm": 0.11732471734285355, |
|
"learning_rate": 1e-06, |
|
"loss": 0.545, |
|
"num_tokens": 749752632.0, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.0656389855793138, |
|
"grad_norm": 0.11663486808538437, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5556, |
|
"num_tokens": 751135143.0, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.067628045748384, |
|
"grad_norm": 0.11738405376672745, |
|
"learning_rate": 1e-06, |
|
"loss": 0.542, |
|
"num_tokens": 752504589.0, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.069617105917454, |
|
"grad_norm": 0.11650814861059189, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5501, |
|
"num_tokens": 753908272.0, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.0716061660865241, |
|
"grad_norm": 0.11730290204286575, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5525, |
|
"num_tokens": 755345567.0, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.0735952262555943, |
|
"grad_norm": 0.11732745170593262, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5471, |
|
"num_tokens": 756742209.0, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.0755842864246643, |
|
"grad_norm": 0.11920227110385895, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5453, |
|
"num_tokens": 758140909.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0775733465937345, |
|
"grad_norm": 0.11768822371959686, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5427, |
|
"num_tokens": 759550869.0, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.0795624067628045, |
|
"grad_norm": 0.11679795384407043, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5615, |
|
"num_tokens": 760954672.0, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.0815514669318746, |
|
"grad_norm": 0.11920733749866486, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5505, |
|
"num_tokens": 762355822.0, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.0835405271009448, |
|
"grad_norm": 0.1188458576798439, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5678, |
|
"num_tokens": 763788308.0, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.085529587270015, |
|
"grad_norm": 0.11863641440868378, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5467, |
|
"num_tokens": 765177690.0, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.087518647439085, |
|
"grad_norm": 0.11603251844644547, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5369, |
|
"num_tokens": 766617783.0, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.0895077076081552, |
|
"grad_norm": 0.1217975988984108, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5431, |
|
"num_tokens": 768029229.0, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.0914967677772252, |
|
"grad_norm": 0.11564943194389343, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5438, |
|
"num_tokens": 769417054.0, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.0934858279462953, |
|
"grad_norm": 0.1208975538611412, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5502, |
|
"num_tokens": 770836479.0, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.0954748881153655, |
|
"grad_norm": 0.11667314916849136, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5547, |
|
"num_tokens": 772254674.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0974639482844357, |
|
"grad_norm": 0.11516263335943222, |
|
"learning_rate": 1e-06, |
|
"loss": 0.55, |
|
"num_tokens": 773706020.0, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.0994530084535057, |
|
"grad_norm": 0.11822597682476044, |
|
"learning_rate": 1e-06, |
|
"loss": 0.54, |
|
"num_tokens": 775110778.0, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.1014420686225759, |
|
"grad_norm": 0.12346749752759933, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5471, |
|
"num_tokens": 776538512.0, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.1034311287916458, |
|
"grad_norm": 0.11938782781362534, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5472, |
|
"num_tokens": 777966414.0, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.105420188960716, |
|
"grad_norm": 0.14581921696662903, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5537, |
|
"num_tokens": 779354221.0, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.1074092491297862, |
|
"grad_norm": 0.11971963196992874, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5487, |
|
"num_tokens": 780779804.0, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.1093983092988562, |
|
"grad_norm": 0.11833789944648743, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5463, |
|
"num_tokens": 782198212.0, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.1113873694679264, |
|
"grad_norm": 0.12102148681879044, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5565, |
|
"num_tokens": 783630792.0, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.1133764296369966, |
|
"grad_norm": 0.11505177617073059, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5361, |
|
"num_tokens": 785041609.0, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.1153654898060665, |
|
"grad_norm": 0.12186893075704575, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5406, |
|
"num_tokens": 786417350.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.1173545499751367, |
|
"grad_norm": 0.12019500881433487, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5568, |
|
"num_tokens": 787852563.0, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.119343610144207, |
|
"grad_norm": 0.11836836487054825, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5338, |
|
"num_tokens": 789256403.0, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.1213326703132769, |
|
"grad_norm": 0.1224868968129158, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5363, |
|
"num_tokens": 790627959.0, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.123321730482347, |
|
"grad_norm": 0.1227540671825409, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5398, |
|
"num_tokens": 792047776.0, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.1253107906514173, |
|
"grad_norm": 0.12231338769197464, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5393, |
|
"num_tokens": 793437309.0, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.1272998508204872, |
|
"grad_norm": 0.11635982990264893, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5491, |
|
"num_tokens": 794832359.0, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.1292889109895574, |
|
"grad_norm": 0.11689839512109756, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5521, |
|
"num_tokens": 796221119.0, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.1312779711586276, |
|
"grad_norm": 0.11974216252565384, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5472, |
|
"num_tokens": 797660744.0, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.1332670313276976, |
|
"grad_norm": 0.1189328134059906, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5444, |
|
"num_tokens": 799060817.0, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.1352560914967678, |
|
"grad_norm": 0.1164221465587616, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5442, |
|
"num_tokens": 800454569.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.137245151665838, |
|
"grad_norm": 0.11653874069452286, |
|
"learning_rate": 1e-06, |
|
"loss": 0.55, |
|
"num_tokens": 801905348.0, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.139234211834908, |
|
"grad_norm": 0.11661481857299805, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5492, |
|
"num_tokens": 803300441.0, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.141223272003978, |
|
"grad_norm": 0.11541904509067535, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5299, |
|
"num_tokens": 804671703.0, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.1432123321730483, |
|
"grad_norm": 0.11833840608596802, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5442, |
|
"num_tokens": 806085229.0, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.1452013923421183, |
|
"grad_norm": 0.11650761216878891, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5363, |
|
"num_tokens": 807470114.0, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.1471904525111885, |
|
"grad_norm": 0.11970090866088867, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5517, |
|
"num_tokens": 808920938.0, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.1491795126802586, |
|
"grad_norm": 0.11860202997922897, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5531, |
|
"num_tokens": 810330812.0, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.1511685728493286, |
|
"grad_norm": 0.11822472512722015, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5386, |
|
"num_tokens": 811715751.0, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.1531576330183988, |
|
"grad_norm": 0.11776979267597198, |
|
"learning_rate": 1e-06, |
|
"loss": 0.538, |
|
"num_tokens": 813102121.0, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.155146693187469, |
|
"grad_norm": 0.11876077950000763, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5578, |
|
"num_tokens": 814501435.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.157135753356539, |
|
"grad_norm": 0.12163852900266647, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5301, |
|
"num_tokens": 815900526.0, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.1591248135256091, |
|
"grad_norm": 0.11880628764629364, |
|
"learning_rate": 1e-06, |
|
"loss": 0.553, |
|
"num_tokens": 817316569.0, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.1611138736946793, |
|
"grad_norm": 0.11747530102729797, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5407, |
|
"num_tokens": 818712696.0, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.1631029338637493, |
|
"grad_norm": 0.11508717387914658, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5525, |
|
"num_tokens": 820179550.0, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.1650919940328195, |
|
"grad_norm": 0.11923891305923462, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5299, |
|
"num_tokens": 821519204.0, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.1670810542018897, |
|
"grad_norm": 0.12130296975374222, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5584, |
|
"num_tokens": 822934109.0, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.1690701143709596, |
|
"grad_norm": 0.11868572235107422, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5237, |
|
"num_tokens": 824350114.0, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.1710591745400298, |
|
"grad_norm": 0.11723876744508743, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5535, |
|
"num_tokens": 825723811.0, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.1730482347091, |
|
"grad_norm": 0.11835741996765137, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5346, |
|
"num_tokens": 827152454.0, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.17503729487817, |
|
"grad_norm": 0.11868591606616974, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5351, |
|
"num_tokens": 828579537.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1770263550472402, |
|
"grad_norm": 0.11987331509590149, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5367, |
|
"num_tokens": 829969684.0, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.1790154152163104, |
|
"grad_norm": 0.11743640154600143, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5548, |
|
"num_tokens": 831391395.0, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.1810044753853803, |
|
"grad_norm": 0.1182253286242485, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5516, |
|
"num_tokens": 832783491.0, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.1829935355544505, |
|
"grad_norm": 0.11603699624538422, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5356, |
|
"num_tokens": 834171588.0, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.1849825957235207, |
|
"grad_norm": 0.1183595210313797, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5511, |
|
"num_tokens": 835573502.0, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.1869716558925907, |
|
"grad_norm": 0.11468026787042618, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5509, |
|
"num_tokens": 837028240.0, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.1889607160616609, |
|
"grad_norm": 0.11567319929599762, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5472, |
|
"num_tokens": 838416832.0, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.190949776230731, |
|
"grad_norm": 0.11915028095245361, |
|
"learning_rate": 1e-06, |
|
"loss": 0.553, |
|
"num_tokens": 839812316.0, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.192938836399801, |
|
"grad_norm": 0.11828889697790146, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5442, |
|
"num_tokens": 841215463.0, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.1949278965688712, |
|
"grad_norm": 0.11770551651716232, |
|
"learning_rate": 1e-06, |
|
"loss": 0.547, |
|
"num_tokens": 842662727.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1969169567379414, |
|
"grad_norm": 0.1193631961941719, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5452, |
|
"num_tokens": 844081793.0, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.1989060169070114, |
|
"grad_norm": 0.11217671632766724, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5401, |
|
"num_tokens": 845485094.0, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.2008950770760816, |
|
"grad_norm": 0.11600279062986374, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5565, |
|
"num_tokens": 846899288.0, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.2028841372451518, |
|
"grad_norm": 0.11796706914901733, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5434, |
|
"num_tokens": 848304697.0, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.2048731974142217, |
|
"grad_norm": 0.11813243478536606, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5535, |
|
"num_tokens": 849715462.0, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.206862257583292, |
|
"grad_norm": 0.1221594288945198, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5461, |
|
"num_tokens": 851059379.0, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.208851317752362, |
|
"grad_norm": 0.11475210636854172, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5394, |
|
"num_tokens": 852449620.0, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 1.210840377921432, |
|
"grad_norm": 0.1158720999956131, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5415, |
|
"num_tokens": 853876338.0, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.2128294380905023, |
|
"grad_norm": 0.11944854259490967, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5398, |
|
"num_tokens": 855326253.0, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.2148184982595724, |
|
"grad_norm": 0.11523836106061935, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5415, |
|
"num_tokens": 856726725.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.2168075584286424, |
|
"grad_norm": 0.11895252019166946, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5499, |
|
"num_tokens": 858107209.0, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 1.2187966185977126, |
|
"grad_norm": 0.11535745114088058, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5286, |
|
"num_tokens": 859540802.0, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.2207856787667828, |
|
"grad_norm": 0.1177186667919159, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5516, |
|
"num_tokens": 860956386.0, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 1.2227747389358528, |
|
"grad_norm": 0.11561235785484314, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5345, |
|
"num_tokens": 862351381.0, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.224763799104923, |
|
"grad_norm": 0.12278357893228531, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5436, |
|
"num_tokens": 863752120.0, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.2267528592739931, |
|
"grad_norm": 0.12000274658203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5409, |
|
"num_tokens": 865134261.0, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.228741919443063, |
|
"grad_norm": 0.11960814148187637, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5421, |
|
"num_tokens": 866500301.0, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 1.2307309796121333, |
|
"grad_norm": 0.11419054120779037, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5407, |
|
"num_tokens": 867914829.0, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.2327200397812033, |
|
"grad_norm": 0.11924876272678375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.533, |
|
"num_tokens": 869296945.0, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 1.2347090999502734, |
|
"grad_norm": 0.12687626481056213, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5433, |
|
"num_tokens": 870720355.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.2366981601193436, |
|
"grad_norm": 0.11868540942668915, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5427, |
|
"num_tokens": 872090554.0, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.2386872202884138, |
|
"grad_norm": 0.11346638202667236, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5455, |
|
"num_tokens": 873519656.0, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.2406762804574838, |
|
"grad_norm": 0.12468260526657104, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5406, |
|
"num_tokens": 874897139.0, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 1.242665340626554, |
|
"grad_norm": 0.11793619394302368, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5541, |
|
"num_tokens": 876281923.0, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.244654400795624, |
|
"grad_norm": 0.11685628443956375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5419, |
|
"num_tokens": 877695435.0, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.2466434609646941, |
|
"grad_norm": 0.12373646348714828, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5372, |
|
"num_tokens": 879058206.0, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.2486325211337643, |
|
"grad_norm": 0.11609544605016708, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5497, |
|
"num_tokens": 880465724.0, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.2506215813028345, |
|
"grad_norm": 0.11885792016983032, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5458, |
|
"num_tokens": 881863204.0, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.2526106414719045, |
|
"grad_norm": 0.1189800500869751, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5425, |
|
"num_tokens": 883235922.0, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.2545997016409747, |
|
"grad_norm": 0.11329221725463867, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5371, |
|
"num_tokens": 884644276.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.2565887618100446, |
|
"grad_norm": 0.11825796961784363, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5351, |
|
"num_tokens": 886073154.0, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.2585778219791148, |
|
"grad_norm": 0.114280566573143, |
|
"learning_rate": 1e-06, |
|
"loss": 0.541, |
|
"num_tokens": 887514458.0, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.260566882148185, |
|
"grad_norm": 0.1187988743185997, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5364, |
|
"num_tokens": 888906987.0, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.2625559423172552, |
|
"grad_norm": 0.11506423354148865, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5431, |
|
"num_tokens": 890319605.0, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.2645450024863252, |
|
"grad_norm": 0.1173451617360115, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5345, |
|
"num_tokens": 891716052.0, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.2665340626553954, |
|
"grad_norm": 0.11930102109909058, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5439, |
|
"num_tokens": 893119175.0, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.2685231228244653, |
|
"grad_norm": 0.11887087672948837, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5348, |
|
"num_tokens": 894490469.0, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.2705121829935355, |
|
"grad_norm": 0.11899300664663315, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5491, |
|
"num_tokens": 895868778.0, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.2725012431626057, |
|
"grad_norm": 0.116294264793396, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5537, |
|
"num_tokens": 897281587.0, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.274490303331676, |
|
"grad_norm": 0.12177430093288422, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5369, |
|
"num_tokens": 898705618.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.2764793635007459, |
|
"grad_norm": 0.11538566648960114, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5387, |
|
"num_tokens": 900118795.0, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.278468423669816, |
|
"grad_norm": 0.11888190358877182, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5448, |
|
"num_tokens": 901549973.0, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.280457483838886, |
|
"grad_norm": 0.11358219385147095, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5403, |
|
"num_tokens": 902947794.0, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.2824465440079562, |
|
"grad_norm": 0.11417380720376968, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5332, |
|
"num_tokens": 904349508.0, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.2844356041770264, |
|
"grad_norm": 0.11959497630596161, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5406, |
|
"num_tokens": 905747046.0, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.2864246643460966, |
|
"grad_norm": 0.11622175574302673, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5415, |
|
"num_tokens": 907173072.0, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.2884137245151666, |
|
"grad_norm": 0.11988142877817154, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5451, |
|
"num_tokens": 908584591.0, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.2904027846842367, |
|
"grad_norm": 0.1185469850897789, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5351, |
|
"num_tokens": 910011116.0, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.2923918448533067, |
|
"grad_norm": 0.11444271355867386, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5384, |
|
"num_tokens": 911427909.0, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 1.294380905022377, |
|
"grad_norm": 0.12033736705780029, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5498, |
|
"num_tokens": 912835863.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.296369965191447, |
|
"grad_norm": 0.1319677233695984, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5296, |
|
"num_tokens": 914238947.0, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.2983590253605173, |
|
"grad_norm": 0.12392336130142212, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5341, |
|
"num_tokens": 915655787.0, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.3003480855295872, |
|
"grad_norm": 0.11796288192272186, |
|
"learning_rate": 1e-06, |
|
"loss": 0.547, |
|
"num_tokens": 917089247.0, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 1.3023371456986574, |
|
"grad_norm": 0.12152981013059616, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5544, |
|
"num_tokens": 918505085.0, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.3043262058677274, |
|
"grad_norm": 0.11925685405731201, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5304, |
|
"num_tokens": 919892998.0, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.3063152660367976, |
|
"grad_norm": 0.11711208522319794, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5426, |
|
"num_tokens": 921240479.0, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.3083043262058678, |
|
"grad_norm": 0.12039055675268173, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5368, |
|
"num_tokens": 922632985.0, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.310293386374938, |
|
"grad_norm": 0.11820589005947113, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5324, |
|
"num_tokens": 924054020.0, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.312282446544008, |
|
"grad_norm": 0.11549760401248932, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5442, |
|
"num_tokens": 925466685.0, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 1.3142715067130781, |
|
"grad_norm": 0.12046794593334198, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5491, |
|
"num_tokens": 926874635.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.316260566882148, |
|
"grad_norm": 0.1153668761253357, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5371, |
|
"num_tokens": 928294778.0, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 1.3182496270512183, |
|
"grad_norm": 0.11516553908586502, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5405, |
|
"num_tokens": 929703109.0, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.3202386872202885, |
|
"grad_norm": 0.11781197786331177, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5342, |
|
"num_tokens": 931098455.0, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 1.3222277473893587, |
|
"grad_norm": 0.11899585276842117, |
|
"learning_rate": 1e-06, |
|
"loss": 0.546, |
|
"num_tokens": 932511314.0, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.3242168075584286, |
|
"grad_norm": 0.11900392174720764, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5404, |
|
"num_tokens": 933900286.0, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.3262058677274988, |
|
"grad_norm": 0.12140580266714096, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5355, |
|
"num_tokens": 935253652.0, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.3281949278965688, |
|
"grad_norm": 0.12140516191720963, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5394, |
|
"num_tokens": 936636184.0, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.330183988065639, |
|
"grad_norm": 0.11907900869846344, |
|
"learning_rate": 1e-06, |
|
"loss": 0.527, |
|
"num_tokens": 938023738.0, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.3321730482347092, |
|
"grad_norm": 0.12013056874275208, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5444, |
|
"num_tokens": 939396704.0, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 1.3341621084037791, |
|
"grad_norm": 0.11747721582651138, |
|
"learning_rate": 1e-06, |
|
"loss": 0.534, |
|
"num_tokens": 940820588.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.3361511685728493, |
|
"grad_norm": 0.11940892785787582, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5323, |
|
"num_tokens": 942230394.0, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.3381402287419195, |
|
"grad_norm": 0.12076081335544586, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5395, |
|
"num_tokens": 943640623.0, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.3401292889109895, |
|
"grad_norm": 0.11554915457963943, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5423, |
|
"num_tokens": 945055848.0, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 1.3421183490800597, |
|
"grad_norm": 0.11654532700777054, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5235, |
|
"num_tokens": 946461742.0, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.3441074092491299, |
|
"grad_norm": 0.11917490512132645, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5389, |
|
"num_tokens": 947841726.0, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.3460964694181998, |
|
"grad_norm": 0.1211070865392685, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5347, |
|
"num_tokens": 949278490.0, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.34808552958727, |
|
"grad_norm": 0.12035378068685532, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5334, |
|
"num_tokens": 950642424.0, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 1.3500745897563402, |
|
"grad_norm": 0.11970808357000351, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5304, |
|
"num_tokens": 952042727.0, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.3520636499254102, |
|
"grad_norm": 0.12441671639680862, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5429, |
|
"num_tokens": 953444541.0, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 1.3540527100944804, |
|
"grad_norm": 0.11760767549276352, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5433, |
|
"num_tokens": 954885585.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.3560417702635505, |
|
"grad_norm": 0.12732824683189392, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5339, |
|
"num_tokens": 956271933.0, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 1.3580308304326205, |
|
"grad_norm": 0.11609245091676712, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5398, |
|
"num_tokens": 957653773.0, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.3600198906016907, |
|
"grad_norm": 0.11394883692264557, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5357, |
|
"num_tokens": 959087783.0, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 1.362008950770761, |
|
"grad_norm": 0.13362184166908264, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5404, |
|
"num_tokens": 960495424.0, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.3639980109398309, |
|
"grad_norm": 0.11702502518892288, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5325, |
|
"num_tokens": 961901643.0, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.365987071108901, |
|
"grad_norm": 0.11658236384391785, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5322, |
|
"num_tokens": 963283706.0, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.3679761312779712, |
|
"grad_norm": 0.12017067521810532, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5338, |
|
"num_tokens": 964734481.0, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 1.3699651914470412, |
|
"grad_norm": 0.12005714327096939, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5423, |
|
"num_tokens": 966125592.0, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.3719542516161114, |
|
"grad_norm": 0.12015294283628464, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5513, |
|
"num_tokens": 967523933.0, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 1.3739433117851814, |
|
"grad_norm": 0.11920958757400513, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5319, |
|
"num_tokens": 968897975.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.3759323719542516, |
|
"grad_norm": 0.12052245438098907, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5333, |
|
"num_tokens": 970267864.0, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 1.3779214321233217, |
|
"grad_norm": 0.11884114146232605, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5336, |
|
"num_tokens": 971706591.0, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.379910492292392, |
|
"grad_norm": 0.11437772214412689, |
|
"learning_rate": 1e-06, |
|
"loss": 0.552, |
|
"num_tokens": 973155693.0, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 1.381899552461462, |
|
"grad_norm": 0.12122377008199692, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5444, |
|
"num_tokens": 974547211.0, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.383888612630532, |
|
"grad_norm": 0.11425941437482834, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5415, |
|
"num_tokens": 976010063.0, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.385877672799602, |
|
"grad_norm": 0.11765948683023453, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5284, |
|
"num_tokens": 977411227.0, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.3878667329686722, |
|
"grad_norm": 0.11854742467403412, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5338, |
|
"num_tokens": 978786923.0, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 1.3898557931377424, |
|
"grad_norm": 0.12211066484451294, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5319, |
|
"num_tokens": 980164053.0, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 1.3918448533068126, |
|
"grad_norm": 0.1181558147072792, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5233, |
|
"num_tokens": 981562290.0, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 1.3938339134758826, |
|
"grad_norm": 0.12071076780557632, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5394, |
|
"num_tokens": 982998912.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3958229736449528, |
|
"grad_norm": 0.11735861748456955, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5456, |
|
"num_tokens": 984410558.0, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 1.3978120338140227, |
|
"grad_norm": 0.11743367463350296, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5396, |
|
"num_tokens": 985836461.0, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.399801093983093, |
|
"grad_norm": 0.11885622888803482, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5424, |
|
"num_tokens": 987230105.0, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 1.4017901541521631, |
|
"grad_norm": 0.11783410608768463, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5352, |
|
"num_tokens": 988630556.0, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.4037792143212333, |
|
"grad_norm": 0.11775646358728409, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5239, |
|
"num_tokens": 989996092.0, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.4057682744903033, |
|
"grad_norm": 0.11850979924201965, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5292, |
|
"num_tokens": 991410237.0, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 1.4077573346593735, |
|
"grad_norm": 0.13466332852840424, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5229, |
|
"num_tokens": 992831683.0, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 1.4097463948284434, |
|
"grad_norm": 0.1183917373418808, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5306, |
|
"num_tokens": 994220085.0, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.4117354549975136, |
|
"grad_norm": 0.11719982326030731, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5347, |
|
"num_tokens": 995634427.0, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 1.4137245151665838, |
|
"grad_norm": 0.1188247948884964, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5214, |
|
"num_tokens": 996991797.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.415713575335654, |
|
"grad_norm": 0.12354787439107895, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5394, |
|
"num_tokens": 998387297.0, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 1.417702635504724, |
|
"grad_norm": 0.11752255260944366, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5289, |
|
"num_tokens": 999796509.0, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.4196916956737942, |
|
"grad_norm": 0.11573730409145355, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5306, |
|
"num_tokens": 1001189936.0, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 1.4216807558428641, |
|
"grad_norm": 0.12337598204612732, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5368, |
|
"num_tokens": 1002569771.0, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.4236698160119343, |
|
"grad_norm": 0.1165643110871315, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5284, |
|
"num_tokens": 1003987924.0, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.4256588761810045, |
|
"grad_norm": 0.12802375853061676, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5404, |
|
"num_tokens": 1005393054.0, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.4276479363500747, |
|
"grad_norm": 0.1228770762681961, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5448, |
|
"num_tokens": 1006837132.0, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 1.4296369965191447, |
|
"grad_norm": 0.12197130918502808, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5514, |
|
"num_tokens": 1008250141.0, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 1.4316260566882149, |
|
"grad_norm": 0.11660060286521912, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5501, |
|
"num_tokens": 1009672430.0, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 1.4336151168572848, |
|
"grad_norm": 0.13146308064460754, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5526, |
|
"num_tokens": 1011105282.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.435604177026355, |
|
"grad_norm": 0.12230085581541061, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5388, |
|
"num_tokens": 1012519871.0, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 1.4375932371954252, |
|
"grad_norm": 0.11886520683765411, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5512, |
|
"num_tokens": 1013919180.0, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 1.4395822973644954, |
|
"grad_norm": 0.12054796516895294, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5418, |
|
"num_tokens": 1015311974.0, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 1.4415713575335654, |
|
"grad_norm": 0.1221555769443512, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5315, |
|
"num_tokens": 1016710513.0, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.4435604177026355, |
|
"grad_norm": 0.12085101753473282, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5267, |
|
"num_tokens": 1018108542.0, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.4455494778717055, |
|
"grad_norm": 0.11824904382228851, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5275, |
|
"num_tokens": 1019503680.0, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 1.4475385380407757, |
|
"grad_norm": 0.11636096239089966, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5326, |
|
"num_tokens": 1020904351.0, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 1.449527598209846, |
|
"grad_norm": 0.12087342143058777, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5418, |
|
"num_tokens": 1022300450.0, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.451516658378916, |
|
"grad_norm": 0.11914920806884766, |
|
"learning_rate": 1e-06, |
|
"loss": 0.547, |
|
"num_tokens": 1023750162.0, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 1.453505718547986, |
|
"grad_norm": 0.11899517476558685, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5452, |
|
"num_tokens": 1025196694.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.4554947787170562, |
|
"grad_norm": 0.1218375712633133, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5321, |
|
"num_tokens": 1026597406.0, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 1.4574838388861262, |
|
"grad_norm": 0.11746956408023834, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5341, |
|
"num_tokens": 1027971754.0, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.4594728990551964, |
|
"grad_norm": 0.12773922085762024, |
|
"learning_rate": 1e-06, |
|
"loss": 0.547, |
|
"num_tokens": 1029364538.0, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 1.4614619592242666, |
|
"grad_norm": 0.1228381097316742, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5403, |
|
"num_tokens": 1030782739.0, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 1.4634510193933368, |
|
"grad_norm": 0.11855144798755646, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5345, |
|
"num_tokens": 1032154592.0, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.4654400795624067, |
|
"grad_norm": 0.12030474096536636, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5314, |
|
"num_tokens": 1033543123.0, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.467429139731477, |
|
"grad_norm": 0.12364325672388077, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5422, |
|
"num_tokens": 1034938165.0, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 1.469418199900547, |
|
"grad_norm": 0.12098907679319382, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5375, |
|
"num_tokens": 1036314909.0, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 1.471407260069617, |
|
"grad_norm": 0.11932458728551865, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5347, |
|
"num_tokens": 1037727924.0, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 1.4733963202386873, |
|
"grad_norm": 0.11801986396312714, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5248, |
|
"num_tokens": 1039111581.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.4753853804077575, |
|
"grad_norm": 0.11731583625078201, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5345, |
|
"num_tokens": 1040546305.0, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 1.4773744405768274, |
|
"grad_norm": 0.11801919341087341, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5336, |
|
"num_tokens": 1041939972.0, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 1.4793635007458976, |
|
"grad_norm": 0.12367312610149384, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5389, |
|
"num_tokens": 1043334299.0, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 1.4813525609149676, |
|
"grad_norm": 0.12621083855628967, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5411, |
|
"num_tokens": 1044732856.0, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.4833416210840378, |
|
"grad_norm": 0.1213909313082695, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5339, |
|
"num_tokens": 1046119537.0, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.485330681253108, |
|
"grad_norm": 0.12064032256603241, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5501, |
|
"num_tokens": 1047526533.0, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.4873197414221782, |
|
"grad_norm": 0.11744097620248795, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5323, |
|
"num_tokens": 1048909786.0, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 1.4893088015912481, |
|
"grad_norm": 0.11546076834201813, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5396, |
|
"num_tokens": 1050351234.0, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.4912978617603183, |
|
"grad_norm": 0.12047351151704788, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5439, |
|
"num_tokens": 1051747737.0, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 1.4932869219293883, |
|
"grad_norm": 0.11532466858625412, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5267, |
|
"num_tokens": 1053183317.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4952759820984585, |
|
"grad_norm": 0.1169387549161911, |
|
"learning_rate": 1e-06, |
|
"loss": 0.526, |
|
"num_tokens": 1054557285.0, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 1.4972650422675287, |
|
"grad_norm": 0.1165088340640068, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5308, |
|
"num_tokens": 1056007559.0, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.4992541024365988, |
|
"grad_norm": 0.11395064741373062, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5408, |
|
"num_tokens": 1057438934.0, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 1.5012431626056688, |
|
"grad_norm": 0.12597793340682983, |
|
"learning_rate": 1e-06, |
|
"loss": 0.54, |
|
"num_tokens": 1058838684.0, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 1.5032322227747388, |
|
"grad_norm": 0.12164228409528732, |
|
"learning_rate": 1e-06, |
|
"loss": 0.538, |
|
"num_tokens": 1060278255.0, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.505221282943809, |
|
"grad_norm": 0.11811868101358414, |
|
"learning_rate": 1e-06, |
|
"loss": 0.531, |
|
"num_tokens": 1061695693.0, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.5072103431128792, |
|
"grad_norm": 0.1173863634467125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.529, |
|
"num_tokens": 1063113004.0, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 1.5091994032819493, |
|
"grad_norm": 0.1195053681731224, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5467, |
|
"num_tokens": 1064537749.0, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 1.5111884634510195, |
|
"grad_norm": 0.12815718352794647, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5202, |
|
"num_tokens": 1065931962.0, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 1.5131775236200895, |
|
"grad_norm": 0.12516118586063385, |
|
"learning_rate": 1e-06, |
|
"loss": 0.517, |
|
"num_tokens": 1067328106.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.5151665837891595, |
|
"grad_norm": 0.11947084218263626, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5198, |
|
"num_tokens": 1068712446.0, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 1.5171556439582297, |
|
"grad_norm": 0.1181473359465599, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5399, |
|
"num_tokens": 1070129632.0, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 1.5191447041272998, |
|
"grad_norm": 0.12384405732154846, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5369, |
|
"num_tokens": 1071485504.0, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 1.52113376429637, |
|
"grad_norm": 0.12462117522954941, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5313, |
|
"num_tokens": 1072876547.0, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.5231228244654402, |
|
"grad_norm": 0.11820020526647568, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5282, |
|
"num_tokens": 1074262233.0, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.5251118846345102, |
|
"grad_norm": 0.11712965369224548, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5286, |
|
"num_tokens": 1075681885.0, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 1.5271009448035802, |
|
"grad_norm": 0.11938031017780304, |
|
"learning_rate": 1e-06, |
|
"loss": 0.524, |
|
"num_tokens": 1077086623.0, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 1.5290900049726504, |
|
"grad_norm": 0.11355356127023697, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5357, |
|
"num_tokens": 1078502366.0, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.5310790651417205, |
|
"grad_norm": 0.12431412935256958, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5317, |
|
"num_tokens": 1079851354.0, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 1.5330681253107907, |
|
"grad_norm": 0.12124991416931152, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5452, |
|
"num_tokens": 1081246250.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.535057185479861, |
|
"grad_norm": 0.11724445968866348, |
|
"learning_rate": 1e-06, |
|
"loss": 0.53, |
|
"num_tokens": 1082635196.0, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 1.5370462456489309, |
|
"grad_norm": 0.12210683524608612, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5291, |
|
"num_tokens": 1084011995.0, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 1.5390353058180009, |
|
"grad_norm": 0.11981118470430374, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5438, |
|
"num_tokens": 1085412000.0, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 1.541024365987071, |
|
"grad_norm": 0.12824386358261108, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5393, |
|
"num_tokens": 1086812231.0, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 1.5430134261561412, |
|
"grad_norm": 0.1165238469839096, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5468, |
|
"num_tokens": 1088217759.0, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.5450024863252114, |
|
"grad_norm": 0.11545863002538681, |
|
"learning_rate": 1e-06, |
|
"loss": 0.522, |
|
"num_tokens": 1089622076.0, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 1.5469915464942816, |
|
"grad_norm": 0.11776294559240341, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5439, |
|
"num_tokens": 1091049994.0, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 1.5489806066633516, |
|
"grad_norm": 0.11632154881954193, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5078, |
|
"num_tokens": 1092400841.0, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 1.5509696668324215, |
|
"grad_norm": 0.1186751276254654, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5372, |
|
"num_tokens": 1093805680.0, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 1.5529587270014917, |
|
"grad_norm": 0.11867798119783401, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5334, |
|
"num_tokens": 1095211670.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.554947787170562, |
|
"grad_norm": 0.11884911358356476, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5355, |
|
"num_tokens": 1096608638.0, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 1.5569368473396321, |
|
"grad_norm": 0.12014549970626831, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5485, |
|
"num_tokens": 1098024501.0, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 1.5589259075087023, |
|
"grad_norm": 0.115864098072052, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5372, |
|
"num_tokens": 1099418318.0, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 1.5609149676777723, |
|
"grad_norm": 0.12087985128164291, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5331, |
|
"num_tokens": 1100818699.0, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.5629040278468422, |
|
"grad_norm": 0.11292678117752075, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5345, |
|
"num_tokens": 1102255753.0, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.5648930880159124, |
|
"grad_norm": 0.11706893891096115, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5211, |
|
"num_tokens": 1103629573.0, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 1.5668821481849826, |
|
"grad_norm": 0.11923506110906601, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5309, |
|
"num_tokens": 1104985171.0, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 1.5688712083540528, |
|
"grad_norm": 0.12054122984409332, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5395, |
|
"num_tokens": 1106405658.0, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 1.5708602685231228, |
|
"grad_norm": 0.11507654935121536, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5414, |
|
"num_tokens": 1107830805.0, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 1.572849328692193, |
|
"grad_norm": 0.11983779072761536, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5314, |
|
"num_tokens": 1109163312.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.574838388861263, |
|
"grad_norm": 0.11855509877204895, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5279, |
|
"num_tokens": 1110582152.0, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 1.5768274490303331, |
|
"grad_norm": 0.11437905579805374, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5335, |
|
"num_tokens": 1112007797.0, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 1.5788165091994033, |
|
"grad_norm": 0.11613345891237259, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5364, |
|
"num_tokens": 1113366919.0, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 1.5808055693684735, |
|
"grad_norm": 0.1167130321264267, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5295, |
|
"num_tokens": 1114808855.0, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 1.5827946295375435, |
|
"grad_norm": 0.11898983269929886, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5426, |
|
"num_tokens": 1116215942.0, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.5847836897066137, |
|
"grad_norm": 0.11814821511507034, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5249, |
|
"num_tokens": 1117635329.0, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 1.5867727498756836, |
|
"grad_norm": 0.11845888942480087, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5347, |
|
"num_tokens": 1119076550.0, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 1.5887618100447538, |
|
"grad_norm": 0.11748501658439636, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5376, |
|
"num_tokens": 1120476445.0, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 1.590750870213824, |
|
"grad_norm": 0.11534599214792252, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5204, |
|
"num_tokens": 1121859435.0, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 1.5927399303828942, |
|
"grad_norm": 0.11943424493074417, |
|
"learning_rate": 1e-06, |
|
"loss": 0.526, |
|
"num_tokens": 1123242470.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5947289905519642, |
|
"grad_norm": 0.12252327799797058, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5278, |
|
"num_tokens": 1124644409.0, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 1.5967180507210343, |
|
"grad_norm": 0.11914427578449249, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5308, |
|
"num_tokens": 1126067895.0, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 1.5987071108901043, |
|
"grad_norm": 0.12473994493484497, |
|
"learning_rate": 1e-06, |
|
"loss": 0.53, |
|
"num_tokens": 1127456815.0, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 1.6006961710591745, |
|
"grad_norm": 0.11551981419324875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5255, |
|
"num_tokens": 1128878563.0, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 1.6026852312282447, |
|
"grad_norm": 0.11678687483072281, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5325, |
|
"num_tokens": 1130317088.0, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.6046742913973149, |
|
"grad_norm": 0.1221092939376831, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5404, |
|
"num_tokens": 1131770565.0, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 1.6066633515663848, |
|
"grad_norm": 0.11658436805009842, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5345, |
|
"num_tokens": 1133178633.0, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 1.608652411735455, |
|
"grad_norm": 0.11696770042181015, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5275, |
|
"num_tokens": 1134574443.0, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 1.610641471904525, |
|
"grad_norm": 0.11893412470817566, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5349, |
|
"num_tokens": 1135948785.0, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 1.6126305320735952, |
|
"grad_norm": 0.12174445390701294, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5292, |
|
"num_tokens": 1137342427.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.6146195922426654, |
|
"grad_norm": 0.12794137001037598, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5418, |
|
"num_tokens": 1138767131.0, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 1.6166086524117356, |
|
"grad_norm": 0.11655872315168381, |
|
"learning_rate": 1e-06, |
|
"loss": 0.525, |
|
"num_tokens": 1140181396.0, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.6185977125808055, |
|
"grad_norm": 0.11739625781774521, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5265, |
|
"num_tokens": 1141585956.0, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 1.6205867727498757, |
|
"grad_norm": 0.11966431885957718, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5439, |
|
"num_tokens": 1142972552.0, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 1.6225758329189457, |
|
"grad_norm": 0.1153174564242363, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5269, |
|
"num_tokens": 1144360432.0, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.6245648930880159, |
|
"grad_norm": 0.11621485650539398, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5274, |
|
"num_tokens": 1145763084.0, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.626553953257086, |
|
"grad_norm": 0.12078936398029327, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5408, |
|
"num_tokens": 1147181036.0, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 1.6285430134261563, |
|
"grad_norm": 0.11993258446455002, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5371, |
|
"num_tokens": 1148568031.0, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 1.6305320735952262, |
|
"grad_norm": 0.12087948620319366, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5239, |
|
"num_tokens": 1149971169.0, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 1.6325211337642964, |
|
"grad_norm": 0.12193674594163895, |
|
"learning_rate": 1e-06, |
|
"loss": 0.532, |
|
"num_tokens": 1151375502.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.6345101939333664, |
|
"grad_norm": 0.11705011874437332, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5374, |
|
"num_tokens": 1152788267.0, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 1.6364992541024366, |
|
"grad_norm": 0.11758929491043091, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5321, |
|
"num_tokens": 1154203978.0, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 1.6384883142715068, |
|
"grad_norm": 0.11853373050689697, |
|
"learning_rate": 1e-06, |
|
"loss": 0.537, |
|
"num_tokens": 1155628446.0, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 1.640477374440577, |
|
"grad_norm": 0.11777627468109131, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5272, |
|
"num_tokens": 1157039859.0, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 1.642466434609647, |
|
"grad_norm": 0.12647390365600586, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5326, |
|
"num_tokens": 1158461070.0, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.644455494778717, |
|
"grad_norm": 0.11644790321588516, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5176, |
|
"num_tokens": 1159857529.0, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 1.646444554947787, |
|
"grad_norm": 0.11872653663158417, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5255, |
|
"num_tokens": 1161253129.0, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 1.6484336151168573, |
|
"grad_norm": 0.1227663904428482, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5217, |
|
"num_tokens": 1162695252.0, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 1.6504226752859275, |
|
"grad_norm": 0.12286870181560516, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5252, |
|
"num_tokens": 1164073116.0, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 1.6524117354549976, |
|
"grad_norm": 0.11983044445514679, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5277, |
|
"num_tokens": 1165473708.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.6544007956240676, |
|
"grad_norm": 0.11557100713253021, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5383, |
|
"num_tokens": 1166861777.0, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 1.6563898557931376, |
|
"grad_norm": 0.12076076120138168, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5438, |
|
"num_tokens": 1168281147.0, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 1.6583789159622078, |
|
"grad_norm": 0.11843711882829666, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5263, |
|
"num_tokens": 1169644876.0, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 1.660367976131278, |
|
"grad_norm": 0.11820250749588013, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5311, |
|
"num_tokens": 1171071751.0, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 1.6623570363003481, |
|
"grad_norm": 0.11654365807771683, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5307, |
|
"num_tokens": 1172492906.0, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.6643460964694183, |
|
"grad_norm": 0.12128648906946182, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5255, |
|
"num_tokens": 1173918229.0, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 1.6663351566384883, |
|
"grad_norm": 0.1194423958659172, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5265, |
|
"num_tokens": 1175294915.0, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 1.6683242168075583, |
|
"grad_norm": 0.11887000501155853, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5413, |
|
"num_tokens": 1176717797.0, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 1.6703132769766285, |
|
"grad_norm": 0.1198115423321724, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5251, |
|
"num_tokens": 1178114954.0, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 1.6723023371456986, |
|
"grad_norm": 0.12133830040693283, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5311, |
|
"num_tokens": 1179495561.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.6742913973147688, |
|
"grad_norm": 0.12414910644292831, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5354, |
|
"num_tokens": 1180900637.0, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 1.676280457483839, |
|
"grad_norm": 0.11752628535032272, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5302, |
|
"num_tokens": 1182291450.0, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 1.678269517652909, |
|
"grad_norm": 0.11616742610931396, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5253, |
|
"num_tokens": 1183704362.0, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 1.680258577821979, |
|
"grad_norm": 0.11694184690713882, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5365, |
|
"num_tokens": 1185133642.0, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 1.6822476379910491, |
|
"grad_norm": 0.17413176596164703, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5194, |
|
"num_tokens": 1186533182.0, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.6842366981601193, |
|
"grad_norm": 0.11673513799905777, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5238, |
|
"num_tokens": 1187935618.0, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 1.6862257583291895, |
|
"grad_norm": 0.11706209182739258, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5274, |
|
"num_tokens": 1189355590.0, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 1.6882148184982597, |
|
"grad_norm": 0.12071144580841064, |
|
"learning_rate": 1e-06, |
|
"loss": 0.522, |
|
"num_tokens": 1190723818.0, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 1.6902038786673297, |
|
"grad_norm": 0.11622277647256851, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5284, |
|
"num_tokens": 1192146190.0, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 1.6921929388363997, |
|
"grad_norm": 0.11853674054145813, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5294, |
|
"num_tokens": 1193541029.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.6941819990054698, |
|
"grad_norm": 0.11840229481458664, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5146, |
|
"num_tokens": 1194973011.0, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 1.69617105917454, |
|
"grad_norm": 0.11261092871427536, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5263, |
|
"num_tokens": 1196379352.0, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 1.6981601193436102, |
|
"grad_norm": 0.11384103447198868, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5203, |
|
"num_tokens": 1197758489.0, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 1.7001491795126804, |
|
"grad_norm": 0.12079748511314392, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5315, |
|
"num_tokens": 1199163292.0, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 1.7021382396817504, |
|
"grad_norm": 0.11787824332714081, |
|
"learning_rate": 1e-06, |
|
"loss": 0.516, |
|
"num_tokens": 1200558367.0, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.7041272998508203, |
|
"grad_norm": 0.11851559579372406, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5208, |
|
"num_tokens": 1201948798.0, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 1.7061163600198905, |
|
"grad_norm": 0.11743155121803284, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5213, |
|
"num_tokens": 1203317747.0, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 1.7081054201889607, |
|
"grad_norm": 0.11653459817171097, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5292, |
|
"num_tokens": 1204751593.0, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 1.710094480358031, |
|
"grad_norm": 0.11759098619222641, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5272, |
|
"num_tokens": 1206123965.0, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 1.712083540527101, |
|
"grad_norm": 0.11782211810350418, |
|
"learning_rate": 1e-06, |
|
"loss": 0.525, |
|
"num_tokens": 1207504339.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.714072600696171, |
|
"grad_norm": 0.11687052994966507, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5321, |
|
"num_tokens": 1208933928.0, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 1.716061660865241, |
|
"grad_norm": 0.11712006479501724, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5311, |
|
"num_tokens": 1210329028.0, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 1.7180507210343112, |
|
"grad_norm": 0.11513438820838928, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5298, |
|
"num_tokens": 1211754806.0, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 1.7200397812033814, |
|
"grad_norm": 0.11873895674943924, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5211, |
|
"num_tokens": 1213112342.0, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 1.7220288413724516, |
|
"grad_norm": 0.11382027715444565, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5227, |
|
"num_tokens": 1214524561.0, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.7240179015415218, |
|
"grad_norm": 0.11664129793643951, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5255, |
|
"num_tokens": 1215928909.0, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 1.7260069617105918, |
|
"grad_norm": 0.11998264491558075, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5195, |
|
"num_tokens": 1217291462.0, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 1.7279960218796617, |
|
"grad_norm": 0.11994168907403946, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5359, |
|
"num_tokens": 1218671826.0, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 1.729985082048732, |
|
"grad_norm": 0.1149284616112709, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5305, |
|
"num_tokens": 1220083806.0, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 1.731974142217802, |
|
"grad_norm": 0.11925540119409561, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5365, |
|
"num_tokens": 1221495745.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.7339632023868723, |
|
"grad_norm": 0.11679849773645401, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5227, |
|
"num_tokens": 1222941705.0, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 1.7359522625559425, |
|
"grad_norm": 0.1146961897611618, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5359, |
|
"num_tokens": 1224378188.0, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 1.7379413227250124, |
|
"grad_norm": 0.11416348069906235, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5365, |
|
"num_tokens": 1225807121.0, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 1.7399303828940824, |
|
"grad_norm": 0.12208687514066696, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5266, |
|
"num_tokens": 1227205689.0, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 1.7419194430631526, |
|
"grad_norm": 0.11736491322517395, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5315, |
|
"num_tokens": 1228632893.0, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.7439085032322228, |
|
"grad_norm": 0.11757368594408035, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5324, |
|
"num_tokens": 1230044870.0, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 1.745897563401293, |
|
"grad_norm": 0.1128118559718132, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5284, |
|
"num_tokens": 1231458999.0, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 1.747886623570363, |
|
"grad_norm": 0.11604047566652298, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5246, |
|
"num_tokens": 1232886812.0, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 1.7498756837394331, |
|
"grad_norm": 0.11722288280725479, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5205, |
|
"num_tokens": 1234263247.0, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 1.751864743908503, |
|
"grad_norm": 0.11631619185209274, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5271, |
|
"num_tokens": 1235684949.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.7538538040775733, |
|
"grad_norm": 0.11910022050142288, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5174, |
|
"num_tokens": 1237092421.0, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 1.7558428642466435, |
|
"grad_norm": 0.37577947974205017, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5305, |
|
"num_tokens": 1238488077.0, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 1.7578319244157137, |
|
"grad_norm": 0.11624370515346527, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5257, |
|
"num_tokens": 1239858200.0, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 1.7598209845847836, |
|
"grad_norm": 0.11999819427728653, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5296, |
|
"num_tokens": 1241270454.0, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 1.7618100447538538, |
|
"grad_norm": 0.11568225175142288, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5204, |
|
"num_tokens": 1242689791.0, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.7637991049229238, |
|
"grad_norm": 0.1157977357506752, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5273, |
|
"num_tokens": 1244076494.0, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 1.765788165091994, |
|
"grad_norm": 0.12077594548463821, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5352, |
|
"num_tokens": 1245450599.0, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 1.7677772252610642, |
|
"grad_norm": 0.11732426285743713, |
|
"learning_rate": 1e-06, |
|
"loss": 0.518, |
|
"num_tokens": 1246866283.0, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 1.7697662854301344, |
|
"grad_norm": 0.11936353892087936, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5211, |
|
"num_tokens": 1248255137.0, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 1.7717553455992043, |
|
"grad_norm": 0.11886170506477356, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5257, |
|
"num_tokens": 1249659564.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.7737444057682745, |
|
"grad_norm": 0.11828939616680145, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5405, |
|
"num_tokens": 1251058792.0, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 1.7757334659373445, |
|
"grad_norm": 0.11856890469789505, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5306, |
|
"num_tokens": 1252485161.0, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 1.7777225261064147, |
|
"grad_norm": 0.11919775605201721, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5263, |
|
"num_tokens": 1253864701.0, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 1.7797115862754849, |
|
"grad_norm": 0.11722150444984436, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5319, |
|
"num_tokens": 1255315675.0, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 1.781700646444555, |
|
"grad_norm": 0.11736007779836655, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5212, |
|
"num_tokens": 1256727422.0, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.783689706613625, |
|
"grad_norm": 0.11714823544025421, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5154, |
|
"num_tokens": 1258118035.0, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 1.7856787667826952, |
|
"grad_norm": 0.11619334667921066, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5354, |
|
"num_tokens": 1259569135.0, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 1.7876678269517652, |
|
"grad_norm": 0.12298526614904404, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5241, |
|
"num_tokens": 1260962943.0, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 1.7896568871208354, |
|
"grad_norm": 0.11889567226171494, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5156, |
|
"num_tokens": 1262364349.0, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 1.7916459472899056, |
|
"grad_norm": 0.11857149749994278, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5338, |
|
"num_tokens": 1263806716.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.7936350074589757, |
|
"grad_norm": 0.11797276884317398, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5207, |
|
"num_tokens": 1265169481.0, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 1.7956240676280457, |
|
"grad_norm": 0.11892815679311752, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5293, |
|
"num_tokens": 1266576993.0, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 1.797613127797116, |
|
"grad_norm": 0.12395410984754562, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5366, |
|
"num_tokens": 1267993556.0, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 1.7996021879661859, |
|
"grad_norm": 0.11953330039978027, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5368, |
|
"num_tokens": 1269389274.0, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 1.801591248135256, |
|
"grad_norm": 0.12015491724014282, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5221, |
|
"num_tokens": 1270797211.0, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.8035803083043263, |
|
"grad_norm": 0.12275838851928711, |
|
"learning_rate": 1e-06, |
|
"loss": 0.516, |
|
"num_tokens": 1272228896.0, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 1.8055693684733964, |
|
"grad_norm": 0.11699802428483963, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5261, |
|
"num_tokens": 1273621370.0, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 1.8075584286424664, |
|
"grad_norm": 0.1161780133843422, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5274, |
|
"num_tokens": 1275017133.0, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 1.8095474888115366, |
|
"grad_norm": 0.11965856701135635, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5337, |
|
"num_tokens": 1276448660.0, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 1.8115365489806066, |
|
"grad_norm": 0.12168006598949432, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5175, |
|
"num_tokens": 1277862125.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.8135256091496768, |
|
"grad_norm": 0.11938372254371643, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5216, |
|
"num_tokens": 1279248819.0, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 1.815514669318747, |
|
"grad_norm": 0.11735393106937408, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5233, |
|
"num_tokens": 1280646759.0, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 1.8175037294878171, |
|
"grad_norm": 0.11623270809650421, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5233, |
|
"num_tokens": 1282057498.0, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 1.819492789656887, |
|
"grad_norm": 0.11557810753583908, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5153, |
|
"num_tokens": 1283431495.0, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 1.8214818498259573, |
|
"grad_norm": 0.1188741996884346, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5046, |
|
"num_tokens": 1284807817.0, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.8234709099950273, |
|
"grad_norm": 0.12421073764562607, |
|
"learning_rate": 1e-06, |
|
"loss": 0.529, |
|
"num_tokens": 1286222594.0, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 1.8254599701640974, |
|
"grad_norm": 0.11472687870264053, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5189, |
|
"num_tokens": 1287660647.0, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 1.8274490303331676, |
|
"grad_norm": 0.12024683505296707, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5327, |
|
"num_tokens": 1289025502.0, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 1.8294380905022378, |
|
"grad_norm": 0.11754554510116577, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5266, |
|
"num_tokens": 1290429204.0, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 1.8314271506713078, |
|
"grad_norm": 0.11749599874019623, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5346, |
|
"num_tokens": 1291835199.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.8334162108403778, |
|
"grad_norm": 0.12334319949150085, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5286, |
|
"num_tokens": 1293232429.0, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 1.835405271009448, |
|
"grad_norm": 0.1195298433303833, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5342, |
|
"num_tokens": 1294636234.0, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 1.8373943311785181, |
|
"grad_norm": 0.12667156755924225, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5329, |
|
"num_tokens": 1296037959.0, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 1.8393833913475883, |
|
"grad_norm": 0.11679953336715698, |
|
"learning_rate": 1e-06, |
|
"loss": 0.521, |
|
"num_tokens": 1297473035.0, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 1.8413724515166585, |
|
"grad_norm": 0.12583783268928528, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5276, |
|
"num_tokens": 1298890907.0, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.8433615116857285, |
|
"grad_norm": 0.11728479713201523, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5395, |
|
"num_tokens": 1300299527.0, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 1.8453505718547984, |
|
"grad_norm": 0.12031624466180801, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5223, |
|
"num_tokens": 1301707639.0, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 1.8473396320238686, |
|
"grad_norm": 0.11883748322725296, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5314, |
|
"num_tokens": 1303102646.0, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 1.8493286921929388, |
|
"grad_norm": 0.11844377219676971, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5239, |
|
"num_tokens": 1304494162.0, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 1.851317752362009, |
|
"grad_norm": 0.11900133639574051, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5417, |
|
"num_tokens": 1305913481.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.8533068125310792, |
|
"grad_norm": 0.11836715042591095, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5233, |
|
"num_tokens": 1307345252.0, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 1.8552958727001492, |
|
"grad_norm": 0.11962825059890747, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5189, |
|
"num_tokens": 1308721678.0, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 1.8572849328692191, |
|
"grad_norm": 0.11557050049304962, |
|
"learning_rate": 1e-06, |
|
"loss": 0.534, |
|
"num_tokens": 1310185671.0, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 1.8592739930382893, |
|
"grad_norm": 0.11940222978591919, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5201, |
|
"num_tokens": 1311578400.0, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 1.8612630532073595, |
|
"grad_norm": 0.11809618771076202, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5302, |
|
"num_tokens": 1313012059.0, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.8632521133764297, |
|
"grad_norm": 0.11569247394800186, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5261, |
|
"num_tokens": 1314429380.0, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 1.8652411735455, |
|
"grad_norm": 0.11870964616537094, |
|
"learning_rate": 1e-06, |
|
"loss": 0.531, |
|
"num_tokens": 1315859666.0, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 1.8672302337145699, |
|
"grad_norm": 0.11804142594337463, |
|
"learning_rate": 1e-06, |
|
"loss": 0.524, |
|
"num_tokens": 1317284109.0, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 1.8692192938836398, |
|
"grad_norm": 0.12137165665626526, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5172, |
|
"num_tokens": 1318645618.0, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 1.87120835405271, |
|
"grad_norm": 0.1159721091389656, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5197, |
|
"num_tokens": 1320005960.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.8731974142217802, |
|
"grad_norm": 0.1175520122051239, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5141, |
|
"num_tokens": 1321343874.0, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 1.8751864743908504, |
|
"grad_norm": 0.12161525338888168, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5283, |
|
"num_tokens": 1322745862.0, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 1.8771755345599206, |
|
"grad_norm": 0.11651836335659027, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5166, |
|
"num_tokens": 1324173893.0, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 1.8791645947289906, |
|
"grad_norm": 0.11678607016801834, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5157, |
|
"num_tokens": 1325528425.0, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 1.8811536548980605, |
|
"grad_norm": 0.11988682299852371, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5241, |
|
"num_tokens": 1326934542.0, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.8831427150671307, |
|
"grad_norm": 0.1179330125451088, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5278, |
|
"num_tokens": 1328329326.0, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 1.885131775236201, |
|
"grad_norm": 0.12124811857938766, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5121, |
|
"num_tokens": 1329690107.0, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 1.887120835405271, |
|
"grad_norm": 0.11679516732692719, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5123, |
|
"num_tokens": 1331099443.0, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 1.8891098955743413, |
|
"grad_norm": 0.12094994634389877, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5294, |
|
"num_tokens": 1332514974.0, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 1.8910989557434112, |
|
"grad_norm": 0.11741626262664795, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5221, |
|
"num_tokens": 1333908900.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.8930880159124812, |
|
"grad_norm": 0.11530331522226334, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5253, |
|
"num_tokens": 1335304709.0, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 1.8950770760815514, |
|
"grad_norm": 0.1158575564622879, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5285, |
|
"num_tokens": 1336705867.0, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 1.8970661362506216, |
|
"grad_norm": 0.12077206373214722, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5282, |
|
"num_tokens": 1338134157.0, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 1.8990551964196918, |
|
"grad_norm": 0.11674216389656067, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5256, |
|
"num_tokens": 1339570572.0, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 1.901044256588762, |
|
"grad_norm": 0.11999034881591797, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5217, |
|
"num_tokens": 1340963763.0, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.903033316757832, |
|
"grad_norm": 0.1159081682562828, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5193, |
|
"num_tokens": 1342343659.0, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 1.905022376926902, |
|
"grad_norm": 0.12077668309211731, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5258, |
|
"num_tokens": 1343740689.0, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 1.907011437095972, |
|
"grad_norm": 0.1166224554181099, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5292, |
|
"num_tokens": 1345132334.0, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 1.9090004972650423, |
|
"grad_norm": 0.1241859421133995, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5148, |
|
"num_tokens": 1346524317.0, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 1.9109895574341125, |
|
"grad_norm": 0.11995749920606613, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5299, |
|
"num_tokens": 1347941197.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.9129786176031827, |
|
"grad_norm": 0.12070658802986145, |
|
"learning_rate": 1e-06, |
|
"loss": 0.521, |
|
"num_tokens": 1349348423.0, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 1.9149676777722526, |
|
"grad_norm": 0.11713531613349915, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5194, |
|
"num_tokens": 1350775976.0, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 1.9169567379413226, |
|
"grad_norm": 0.11523653566837311, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5187, |
|
"num_tokens": 1352185197.0, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 1.9189457981103928, |
|
"grad_norm": 0.1193997859954834, |
|
"learning_rate": 1e-06, |
|
"loss": 0.521, |
|
"num_tokens": 1353553440.0, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 1.920934858279463, |
|
"grad_norm": 0.11642030626535416, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5187, |
|
"num_tokens": 1354944439.0, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.9229239184485332, |
|
"grad_norm": 0.12482885271310806, |
|
"learning_rate": 1e-06, |
|
"loss": 0.515, |
|
"num_tokens": 1356336096.0, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 1.9249129786176031, |
|
"grad_norm": 0.12160996347665787, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5118, |
|
"num_tokens": 1357735474.0, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 1.9269020387866733, |
|
"grad_norm": 0.11489357799291611, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5202, |
|
"num_tokens": 1359150923.0, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 1.9288910989557433, |
|
"grad_norm": 0.1207101047039032, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5185, |
|
"num_tokens": 1360546317.0, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 1.9308801591248135, |
|
"grad_norm": 0.12328807264566422, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5359, |
|
"num_tokens": 1361956243.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.9328692192938837, |
|
"grad_norm": 0.11625958234071732, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5302, |
|
"num_tokens": 1363369844.0, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 1.9348582794629539, |
|
"grad_norm": 0.11894381046295166, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5152, |
|
"num_tokens": 1364788358.0, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 1.9368473396320238, |
|
"grad_norm": 0.11918807029724121, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5294, |
|
"num_tokens": 1366212166.0, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 1.938836399801094, |
|
"grad_norm": 0.1214541345834732, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5279, |
|
"num_tokens": 1367620280.0, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 1.940825459970164, |
|
"grad_norm": 0.12185929715633392, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5411, |
|
"num_tokens": 1369018460.0, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.9428145201392342, |
|
"grad_norm": 0.12074369192123413, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5169, |
|
"num_tokens": 1370404156.0, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 1.9448035803083044, |
|
"grad_norm": 0.11974059790372849, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5189, |
|
"num_tokens": 1371779401.0, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 1.9467926404773745, |
|
"grad_norm": 0.1180194690823555, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5255, |
|
"num_tokens": 1373167544.0, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 1.9487817006464445, |
|
"grad_norm": 0.11891929805278778, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5181, |
|
"num_tokens": 1374625457.0, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 1.9507707608155147, |
|
"grad_norm": 0.11925170570611954, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5386, |
|
"num_tokens": 1376040593.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.9527598209845847, |
|
"grad_norm": 0.11748997122049332, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5292, |
|
"num_tokens": 1377439737.0, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 1.9547488811536549, |
|
"grad_norm": 0.12130767852067947, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5283, |
|
"num_tokens": 1378841802.0, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 1.956737941322725, |
|
"grad_norm": 0.117047019302845, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5307, |
|
"num_tokens": 1380279333.0, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 1.9587270014917952, |
|
"grad_norm": 0.11402452737092972, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5252, |
|
"num_tokens": 1381666440.0, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 1.9607160616608652, |
|
"grad_norm": 0.11929241567850113, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5267, |
|
"num_tokens": 1383062810.0, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.9627051218299354, |
|
"grad_norm": 0.13006067276000977, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5245, |
|
"num_tokens": 1384441301.0, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 1.9646941819990054, |
|
"grad_norm": 0.11855417490005493, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5241, |
|
"num_tokens": 1385828740.0, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 1.9666832421680756, |
|
"grad_norm": 0.11694888770580292, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5198, |
|
"num_tokens": 1387217686.0, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 1.9686723023371457, |
|
"grad_norm": 0.12000932544469833, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5167, |
|
"num_tokens": 1388611363.0, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 1.970661362506216, |
|
"grad_norm": 0.11440891027450562, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5149, |
|
"num_tokens": 1390014972.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.972650422675286, |
|
"grad_norm": 0.11770491302013397, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5254, |
|
"num_tokens": 1391404923.0, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 1.974639482844356, |
|
"grad_norm": 0.1166594997048378, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5316, |
|
"num_tokens": 1392816272.0, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 1.976628543013426, |
|
"grad_norm": 0.12553872168064117, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5168, |
|
"num_tokens": 1394201525.0, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 1.9786176031824962, |
|
"grad_norm": 0.11989990621805191, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5317, |
|
"num_tokens": 1395588841.0, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 1.9806066633515664, |
|
"grad_norm": 0.1259329915046692, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5367, |
|
"num_tokens": 1397008721.0, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.9825957235206366, |
|
"grad_norm": 0.11585794389247894, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5216, |
|
"num_tokens": 1398471728.0, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 1.9845847836897066, |
|
"grad_norm": 0.12714508175849915, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5278, |
|
"num_tokens": 1399889254.0, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 1.9865738438587768, |
|
"grad_norm": 0.12047947198152542, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5258, |
|
"num_tokens": 1401259409.0, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 1.9885629040278467, |
|
"grad_norm": 0.12171699851751328, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5198, |
|
"num_tokens": 1402651874.0, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 1.990551964196917, |
|
"grad_norm": 0.11678752303123474, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5278, |
|
"num_tokens": 1404075471.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.9925410243659871, |
|
"grad_norm": 0.1172628328204155, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5231, |
|
"num_tokens": 1405486757.0, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 1.9945300845350573, |
|
"grad_norm": 0.11984940618276596, |
|
"learning_rate": 1e-06, |
|
"loss": 0.522, |
|
"num_tokens": 1406892660.0, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 1.9965191447041273, |
|
"grad_norm": 0.11809879541397095, |
|
"learning_rate": 1e-06, |
|
"loss": 0.516, |
|
"num_tokens": 1408310027.0, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 1.9985082048731975, |
|
"grad_norm": 0.11894188821315765, |
|
"learning_rate": 1e-06, |
|
"loss": 0.5125, |
|
"num_tokens": 1409683792.0, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 1.9985082048731975, |
|
"step": 1004, |
|
"total_flos": 5.825582167854206e+19, |
|
"train_loss": 0.5989120793770034, |
|
"train_runtime": 31705.2652, |
|
"train_samples_per_second": 14.201, |
|
"train_steps_per_second": 0.032 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1004, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 51, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.825582167854206e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|