|
{ |
|
"best_metric": 0.15992629528045654, |
|
"best_model_checkpoint": "checkpoints/star_plus-llama-3.1-8b-math50k/math50k/finetune-llama-3.1-8b-math-step-1/checkpoint-3456", |
|
"epoch": 1.0, |
|
"eval_steps": 384, |
|
"global_step": 3837, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0013031013812874641, |
|
"grad_norm": 21.375, |
|
"learning_rate": 1.3020833333333334e-07, |
|
"loss": 0.5553, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0026062027625749283, |
|
"grad_norm": 19.125, |
|
"learning_rate": 2.604166666666667e-07, |
|
"loss": 0.5275, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.003909304143862392, |
|
"grad_norm": 22.5, |
|
"learning_rate": 3.90625e-07, |
|
"loss": 0.5672, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.005212405525149857, |
|
"grad_norm": 20.125, |
|
"learning_rate": 5.208333333333334e-07, |
|
"loss": 0.5298, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.006515506906437321, |
|
"grad_norm": 17.375, |
|
"learning_rate": 6.510416666666668e-07, |
|
"loss": 0.5131, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.007818608287724784, |
|
"grad_norm": 16.75, |
|
"learning_rate": 7.8125e-07, |
|
"loss": 0.4643, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.009121709669012249, |
|
"grad_norm": 14.9375, |
|
"learning_rate": 9.114583333333333e-07, |
|
"loss": 0.4317, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.010424811050299713, |
|
"grad_norm": 15.8125, |
|
"learning_rate": 1.0416666666666667e-06, |
|
"loss": 0.3643, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.011727912431587178, |
|
"grad_norm": 11.375, |
|
"learning_rate": 1.1718750000000001e-06, |
|
"loss": 0.2948, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.013031013812874642, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 1.3020833333333335e-06, |
|
"loss": 0.2206, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.014334115194162106, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.4322916666666667e-06, |
|
"loss": 0.1895, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.01563721657544957, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.5625e-06, |
|
"loss": 0.1681, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.016940317956737033, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.6927083333333335e-06, |
|
"loss": 0.155, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.018243419338024498, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.8229166666666666e-06, |
|
"loss": 0.1643, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.019546520719311962, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.953125e-06, |
|
"loss": 0.157, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.020849622100599426, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 2.0833333333333334e-06, |
|
"loss": 0.1477, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02215272348188689, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 2.213541666666667e-06, |
|
"loss": 0.1519, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.023455824863174355, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 2.3437500000000002e-06, |
|
"loss": 0.148, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02475892624446182, |
|
"grad_norm": 2.5, |
|
"learning_rate": 2.4739583333333336e-06, |
|
"loss": 0.1546, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.026062027625749284, |
|
"grad_norm": 2.25, |
|
"learning_rate": 2.604166666666667e-06, |
|
"loss": 0.1564, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.027365129007036748, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 2.7343750000000004e-06, |
|
"loss": 0.1493, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.028668230388324212, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 2.8645833333333334e-06, |
|
"loss": 0.1525, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.029971331769611677, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 2.994791666666667e-06, |
|
"loss": 0.1582, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.03127443315089914, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.1559, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.032577534532186606, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 3.2552083333333335e-06, |
|
"loss": 0.1514, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.033880635913474066, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 3.385416666666667e-06, |
|
"loss": 0.1528, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.035183737294761534, |
|
"grad_norm": 2.5, |
|
"learning_rate": 3.5156250000000003e-06, |
|
"loss": 0.1613, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.036486838676048995, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 3.6458333333333333e-06, |
|
"loss": 0.1503, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03778994005733646, |
|
"grad_norm": 2.75, |
|
"learning_rate": 3.776041666666667e-06, |
|
"loss": 0.1454, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.039093041438623924, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 3.90625e-06, |
|
"loss": 0.1506, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04039614281991139, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 4.0364583333333335e-06, |
|
"loss": 0.1523, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.04169924420119885, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.1519, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04300234558248632, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 4.296875e-06, |
|
"loss": 0.16, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.04430544696377378, |
|
"grad_norm": 3.0, |
|
"learning_rate": 4.427083333333334e-06, |
|
"loss": 0.1495, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04560854834506125, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 4.557291666666667e-06, |
|
"loss": 0.145, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.04691164972634871, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.6875000000000004e-06, |
|
"loss": 0.1398, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.04821475110763617, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 4.817708333333334e-06, |
|
"loss": 0.1562, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.04951785248892364, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 4.947916666666667e-06, |
|
"loss": 0.1523, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0508209538702111, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.078125000000001e-06, |
|
"loss": 0.1494, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.05212405525149857, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 5.208333333333334e-06, |
|
"loss": 0.1432, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05342715663278603, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 5.3385416666666666e-06, |
|
"loss": 0.1472, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.054730258014073496, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 5.468750000000001e-06, |
|
"loss": 0.1471, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.05603335939536096, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 5.598958333333334e-06, |
|
"loss": 0.1511, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.057336460776648425, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 5.729166666666667e-06, |
|
"loss": 0.1449, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.058639562157935886, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.859375e-06, |
|
"loss": 0.1502, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.059942663539223354, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 5.989583333333334e-06, |
|
"loss": 0.1438, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.061245764920510815, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 6.119791666666667e-06, |
|
"loss": 0.1584, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.06254886630179828, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.1572, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.06385196768308575, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 6.3802083333333345e-06, |
|
"loss": 0.1516, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.06515506906437321, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 6.510416666666667e-06, |
|
"loss": 0.152, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06645817044566067, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 6.6406250000000005e-06, |
|
"loss": 0.1508, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.06776127182694813, |
|
"grad_norm": 2.75, |
|
"learning_rate": 6.770833333333334e-06, |
|
"loss": 0.1569, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.06906437320823561, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 6.901041666666667e-06, |
|
"loss": 0.1426, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.07036747458952307, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 7.031250000000001e-06, |
|
"loss": 0.1595, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.07167057597081053, |
|
"grad_norm": 2.5, |
|
"learning_rate": 7.161458333333334e-06, |
|
"loss": 0.1603, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.07297367735209799, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 7.291666666666667e-06, |
|
"loss": 0.1533, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.07427677873338545, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 7.421875000000001e-06, |
|
"loss": 0.1505, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.07557988011467293, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 7.552083333333334e-06, |
|
"loss": 0.1499, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07688298149596039, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 7.682291666666668e-06, |
|
"loss": 0.1544, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.07818608287724785, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 7.8125e-06, |
|
"loss": 0.1535, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07948918425853531, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 7.942708333333334e-06, |
|
"loss": 0.1497, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.08079228563982278, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 8.072916666666667e-06, |
|
"loss": 0.1568, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.08209538702111024, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 8.203125000000001e-06, |
|
"loss": 0.1525, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.0833984884023977, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.15, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.08470158978368517, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 8.463541666666666e-06, |
|
"loss": 0.1583, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.08600469116497264, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 8.59375e-06, |
|
"loss": 0.1555, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.0873077925462601, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 8.723958333333335e-06, |
|
"loss": 0.1534, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.08861089392754756, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 8.854166666666667e-06, |
|
"loss": 0.1565, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.08991399530883502, |
|
"grad_norm": 2.5, |
|
"learning_rate": 8.984375000000002e-06, |
|
"loss": 0.1544, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.0912170966901225, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 9.114583333333334e-06, |
|
"loss": 0.1543, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09252019807140996, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 9.244791666666667e-06, |
|
"loss": 0.1536, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.09382329945269742, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 0.1606, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.09512640083398488, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 9.505208333333335e-06, |
|
"loss": 0.1533, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.09642950221527234, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 9.635416666666668e-06, |
|
"loss": 0.1616, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.09773260359655982, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 9.765625e-06, |
|
"loss": 0.1703, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.09903570497784728, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 9.895833333333334e-06, |
|
"loss": 0.1659, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.10007818608287725, |
|
"eval_loss": 0.18054868280887604, |
|
"eval_runtime": 49.2875, |
|
"eval_samples_per_second": 298.25, |
|
"eval_steps_per_second": 9.333, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.10033880635913474, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 9.997103967564437e-06, |
|
"loss": 0.1524, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.1016419077404222, |
|
"grad_norm": 3.25, |
|
"learning_rate": 9.98262380538662e-06, |
|
"loss": 0.1655, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.10294500912170967, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 9.968143643208805e-06, |
|
"loss": 0.1657, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.10424811050299713, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 9.953663481030988e-06, |
|
"loss": 0.1566, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1055512118842846, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 9.939183318853171e-06, |
|
"loss": 0.162, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.10685431326557206, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 9.924703156675356e-06, |
|
"loss": 0.156, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.10815741464685953, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 9.910222994497538e-06, |
|
"loss": 0.164, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.10946051602814699, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 9.895742832319723e-06, |
|
"loss": 0.1557, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.11076361740943445, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 9.881262670141906e-06, |
|
"loss": 0.1643, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.11206671879072191, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 9.86678250796409e-06, |
|
"loss": 0.1624, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.11336982017200939, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 9.852302345786274e-06, |
|
"loss": 0.1536, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.11467292155329685, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 9.837822183608458e-06, |
|
"loss": 0.1638, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.11597602293458431, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 9.823342021430641e-06, |
|
"loss": 0.1587, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.11727912431587177, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 9.808861859252824e-06, |
|
"loss": 0.1611, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.11858222569715923, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 9.794381697075009e-06, |
|
"loss": 0.1539, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.11988532707844671, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 9.779901534897192e-06, |
|
"loss": 0.1607, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.12118842845973417, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 9.765421372719375e-06, |
|
"loss": 0.1598, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.12249152984102163, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 9.75094121054156e-06, |
|
"loss": 0.1531, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.12379463122230909, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.736461048363742e-06, |
|
"loss": 0.16, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.12509773260359655, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 9.721980886185927e-06, |
|
"loss": 0.1572, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.12640083398488403, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 9.70750072400811e-06, |
|
"loss": 0.1581, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.1277039353661715, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 9.693020561830293e-06, |
|
"loss": 0.1599, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.12900703674745895, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 9.678540399652477e-06, |
|
"loss": 0.1632, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.13031013812874642, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 9.66406023747466e-06, |
|
"loss": 0.1602, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13161323951003387, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 9.649580075296843e-06, |
|
"loss": 0.1551, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.13291634089132134, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 9.635099913119028e-06, |
|
"loss": 0.1571, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.13421944227260882, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 9.620619750941211e-06, |
|
"loss": 0.1544, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.13552254365389627, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 9.606139588763394e-06, |
|
"loss": 0.1594, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.13682564503518374, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 9.591659426585578e-06, |
|
"loss": 0.1597, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.13812874641647122, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 9.577179264407761e-06, |
|
"loss": 0.1568, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.13943184779775866, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 9.562699102229946e-06, |
|
"loss": 0.1501, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.14073494917904614, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.548218940052129e-06, |
|
"loss": 0.154, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.14203805056033358, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.533738777874314e-06, |
|
"loss": 0.1579, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.14334115194162106, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 9.519258615696497e-06, |
|
"loss": 0.1582, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.14464425332290853, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 9.504778453518681e-06, |
|
"loss": 0.1622, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.14594735470419598, |
|
"grad_norm": 2.625, |
|
"learning_rate": 9.490298291340864e-06, |
|
"loss": 0.1558, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.14725045608548346, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 9.475818129163047e-06, |
|
"loss": 0.1533, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.1485535574667709, |
|
"grad_norm": 2.5, |
|
"learning_rate": 9.461337966985232e-06, |
|
"loss": 0.1531, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.14985665884805838, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 9.446857804807415e-06, |
|
"loss": 0.1585, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.15115976022934585, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 9.4323776426296e-06, |
|
"loss": 0.1522, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1524628616106333, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 9.417897480451782e-06, |
|
"loss": 0.1572, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.15376596299192077, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 9.403417318273965e-06, |
|
"loss": 0.1553, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.15506906437320825, |
|
"grad_norm": 3.125, |
|
"learning_rate": 9.38893715609615e-06, |
|
"loss": 0.1664, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.1563721657544957, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 9.374456993918333e-06, |
|
"loss": 0.1469, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.15767526713578317, |
|
"grad_norm": 2.5, |
|
"learning_rate": 9.359976831740516e-06, |
|
"loss": 0.16, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.15897836851707062, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 9.3454966695627e-06, |
|
"loss": 0.1546, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.1602814698983581, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 9.331016507384883e-06, |
|
"loss": 0.1491, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.16158457127964557, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 9.316536345207066e-06, |
|
"loss": 0.1484, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.16288767266093301, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 9.30205618302925e-06, |
|
"loss": 0.1665, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.1641907740422205, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 9.287576020851434e-06, |
|
"loss": 0.1523, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.16549387542350794, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 9.273095858673617e-06, |
|
"loss": 0.1597, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.1667969768047954, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 9.258615696495801e-06, |
|
"loss": 0.1563, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.16810007818608289, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 9.244135534317984e-06, |
|
"loss": 0.1578, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.16940317956737033, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 9.229655372140169e-06, |
|
"loss": 0.1505, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1707062809486578, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 9.215175209962352e-06, |
|
"loss": 0.1563, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.17200938232994528, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 9.200695047784537e-06, |
|
"loss": 0.1584, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.17331248371123273, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 9.18621488560672e-06, |
|
"loss": 0.1513, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.1746155850925202, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 9.171734723428904e-06, |
|
"loss": 0.1553, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.17591868647380765, |
|
"grad_norm": 2.375, |
|
"learning_rate": 9.157254561251087e-06, |
|
"loss": 0.1547, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.17722178785509513, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 9.142774399073272e-06, |
|
"loss": 0.1553, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.1785248892363826, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 9.128294236895455e-06, |
|
"loss": 0.1527, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.17982799061767005, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 9.113814074717638e-06, |
|
"loss": 0.1545, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.18113109199895752, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.099333912539822e-06, |
|
"loss": 0.1503, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.182434193380245, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 9.084853750362005e-06, |
|
"loss": 0.1546, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.18373729476153244, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.070373588184188e-06, |
|
"loss": 0.1533, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.18504039614281992, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 9.055893426006373e-06, |
|
"loss": 0.1499, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.18634349752410737, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 9.041413263828556e-06, |
|
"loss": 0.1561, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.18764659890539484, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 9.026933101650739e-06, |
|
"loss": 0.161, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.18894970028668231, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 9.012452939472923e-06, |
|
"loss": 0.1602, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.19025280166796976, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 8.997972777295106e-06, |
|
"loss": 0.1579, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.19155590304925724, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 8.983492615117289e-06, |
|
"loss": 0.1537, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.19285900443054468, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 8.969012452939474e-06, |
|
"loss": 0.1508, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.19416210581183216, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 8.954532290761657e-06, |
|
"loss": 0.1525, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.19546520719311963, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.94005212858384e-06, |
|
"loss": 0.1562, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.19676830857440708, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 8.925571966406024e-06, |
|
"loss": 0.1566, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.19807140995569456, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 8.911091804228207e-06, |
|
"loss": 0.1541, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.19937451133698203, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 8.896611642050392e-06, |
|
"loss": 0.1484, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.2001563721657545, |
|
"eval_loss": 0.17529802024364471, |
|
"eval_runtime": 49.2622, |
|
"eval_samples_per_second": 298.403, |
|
"eval_steps_per_second": 9.338, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.20067761271826948, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 8.882131479872575e-06, |
|
"loss": 0.1509, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.20198071409955695, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 8.86765131769476e-06, |
|
"loss": 0.1499, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.2032838154808444, |
|
"grad_norm": 2.25, |
|
"learning_rate": 8.853171155516942e-06, |
|
"loss": 0.1563, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.20458691686213187, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 8.838690993339127e-06, |
|
"loss": 0.1493, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.20589001824341935, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 8.82421083116131e-06, |
|
"loss": 0.1432, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.2071931196247068, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 8.809730668983495e-06, |
|
"loss": 0.1428, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.20849622100599427, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 8.795250506805678e-06, |
|
"loss": 0.1597, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.20979932238728172, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 8.78077034462786e-06, |
|
"loss": 0.1599, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.2111024237685692, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 8.766290182450045e-06, |
|
"loss": 0.1554, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.21240552514985667, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.751810020272228e-06, |
|
"loss": 0.1588, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.2137086265311441, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 8.737329858094411e-06, |
|
"loss": 0.1498, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.2150117279124316, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 8.722849695916596e-06, |
|
"loss": 0.1571, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.21631482929371906, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 8.708369533738779e-06, |
|
"loss": 0.149, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.2176179306750065, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.693889371560962e-06, |
|
"loss": 0.1558, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.21892103205629398, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 8.679409209383146e-06, |
|
"loss": 0.1543, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.22022413343758143, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.664929047205329e-06, |
|
"loss": 0.146, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.2215272348188689, |
|
"grad_norm": 2.5, |
|
"learning_rate": 8.650448885027512e-06, |
|
"loss": 0.1457, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.22283033620015638, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 8.635968722849697e-06, |
|
"loss": 0.156, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.22413343758144383, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 8.62148856067188e-06, |
|
"loss": 0.1496, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.2254365389627313, |
|
"grad_norm": 2.25, |
|
"learning_rate": 8.607008398494063e-06, |
|
"loss": 0.1495, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.22673964034401878, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 8.592528236316247e-06, |
|
"loss": 0.1501, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.22804274172530622, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 8.57804807413843e-06, |
|
"loss": 0.154, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.2293458431065937, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.563567911960615e-06, |
|
"loss": 0.1444, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.23064894448788115, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 8.549087749782798e-06, |
|
"loss": 0.1505, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.23195204586916862, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 8.534607587604982e-06, |
|
"loss": 0.1593, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.2332551472504561, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 8.520127425427165e-06, |
|
"loss": 0.1477, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.23455824863174354, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 8.50564726324935e-06, |
|
"loss": 0.1487, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.23586135001303102, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 8.491167101071533e-06, |
|
"loss": 0.1434, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.23716445139431847, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 8.476686938893718e-06, |
|
"loss": 0.1496, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.23846755277560594, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 8.4622067767159e-06, |
|
"loss": 0.1515, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.23977065415689341, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 8.447726614538083e-06, |
|
"loss": 0.155, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.24107375553818086, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 8.433246452360268e-06, |
|
"loss": 0.146, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.24237685691946834, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 8.418766290182451e-06, |
|
"loss": 0.1495, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.2436799583007558, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 8.404286128004634e-06, |
|
"loss": 0.1536, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.24498305968204326, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 8.389805965826819e-06, |
|
"loss": 0.1529, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.24628616106333073, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 8.375325803649002e-06, |
|
"loss": 0.157, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.24758926244461818, |
|
"grad_norm": 2.25, |
|
"learning_rate": 8.360845641471184e-06, |
|
"loss": 0.1486, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.24889236382590565, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 8.346365479293369e-06, |
|
"loss": 0.1495, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.2501954652071931, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 8.331885317115552e-06, |
|
"loss": 0.1415, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.2514985665884806, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 8.317405154937735e-06, |
|
"loss": 0.1478, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.25280166796976805, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 8.30292499275992e-06, |
|
"loss": 0.1531, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.2541047693510555, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 8.288444830582103e-06, |
|
"loss": 0.1456, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.255407870732343, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 8.273964668404287e-06, |
|
"loss": 0.149, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.2567109721136304, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 8.25948450622647e-06, |
|
"loss": 0.1565, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.2580140734949179, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 8.245004344048653e-06, |
|
"loss": 0.1463, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.25931717487620537, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 8.230524181870838e-06, |
|
"loss": 0.1466, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.26062027625749284, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 8.21604401969302e-06, |
|
"loss": 0.1492, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2619233776387803, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 8.201563857515205e-06, |
|
"loss": 0.1538, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.26322647902006774, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 8.187083695337388e-06, |
|
"loss": 0.1436, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.2645295804013552, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 8.172603533159573e-06, |
|
"loss": 0.1543, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.2658326817826427, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 8.158123370981756e-06, |
|
"loss": 0.1461, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.26713578316393016, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 8.14364320880394e-06, |
|
"loss": 0.1554, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.26843888454521764, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 8.129163046626123e-06, |
|
"loss": 0.1517, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.26974198592650506, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 8.114682884448306e-06, |
|
"loss": 0.1467, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.27104508730779253, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 8.100202722270491e-06, |
|
"loss": 0.1444, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.27234818868908, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 8.085722560092674e-06, |
|
"loss": 0.1499, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.2736512900703675, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.071242397914857e-06, |
|
"loss": 0.1494, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.27495439145165496, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 8.056762235737041e-06, |
|
"loss": 0.1397, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.27625749283294243, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 8.042282073559224e-06, |
|
"loss": 0.146, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.27756059421422985, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 8.027801911381407e-06, |
|
"loss": 0.1477, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.2788636955955173, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 8.013321749203592e-06, |
|
"loss": 0.1517, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.2801667969768048, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 7.998841587025775e-06, |
|
"loss": 0.1522, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.2814698983580923, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 7.984361424847958e-06, |
|
"loss": 0.1571, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.28277299973937975, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 7.969881262670143e-06, |
|
"loss": 0.1424, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.28407610112066717, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 7.955401100492325e-06, |
|
"loss": 0.1458, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.28537920250195464, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 7.94092093831451e-06, |
|
"loss": 0.1484, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.2866823038832421, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 7.926440776136693e-06, |
|
"loss": 0.1493, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2879854052645296, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 7.911960613958876e-06, |
|
"loss": 0.1471, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.28928850664581707, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 7.89748045178106e-06, |
|
"loss": 0.1472, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.2905916080271045, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 7.883000289603244e-06, |
|
"loss": 0.1443, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.29189470940839196, |
|
"grad_norm": 2.0, |
|
"learning_rate": 7.868520127425428e-06, |
|
"loss": 0.1387, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.29319781078967944, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 7.854039965247611e-06, |
|
"loss": 0.1372, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.2945009121709669, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 7.839559803069796e-06, |
|
"loss": 0.1472, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.2958040135522544, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 7.825079640891979e-06, |
|
"loss": 0.1519, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.2971071149335418, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 7.810599478714163e-06, |
|
"loss": 0.1448, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.2984102163148293, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 7.796119316536346e-06, |
|
"loss": 0.1482, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.29971331769611675, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 7.78163915435853e-06, |
|
"loss": 0.1409, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.30023455824863177, |
|
"eval_loss": 0.16887266933918, |
|
"eval_runtime": 49.2745, |
|
"eval_samples_per_second": 298.329, |
|
"eval_steps_per_second": 9.335, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.30101641907740423, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 7.767158992180714e-06, |
|
"loss": 0.1418, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.3023195204586917, |
|
"grad_norm": 2.375, |
|
"learning_rate": 7.752678830002897e-06, |
|
"loss": 0.1511, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.3036226218399791, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 7.73819866782508e-06, |
|
"loss": 0.1356, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.3049257232212666, |
|
"grad_norm": 2.375, |
|
"learning_rate": 7.723718505647264e-06, |
|
"loss": 0.1461, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.3062288246025541, |
|
"grad_norm": 2.25, |
|
"learning_rate": 7.709238343469447e-06, |
|
"loss": 0.1415, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.30753192598384155, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 7.69475818129163e-06, |
|
"loss": 0.1401, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.308835027365129, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 7.680278019113815e-06, |
|
"loss": 0.1477, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.3101381287464165, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 7.665797856935998e-06, |
|
"loss": 0.1526, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.3114412301277039, |
|
"grad_norm": 2.375, |
|
"learning_rate": 7.651317694758183e-06, |
|
"loss": 0.143, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.3127443315089914, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 7.636837532580365e-06, |
|
"loss": 0.1538, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.31404743289027887, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 7.622357370402549e-06, |
|
"loss": 0.1497, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.31535053427156634, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 7.607877208224733e-06, |
|
"loss": 0.1461, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.3166536356528538, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 7.593397046046917e-06, |
|
"loss": 0.147, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.31795673703414123, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 7.5789168838691e-06, |
|
"loss": 0.1453, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.3192598384154287, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 7.5644367216912836e-06, |
|
"loss": 0.146, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.3205629397967162, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 7.549956559513467e-06, |
|
"loss": 0.1533, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.32186604117800366, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 7.53547639733565e-06, |
|
"loss": 0.1475, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.32316914255929113, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 7.520996235157835e-06, |
|
"loss": 0.1522, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.32447224394057855, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 7.506516072980018e-06, |
|
"loss": 0.1496, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.32577534532186603, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 7.492035910802201e-06, |
|
"loss": 0.1426, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3270784467031535, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 7.4775557486243854e-06, |
|
"loss": 0.1515, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.328381548084441, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 7.463075586446568e-06, |
|
"loss": 0.1374, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.32968464946572845, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 7.448595424268752e-06, |
|
"loss": 0.1424, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.33098775084701587, |
|
"grad_norm": 2.375, |
|
"learning_rate": 7.434115262090936e-06, |
|
"loss": 0.1434, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.33229085222830335, |
|
"grad_norm": 2.25, |
|
"learning_rate": 7.41963509991312e-06, |
|
"loss": 0.1441, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.3335939536095908, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 7.405154937735303e-06, |
|
"loss": 0.1419, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.3348970549908783, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 7.390674775557487e-06, |
|
"loss": 0.1471, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.33620015637216577, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 7.37619461337967e-06, |
|
"loss": 0.1478, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.33750325775345325, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 7.361714451201855e-06, |
|
"loss": 0.1432, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.33880635913474066, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 7.347234289024038e-06, |
|
"loss": 0.1434, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.34010946051602814, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 7.332754126846221e-06, |
|
"loss": 0.1441, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.3414125618973156, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 7.3182739646684054e-06, |
|
"loss": 0.1445, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.3427156632786031, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 7.303793802490588e-06, |
|
"loss": 0.1419, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.34401876465989056, |
|
"grad_norm": 2.125, |
|
"learning_rate": 7.289313640312772e-06, |
|
"loss": 0.1426, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.345321866041178, |
|
"grad_norm": 2.5, |
|
"learning_rate": 7.274833478134956e-06, |
|
"loss": 0.1477, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.34662496742246546, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 7.26035331595714e-06, |
|
"loss": 0.1391, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.34792806880375293, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 7.245873153779323e-06, |
|
"loss": 0.1377, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.3492311701850404, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 7.2313929916015065e-06, |
|
"loss": 0.1455, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.3505342715663279, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 7.21691282942369e-06, |
|
"loss": 0.1449, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.3518373729476153, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 7.202432667245873e-06, |
|
"loss": 0.15, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.3531404743289028, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 7.187952505068058e-06, |
|
"loss": 0.1456, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.35444357571019025, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 7.173472342890241e-06, |
|
"loss": 0.146, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.3557466770914777, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 7.158992180712424e-06, |
|
"loss": 0.1426, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.3570497784727652, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 7.144512018534608e-06, |
|
"loss": 0.139, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.3583528798540526, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 7.130031856356791e-06, |
|
"loss": 0.1387, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.3596559812353401, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 7.115551694178975e-06, |
|
"loss": 0.1463, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.36095908261662757, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 7.101071532001159e-06, |
|
"loss": 0.1407, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.36226218399791504, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 7.086591369823343e-06, |
|
"loss": 0.14, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.3635652853792025, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 7.0721112076455265e-06, |
|
"loss": 0.1442, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.36486838676049, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 7.05763104546771e-06, |
|
"loss": 0.1497, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3661714881417774, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 7.043150883289893e-06, |
|
"loss": 0.1404, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.3674745895230649, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 7.028670721112078e-06, |
|
"loss": 0.1327, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.36877769090435236, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 7.014190558934261e-06, |
|
"loss": 0.1447, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.37008079228563984, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 6.999710396756444e-06, |
|
"loss": 0.1513, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.3713838936669273, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 6.985230234578628e-06, |
|
"loss": 0.1429, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.37268699504821473, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.970750072400811e-06, |
|
"loss": 0.1424, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.3739900964295022, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 6.956269910222995e-06, |
|
"loss": 0.1382, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.3752931978107897, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 6.941789748045179e-06, |
|
"loss": 0.1472, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.37659629919207716, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 6.927309585867362e-06, |
|
"loss": 0.1379, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.37789940057336463, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 6.912829423689546e-06, |
|
"loss": 0.1397, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.37920250195465205, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 6.898349261511729e-06, |
|
"loss": 0.1443, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.3805056033359395, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 6.883869099333913e-06, |
|
"loss": 0.14, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.381808704717227, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 6.869388937156096e-06, |
|
"loss": 0.1392, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.3831118060985145, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 6.854908774978281e-06, |
|
"loss": 0.1447, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.38441490747980195, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 6.840428612800464e-06, |
|
"loss": 0.1419, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.38571800886108937, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 6.825948450622647e-06, |
|
"loss": 0.1408, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.38702111024237684, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 6.811468288444831e-06, |
|
"loss": 0.1482, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.3883242116236643, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 6.796988126267014e-06, |
|
"loss": 0.139, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.3896273130049518, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 6.782507964089199e-06, |
|
"loss": 0.1472, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.39093041438623927, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 6.768027801911382e-06, |
|
"loss": 0.1412, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3922335157675267, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 6.753547639733566e-06, |
|
"loss": 0.14, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.39353661714881416, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 6.739067477555749e-06, |
|
"loss": 0.145, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.39483971853010164, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 6.724587315377933e-06, |
|
"loss": 0.1397, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.3961428199113891, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 6.710107153200116e-06, |
|
"loss": 0.15, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.3974459212926766, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 6.695626991022301e-06, |
|
"loss": 0.1401, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.39874902267396406, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 6.681146828844484e-06, |
|
"loss": 0.1353, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.4000521240552515, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.1436, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.400312744331509, |
|
"eval_loss": 0.16504338383674622, |
|
"eval_runtime": 49.3084, |
|
"eval_samples_per_second": 298.123, |
|
"eval_steps_per_second": 9.329, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 0.40135522543653895, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 6.652186504488851e-06, |
|
"loss": 0.1405, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.40265832681782643, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 6.637706342311034e-06, |
|
"loss": 0.1397, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.4039614281991139, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 6.623226180133218e-06, |
|
"loss": 0.1429, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.4052645295804014, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 6.608746017955402e-06, |
|
"loss": 0.1476, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.4065676309616888, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 6.594265855777585e-06, |
|
"loss": 0.1463, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.40787073234297627, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 6.5797856935997685e-06, |
|
"loss": 0.1373, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.40917383372426375, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 6.565305531421952e-06, |
|
"loss": 0.1356, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.4104769351055512, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 6.550825369244136e-06, |
|
"loss": 0.1422, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.4117800364868387, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 6.536345207066319e-06, |
|
"loss": 0.1392, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.4130831378681261, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 6.521865044888504e-06, |
|
"loss": 0.1446, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.4143862392494136, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 6.507384882710687e-06, |
|
"loss": 0.1353, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.41568934063070107, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 6.492904720532871e-06, |
|
"loss": 0.1417, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.41699244201198854, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 6.478424558355054e-06, |
|
"loss": 0.1359, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.418295543393276, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 6.463944396177237e-06, |
|
"loss": 0.141, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.41959864477456343, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 6.449464233999422e-06, |
|
"loss": 0.144, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.4209017461558509, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 6.434984071821605e-06, |
|
"loss": 0.1424, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.4222048475371384, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 6.4205039096437885e-06, |
|
"loss": 0.1374, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.42350794891842586, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 6.406023747465972e-06, |
|
"loss": 0.1332, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.42481105029971333, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 6.391543585288156e-06, |
|
"loss": 0.1363, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.4261141516810008, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 6.377063423110339e-06, |
|
"loss": 0.1446, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.4274172530622882, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 6.362583260932524e-06, |
|
"loss": 0.1428, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4287203544435757, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 6.348103098754707e-06, |
|
"loss": 0.1331, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.4300234558248632, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 6.3336229365768896e-06, |
|
"loss": 0.1429, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.43132655720615065, |
|
"grad_norm": 2.125, |
|
"learning_rate": 6.319142774399074e-06, |
|
"loss": 0.1396, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.4326296585874381, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 6.304662612221257e-06, |
|
"loss": 0.1464, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.43393275996872555, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 6.290182450043441e-06, |
|
"loss": 0.1417, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.435235861350013, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 6.275702287865625e-06, |
|
"loss": 0.141, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.4365389627313005, |
|
"grad_norm": 2.875, |
|
"learning_rate": 6.261222125687808e-06, |
|
"loss": 0.1432, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.43784206411258797, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 6.2467419635099915e-06, |
|
"loss": 0.135, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.43914516549387544, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 6.232261801332175e-06, |
|
"loss": 0.1428, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.44044826687516286, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 6.217781639154359e-06, |
|
"loss": 0.1415, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.44175136825645034, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 6.203301476976543e-06, |
|
"loss": 0.13, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.4430544696377378, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 6.188821314798727e-06, |
|
"loss": 0.1388, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.4443575710190253, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 6.1743411526209096e-06, |
|
"loss": 0.1409, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.44566067240031276, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 6.159860990443094e-06, |
|
"loss": 0.1308, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.4469637737816002, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 6.145380828265277e-06, |
|
"loss": 0.1417, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.44826687516288766, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 6.13090066608746e-06, |
|
"loss": 0.1456, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.44956997654417513, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 6.116420503909645e-06, |
|
"loss": 0.1413, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.4508730779254626, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 6.101940341731828e-06, |
|
"loss": 0.1384, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.4521761793067501, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 6.0874601795540115e-06, |
|
"loss": 0.1406, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.45347928068803756, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 6.072980017376195e-06, |
|
"loss": 0.141, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.454782382069325, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 6.058499855198379e-06, |
|
"loss": 0.1478, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.45608548345061245, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 6.044019693020562e-06, |
|
"loss": 0.1412, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.4573885848318999, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 6.029539530842747e-06, |
|
"loss": 0.1407, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.4586916862131874, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 6.0150593686649296e-06, |
|
"loss": 0.1398, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.4599947875944749, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 6.0005792064871125e-06, |
|
"loss": 0.1397, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.4612978889757623, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 5.986099044309297e-06, |
|
"loss": 0.1458, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.46260099035704977, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.97161888213148e-06, |
|
"loss": 0.1421, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.46390409173833724, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 5.957138719953663e-06, |
|
"loss": 0.1394, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.4652071931196247, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 5.942658557775848e-06, |
|
"loss": 0.1426, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.4665102945009122, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 5.928178395598031e-06, |
|
"loss": 0.1427, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.4678133958821996, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 5.913698233420215e-06, |
|
"loss": 0.14, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.4691164972634871, |
|
"grad_norm": 2.0, |
|
"learning_rate": 5.899218071242398e-06, |
|
"loss": 0.1403, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.47041959864477456, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 5.884737909064582e-06, |
|
"loss": 0.1472, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.47172270002606204, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 5.870257746886766e-06, |
|
"loss": 0.1431, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.4730258014073495, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 5.8557775847089495e-06, |
|
"loss": 0.1376, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.47432890278863693, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 5.8412974225311325e-06, |
|
"loss": 0.1437, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.4756320041699244, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 5.826817260353317e-06, |
|
"loss": 0.1352, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.4769351055512119, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 5.8123370981755e-06, |
|
"loss": 0.1385, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.47823820693249935, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 5.797856935997683e-06, |
|
"loss": 0.1408, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.47954130831378683, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 5.783376773819868e-06, |
|
"loss": 0.1452, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.4808444096950743, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 5.768896611642051e-06, |
|
"loss": 0.1372, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.4821475110763617, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 5.754416449464234e-06, |
|
"loss": 0.1416, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.4834506124576492, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.739936287286418e-06, |
|
"loss": 0.1502, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.4847537138389367, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 5.725456125108602e-06, |
|
"loss": 0.1389, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.48605681522022415, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 5.710975962930785e-06, |
|
"loss": 0.1355, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.4873599166015116, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 5.6964958007529695e-06, |
|
"loss": 0.1404, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.48866301798279904, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 5.6820156385751525e-06, |
|
"loss": 0.1399, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.4899661193640865, |
|
"grad_norm": 2.0, |
|
"learning_rate": 5.667535476397335e-06, |
|
"loss": 0.1404, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.491269220745374, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 5.65305531421952e-06, |
|
"loss": 0.1399, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.49257232212666147, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 5.638575152041703e-06, |
|
"loss": 0.1385, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.49387542350794894, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.624094989863888e-06, |
|
"loss": 0.1362, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.49517852488923636, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 5.609614827686071e-06, |
|
"loss": 0.1394, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.49648162627052383, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.5951346655082535e-06, |
|
"loss": 0.1378, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.4977847276518113, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 5.580654503330438e-06, |
|
"loss": 0.1399, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.4990878290330988, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.566174341152621e-06, |
|
"loss": 0.1367, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.5003909304143862, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 5.551694178974805e-06, |
|
"loss": 0.1381, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.5003909304143862, |
|
"eval_loss": 0.16250096261501312, |
|
"eval_runtime": 49.2724, |
|
"eval_samples_per_second": 298.342, |
|
"eval_steps_per_second": 9.336, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.5016940317956737, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 5.537214016796989e-06, |
|
"loss": 0.1415, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.5029971331769612, |
|
"grad_norm": 2.375, |
|
"learning_rate": 5.5227338546191725e-06, |
|
"loss": 0.1413, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.5043002345582487, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 5.508253692441355e-06, |
|
"loss": 0.1418, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.5056033359395361, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 5.49377353026354e-06, |
|
"loss": 0.1403, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.5069064373208235, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 5.479293368085723e-06, |
|
"loss": 0.1344, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.508209538702111, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 5.464813205907906e-06, |
|
"loss": 0.1362, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5095126400833985, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 5.4503330437300906e-06, |
|
"loss": 0.1374, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.510815741464686, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.4358528815522735e-06, |
|
"loss": 0.1389, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.5121188428459734, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 5.421372719374457e-06, |
|
"loss": 0.1414, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.5134219442272608, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 5.406892557196641e-06, |
|
"loss": 0.1366, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.5147250456085484, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.392412395018825e-06, |
|
"loss": 0.1424, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.5160281469898358, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.377932232841008e-06, |
|
"loss": 0.1387, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5173312483711233, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 5.3634520706631925e-06, |
|
"loss": 0.1393, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.5186343497524107, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 5.348971908485375e-06, |
|
"loss": 0.1339, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5199374511336982, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 5.33449174630756e-06, |
|
"loss": 0.1366, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.5212405525149857, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 5.320011584129743e-06, |
|
"loss": 0.1333, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5225436538962731, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 5.305531421951926e-06, |
|
"loss": 0.1352, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.5238467552775606, |
|
"grad_norm": 2.0, |
|
"learning_rate": 5.2910512597741106e-06, |
|
"loss": 0.1384, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.5251498566588481, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 5.2765710975962935e-06, |
|
"loss": 0.137, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.5264529580401355, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 5.2620909354184764e-06, |
|
"loss": 0.132, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.527756059421423, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 5.247610773240661e-06, |
|
"loss": 0.1414, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.5290591608027104, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 5.233130611062844e-06, |
|
"loss": 0.1389, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.530362262183998, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 5.218650448885028e-06, |
|
"loss": 0.1405, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.5316653635652854, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 5.204170286707212e-06, |
|
"loss": 0.138, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.5329684649465728, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 5.189690124529395e-06, |
|
"loss": 0.1382, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.5342715663278603, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 5.175209962351578e-06, |
|
"loss": 0.1332, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.5355746677091477, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 5.160729800173763e-06, |
|
"loss": 0.1303, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.5368777690904353, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 5.146249637995946e-06, |
|
"loss": 0.1356, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.5381808704717227, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 5.131769475818129e-06, |
|
"loss": 0.1397, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.5394839718530101, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.1172893136403135e-06, |
|
"loss": 0.1384, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.5407870732342976, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 5.1028091514624964e-06, |
|
"loss": 0.1385, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.5420901746155851, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 5.08832898928468e-06, |
|
"loss": 0.1349, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.5433932759968726, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 5.073848827106864e-06, |
|
"loss": 0.1324, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.54469637737816, |
|
"grad_norm": 2.0, |
|
"learning_rate": 5.059368664929048e-06, |
|
"loss": 0.1352, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.5459994787594474, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.044888502751232e-06, |
|
"loss": 0.1326, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.547302580140735, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 5.030408340573415e-06, |
|
"loss": 0.1363, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5486056815220224, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 5.015928178395598e-06, |
|
"loss": 0.1309, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.5499087829033099, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 5.001448016217782e-06, |
|
"loss": 0.1398, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.5512118842845973, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 4.986967854039966e-06, |
|
"loss": 0.1307, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.5525149856658849, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.97248769186215e-06, |
|
"loss": 0.1334, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.5538180870471723, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 4.958007529684333e-06, |
|
"loss": 0.1336, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.5551211884284597, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 4.9435273675065164e-06, |
|
"loss": 0.1363, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.5564242898097472, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 4.9290472053287e-06, |
|
"loss": 0.144, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.5577273911910346, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.914567043150883e-06, |
|
"loss": 0.1312, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.5590304925723222, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 4.900086880973067e-06, |
|
"loss": 0.1339, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.5603335939536096, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 4.885606718795251e-06, |
|
"loss": 0.1363, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.561636695334897, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 4.8711265566174345e-06, |
|
"loss": 0.141, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.5629397967161845, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 4.856646394439618e-06, |
|
"loss": 0.1312, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.564242898097472, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.842166232261802e-06, |
|
"loss": 0.1418, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.5655459994787595, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 4.827686070083986e-06, |
|
"loss": 0.1391, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.5668491008600469, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 4.813205907906169e-06, |
|
"loss": 0.1379, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.5681522022413343, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.798725745728353e-06, |
|
"loss": 0.1337, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.5694553036226219, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 4.784245583550536e-06, |
|
"loss": 0.1339, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.5707584050039093, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 4.769765421372719e-06, |
|
"loss": 0.1413, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.5720615063851968, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.755285259194903e-06, |
|
"loss": 0.1394, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.5733646077664842, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.740805097017087e-06, |
|
"loss": 0.1383, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5746677091477717, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4.726324934839271e-06, |
|
"loss": 0.1409, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.5759708105290592, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.7118447726614545e-06, |
|
"loss": 0.1359, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.5772739119103466, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 4.697364610483638e-06, |
|
"loss": 0.1325, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.5785770132916341, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 4.682884448305822e-06, |
|
"loss": 0.1383, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.5798801146729216, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 4.668404286128005e-06, |
|
"loss": 0.138, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.581183216054209, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.653924123950189e-06, |
|
"loss": 0.1275, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.5824863174354965, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 4.639443961772373e-06, |
|
"loss": 0.1408, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.5837894188167839, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 4.6249637995945556e-06, |
|
"loss": 0.1286, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.5850925201980715, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.610483637416739e-06, |
|
"loss": 0.1424, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.5863956215793589, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 4.596003475238923e-06, |
|
"loss": 0.1355, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.5876987229606463, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.581523313061106e-06, |
|
"loss": 0.1409, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.5890018243419338, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 4.56704315088329e-06, |
|
"loss": 0.1419, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.5903049257232212, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.552562988705474e-06, |
|
"loss": 0.1317, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.5916080271045088, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 4.5380828265276575e-06, |
|
"loss": 0.1379, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.5929111284857962, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 4.523602664349841e-06, |
|
"loss": 0.1337, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.5942142298670836, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 4.509122502172025e-06, |
|
"loss": 0.1293, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.5955173312483711, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.494642339994209e-06, |
|
"loss": 0.1414, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.5968204326296586, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 4.480162177816392e-06, |
|
"loss": 0.1327, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.5981235340109461, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 4.4656820156385756e-06, |
|
"loss": 0.1383, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.5994266353922335, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 4.451201853460759e-06, |
|
"loss": 0.1349, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6004691164972635, |
|
"eval_loss": 0.1611909121274948, |
|
"eval_runtime": 49.2444, |
|
"eval_samples_per_second": 298.511, |
|
"eval_steps_per_second": 9.341, |
|
"step": 2304 |
|
}, |
|
{ |
|
"epoch": 0.6007297367735209, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 4.436721691282942e-06, |
|
"loss": 0.1374, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.6020328381548085, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.422241529105126e-06, |
|
"loss": 0.1303, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.6033359395360959, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 4.40776136692731e-06, |
|
"loss": 0.1292, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.6046390409173834, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 4.393281204749494e-06, |
|
"loss": 0.1412, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.6059421422986708, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 4.3788010425716774e-06, |
|
"loss": 0.1396, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.6072452436799582, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.364320880393861e-06, |
|
"loss": 0.1325, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.6085483450612458, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4.349840718216044e-06, |
|
"loss": 0.1413, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.6098514464425332, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 4.335360556038228e-06, |
|
"loss": 0.138, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.6111545478238207, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 4.320880393860412e-06, |
|
"loss": 0.1325, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.6124576492051081, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 4.3064002316825955e-06, |
|
"loss": 0.1382, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.6137607505863957, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.2919200695047785e-06, |
|
"loss": 0.1403, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.6150638519676831, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 4.277439907326962e-06, |
|
"loss": 0.1308, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.6163669533489705, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4.262959745149146e-06, |
|
"loss": 0.138, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.617670054730258, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 4.24847958297133e-06, |
|
"loss": 0.1346, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.6189731561115455, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.233999420793513e-06, |
|
"loss": 0.1342, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.620276257492833, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.219519258615697e-06, |
|
"loss": 0.1367, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.6215793588741204, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 4.20503909643788e-06, |
|
"loss": 0.1313, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.6228824602554078, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 4.190558934260064e-06, |
|
"loss": 0.1299, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.6241855616366954, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.176078772082248e-06, |
|
"loss": 0.1286, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.6254886630179828, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.161598609904432e-06, |
|
"loss": 0.1368, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6267917643992703, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.147118447726615e-06, |
|
"loss": 0.1316, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.6280948657805577, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.1326382855487985e-06, |
|
"loss": 0.1346, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.6293979671618452, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 4.118158123370982e-06, |
|
"loss": 0.1375, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.6307010685431327, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.103677961193166e-06, |
|
"loss": 0.1322, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.6320041699244201, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 4.089197799015349e-06, |
|
"loss": 0.1275, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.6333072713057076, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.074717636837533e-06, |
|
"loss": 0.1364, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.634610372686995, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 4.0602374746597166e-06, |
|
"loss": 0.1381, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.6359134740682825, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 4.0457573124819e-06, |
|
"loss": 0.1428, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.63721657544957, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.031277150304083e-06, |
|
"loss": 0.1371, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.6385196768308574, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4.016796988126267e-06, |
|
"loss": 0.1384, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.639822778212145, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.002316825948451e-06, |
|
"loss": 0.1389, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.6411258795934324, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.987836663770635e-06, |
|
"loss": 0.1322, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.6424289809747198, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 3.9733565015928185e-06, |
|
"loss": 0.1454, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.6437320823560073, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 3.958876339415002e-06, |
|
"loss": 0.1276, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.6450351837372947, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.944396177237185e-06, |
|
"loss": 0.1334, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.6463382851185823, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 3.929916015059369e-06, |
|
"loss": 0.1355, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.6476413864998697, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.915435852881553e-06, |
|
"loss": 0.1338, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.6489444878811571, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 3.900955690703736e-06, |
|
"loss": 0.1335, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.6502475892624446, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 3.8864755285259195e-06, |
|
"loss": 0.1353, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.6515506906437321, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 3.871995366348103e-06, |
|
"loss": 0.1358, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6528537920250196, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 3.857515204170287e-06, |
|
"loss": 0.1299, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.654156893406307, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 3.843035041992471e-06, |
|
"loss": 0.1346, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.6554599947875944, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 3.828554879814655e-06, |
|
"loss": 0.1322, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.656763096168882, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 3.814074717636838e-06, |
|
"loss": 0.1324, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.6580661975501694, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 3.7995945554590214e-06, |
|
"loss": 0.1301, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.6593692989314569, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 3.785114393281205e-06, |
|
"loss": 0.1365, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.6606724003127443, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 3.770634231103389e-06, |
|
"loss": 0.1357, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.6619755016940317, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 3.756154068925572e-06, |
|
"loss": 0.1352, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.6632786030753193, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 3.7416739067477557e-06, |
|
"loss": 0.1278, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.6645817044566067, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 3.7271937445699395e-06, |
|
"loss": 0.139, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.6658848058378942, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 3.712713582392123e-06, |
|
"loss": 0.1453, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.6671879072191816, |
|
"grad_norm": 2.0, |
|
"learning_rate": 3.6982334202143067e-06, |
|
"loss": 0.1366, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.6684910086004691, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 3.6837532580364904e-06, |
|
"loss": 0.1348, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.6697941099817566, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.6692730958586742e-06, |
|
"loss": 0.136, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.671097211363044, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 3.6547929336808576e-06, |
|
"loss": 0.1458, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.6724003127443315, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 3.6403127715030414e-06, |
|
"loss": 0.1393, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.673703414125619, |
|
"grad_norm": 2.125, |
|
"learning_rate": 3.6258326093252248e-06, |
|
"loss": 0.1326, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.6750065155069065, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 3.611352447147408e-06, |
|
"loss": 0.1305, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.6763096168881939, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 3.596872284969592e-06, |
|
"loss": 0.1385, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.6776127182694813, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 3.5823921227917757e-06, |
|
"loss": 0.1295, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6789158196507689, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 3.567911960613959e-06, |
|
"loss": 0.1316, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.6802189210320563, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 3.553431798436143e-06, |
|
"loss": 0.1299, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.6815220224133438, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.5389516362583266e-06, |
|
"loss": 0.1307, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.6828251237946312, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.5244714740805096e-06, |
|
"loss": 0.1329, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.6841282251759186, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 3.5099913119026934e-06, |
|
"loss": 0.1314, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.6854313265572062, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 3.495511149724877e-06, |
|
"loss": 0.1286, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.6867344279384936, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 3.481030987547061e-06, |
|
"loss": 0.136, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.6880375293197811, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 3.4665508253692443e-06, |
|
"loss": 0.1382, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.6893406307010685, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 3.452070663191428e-06, |
|
"loss": 0.1378, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.690643732082356, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 3.437590501013612e-06, |
|
"loss": 0.1323, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.6919468334636435, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 3.423110338835795e-06, |
|
"loss": 0.1323, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.6932499348449309, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.4086301766579786e-06, |
|
"loss": 0.1302, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.6945530362262184, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 3.3941500144801624e-06, |
|
"loss": 0.1344, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.6958561376075059, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.379669852302346e-06, |
|
"loss": 0.1321, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.6971592389887933, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 3.3651896901245296e-06, |
|
"loss": 0.1349, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.6984623403700808, |
|
"grad_norm": 2.25, |
|
"learning_rate": 3.3507095279467134e-06, |
|
"loss": 0.1409, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.6997654417513682, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 3.336229365768897e-06, |
|
"loss": 0.1352, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.7005473025801408, |
|
"eval_loss": 0.16054348647594452, |
|
"eval_runtime": 49.2246, |
|
"eval_samples_per_second": 298.631, |
|
"eval_steps_per_second": 9.345, |
|
"step": 2688 |
|
}, |
|
{ |
|
"epoch": 0.7010685431326558, |
|
"grad_norm": 1.875, |
|
"learning_rate": 3.3217492035910805e-06, |
|
"loss": 0.1307, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.7023716445139432, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 3.3072690414132643e-06, |
|
"loss": 0.1413, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.7036747458952306, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 3.2927888792354477e-06, |
|
"loss": 0.1426, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7049778472765181, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 3.278308717057631e-06, |
|
"loss": 0.1334, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.7062809486578056, |
|
"grad_norm": 1.875, |
|
"learning_rate": 3.263828554879815e-06, |
|
"loss": 0.1329, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.7075840500390931, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 3.2493483927019986e-06, |
|
"loss": 0.1359, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.7088871514203805, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 3.234868230524182e-06, |
|
"loss": 0.1361, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.7101902528016679, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 3.2203880683463658e-06, |
|
"loss": 0.1377, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.7114933541829555, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 3.2059079061685496e-06, |
|
"loss": 0.1301, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.7127964555642429, |
|
"grad_norm": 2.0, |
|
"learning_rate": 3.1914277439907334e-06, |
|
"loss": 0.1315, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.7140995569455304, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 3.1769475818129163e-06, |
|
"loss": 0.1309, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.7154026583268178, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 3.1624674196351e-06, |
|
"loss": 0.1359, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.7167057597081052, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 3.147987257457284e-06, |
|
"loss": 0.148, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.7180088610893928, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.1335070952794672e-06, |
|
"loss": 0.1287, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.7193119624706802, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 3.119026933101651e-06, |
|
"loss": 0.137, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.7206150638519677, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 3.104546770923835e-06, |
|
"loss": 0.1322, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.7219181652332551, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 3.0900666087460178e-06, |
|
"loss": 0.1354, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.7232212666145426, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 3.0755864465682016e-06, |
|
"loss": 0.1353, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.7245243679958301, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 3.0611062843903853e-06, |
|
"loss": 0.1287, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.7258274693771175, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 3.046626122212569e-06, |
|
"loss": 0.1334, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.727130570758405, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 3.0321459600347525e-06, |
|
"loss": 0.1314, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.7284336721396925, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.0176657978569363e-06, |
|
"loss": 0.1381, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.72973677352098, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 3.00318563567912e-06, |
|
"loss": 0.1377, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.7310398749022674, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 2.9887054735013034e-06, |
|
"loss": 0.1361, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.7323429762835548, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.974225311323487e-06, |
|
"loss": 0.1361, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.7336460776648424, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.9597451491456706e-06, |
|
"loss": 0.1273, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.7349491790461298, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 2.945264986967854e-06, |
|
"loss": 0.1328, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.7362522804274173, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.9307848247900378e-06, |
|
"loss": 0.1322, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.7375553818087047, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 2.9163046626122215e-06, |
|
"loss": 0.1322, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.7388584831899921, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 2.9018245004344053e-06, |
|
"loss": 0.1296, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.7401615845712797, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 2.8873443382565887e-06, |
|
"loss": 0.1352, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.7414646859525671, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 2.8728641760787725e-06, |
|
"loss": 0.1346, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.7427677873338546, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 2.8583840139009563e-06, |
|
"loss": 0.1325, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.744070888715142, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 2.8439038517231392e-06, |
|
"loss": 0.1311, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.7453739900964295, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 2.829423689545323e-06, |
|
"loss": 0.1342, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.746677091477717, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 2.814943527367507e-06, |
|
"loss": 0.1369, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.7479801928590044, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 2.80046336518969e-06, |
|
"loss": 0.1378, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.7492832942402919, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 2.785983203011874e-06, |
|
"loss": 0.1246, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.7505863956215794, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.7715030408340578e-06, |
|
"loss": 0.1377, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.7518894970028668, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 2.7570228786562415e-06, |
|
"loss": 0.1345, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.7531925983841543, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 2.7425427164784245e-06, |
|
"loss": 0.1344, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.7544956997654417, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 2.7280625543006083e-06, |
|
"loss": 0.1319, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.7557988011467293, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 2.713582392122792e-06, |
|
"loss": 0.1286, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.7571019025280167, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 2.6991022299449754e-06, |
|
"loss": 0.1281, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.7584050039093041, |
|
"grad_norm": 2.0, |
|
"learning_rate": 2.6846220677671592e-06, |
|
"loss": 0.1318, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.7597081052905916, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 2.670141905589343e-06, |
|
"loss": 0.1317, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.761011206671879, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 2.655661743411526e-06, |
|
"loss": 0.1419, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.7623143080531666, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 2.6411815812337097e-06, |
|
"loss": 0.1354, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.763617409434454, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 2.6267014190558935e-06, |
|
"loss": 0.1425, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.7649205108157414, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 2.6122212568780773e-06, |
|
"loss": 0.133, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.766223612197029, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 2.5977410947002607e-06, |
|
"loss": 0.1315, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.7675267135783164, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 2.5832609325224445e-06, |
|
"loss": 0.1355, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.7688298149596039, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 2.5687807703446283e-06, |
|
"loss": 0.139, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.7701329163408913, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 2.5543006081668116e-06, |
|
"loss": 0.1372, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.7714360177221787, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 2.5398204459889954e-06, |
|
"loss": 0.1359, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.7727391191034663, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 2.525340283811179e-06, |
|
"loss": 0.1332, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.7740422204847537, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 2.510860121633362e-06, |
|
"loss": 0.1375, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.7753453218660412, |
|
"grad_norm": 2.0, |
|
"learning_rate": 2.496379959455546e-06, |
|
"loss": 0.1336, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.7766484232473286, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 2.4818997972777297e-06, |
|
"loss": 0.1326, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.777951524628616, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 2.467419635099913e-06, |
|
"loss": 0.131, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.7792546260099036, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 2.452939472922097e-06, |
|
"loss": 0.1254, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.780557727391191, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 2.4384593107442807e-06, |
|
"loss": 0.1367, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.7818608287724785, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 2.423979148566464e-06, |
|
"loss": 0.1301, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.783163930153766, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 2.409498986388648e-06, |
|
"loss": 0.1346, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.7844670315350534, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 2.395018824210831e-06, |
|
"loss": 0.1285, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.7857701329163409, |
|
"grad_norm": 2.375, |
|
"learning_rate": 2.380538662033015e-06, |
|
"loss": 0.1374, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.7870732342976283, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.3660584998551988e-06, |
|
"loss": 0.1382, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.7883763356789159, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.351578337677382e-06, |
|
"loss": 0.1389, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.7896794370602033, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.337098175499566e-06, |
|
"loss": 0.1273, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.7909825384414908, |
|
"grad_norm": 2.125, |
|
"learning_rate": 2.3226180133217493e-06, |
|
"loss": 0.1323, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.7922856398227782, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 2.308137851143933e-06, |
|
"loss": 0.1342, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.7935887412040656, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.2936576889661165e-06, |
|
"loss": 0.133, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.7948918425853532, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 2.2791775267883002e-06, |
|
"loss": 0.1462, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.7961949439666406, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 2.264697364610484e-06, |
|
"loss": 0.1412, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.7974980453479281, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 2.2502172024326674e-06, |
|
"loss": 0.144, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.7988011467292155, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.235737040254851e-06, |
|
"loss": 0.1328, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.800104248110503, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.2212568780770346e-06, |
|
"loss": 0.1342, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.800625488663018, |
|
"eval_loss": 0.1600351482629776, |
|
"eval_runtime": 49.2423, |
|
"eval_samples_per_second": 298.524, |
|
"eval_steps_per_second": 9.342, |
|
"step": 3072 |
|
}, |
|
{ |
|
"epoch": 0.8014073494917905, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 2.2067767158992183e-06, |
|
"loss": 0.1334, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.8027104508730779, |
|
"grad_norm": 2.125, |
|
"learning_rate": 2.192296553721402e-06, |
|
"loss": 0.1283, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.8040135522543654, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 2.1778163915435855e-06, |
|
"loss": 0.1341, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.8053166536356529, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 2.1633362293657693e-06, |
|
"loss": 0.1435, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.8066197550169403, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 2.1488560671879527e-06, |
|
"loss": 0.1358, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.8079228563982278, |
|
"grad_norm": 2.0, |
|
"learning_rate": 2.134375905010136e-06, |
|
"loss": 0.128, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.8092259577795152, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 2.11989574283232e-06, |
|
"loss": 0.1376, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.8105290591608028, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 2.1054155806545036e-06, |
|
"loss": 0.1307, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.8118321605420902, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 2.0909354184766874e-06, |
|
"loss": 0.1371, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.8131352619233776, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 2.0764552562988708e-06, |
|
"loss": 0.1276, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.8144383633046651, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 2.061975094121054e-06, |
|
"loss": 0.1335, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.8157414646859525, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 2.047494931943238e-06, |
|
"loss": 0.1319, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.8170445660672401, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 2.0330147697654217e-06, |
|
"loss": 0.1343, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.8183476674485275, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 2.018534607587605e-06, |
|
"loss": 0.1317, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.8196507688298149, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 2.004054445409789e-06, |
|
"loss": 0.1276, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.8209538702111024, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.9895742832319722e-06, |
|
"loss": 0.1308, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.8222569715923899, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.975094121054156e-06, |
|
"loss": 0.1354, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.8235600729736774, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.9606139588763394e-06, |
|
"loss": 0.1256, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.8248631743549648, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.946133796698523e-06, |
|
"loss": 0.1308, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.8261662757362522, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.931653634520707e-06, |
|
"loss": 0.1347, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.8274693771175398, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.9171734723428903e-06, |
|
"loss": 0.1348, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.8287724784988272, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.9026933101650741e-06, |
|
"loss": 0.1355, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.8300755798801147, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.8882131479872575e-06, |
|
"loss": 0.1356, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.8313786812614021, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.8737329858094413e-06, |
|
"loss": 0.1331, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.8326817826426895, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.8592528236316248e-06, |
|
"loss": 0.1335, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.8339848840239771, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.8447726614538084e-06, |
|
"loss": 0.1325, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.8352879854052645, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.8302924992759922e-06, |
|
"loss": 0.1282, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.836591086786552, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.8158123370981756e-06, |
|
"loss": 0.1385, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.8378941881678394, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.8013321749203594e-06, |
|
"loss": 0.1377, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.8391972895491269, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.786852012742543e-06, |
|
"loss": 0.1344, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.8405003909304144, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.7723718505647263e-06, |
|
"loss": 0.1362, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.8418034923117018, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.75789168838691e-06, |
|
"loss": 0.1292, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.8431065936929893, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.7434115262090937e-06, |
|
"loss": 0.1394, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 0.8444096950742768, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.7289313640312775e-06, |
|
"loss": 0.1246, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.8457127964555643, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.7144512018534608e-06, |
|
"loss": 0.1382, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 0.8470158978368517, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.6999710396756444e-06, |
|
"loss": 0.1289, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.8483189992181391, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.6854908774978282e-06, |
|
"loss": 0.1302, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.8496221005994267, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.6710107153200118e-06, |
|
"loss": 0.1298, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.8509252019807141, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.6565305531421956e-06, |
|
"loss": 0.1345, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 0.8522283033620016, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.642050390964379e-06, |
|
"loss": 0.1278, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.853531404743289, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.6275702287865625e-06, |
|
"loss": 0.1346, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.8548345061245765, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.6130900666087463e-06, |
|
"loss": 0.1381, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.856137607505864, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.5986099044309297e-06, |
|
"loss": 0.1323, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.8574407088871514, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.5841297422531135e-06, |
|
"loss": 0.1279, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.8587438102684389, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.569649580075297e-06, |
|
"loss": 0.1309, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 0.8600469116497264, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.5551694178974804e-06, |
|
"loss": 0.1343, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.8613500130310138, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.5406892557196642e-06, |
|
"loss": 0.1292, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 0.8626531144123013, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.5262090935418478e-06, |
|
"loss": 0.1388, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.8639562157935887, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.5117289313640316e-06, |
|
"loss": 0.1333, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.8652593171748763, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.4972487691862151e-06, |
|
"loss": 0.136, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.8665624185561637, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.4827686070083985e-06, |
|
"loss": 0.1324, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.8678655199374511, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.4682884448305823e-06, |
|
"loss": 0.1288, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.8691686213187386, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.4538082826527659e-06, |
|
"loss": 0.1311, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.870471722700026, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.4393281204749497e-06, |
|
"loss": 0.1431, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.8717748240813136, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.424847958297133e-06, |
|
"loss": 0.1322, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.873077925462601, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.4103677961193166e-06, |
|
"loss": 0.1348, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.8743810268438884, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.3958876339415004e-06, |
|
"loss": 0.1336, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 0.8756841282251759, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.3814074717636838e-06, |
|
"loss": 0.132, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.8769872296064634, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.3669273095858675e-06, |
|
"loss": 0.1257, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 0.8782903309877509, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.3524471474080511e-06, |
|
"loss": 0.1399, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.8795934323690383, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.3379669852302347e-06, |
|
"loss": 0.1376, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.8808965337503257, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.3234868230524183e-06, |
|
"loss": 0.126, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.8821996351316133, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.3090066608746019e-06, |
|
"loss": 0.1342, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 0.8835027365129007, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.2945264986967854e-06, |
|
"loss": 0.1317, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.8848058378941882, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.2800463365189692e-06, |
|
"loss": 0.1317, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 0.8861089392754756, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.2655661743411526e-06, |
|
"loss": 0.1353, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.887412040656763, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.2510860121633364e-06, |
|
"loss": 0.1363, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 0.8887151420380506, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.23660584998552e-06, |
|
"loss": 0.1385, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.890018243419338, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.2221256878077035e-06, |
|
"loss": 0.1245, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 0.8913213448006255, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.2076455256298871e-06, |
|
"loss": 0.1322, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.892624446181913, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.193165363452071e-06, |
|
"loss": 0.1378, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.8939275475632004, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.1786852012742543e-06, |
|
"loss": 0.1324, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.8952306489444879, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.1642050390964378e-06, |
|
"loss": 0.1369, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 0.8965337503257753, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.1497248769186216e-06, |
|
"loss": 0.1333, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.8978368517070628, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.1352447147408052e-06, |
|
"loss": 0.1281, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 0.8991399530883503, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.1207645525629888e-06, |
|
"loss": 0.1431, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.9004430544696377, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.1062843903851724e-06, |
|
"loss": 0.1332, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 0.9007036747458952, |
|
"eval_loss": 0.15992629528045654, |
|
"eval_runtime": 49.2173, |
|
"eval_samples_per_second": 298.675, |
|
"eval_steps_per_second": 9.346, |
|
"step": 3456 |
|
}, |
|
{ |
|
"epoch": 0.9017461558509252, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.091804228207356e-06, |
|
"loss": 0.1307, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.9030492572322126, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.0773240660295395e-06, |
|
"loss": 0.1314, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.9043523586135002, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.0628439038517233e-06, |
|
"loss": 0.1371, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.9056554599947876, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.0483637416739069e-06, |
|
"loss": 0.1296, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.9069585613760751, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.0338835794960905e-06, |
|
"loss": 0.126, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.9082616627573625, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.019403417318274e-06, |
|
"loss": 0.1379, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 0.90956476413865, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.0049232551404576e-06, |
|
"loss": 0.1267, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.9108678655199375, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 9.904430929626412e-07, |
|
"loss": 0.1342, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 0.9121709669012249, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 9.75962930784825e-07, |
|
"loss": 0.1278, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.9134740682825124, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 9.614827686070084e-07, |
|
"loss": 0.1326, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 0.9147771696637998, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 9.47002606429192e-07, |
|
"loss": 0.1331, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.9160802710450873, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 9.325224442513757e-07, |
|
"loss": 0.1308, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 0.9173833724263748, |
|
"grad_norm": 2.125, |
|
"learning_rate": 9.180422820735593e-07, |
|
"loss": 0.1354, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.9186864738076622, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 9.03562119895743e-07, |
|
"loss": 0.1276, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.9199895751889497, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 8.890819577179265e-07, |
|
"loss": 0.1336, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.9212926765702372, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 8.746017955401101e-07, |
|
"loss": 0.1311, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 0.9225957779515246, |
|
"grad_norm": 2.125, |
|
"learning_rate": 8.601216333622937e-07, |
|
"loss": 0.1289, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.9238988793328121, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 8.456414711844774e-07, |
|
"loss": 0.1339, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 0.9252019807140995, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 8.31161309006661e-07, |
|
"loss": 0.1354, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.9265050820953871, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 8.166811468288445e-07, |
|
"loss": 0.1395, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 0.9278081834766745, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 8.022009846510281e-07, |
|
"loss": 0.1365, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.9291112848579619, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 7.877208224732118e-07, |
|
"loss": 0.1335, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 0.9304143862392494, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 7.732406602953954e-07, |
|
"loss": 0.1387, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.9317174876205369, |
|
"grad_norm": 2.25, |
|
"learning_rate": 7.587604981175791e-07, |
|
"loss": 0.1338, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.9330205890018244, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 7.442803359397626e-07, |
|
"loss": 0.1328, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.9343236903831118, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 7.298001737619461e-07, |
|
"loss": 0.1293, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 0.9356267917643992, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 7.153200115841298e-07, |
|
"loss": 0.1384, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.9369298931456868, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 7.008398494063134e-07, |
|
"loss": 0.1228, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 0.9382329945269742, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 6.863596872284971e-07, |
|
"loss": 0.1286, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.9395360959082617, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 6.718795250506806e-07, |
|
"loss": 0.1314, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 0.9408391972895491, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 6.573993628728642e-07, |
|
"loss": 0.1359, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.9421422986708365, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 6.429192006950478e-07, |
|
"loss": 0.1332, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 0.9434454000521241, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 6.284390385172315e-07, |
|
"loss": 0.1312, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.9447485014334115, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 6.139588763394151e-07, |
|
"loss": 0.1248, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.946051602814699, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 5.994787141615987e-07, |
|
"loss": 0.133, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.9473547041959864, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 5.849985519837822e-07, |
|
"loss": 0.1249, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 0.9486578055772739, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 5.705183898059659e-07, |
|
"loss": 0.1331, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.9499609069585614, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 5.560382276281495e-07, |
|
"loss": 0.1279, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 0.9512640083398488, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 5.415580654503331e-07, |
|
"loss": 0.1303, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.9525671097211363, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 5.270779032725168e-07, |
|
"loss": 0.1343, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 0.9538702111024238, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 5.125977410947003e-07, |
|
"loss": 0.1325, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.9551733124837112, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.981175789168839e-07, |
|
"loss": 0.1268, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 0.9564764138649987, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.836374167390675e-07, |
|
"loss": 0.1324, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.9577795152462861, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 4.691572545612511e-07, |
|
"loss": 0.1284, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.9590826166275737, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.5467709238343475e-07, |
|
"loss": 0.1338, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.9603857180088611, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.401969302056183e-07, |
|
"loss": 0.1334, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 0.9616888193901486, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 4.2571676802780196e-07, |
|
"loss": 0.1302, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.962991920771436, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4.1123660584998553e-07, |
|
"loss": 0.1298, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 0.9642950221527234, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.9675644367216916e-07, |
|
"loss": 0.1301, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.965598123534011, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.822762814943528e-07, |
|
"loss": 0.1336, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 0.9669012249152984, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 3.6779611931653637e-07, |
|
"loss": 0.1421, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.9682043262965859, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 3.5331595713872e-07, |
|
"loss": 0.1362, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 0.9695074276778733, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 3.388357949609036e-07, |
|
"loss": 0.1281, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.9708105290591608, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 3.243556327830872e-07, |
|
"loss": 0.1343, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.9721136304404483, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.098754706052708e-07, |
|
"loss": 0.1373, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.9734167318217357, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 2.953953084274544e-07, |
|
"loss": 0.1359, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 0.9747198332030232, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 2.8091514624963805e-07, |
|
"loss": 0.1399, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.9760229345843107, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 2.6643498407182163e-07, |
|
"loss": 0.1358, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 0.9773260359655981, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 2.519548218940052e-07, |
|
"loss": 0.1323, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.9786291373468856, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 2.3747465971618884e-07, |
|
"loss": 0.1328, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 0.979932238728173, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 2.2299449753837244e-07, |
|
"loss": 0.1325, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.9812353401094606, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 2.0851433536055607e-07, |
|
"loss": 0.1329, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 0.982538441490748, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.9403417318273968e-07, |
|
"loss": 0.1266, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.9838415428720354, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.7955401100492328e-07, |
|
"loss": 0.1333, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.9851446442533229, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.6507384882710686e-07, |
|
"loss": 0.1293, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.9864477456346104, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.505936866492905e-07, |
|
"loss": 0.127, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 0.9877508470158979, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.361135244714741e-07, |
|
"loss": 0.1404, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.9890539483971853, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.216333622936577e-07, |
|
"loss": 0.1283, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 0.9903570497784727, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.071532001158413e-07, |
|
"loss": 0.1372, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.9916601511597603, |
|
"grad_norm": 2.0, |
|
"learning_rate": 9.26730379380249e-08, |
|
"loss": 0.1318, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 0.9929632525410477, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 7.819287576020852e-08, |
|
"loss": 0.1273, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.9942663539223352, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 6.371271358239213e-08, |
|
"loss": 0.1377, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 0.9955694553036226, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.923255140457574e-08, |
|
"loss": 0.1361, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.99687255668491, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 3.475238922675934e-08, |
|
"loss": 0.1349, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.9981756580661976, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 2.027222704894295e-08, |
|
"loss": 0.1376, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.999478759447485, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 5.792064871126557e-09, |
|
"loss": 0.1353, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 3837, |
|
"total_flos": 3.632852192307708e+18, |
|
"train_loss": 0.14543206988600105, |
|
"train_runtime": 4317.1955, |
|
"train_samples_per_second": 56.879, |
|
"train_steps_per_second": 0.889 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3837, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 384, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.632852192307708e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|