|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.13344644508202191, |
|
"eval_steps": 1000, |
|
"global_step": 14000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0009531888934430136, |
|
"grad_norm": 1.0701079368591309, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.2617, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0019063777868860272, |
|
"grad_norm": 1.0461440086364746, |
|
"learning_rate": 3e-06, |
|
"loss": 0.2595, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.002859566680329041, |
|
"grad_norm": 1.0249755382537842, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.2595, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0038127555737720543, |
|
"grad_norm": 0.9327605366706848, |
|
"learning_rate": 6e-06, |
|
"loss": 0.2563, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.004765944467215068, |
|
"grad_norm": 0.9439413547515869, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.2589, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.005719133360658082, |
|
"grad_norm": 0.8729381561279297, |
|
"learning_rate": 9e-06, |
|
"loss": 0.2617, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.006672322254101095, |
|
"grad_norm": 0.9562346935272217, |
|
"learning_rate": 1.05e-05, |
|
"loss": 0.259, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.007625511147544109, |
|
"grad_norm": 1.7502244710922241, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.2551, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.008578700040987123, |
|
"grad_norm": 0.8447253704071045, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 0.2555, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.009531888934430136, |
|
"grad_norm": 0.9096837043762207, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.2637, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.009531888934430136, |
|
"eval_loss": 0.22202371060848236, |
|
"eval_runtime": 24.6656, |
|
"eval_samples_per_second": 608.134, |
|
"eval_steps_per_second": 9.527, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.01048507782787315, |
|
"grad_norm": 0.9705513715744019, |
|
"learning_rate": 1.65e-05, |
|
"loss": 0.2614, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.011438266721316164, |
|
"grad_norm": 0.9748035669326782, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.2648, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.012391455614759177, |
|
"grad_norm": 2.0027875900268555, |
|
"learning_rate": 1.95e-05, |
|
"loss": 0.2605, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.01334464450820219, |
|
"grad_norm": 1.203764796257019, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.2645, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.014297833401645204, |
|
"grad_norm": 1.2857439517974854, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.2622, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.015251022295088217, |
|
"grad_norm": 0.969646692276001, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.2604, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.016204211188531232, |
|
"grad_norm": 0.8485471606254578, |
|
"learning_rate": 2.55e-05, |
|
"loss": 0.2628, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.017157400081974247, |
|
"grad_norm": 1.1885377168655396, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.2665, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.018110588975417258, |
|
"grad_norm": 1.98976469039917, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 0.2723, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.019063777868860272, |
|
"grad_norm": 1.0017362833023071, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2645, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.019063777868860272, |
|
"eval_loss": 0.2264958769083023, |
|
"eval_runtime": 24.3618, |
|
"eval_samples_per_second": 615.718, |
|
"eval_steps_per_second": 9.646, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.020016966762303287, |
|
"grad_norm": 1.3095935583114624, |
|
"learning_rate": 2.9970848597331675e-05, |
|
"loss": 0.2735, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.0209701556557463, |
|
"grad_norm": 1.0084208250045776, |
|
"learning_rate": 2.9941697194663354e-05, |
|
"loss": 0.2697, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.021923344549189313, |
|
"grad_norm": 0.9595718383789062, |
|
"learning_rate": 2.9912545791995025e-05, |
|
"loss": 0.2706, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.022876533442632328, |
|
"grad_norm": 1.156947374343872, |
|
"learning_rate": 2.98833943893267e-05, |
|
"loss": 0.2664, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.02382972233607534, |
|
"grad_norm": 0.9906996488571167, |
|
"learning_rate": 2.9854242986658374e-05, |
|
"loss": 0.267, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.024782911229518353, |
|
"grad_norm": 1.133239507675171, |
|
"learning_rate": 2.9825091583990053e-05, |
|
"loss": 0.2697, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.025736100122961368, |
|
"grad_norm": 1.1839542388916016, |
|
"learning_rate": 2.9795940181321727e-05, |
|
"loss": 0.2644, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.02668928901640438, |
|
"grad_norm": 1.1177607774734497, |
|
"learning_rate": 2.97667887786534e-05, |
|
"loss": 0.2649, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.027642477909847394, |
|
"grad_norm": 1.0634980201721191, |
|
"learning_rate": 2.9737637375985073e-05, |
|
"loss": 0.273, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.02859566680329041, |
|
"grad_norm": 1.141790747642517, |
|
"learning_rate": 2.970848597331675e-05, |
|
"loss": 0.2717, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.02859566680329041, |
|
"eval_loss": 0.23009072244167328, |
|
"eval_runtime": 25.5443, |
|
"eval_samples_per_second": 587.214, |
|
"eval_steps_per_second": 9.2, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.029548855696733423, |
|
"grad_norm": 0.8992202281951904, |
|
"learning_rate": 2.9679334570648426e-05, |
|
"loss": 0.272, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.030502044590176434, |
|
"grad_norm": 1.1783612966537476, |
|
"learning_rate": 2.96501831679801e-05, |
|
"loss": 0.2705, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.03145523348361945, |
|
"grad_norm": 1.516988754272461, |
|
"learning_rate": 2.9621031765311772e-05, |
|
"loss": 0.2696, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.032408422377062464, |
|
"grad_norm": 0.9750285148620605, |
|
"learning_rate": 2.959188036264345e-05, |
|
"loss": 0.2661, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.03336161127050548, |
|
"grad_norm": 1.0874147415161133, |
|
"learning_rate": 2.9562728959975125e-05, |
|
"loss": 0.2713, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.03431480016394849, |
|
"grad_norm": 1.2503632307052612, |
|
"learning_rate": 2.95335775573068e-05, |
|
"loss": 0.2694, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.0352679890573915, |
|
"grad_norm": 2.1983683109283447, |
|
"learning_rate": 2.9504426154638478e-05, |
|
"loss": 0.2715, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.036221177950834516, |
|
"grad_norm": 1.0884830951690674, |
|
"learning_rate": 2.947527475197015e-05, |
|
"loss": 0.2671, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.03717436684427753, |
|
"grad_norm": 0.9805251955986023, |
|
"learning_rate": 2.9446123349301824e-05, |
|
"loss": 0.2705, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.038127555737720545, |
|
"grad_norm": 1.0471646785736084, |
|
"learning_rate": 2.94169719466335e-05, |
|
"loss": 0.2657, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.038127555737720545, |
|
"eval_loss": 0.22619383037090302, |
|
"eval_runtime": 24.3251, |
|
"eval_samples_per_second": 616.647, |
|
"eval_steps_per_second": 9.661, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.03908074463116356, |
|
"grad_norm": 1.080304503440857, |
|
"learning_rate": 2.9387820543965177e-05, |
|
"loss": 0.2755, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.040033933524606574, |
|
"grad_norm": 1.2072677612304688, |
|
"learning_rate": 2.935866914129685e-05, |
|
"loss": 0.2666, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.04098712241804958, |
|
"grad_norm": 1.1678977012634277, |
|
"learning_rate": 2.9329517738628523e-05, |
|
"loss": 0.2708, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.0419403113114926, |
|
"grad_norm": 0.9155502319335938, |
|
"learning_rate": 2.9300366335960198e-05, |
|
"loss": 0.2701, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.04289350020493561, |
|
"grad_norm": 1.022687315940857, |
|
"learning_rate": 2.9271214933291876e-05, |
|
"loss": 0.276, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.043846689098378626, |
|
"grad_norm": 1.0507577657699585, |
|
"learning_rate": 2.924206353062355e-05, |
|
"loss": 0.2695, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.04479987799182164, |
|
"grad_norm": 0.9346485137939453, |
|
"learning_rate": 2.9212912127955225e-05, |
|
"loss": 0.2715, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.045753066885264655, |
|
"grad_norm": 1.0042835474014282, |
|
"learning_rate": 2.9183760725286897e-05, |
|
"loss": 0.2671, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.04670625577870767, |
|
"grad_norm": 1.106454610824585, |
|
"learning_rate": 2.9154609322618575e-05, |
|
"loss": 0.2666, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.04765944467215068, |
|
"grad_norm": 0.911589503288269, |
|
"learning_rate": 2.912545791995025e-05, |
|
"loss": 0.264, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.04765944467215068, |
|
"eval_loss": 0.22571362555027008, |
|
"eval_runtime": 24.0986, |
|
"eval_samples_per_second": 622.442, |
|
"eval_steps_per_second": 9.752, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.04861263356559369, |
|
"grad_norm": 0.8723756670951843, |
|
"learning_rate": 2.9096306517281924e-05, |
|
"loss": 0.264, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.04956582245903671, |
|
"grad_norm": 1.034590482711792, |
|
"learning_rate": 2.90671551146136e-05, |
|
"loss": 0.2767, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.05051901135247972, |
|
"grad_norm": 1.0665106773376465, |
|
"learning_rate": 2.9038003711945274e-05, |
|
"loss": 0.2676, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.051472200245922736, |
|
"grad_norm": 0.9242556095123291, |
|
"learning_rate": 2.900885230927695e-05, |
|
"loss": 0.2699, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.05242538913936575, |
|
"grad_norm": 1.1992926597595215, |
|
"learning_rate": 2.8979700906608623e-05, |
|
"loss": 0.2682, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.05337857803280876, |
|
"grad_norm": 0.9543828964233398, |
|
"learning_rate": 2.89505495039403e-05, |
|
"loss": 0.2713, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.05433176692625177, |
|
"grad_norm": 0.9702574014663696, |
|
"learning_rate": 2.8921398101271973e-05, |
|
"loss": 0.2663, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.05528495581969479, |
|
"grad_norm": 0.9306678175926208, |
|
"learning_rate": 2.8892246698603647e-05, |
|
"loss": 0.2712, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.0562381447131378, |
|
"grad_norm": 1.2940869331359863, |
|
"learning_rate": 2.8863095295935322e-05, |
|
"loss": 0.2732, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.05719133360658082, |
|
"grad_norm": 0.8944372534751892, |
|
"learning_rate": 2.8833943893267e-05, |
|
"loss": 0.2675, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.05719133360658082, |
|
"eval_loss": 0.22631041705608368, |
|
"eval_runtime": 24.2322, |
|
"eval_samples_per_second": 619.011, |
|
"eval_steps_per_second": 9.698, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.05814452250002383, |
|
"grad_norm": 1.1152732372283936, |
|
"learning_rate": 2.8804792490598675e-05, |
|
"loss": 0.2632, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.05909771139346685, |
|
"grad_norm": 0.90058833360672, |
|
"learning_rate": 2.8775641087930346e-05, |
|
"loss": 0.2677, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.060050900286909854, |
|
"grad_norm": 0.9290627241134644, |
|
"learning_rate": 2.874648968526202e-05, |
|
"loss": 0.2667, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.06100408918035287, |
|
"grad_norm": 1.0167937278747559, |
|
"learning_rate": 2.87173382825937e-05, |
|
"loss": 0.2658, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.061957278073795884, |
|
"grad_norm": 1.0440782308578491, |
|
"learning_rate": 2.8688186879925374e-05, |
|
"loss": 0.2672, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.0629104669672389, |
|
"grad_norm": 1.0155839920043945, |
|
"learning_rate": 2.865903547725705e-05, |
|
"loss": 0.2657, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.0638636558606819, |
|
"grad_norm": 0.879859209060669, |
|
"learning_rate": 2.862988407458872e-05, |
|
"loss": 0.2674, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.06481684475412493, |
|
"grad_norm": 0.9081212878227234, |
|
"learning_rate": 2.8600732671920398e-05, |
|
"loss": 0.2644, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.06577003364756794, |
|
"grad_norm": 1.1635853052139282, |
|
"learning_rate": 2.8571581269252073e-05, |
|
"loss": 0.2609, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.06672322254101096, |
|
"grad_norm": 1.0756968259811401, |
|
"learning_rate": 2.8542429866583747e-05, |
|
"loss": 0.2682, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.06672322254101096, |
|
"eval_loss": 0.22241491079330444, |
|
"eval_runtime": 25.327, |
|
"eval_samples_per_second": 592.253, |
|
"eval_steps_per_second": 9.279, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.06767641143445396, |
|
"grad_norm": 1.0364997386932373, |
|
"learning_rate": 2.8513278463915425e-05, |
|
"loss": 0.2651, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.06862960032789699, |
|
"grad_norm": 1.0817292928695679, |
|
"learning_rate": 2.8484127061247097e-05, |
|
"loss": 0.2634, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.06958278922134, |
|
"grad_norm": 1.052465796470642, |
|
"learning_rate": 2.845497565857877e-05, |
|
"loss": 0.2672, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.070535978114783, |
|
"grad_norm": 0.8442723155021667, |
|
"learning_rate": 2.8425824255910446e-05, |
|
"loss": 0.2709, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.07148916700822602, |
|
"grad_norm": 1.104926347732544, |
|
"learning_rate": 2.8396672853242124e-05, |
|
"loss": 0.2617, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.07244235590166903, |
|
"grad_norm": 1.0135023593902588, |
|
"learning_rate": 2.83675214505738e-05, |
|
"loss": 0.2625, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.07339554479511205, |
|
"grad_norm": 0.9307543039321899, |
|
"learning_rate": 2.833837004790547e-05, |
|
"loss": 0.2671, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.07434873368855506, |
|
"grad_norm": 1.5013054609298706, |
|
"learning_rate": 2.8309218645237145e-05, |
|
"loss": 0.2656, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.07530192258199807, |
|
"grad_norm": 0.923324465751648, |
|
"learning_rate": 2.8280067242568823e-05, |
|
"loss": 0.2607, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.07625511147544109, |
|
"grad_norm": 1.065769076347351, |
|
"learning_rate": 2.8250915839900498e-05, |
|
"loss": 0.2641, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.07625511147544109, |
|
"eval_loss": 0.22064544260501862, |
|
"eval_runtime": 25.6245, |
|
"eval_samples_per_second": 585.378, |
|
"eval_steps_per_second": 9.171, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.0772083003688841, |
|
"grad_norm": 1.053281545639038, |
|
"learning_rate": 2.8221764437232173e-05, |
|
"loss": 0.2633, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.07816148926232712, |
|
"grad_norm": 1.0560704469680786, |
|
"learning_rate": 2.8192613034563844e-05, |
|
"loss": 0.2602, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.07911467815577013, |
|
"grad_norm": 1.0632127523422241, |
|
"learning_rate": 2.8163461631895522e-05, |
|
"loss": 0.2647, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.08006786704921315, |
|
"grad_norm": 1.0002626180648804, |
|
"learning_rate": 2.8134310229227197e-05, |
|
"loss": 0.2654, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.08102105594265616, |
|
"grad_norm": 1.1899933815002441, |
|
"learning_rate": 2.8105158826558872e-05, |
|
"loss": 0.2631, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.08197424483609916, |
|
"grad_norm": 0.9177943468093872, |
|
"learning_rate": 2.807600742389055e-05, |
|
"loss": 0.264, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.08292743372954219, |
|
"grad_norm": 1.0969672203063965, |
|
"learning_rate": 2.804685602122222e-05, |
|
"loss": 0.2663, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.0838806226229852, |
|
"grad_norm": 0.9465392231941223, |
|
"learning_rate": 2.8017704618553896e-05, |
|
"loss": 0.2599, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.08483381151642821, |
|
"grad_norm": 1.1491124629974365, |
|
"learning_rate": 2.798855321588557e-05, |
|
"loss": 0.2616, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.08578700040987122, |
|
"grad_norm": 1.040123701095581, |
|
"learning_rate": 2.795940181321725e-05, |
|
"loss": 0.2611, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.08578700040987122, |
|
"eval_loss": 0.22252394258975983, |
|
"eval_runtime": 24.4254, |
|
"eval_samples_per_second": 614.114, |
|
"eval_steps_per_second": 9.621, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.08674018930331424, |
|
"grad_norm": 0.8041715621948242, |
|
"learning_rate": 2.7930250410548923e-05, |
|
"loss": 0.2597, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.08769337819675725, |
|
"grad_norm": 1.2013587951660156, |
|
"learning_rate": 2.7901099007880595e-05, |
|
"loss": 0.2627, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.08864656709020026, |
|
"grad_norm": 0.8449276089668274, |
|
"learning_rate": 2.787194760521227e-05, |
|
"loss": 0.2694, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.08959975598364328, |
|
"grad_norm": 0.957938015460968, |
|
"learning_rate": 2.7842796202543948e-05, |
|
"loss": 0.2646, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.09055294487708629, |
|
"grad_norm": 0.9442753195762634, |
|
"learning_rate": 2.7813644799875622e-05, |
|
"loss": 0.2618, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.09150613377052931, |
|
"grad_norm": 1.0630254745483398, |
|
"learning_rate": 2.7784493397207297e-05, |
|
"loss": 0.267, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.09245932266397232, |
|
"grad_norm": 0.9763880372047424, |
|
"learning_rate": 2.775534199453897e-05, |
|
"loss": 0.2631, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.09341251155741534, |
|
"grad_norm": 1.059673547744751, |
|
"learning_rate": 2.7726190591870647e-05, |
|
"loss": 0.264, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.09436570045085835, |
|
"grad_norm": 1.0772706270217896, |
|
"learning_rate": 2.769703918920232e-05, |
|
"loss": 0.26, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.09531888934430136, |
|
"grad_norm": 0.9500916600227356, |
|
"learning_rate": 2.7667887786533996e-05, |
|
"loss": 0.2603, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.09531888934430136, |
|
"eval_loss": 0.22107724845409393, |
|
"eval_runtime": 24.1253, |
|
"eval_samples_per_second": 621.753, |
|
"eval_steps_per_second": 9.741, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.09627207823774438, |
|
"grad_norm": 0.7942706346511841, |
|
"learning_rate": 2.7639027897892354e-05, |
|
"loss": 0.258, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.09722526713118738, |
|
"grad_norm": 1.1196712255477905, |
|
"learning_rate": 2.7610168009250712e-05, |
|
"loss": 0.2594, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.0981784560246304, |
|
"grad_norm": 0.9647284746170044, |
|
"learning_rate": 2.7581016606582387e-05, |
|
"loss": 0.2645, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.09913164491807341, |
|
"grad_norm": 1.0983389616012573, |
|
"learning_rate": 2.7551865203914065e-05, |
|
"loss": 0.2589, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.10008483381151642, |
|
"grad_norm": 0.8184943795204163, |
|
"learning_rate": 2.7522713801245736e-05, |
|
"loss": 0.2604, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.10103802270495944, |
|
"grad_norm": 1.0684343576431274, |
|
"learning_rate": 2.749356239857741e-05, |
|
"loss": 0.2602, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.10199121159840245, |
|
"grad_norm": 0.9852308034896851, |
|
"learning_rate": 2.7464410995909086e-05, |
|
"loss": 0.2688, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.10294440049184547, |
|
"grad_norm": 0.8270373940467834, |
|
"learning_rate": 2.7435259593240764e-05, |
|
"loss": 0.2601, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.10389758938528848, |
|
"grad_norm": 0.9181864857673645, |
|
"learning_rate": 2.740610819057244e-05, |
|
"loss": 0.259, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.1048507782787315, |
|
"grad_norm": 0.8947911858558655, |
|
"learning_rate": 2.737695678790411e-05, |
|
"loss": 0.2616, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.1048507782787315, |
|
"eval_loss": 0.22300027310848236, |
|
"eval_runtime": 26.4519, |
|
"eval_samples_per_second": 567.068, |
|
"eval_steps_per_second": 8.884, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.10580396717217451, |
|
"grad_norm": 1.19639253616333, |
|
"learning_rate": 2.7347805385235785e-05, |
|
"loss": 0.2624, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.10675715606561752, |
|
"grad_norm": 1.3614460229873657, |
|
"learning_rate": 2.7318653982567463e-05, |
|
"loss": 0.2578, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.10771034495906054, |
|
"grad_norm": 0.8842675089836121, |
|
"learning_rate": 2.7289502579899138e-05, |
|
"loss": 0.259, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.10866353385250355, |
|
"grad_norm": 1.1543840169906616, |
|
"learning_rate": 2.7260351177230812e-05, |
|
"loss": 0.2594, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.10961672274594657, |
|
"grad_norm": 1.1461540460586548, |
|
"learning_rate": 2.7231199774562484e-05, |
|
"loss": 0.2576, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.11056991163938958, |
|
"grad_norm": 0.9683176279067993, |
|
"learning_rate": 2.7202048371894162e-05, |
|
"loss": 0.2597, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.1115231005328326, |
|
"grad_norm": 1.1039471626281738, |
|
"learning_rate": 2.7172896969225837e-05, |
|
"loss": 0.2586, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.1124762894262756, |
|
"grad_norm": 0.9412834644317627, |
|
"learning_rate": 2.714374556655751e-05, |
|
"loss": 0.2573, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.11342947831971861, |
|
"grad_norm": 1.1193273067474365, |
|
"learning_rate": 2.711459416388919e-05, |
|
"loss": 0.2564, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.11438266721316163, |
|
"grad_norm": 0.9070214033126831, |
|
"learning_rate": 2.708544276122086e-05, |
|
"loss": 0.2598, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.11438266721316163, |
|
"eval_loss": 0.21802841126918793, |
|
"eval_runtime": 24.3781, |
|
"eval_samples_per_second": 615.305, |
|
"eval_steps_per_second": 9.64, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.11533585610660464, |
|
"grad_norm": 0.9957073330879211, |
|
"learning_rate": 2.7056291358552536e-05, |
|
"loss": 0.2582, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.11628904500004766, |
|
"grad_norm": 0.9560794234275818, |
|
"learning_rate": 2.702713995588421e-05, |
|
"loss": 0.2624, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.11724223389349067, |
|
"grad_norm": 1.0625020265579224, |
|
"learning_rate": 2.699798855321589e-05, |
|
"loss": 0.2606, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.1181954227869337, |
|
"grad_norm": 1.2022795677185059, |
|
"learning_rate": 2.6968837150547563e-05, |
|
"loss": 0.2577, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.1191486116803767, |
|
"grad_norm": 1.005925178527832, |
|
"learning_rate": 2.6939685747879234e-05, |
|
"loss": 0.2609, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.12010180057381971, |
|
"grad_norm": 1.0519824028015137, |
|
"learning_rate": 2.691053434521091e-05, |
|
"loss": 0.2653, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.12105498946726273, |
|
"grad_norm": 1.0782413482666016, |
|
"learning_rate": 2.6881382942542587e-05, |
|
"loss": 0.2537, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.12200817836070574, |
|
"grad_norm": 0.9406309723854065, |
|
"learning_rate": 2.6852231539874262e-05, |
|
"loss": 0.262, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.12296136725414876, |
|
"grad_norm": 0.922545850276947, |
|
"learning_rate": 2.682337165123262e-05, |
|
"loss": 0.2581, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.12391455614759177, |
|
"grad_norm": 0.8488488793373108, |
|
"learning_rate": 2.6794220248564295e-05, |
|
"loss": 0.2611, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.12391455614759177, |
|
"eval_loss": 0.22087305784225464, |
|
"eval_runtime": 23.9914, |
|
"eval_samples_per_second": 625.224, |
|
"eval_steps_per_second": 9.795, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.12486774504103478, |
|
"grad_norm": 0.9024129509925842, |
|
"learning_rate": 2.6765360359922653e-05, |
|
"loss": 0.2604, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.1258209339344778, |
|
"grad_norm": 0.9496759176254272, |
|
"learning_rate": 2.6736208957254328e-05, |
|
"loss": 0.2552, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.1267741228279208, |
|
"grad_norm": 1.0905983448028564, |
|
"learning_rate": 2.6707057554586002e-05, |
|
"loss": 0.2538, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.1277273117213638, |
|
"grad_norm": 1.1556366682052612, |
|
"learning_rate": 2.6677906151917677e-05, |
|
"loss": 0.2585, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.12868050061480685, |
|
"grad_norm": 1.0274028778076172, |
|
"learning_rate": 2.6648754749249352e-05, |
|
"loss": 0.2546, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.12963368950824986, |
|
"grad_norm": 0.9366750717163086, |
|
"learning_rate": 2.6619603346581027e-05, |
|
"loss": 0.2562, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.13058687840169286, |
|
"grad_norm": 0.9076129198074341, |
|
"learning_rate": 2.65904519439127e-05, |
|
"loss": 0.2567, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.13154006729513587, |
|
"grad_norm": 0.9610471725463867, |
|
"learning_rate": 2.6561300541244376e-05, |
|
"loss": 0.2528, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.13249325618857888, |
|
"grad_norm": 1.2852675914764404, |
|
"learning_rate": 2.653214913857605e-05, |
|
"loss": 0.2511, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.13344644508202191, |
|
"grad_norm": 0.8626914024353027, |
|
"learning_rate": 2.6502997735907726e-05, |
|
"loss": 0.2507, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.13344644508202191, |
|
"eval_loss": 0.21873866021633148, |
|
"eval_runtime": 23.5023, |
|
"eval_samples_per_second": 638.235, |
|
"eval_steps_per_second": 9.999, |
|
"step": 14000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 104911, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.96629250834432e+16, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|