|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.00444780500822844, |
|
"eval_steps": 10, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 4.447805008228439e-05, |
|
"grad_norm": 0.5683791637420654, |
|
"learning_rate": 9.99995774416091e-07, |
|
"loss": 2.5379, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 4.447805008228439e-05, |
|
"eval_loss": 2.2479803562164307, |
|
"eval_runtime": 228.3596, |
|
"eval_samples_per_second": 0.28, |
|
"eval_steps_per_second": 0.07, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 8.895610016456878e-05, |
|
"grad_norm": 0.48047935962677, |
|
"learning_rate": 9.999915484938526e-07, |
|
"loss": 2.7954, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0001334341502468532, |
|
"grad_norm": 0.41253846883773804, |
|
"learning_rate": 9.999873222332442e-07, |
|
"loss": 2.5331, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00017791220032913757, |
|
"grad_norm": 0.5433268547058105, |
|
"learning_rate": 9.999830956342252e-07, |
|
"loss": 2.6313, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00022239025041142198, |
|
"grad_norm": 0.4853809177875519, |
|
"learning_rate": 9.999788686967546e-07, |
|
"loss": 3.0209, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0002668683004937064, |
|
"grad_norm": 0.41539642214775085, |
|
"learning_rate": 9.999746414207922e-07, |
|
"loss": 2.1449, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00031134635057599076, |
|
"grad_norm": 0.38092610239982605, |
|
"learning_rate": 9.999704138062971e-07, |
|
"loss": 2.7408, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.00035582440065827514, |
|
"grad_norm": 0.42732885479927063, |
|
"learning_rate": 9.999661858532288e-07, |
|
"loss": 2.6069, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0004003024507405595, |
|
"grad_norm": 0.385442852973938, |
|
"learning_rate": 9.999619575615465e-07, |
|
"loss": 2.4027, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.00044478050082284395, |
|
"grad_norm": 0.5769224762916565, |
|
"learning_rate": 9.999577289312093e-07, |
|
"loss": 2.7377, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00044478050082284395, |
|
"eval_loss": 2.127828598022461, |
|
"eval_runtime": 240.4406, |
|
"eval_samples_per_second": 0.266, |
|
"eval_steps_per_second": 0.067, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0004892585509051283, |
|
"grad_norm": 0.3700040578842163, |
|
"learning_rate": 9.999534999621769e-07, |
|
"loss": 2.4942, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0005337366009874128, |
|
"grad_norm": 0.564141571521759, |
|
"learning_rate": 9.999492706544085e-07, |
|
"loss": 2.199, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0005782146510696971, |
|
"grad_norm": 0.38583478331565857, |
|
"learning_rate": 9.999450410078634e-07, |
|
"loss": 2.5461, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0006226927011519815, |
|
"grad_norm": 0.28498372435569763, |
|
"learning_rate": 9.999408110225007e-07, |
|
"loss": 2.1158, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0006671707512342659, |
|
"grad_norm": 0.3804119825363159, |
|
"learning_rate": 9.999365806982799e-07, |
|
"loss": 2.2961, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0007116488013165503, |
|
"grad_norm": 0.3127712607383728, |
|
"learning_rate": 9.999323500351601e-07, |
|
"loss": 2.1634, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0007561268513988347, |
|
"grad_norm": 0.3194904625415802, |
|
"learning_rate": 9.999281190331008e-07, |
|
"loss": 2.323, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.000800604901481119, |
|
"grad_norm": 0.3155025541782379, |
|
"learning_rate": 9.999238876920612e-07, |
|
"loss": 2.198, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0008450829515634034, |
|
"grad_norm": 0.5667710304260254, |
|
"learning_rate": 9.999196560120004e-07, |
|
"loss": 2.2118, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0008895610016456879, |
|
"grad_norm": 0.5151445269584656, |
|
"learning_rate": 9.999154239928777e-07, |
|
"loss": 2.3158, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0008895610016456879, |
|
"eval_loss": 2.0555460453033447, |
|
"eval_runtime": 242.0578, |
|
"eval_samples_per_second": 0.264, |
|
"eval_steps_per_second": 0.066, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0009340390517279723, |
|
"grad_norm": 0.39977356791496277, |
|
"learning_rate": 9.999111916346526e-07, |
|
"loss": 2.0448, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0009785171018102566, |
|
"grad_norm": 0.3716548979282379, |
|
"learning_rate": 9.99906958937284e-07, |
|
"loss": 2.1979, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.001022995151892541, |
|
"grad_norm": 0.48981648683547974, |
|
"learning_rate": 9.999027259007314e-07, |
|
"loss": 1.9194, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0010674732019748255, |
|
"grad_norm": 0.38807594776153564, |
|
"learning_rate": 9.99898492524954e-07, |
|
"loss": 1.9887, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.00111195125205711, |
|
"grad_norm": 0.9124931693077087, |
|
"learning_rate": 9.998942588099108e-07, |
|
"loss": 2.2736, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0011564293021393943, |
|
"grad_norm": 0.3414762020111084, |
|
"learning_rate": 9.99890024755561e-07, |
|
"loss": 2.0791, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0012009073522216787, |
|
"grad_norm": 0.5927892327308655, |
|
"learning_rate": 9.998857903618642e-07, |
|
"loss": 2.4562, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.001245385402303963, |
|
"grad_norm": 0.31966015696525574, |
|
"learning_rate": 9.998815556287793e-07, |
|
"loss": 1.9254, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0012898634523862474, |
|
"grad_norm": 0.444794237613678, |
|
"learning_rate": 9.998773205562656e-07, |
|
"loss": 2.1384, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0013343415024685318, |
|
"grad_norm": 0.5257573127746582, |
|
"learning_rate": 9.998730851442821e-07, |
|
"loss": 2.2903, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0013343415024685318, |
|
"eval_loss": 2.0174190998077393, |
|
"eval_runtime": 244.9536, |
|
"eval_samples_per_second": 0.261, |
|
"eval_steps_per_second": 0.065, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0013788195525508162, |
|
"grad_norm": 0.2638561427593231, |
|
"learning_rate": 9.998688493927882e-07, |
|
"loss": 2.0546, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0014232976026331006, |
|
"grad_norm": 0.45275381207466125, |
|
"learning_rate": 9.99864613301743e-07, |
|
"loss": 2.064, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.001467775652715385, |
|
"grad_norm": 0.45670074224472046, |
|
"learning_rate": 9.998603768711058e-07, |
|
"loss": 2.2222, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0015122537027976693, |
|
"grad_norm": 0.7620413303375244, |
|
"learning_rate": 9.998561401008355e-07, |
|
"loss": 2.0651, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0015567317528799537, |
|
"grad_norm": 0.42548975348472595, |
|
"learning_rate": 9.998519029908915e-07, |
|
"loss": 2.2957, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.001601209802962238, |
|
"grad_norm": 0.31207627058029175, |
|
"learning_rate": 9.998476655412328e-07, |
|
"loss": 2.2575, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0016456878530445225, |
|
"grad_norm": 0.2575569450855255, |
|
"learning_rate": 9.998434277518184e-07, |
|
"loss": 2.1558, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0016901659031268068, |
|
"grad_norm": 0.5504156947135925, |
|
"learning_rate": 9.998391896226079e-07, |
|
"loss": 2.4939, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0017346439532090912, |
|
"grad_norm": 0.36677634716033936, |
|
"learning_rate": 9.998349511535599e-07, |
|
"loss": 1.8096, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0017791220032913758, |
|
"grad_norm": 0.45712989568710327, |
|
"learning_rate": 9.99830712344634e-07, |
|
"loss": 1.8726, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0017791220032913758, |
|
"eval_loss": 1.9996732473373413, |
|
"eval_runtime": 234.0331, |
|
"eval_samples_per_second": 0.273, |
|
"eval_steps_per_second": 0.068, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0018236000533736602, |
|
"grad_norm": 0.45847609639167786, |
|
"learning_rate": 9.998264731957889e-07, |
|
"loss": 1.8686, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0018680781034559446, |
|
"grad_norm": 0.33932119607925415, |
|
"learning_rate": 9.998222337069841e-07, |
|
"loss": 2.2302, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.001912556153538229, |
|
"grad_norm": 1.3315902948379517, |
|
"learning_rate": 9.998179938781784e-07, |
|
"loss": 1.8721, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.001957034203620513, |
|
"grad_norm": 1.2163060903549194, |
|
"learning_rate": 9.99813753709331e-07, |
|
"loss": 2.1642, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0020015122537027975, |
|
"grad_norm": 0.4775317907333374, |
|
"learning_rate": 9.99809513200401e-07, |
|
"loss": 2.4106, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.002045990303785082, |
|
"grad_norm": 0.28207871317863464, |
|
"learning_rate": 9.998052723513476e-07, |
|
"loss": 2.058, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0020904683538673662, |
|
"grad_norm": 0.5639588236808777, |
|
"learning_rate": 9.998010311621295e-07, |
|
"loss": 2.2522, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.002134946403949651, |
|
"grad_norm": 0.45129290223121643, |
|
"learning_rate": 9.997967896327061e-07, |
|
"loss": 2.3281, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0021794244540319354, |
|
"grad_norm": 0.4590243101119995, |
|
"learning_rate": 9.997925477630364e-07, |
|
"loss": 2.1991, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.00222390250411422, |
|
"grad_norm": 0.3881557881832123, |
|
"learning_rate": 9.997883055530797e-07, |
|
"loss": 2.2259, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00222390250411422, |
|
"eval_loss": 1.9868642091751099, |
|
"eval_runtime": 233.6732, |
|
"eval_samples_per_second": 0.274, |
|
"eval_steps_per_second": 0.068, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.002268380554196504, |
|
"grad_norm": 0.4352044463157654, |
|
"learning_rate": 9.997840630027944e-07, |
|
"loss": 2.2623, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0023128586042787886, |
|
"grad_norm": 0.43397316336631775, |
|
"learning_rate": 9.997798201121402e-07, |
|
"loss": 1.8368, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.002357336654361073, |
|
"grad_norm": 0.38807961344718933, |
|
"learning_rate": 9.99775576881076e-07, |
|
"loss": 2.0101, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0024018147044433573, |
|
"grad_norm": 0.6237647533416748, |
|
"learning_rate": 9.997713333095603e-07, |
|
"loss": 2.0238, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0024462927545256417, |
|
"grad_norm": 0.34039369225502014, |
|
"learning_rate": 9.997670893975529e-07, |
|
"loss": 2.1, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.002490770804607926, |
|
"grad_norm": 0.4221118986606598, |
|
"learning_rate": 9.997628451450122e-07, |
|
"loss": 2.1649, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0025352488546902105, |
|
"grad_norm": 0.38328638672828674, |
|
"learning_rate": 9.997586005518976e-07, |
|
"loss": 2.1151, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.002579726904772495, |
|
"grad_norm": 0.3308090567588806, |
|
"learning_rate": 9.997543556181679e-07, |
|
"loss": 2.1499, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0026242049548547792, |
|
"grad_norm": 0.3598516881465912, |
|
"learning_rate": 9.99750110343782e-07, |
|
"loss": 2.1146, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0026686830049370636, |
|
"grad_norm": 0.38582974672317505, |
|
"learning_rate": 9.997458647286993e-07, |
|
"loss": 1.8236, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0026686830049370636, |
|
"eval_loss": 1.9794503450393677, |
|
"eval_runtime": 312.5608, |
|
"eval_samples_per_second": 0.205, |
|
"eval_steps_per_second": 0.051, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.002713161055019348, |
|
"grad_norm": 0.5031485557556152, |
|
"learning_rate": 9.997416187728787e-07, |
|
"loss": 2.3825, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0027576391051016324, |
|
"grad_norm": 0.40436115860939026, |
|
"learning_rate": 9.997373724762788e-07, |
|
"loss": 1.9051, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0028021171551839167, |
|
"grad_norm": 0.3216610252857208, |
|
"learning_rate": 9.997331258388588e-07, |
|
"loss": 1.7655, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.002846595205266201, |
|
"grad_norm": 0.30226317048072815, |
|
"learning_rate": 9.997288788605777e-07, |
|
"loss": 2.215, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0028910732553484855, |
|
"grad_norm": 0.34857413172721863, |
|
"learning_rate": 9.997246315413945e-07, |
|
"loss": 2.1704, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.00293555130543077, |
|
"grad_norm": 0.4939625859260559, |
|
"learning_rate": 9.99720383881268e-07, |
|
"loss": 2.1591, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0029800293555130542, |
|
"grad_norm": 0.6396478414535522, |
|
"learning_rate": 9.997161358801571e-07, |
|
"loss": 2.4183, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0030245074055953386, |
|
"grad_norm": 0.3547438681125641, |
|
"learning_rate": 9.99711887538021e-07, |
|
"loss": 2.3338, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.003068985455677623, |
|
"grad_norm": 0.455522745847702, |
|
"learning_rate": 9.997076388548186e-07, |
|
"loss": 2.2559, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0031134635057599074, |
|
"grad_norm": 0.5139729976654053, |
|
"learning_rate": 9.997033898305084e-07, |
|
"loss": 2.4271, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0031134635057599074, |
|
"eval_loss": 1.974100947380066, |
|
"eval_runtime": 231.1208, |
|
"eval_samples_per_second": 0.277, |
|
"eval_steps_per_second": 0.069, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0031579415558421918, |
|
"grad_norm": 0.3868389427661896, |
|
"learning_rate": 9.996991404650499e-07, |
|
"loss": 1.8754, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.003202419605924476, |
|
"grad_norm": 0.4664241373538971, |
|
"learning_rate": 9.996948907584016e-07, |
|
"loss": 1.9934, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0032468976560067605, |
|
"grad_norm": 0.3952767848968506, |
|
"learning_rate": 9.996906407105226e-07, |
|
"loss": 2.0883, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.003291375706089045, |
|
"grad_norm": 0.4785691797733307, |
|
"learning_rate": 9.996863903213718e-07, |
|
"loss": 2.3203, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0033358537561713293, |
|
"grad_norm": 0.4103385806083679, |
|
"learning_rate": 9.99682139590908e-07, |
|
"loss": 2.0406, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0033803318062536137, |
|
"grad_norm": 0.45813262462615967, |
|
"learning_rate": 9.996778885190904e-07, |
|
"loss": 2.1745, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.003424809856335898, |
|
"grad_norm": 0.34197866916656494, |
|
"learning_rate": 9.996736371058771e-07, |
|
"loss": 2.0839, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0034692879064181824, |
|
"grad_norm": 0.4917107820510864, |
|
"learning_rate": 9.996693853512279e-07, |
|
"loss": 1.9646, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0035137659565004672, |
|
"grad_norm": 0.570755124092102, |
|
"learning_rate": 9.99665133255101e-07, |
|
"loss": 2.4128, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0035582440065827516, |
|
"grad_norm": 0.6334550380706787, |
|
"learning_rate": 9.996608808174557e-07, |
|
"loss": 2.3972, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0035582440065827516, |
|
"eval_loss": 1.9703996181488037, |
|
"eval_runtime": 236.153, |
|
"eval_samples_per_second": 0.271, |
|
"eval_steps_per_second": 0.068, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.003602722056665036, |
|
"grad_norm": 0.44049328565597534, |
|
"learning_rate": 9.996566280382507e-07, |
|
"loss": 2.389, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0036472001067473204, |
|
"grad_norm": 0.5198694467544556, |
|
"learning_rate": 9.996523749174444e-07, |
|
"loss": 1.8092, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.0036916781568296047, |
|
"grad_norm": 0.4297351837158203, |
|
"learning_rate": 9.996481214549966e-07, |
|
"loss": 1.9158, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.003736156206911889, |
|
"grad_norm": 0.5207564234733582, |
|
"learning_rate": 9.996438676508653e-07, |
|
"loss": 2.3368, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.0037806342569941735, |
|
"grad_norm": 3.4639275074005127, |
|
"learning_rate": 9.996396135050097e-07, |
|
"loss": 2.0157, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.003825112307076458, |
|
"grad_norm": 0.5132240056991577, |
|
"learning_rate": 9.996353590173885e-07, |
|
"loss": 2.1738, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.0038695903571587423, |
|
"grad_norm": 0.559355616569519, |
|
"learning_rate": 9.996311041879605e-07, |
|
"loss": 2.2668, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.003914068407241026, |
|
"grad_norm": 0.40078872442245483, |
|
"learning_rate": 9.996268490166847e-07, |
|
"loss": 2.0339, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.003958546457323311, |
|
"grad_norm": 0.43362948298454285, |
|
"learning_rate": 9.996225935035196e-07, |
|
"loss": 2.3476, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.004003024507405595, |
|
"grad_norm": 0.6087605953216553, |
|
"learning_rate": 9.99618337648424e-07, |
|
"loss": 2.3268, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.004003024507405595, |
|
"eval_loss": 1.9674957990646362, |
|
"eval_runtime": 241.976, |
|
"eval_samples_per_second": 0.264, |
|
"eval_steps_per_second": 0.066, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.004047502557487879, |
|
"grad_norm": 3.6922800540924072, |
|
"learning_rate": 9.996140814513573e-07, |
|
"loss": 2.2244, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.004091980607570164, |
|
"grad_norm": 0.6901090145111084, |
|
"learning_rate": 9.996098249122776e-07, |
|
"loss": 2.127, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.004136458657652448, |
|
"grad_norm": 0.42567893862724304, |
|
"learning_rate": 9.99605568031144e-07, |
|
"loss": 1.99, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.0041809367077347325, |
|
"grad_norm": 0.37529805302619934, |
|
"learning_rate": 9.996013108079149e-07, |
|
"loss": 2.1369, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.004225414757817018, |
|
"grad_norm": 0.5616635084152222, |
|
"learning_rate": 9.995970532425493e-07, |
|
"loss": 1.8421, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.004269892807899302, |
|
"grad_norm": 0.3917858898639679, |
|
"learning_rate": 9.995927953350061e-07, |
|
"loss": 2.0905, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0043143708579815865, |
|
"grad_norm": 0.3693113923072815, |
|
"learning_rate": 9.99588537085244e-07, |
|
"loss": 2.326, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.004358848908063871, |
|
"grad_norm": 0.4595705270767212, |
|
"learning_rate": 9.995842784932216e-07, |
|
"loss": 1.8433, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.004403326958146155, |
|
"grad_norm": 0.46681633591651917, |
|
"learning_rate": 9.995800195588977e-07, |
|
"loss": 2.21, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.00444780500822844, |
|
"grad_norm": 0.5430231690406799, |
|
"learning_rate": 9.99575760282231e-07, |
|
"loss": 2.4849, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.00444780500822844, |
|
"eval_loss": 1.964964509010315, |
|
"eval_runtime": 231.4209, |
|
"eval_samples_per_second": 0.277, |
|
"eval_steps_per_second": 0.069, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 22483, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.248725545091072e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|