|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 9280, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.021551724137931036, |
|
"grad_norm": 20.742843627929688, |
|
"learning_rate": 1.002155172413793e-06, |
|
"loss": 2.152, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04310344827586207, |
|
"grad_norm": 7.82145357131958, |
|
"learning_rate": 2.079741379310345e-06, |
|
"loss": 0.677, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06465517241379311, |
|
"grad_norm": 51.326385498046875, |
|
"learning_rate": 3.157327586206897e-06, |
|
"loss": 0.5383, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08620689655172414, |
|
"grad_norm": 22.487550735473633, |
|
"learning_rate": 4.234913793103448e-06, |
|
"loss": 0.481, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10775862068965517, |
|
"grad_norm": 37.75484085083008, |
|
"learning_rate": 5.3125e-06, |
|
"loss": 0.4856, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12931034482758622, |
|
"grad_norm": 27.280712127685547, |
|
"learning_rate": 6.3900862068965515e-06, |
|
"loss": 0.4785, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.15086206896551724, |
|
"grad_norm": 38.151546478271484, |
|
"learning_rate": 7.467672413793104e-06, |
|
"loss": 0.4623, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 25.77330780029297, |
|
"learning_rate": 8.545258620689656e-06, |
|
"loss": 0.4584, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.1939655172413793, |
|
"grad_norm": 22.47431755065918, |
|
"learning_rate": 9.622844827586207e-06, |
|
"loss": 0.4572, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.21551724137931033, |
|
"grad_norm": 9.425881385803223, |
|
"learning_rate": 9.922174329501917e-06, |
|
"loss": 0.4247, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.23706896551724138, |
|
"grad_norm": 19.910789489746094, |
|
"learning_rate": 9.802442528735633e-06, |
|
"loss": 0.4208, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.25862068965517243, |
|
"grad_norm": 16.591716766357422, |
|
"learning_rate": 9.68271072796935e-06, |
|
"loss": 0.4366, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2801724137931034, |
|
"grad_norm": 20.817670822143555, |
|
"learning_rate": 9.562978927203065e-06, |
|
"loss": 0.3789, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3017241379310345, |
|
"grad_norm": 18.30855369567871, |
|
"learning_rate": 9.443247126436782e-06, |
|
"loss": 0.4443, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3232758620689655, |
|
"grad_norm": 26.79413414001465, |
|
"learning_rate": 9.3235153256705e-06, |
|
"loss": 0.4019, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 23.13565444946289, |
|
"learning_rate": 9.203783524904215e-06, |
|
"loss": 0.3181, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.36637931034482757, |
|
"grad_norm": 29.596229553222656, |
|
"learning_rate": 9.084051724137932e-06, |
|
"loss": 0.3621, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.3879310344827586, |
|
"grad_norm": 20.81353759765625, |
|
"learning_rate": 8.964319923371648e-06, |
|
"loss": 0.3861, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.40948275862068967, |
|
"grad_norm": 46.100711822509766, |
|
"learning_rate": 8.844588122605365e-06, |
|
"loss": 0.407, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.43103448275862066, |
|
"grad_norm": 5.473584175109863, |
|
"learning_rate": 8.724856321839082e-06, |
|
"loss": 0.4148, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.4525862068965517, |
|
"grad_norm": 23.936071395874023, |
|
"learning_rate": 8.605124521072798e-06, |
|
"loss": 0.3606, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.47413793103448276, |
|
"grad_norm": 12.36728572845459, |
|
"learning_rate": 8.485392720306515e-06, |
|
"loss": 0.3567, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.4956896551724138, |
|
"grad_norm": 17.580957412719727, |
|
"learning_rate": 8.36566091954023e-06, |
|
"loss": 0.3277, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 11.993826866149902, |
|
"learning_rate": 8.245929118773946e-06, |
|
"loss": 0.3921, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.5387931034482759, |
|
"grad_norm": 20.464040756225586, |
|
"learning_rate": 8.126197318007663e-06, |
|
"loss": 0.3395, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.5603448275862069, |
|
"grad_norm": 14.831501960754395, |
|
"learning_rate": 8.00646551724138e-06, |
|
"loss": 0.3643, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.5818965517241379, |
|
"grad_norm": 10.325621604919434, |
|
"learning_rate": 7.886733716475098e-06, |
|
"loss": 0.3484, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.603448275862069, |
|
"grad_norm": 33.5569953918457, |
|
"learning_rate": 7.767001915708813e-06, |
|
"loss": 0.3196, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 17.34543228149414, |
|
"learning_rate": 7.64727011494253e-06, |
|
"loss": 0.3094, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.646551724137931, |
|
"grad_norm": 15.79505443572998, |
|
"learning_rate": 7.527538314176246e-06, |
|
"loss": 0.3489, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.6681034482758621, |
|
"grad_norm": 15.39349365234375, |
|
"learning_rate": 7.407806513409962e-06, |
|
"loss": 0.3054, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 20.706342697143555, |
|
"learning_rate": 7.288074712643679e-06, |
|
"loss": 0.3013, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.7112068965517241, |
|
"grad_norm": 19.69810676574707, |
|
"learning_rate": 7.168342911877395e-06, |
|
"loss": 0.3408, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.7327586206896551, |
|
"grad_norm": 12.575122833251953, |
|
"learning_rate": 7.048611111111112e-06, |
|
"loss": 0.3463, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.7543103448275862, |
|
"grad_norm": 13.906023979187012, |
|
"learning_rate": 6.928879310344828e-06, |
|
"loss": 0.3005, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.7758620689655172, |
|
"grad_norm": 23.61775016784668, |
|
"learning_rate": 6.809147509578544e-06, |
|
"loss": 0.3072, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.7974137931034483, |
|
"grad_norm": 6.779231548309326, |
|
"learning_rate": 6.689415708812261e-06, |
|
"loss": 0.3368, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.8189655172413793, |
|
"grad_norm": 30.881776809692383, |
|
"learning_rate": 6.569683908045977e-06, |
|
"loss": 0.2608, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.8405172413793104, |
|
"grad_norm": 20.54507064819336, |
|
"learning_rate": 6.449952107279695e-06, |
|
"loss": 0.3227, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 28.014013290405273, |
|
"learning_rate": 6.33022030651341e-06, |
|
"loss": 0.2993, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.8836206896551724, |
|
"grad_norm": 10.269726753234863, |
|
"learning_rate": 6.2104885057471265e-06, |
|
"loss": 0.298, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.9051724137931034, |
|
"grad_norm": 25.653640747070312, |
|
"learning_rate": 6.090756704980844e-06, |
|
"loss": 0.2789, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.9267241379310345, |
|
"grad_norm": 10.718826293945312, |
|
"learning_rate": 5.97102490421456e-06, |
|
"loss": 0.3076, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.9482758620689655, |
|
"grad_norm": 19.0034236907959, |
|
"learning_rate": 5.851293103448276e-06, |
|
"loss": 0.298, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.9698275862068966, |
|
"grad_norm": 7.18696928024292, |
|
"learning_rate": 5.731561302681993e-06, |
|
"loss": 0.2953, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.9913793103448276, |
|
"grad_norm": 15.798299789428711, |
|
"learning_rate": 5.613026819923372e-06, |
|
"loss": 0.2845, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.0129310344827587, |
|
"grad_norm": 25.310443878173828, |
|
"learning_rate": 5.4932950191570884e-06, |
|
"loss": 0.2819, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 1.2570937871932983, |
|
"learning_rate": 5.373563218390805e-06, |
|
"loss": 0.2247, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.0560344827586208, |
|
"grad_norm": 17.156612396240234, |
|
"learning_rate": 5.253831417624522e-06, |
|
"loss": 0.2716, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.0775862068965518, |
|
"grad_norm": 27.063459396362305, |
|
"learning_rate": 5.134099616858238e-06, |
|
"loss": 0.2198, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.0991379310344827, |
|
"grad_norm": 16.36822509765625, |
|
"learning_rate": 5.014367816091954e-06, |
|
"loss": 0.2677, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.1206896551724137, |
|
"grad_norm": 23.680177688598633, |
|
"learning_rate": 4.894636015325671e-06, |
|
"loss": 0.2522, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.1422413793103448, |
|
"grad_norm": 4.396277904510498, |
|
"learning_rate": 4.774904214559387e-06, |
|
"loss": 0.2101, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.1637931034482758, |
|
"grad_norm": 3.8707330226898193, |
|
"learning_rate": 4.655172413793104e-06, |
|
"loss": 0.2532, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.1853448275862069, |
|
"grad_norm": 22.42334747314453, |
|
"learning_rate": 4.53544061302682e-06, |
|
"loss": 0.2386, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.206896551724138, |
|
"grad_norm": 1.4298897981643677, |
|
"learning_rate": 4.4157088122605364e-06, |
|
"loss": 0.2426, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.228448275862069, |
|
"grad_norm": 4.422305107116699, |
|
"learning_rate": 4.295977011494254e-06, |
|
"loss": 0.2399, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 30.610410690307617, |
|
"learning_rate": 4.17624521072797e-06, |
|
"loss": 0.2517, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.271551724137931, |
|
"grad_norm": 24.947818756103516, |
|
"learning_rate": 4.056513409961686e-06, |
|
"loss": 0.2367, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.293103448275862, |
|
"grad_norm": 15.14891529083252, |
|
"learning_rate": 3.936781609195403e-06, |
|
"loss": 0.2224, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.3146551724137931, |
|
"grad_norm": 6.019184112548828, |
|
"learning_rate": 3.817049808429119e-06, |
|
"loss": 0.246, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.3362068965517242, |
|
"grad_norm": 5.349549293518066, |
|
"learning_rate": 3.697318007662836e-06, |
|
"loss": 0.2203, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.3577586206896552, |
|
"grad_norm": 19.479036331176758, |
|
"learning_rate": 3.577586206896552e-06, |
|
"loss": 0.2262, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 13.838797569274902, |
|
"learning_rate": 3.457854406130268e-06, |
|
"loss": 0.2064, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.4008620689655173, |
|
"grad_norm": 23.992177963256836, |
|
"learning_rate": 3.338122605363985e-06, |
|
"loss": 0.2557, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.4224137931034484, |
|
"grad_norm": 17.90095329284668, |
|
"learning_rate": 3.2183908045977012e-06, |
|
"loss": 0.2091, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.4439655172413794, |
|
"grad_norm": 6.963798999786377, |
|
"learning_rate": 3.098659003831418e-06, |
|
"loss": 0.1997, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.4655172413793103, |
|
"grad_norm": 17.81186866760254, |
|
"learning_rate": 2.9789272030651344e-06, |
|
"loss": 0.2302, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.4870689655172413, |
|
"grad_norm": 11.2035551071167, |
|
"learning_rate": 2.859195402298851e-06, |
|
"loss": 0.2031, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.5086206896551724, |
|
"grad_norm": 10.019887924194336, |
|
"learning_rate": 2.739463601532567e-06, |
|
"loss": 0.2459, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.5301724137931034, |
|
"grad_norm": 15.275938987731934, |
|
"learning_rate": 2.6197318007662834e-06, |
|
"loss": 0.2386, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 16.73408317565918, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.2315, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.5732758620689655, |
|
"grad_norm": 7.654747486114502, |
|
"learning_rate": 2.380268199233717e-06, |
|
"loss": 0.2596, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.5948275862068966, |
|
"grad_norm": 18.679759979248047, |
|
"learning_rate": 2.260536398467433e-06, |
|
"loss": 0.2366, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.6163793103448276, |
|
"grad_norm": 9.820782661437988, |
|
"learning_rate": 2.1408045977011497e-06, |
|
"loss": 0.2145, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.6379310344827587, |
|
"grad_norm": 7.739869594573975, |
|
"learning_rate": 2.021072796934866e-06, |
|
"loss": 0.2167, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.6594827586206895, |
|
"grad_norm": 30.86256217956543, |
|
"learning_rate": 1.9013409961685824e-06, |
|
"loss": 0.2322, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.6810344827586206, |
|
"grad_norm": 13.007984161376953, |
|
"learning_rate": 1.781609195402299e-06, |
|
"loss": 0.308, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.7025862068965516, |
|
"grad_norm": 13.239990234375, |
|
"learning_rate": 1.6618773946360153e-06, |
|
"loss": 0.2166, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 11.643211364746094, |
|
"learning_rate": 1.5421455938697319e-06, |
|
"loss": 0.2197, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.7456896551724137, |
|
"grad_norm": 23.515066146850586, |
|
"learning_rate": 1.4224137931034484e-06, |
|
"loss": 0.2014, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.7672413793103448, |
|
"grad_norm": 8.773682594299316, |
|
"learning_rate": 1.3026819923371648e-06, |
|
"loss": 0.2411, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.7887931034482758, |
|
"grad_norm": 16.361536026000977, |
|
"learning_rate": 1.1829501915708814e-06, |
|
"loss": 0.2072, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.8103448275862069, |
|
"grad_norm": 6.065272331237793, |
|
"learning_rate": 1.0632183908045977e-06, |
|
"loss": 0.2527, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.831896551724138, |
|
"grad_norm": 2.8060665130615234, |
|
"learning_rate": 9.434865900383143e-07, |
|
"loss": 0.2209, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.853448275862069, |
|
"grad_norm": 5.463712215423584, |
|
"learning_rate": 8.237547892720307e-07, |
|
"loss": 0.2367, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 6.7926836013793945, |
|
"learning_rate": 7.040229885057472e-07, |
|
"loss": 0.2142, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.896551724137931, |
|
"grad_norm": 8.841383934020996, |
|
"learning_rate": 5.842911877394636e-07, |
|
"loss": 0.2026, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.918103448275862, |
|
"grad_norm": 5.810873985290527, |
|
"learning_rate": 4.6455938697318016e-07, |
|
"loss": 0.2433, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.9396551724137931, |
|
"grad_norm": 21.808469772338867, |
|
"learning_rate": 3.4482758620689656e-07, |
|
"loss": 0.1935, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.9612068965517242, |
|
"grad_norm": 26.461162567138672, |
|
"learning_rate": 2.2509578544061305e-07, |
|
"loss": 0.2274, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.9827586206896552, |
|
"grad_norm": 13.109670639038086, |
|
"learning_rate": 1.0536398467432952e-07, |
|
"loss": 0.2177, |
|
"step": 9200 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 9280, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|