|
{ |
|
"best_metric": 2.126425266265869, |
|
"best_model_checkpoint": "/work/Ccp-OldNewsBERT_2024/modelling/checkpoint-95500", |
|
"epoch": 15.0, |
|
"eval_steps": 500, |
|
"global_step": 98640, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07603406326034064, |
|
"grad_norm": 1.1528505086898804, |
|
"learning_rate": 1.25e-05, |
|
"loss": 8.5532, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.07603406326034064, |
|
"eval_loss": 7.3156208992004395, |
|
"eval_runtime": 392.108, |
|
"eval_samples_per_second": 1073.301, |
|
"eval_steps_per_second": 4.193, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.15206812652068127, |
|
"grad_norm": 1.3152525424957275, |
|
"learning_rate": 2.5e-05, |
|
"loss": 7.0493, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.15206812652068127, |
|
"eval_loss": 6.841813087463379, |
|
"eval_runtime": 392.0485, |
|
"eval_samples_per_second": 1073.464, |
|
"eval_steps_per_second": 4.193, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2281021897810219, |
|
"grad_norm": 1.7870614528656006, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 6.7805, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2281021897810219, |
|
"eval_loss": 6.62256383895874, |
|
"eval_runtime": 391.9186, |
|
"eval_samples_per_second": 1073.82, |
|
"eval_steps_per_second": 4.195, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.30413625304136255, |
|
"grad_norm": 1.1754438877105713, |
|
"learning_rate": 5e-05, |
|
"loss": 6.5763, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.30413625304136255, |
|
"eval_loss": 6.442608833312988, |
|
"eval_runtime": 391.8632, |
|
"eval_samples_per_second": 1073.972, |
|
"eval_steps_per_second": 4.195, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.38017031630170317, |
|
"grad_norm": 1.4492470026016235, |
|
"learning_rate": 4.999669762518974e-05, |
|
"loss": 6.4084, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.38017031630170317, |
|
"eval_loss": 6.2790422439575195, |
|
"eval_runtime": 395.0617, |
|
"eval_samples_per_second": 1065.277, |
|
"eval_steps_per_second": 4.161, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4562043795620438, |
|
"grad_norm": 1.4527273178100586, |
|
"learning_rate": 4.9986791373213283e-05, |
|
"loss": 6.2422, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.4562043795620438, |
|
"eval_loss": 6.124966621398926, |
|
"eval_runtime": 394.995, |
|
"eval_samples_per_second": 1065.457, |
|
"eval_steps_per_second": 4.162, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5322384428223844, |
|
"grad_norm": 1.8543823957443237, |
|
"learning_rate": 4.997028386120321e-05, |
|
"loss": 6.0635, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.5322384428223844, |
|
"eval_loss": 5.868374347686768, |
|
"eval_runtime": 394.8994, |
|
"eval_samples_per_second": 1065.715, |
|
"eval_steps_per_second": 4.163, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6082725060827251, |
|
"grad_norm": 1.973868489265442, |
|
"learning_rate": 4.994717945027886e-05, |
|
"loss": 5.7633, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6082725060827251, |
|
"eval_loss": 5.493896961212158, |
|
"eval_runtime": 394.9221, |
|
"eval_samples_per_second": 1065.653, |
|
"eval_steps_per_second": 4.163, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6843065693430657, |
|
"grad_norm": 1.8778235912322998, |
|
"learning_rate": 4.99174842443942e-05, |
|
"loss": 5.429, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.6843065693430657, |
|
"eval_loss": 5.116617679595947, |
|
"eval_runtime": 394.8716, |
|
"eval_samples_per_second": 1065.79, |
|
"eval_steps_per_second": 4.163, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.7603406326034063, |
|
"grad_norm": 2.0149049758911133, |
|
"learning_rate": 4.9881206088725227e-05, |
|
"loss": 5.0808, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.7603406326034063, |
|
"eval_loss": 4.774472713470459, |
|
"eval_runtime": 395.1882, |
|
"eval_samples_per_second": 1064.936, |
|
"eval_steps_per_second": 4.16, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8363746958637469, |
|
"grad_norm": 1.7959963083267212, |
|
"learning_rate": 4.983835456759734e-05, |
|
"loss": 4.7725, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.8363746958637469, |
|
"eval_loss": 4.4951066970825195, |
|
"eval_runtime": 395.052, |
|
"eval_samples_per_second": 1065.303, |
|
"eval_steps_per_second": 4.161, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.9124087591240876, |
|
"grad_norm": 1.6965287923812866, |
|
"learning_rate": 4.978894100195325e-05, |
|
"loss": 4.5344, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.9124087591240876, |
|
"eval_loss": 4.28698205947876, |
|
"eval_runtime": 395.1764, |
|
"eval_samples_per_second": 1064.967, |
|
"eval_steps_per_second": 4.16, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.9884428223844283, |
|
"grad_norm": 1.6758971214294434, |
|
"learning_rate": 4.973297844636212e-05, |
|
"loss": 4.3347, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.9884428223844283, |
|
"eval_loss": 4.129937648773193, |
|
"eval_runtime": 395.104, |
|
"eval_samples_per_second": 1065.163, |
|
"eval_steps_per_second": 4.161, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.0644768856447688, |
|
"grad_norm": 1.7391337156295776, |
|
"learning_rate": 4.9670481685570645e-05, |
|
"loss": 4.1883, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.0644768856447688, |
|
"eval_loss": 4.002706050872803, |
|
"eval_runtime": 395.1014, |
|
"eval_samples_per_second": 1065.17, |
|
"eval_steps_per_second": 4.161, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.1405109489051095, |
|
"grad_norm": 1.592909812927246, |
|
"learning_rate": 4.960146723059713e-05, |
|
"loss": 4.0579, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.1405109489051095, |
|
"eval_loss": 3.8906095027923584, |
|
"eval_runtime": 395.0202, |
|
"eval_samples_per_second": 1065.389, |
|
"eval_steps_per_second": 4.162, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.2165450121654502, |
|
"grad_norm": 1.7625865936279297, |
|
"learning_rate": 4.952595331436939e-05, |
|
"loss": 3.9484, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.2165450121654502, |
|
"eval_loss": 3.793649673461914, |
|
"eval_runtime": 395.0939, |
|
"eval_samples_per_second": 1065.19, |
|
"eval_steps_per_second": 4.161, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.2925790754257909, |
|
"grad_norm": 1.5408483743667603, |
|
"learning_rate": 4.9443959886907786e-05, |
|
"loss": 3.8541, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.2925790754257909, |
|
"eval_loss": 3.707909107208252, |
|
"eval_runtime": 395.0107, |
|
"eval_samples_per_second": 1065.414, |
|
"eval_steps_per_second": 4.162, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.3686131386861313, |
|
"grad_norm": 1.5377788543701172, |
|
"learning_rate": 4.935550861005469e-05, |
|
"loss": 3.7751, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.3686131386861313, |
|
"eval_loss": 3.6381478309631348, |
|
"eval_runtime": 395.0196, |
|
"eval_samples_per_second": 1065.39, |
|
"eval_steps_per_second": 4.162, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.444647201946472, |
|
"grad_norm": 1.5185712575912476, |
|
"learning_rate": 4.926062285175158e-05, |
|
"loss": 3.7, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.444647201946472, |
|
"eval_loss": 3.56645131111145, |
|
"eval_runtime": 395.0168, |
|
"eval_samples_per_second": 1065.398, |
|
"eval_steps_per_second": 4.162, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.5206812652068127, |
|
"grad_norm": 1.4640849828720093, |
|
"learning_rate": 4.9159536649297986e-05, |
|
"loss": 3.6397, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.5206812652068127, |
|
"eval_loss": 3.5038576126098633, |
|
"eval_runtime": 395.2609, |
|
"eval_samples_per_second": 1064.74, |
|
"eval_steps_per_second": 4.159, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.5967153284671531, |
|
"grad_norm": 1.7205146551132202, |
|
"learning_rate": 4.9051871562474056e-05, |
|
"loss": 3.5783, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.5967153284671531, |
|
"eval_loss": 3.4472110271453857, |
|
"eval_runtime": 395.1986, |
|
"eval_samples_per_second": 1064.908, |
|
"eval_steps_per_second": 4.16, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.672749391727494, |
|
"grad_norm": 1.605870008468628, |
|
"learning_rate": 4.8937852212067106e-05, |
|
"loss": 3.5196, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.672749391727494, |
|
"eval_loss": 3.3966190814971924, |
|
"eval_runtime": 395.1285, |
|
"eval_samples_per_second": 1065.097, |
|
"eval_steps_per_second": 4.161, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.7487834549878345, |
|
"grad_norm": 1.6770403385162354, |
|
"learning_rate": 4.8817508720847596e-05, |
|
"loss": 3.4701, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.7487834549878345, |
|
"eval_loss": 3.34128999710083, |
|
"eval_runtime": 395.1091, |
|
"eval_samples_per_second": 1065.149, |
|
"eval_steps_per_second": 4.161, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.8248175182481752, |
|
"grad_norm": 1.5218740701675415, |
|
"learning_rate": 4.869087288236064e-05, |
|
"loss": 3.4226, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.8248175182481752, |
|
"eval_loss": 3.301135540008545, |
|
"eval_runtime": 395.0668, |
|
"eval_samples_per_second": 1065.263, |
|
"eval_steps_per_second": 4.161, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.9008515815085159, |
|
"grad_norm": 1.528290867805481, |
|
"learning_rate": 4.855797815252648e-05, |
|
"loss": 3.3704, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.9008515815085159, |
|
"eval_loss": 3.255563735961914, |
|
"eval_runtime": 395.0617, |
|
"eval_samples_per_second": 1065.277, |
|
"eval_steps_per_second": 4.161, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.9768856447688563, |
|
"grad_norm": 1.4962824583053589, |
|
"learning_rate": 4.8418859640801796e-05, |
|
"loss": 3.3326, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.9768856447688563, |
|
"eval_loss": 3.2163586616516113, |
|
"eval_runtime": 395.1594, |
|
"eval_samples_per_second": 1065.013, |
|
"eval_steps_per_second": 4.16, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.052919708029197, |
|
"grad_norm": 1.5214394330978394, |
|
"learning_rate": 4.8273554100904066e-05, |
|
"loss": 3.2872, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.052919708029197, |
|
"eval_loss": 3.178077220916748, |
|
"eval_runtime": 395.023, |
|
"eval_samples_per_second": 1065.381, |
|
"eval_steps_per_second": 4.162, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.1289537712895377, |
|
"grad_norm": 1.6362810134887695, |
|
"learning_rate": 4.8122408939478185e-05, |
|
"loss": 3.2453, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.1289537712895377, |
|
"eval_loss": 3.1436197757720947, |
|
"eval_runtime": 395.163, |
|
"eval_samples_per_second": 1065.004, |
|
"eval_steps_per_second": 4.16, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.204987834549878, |
|
"grad_norm": 1.6314831972122192, |
|
"learning_rate": 4.79651794790509e-05, |
|
"loss": 3.2149, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.204987834549878, |
|
"eval_loss": 3.1076748371124268, |
|
"eval_runtime": 395.3195, |
|
"eval_samples_per_second": 1064.582, |
|
"eval_steps_per_second": 4.159, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.281021897810219, |
|
"grad_norm": 1.5647250413894653, |
|
"learning_rate": 4.7801573854264494e-05, |
|
"loss": 3.1836, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.281021897810219, |
|
"eval_loss": 3.081753969192505, |
|
"eval_runtime": 395.1551, |
|
"eval_samples_per_second": 1065.025, |
|
"eval_steps_per_second": 4.16, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.3570559610705595, |
|
"grad_norm": 1.559869408607483, |
|
"learning_rate": 4.763194428202762e-05, |
|
"loss": 3.1459, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.3570559610705595, |
|
"eval_loss": 3.044140100479126, |
|
"eval_runtime": 395.2791, |
|
"eval_samples_per_second": 1064.691, |
|
"eval_steps_per_second": 4.159, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.4330900243309004, |
|
"grad_norm": 1.669546365737915, |
|
"learning_rate": 4.745633557677441e-05, |
|
"loss": 3.1298, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.4330900243309004, |
|
"eval_loss": 3.015268325805664, |
|
"eval_runtime": 395.2158, |
|
"eval_samples_per_second": 1064.861, |
|
"eval_steps_per_second": 4.16, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.509124087591241, |
|
"grad_norm": 1.5877552032470703, |
|
"learning_rate": 4.727479413256602e-05, |
|
"loss": 3.0882, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.509124087591241, |
|
"eval_loss": 2.9866795539855957, |
|
"eval_runtime": 395.1307, |
|
"eval_samples_per_second": 1065.091, |
|
"eval_steps_per_second": 4.161, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.5851581508515817, |
|
"grad_norm": 1.6820305585861206, |
|
"learning_rate": 4.708736791083384e-05, |
|
"loss": 3.0738, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.5851581508515817, |
|
"eval_loss": 2.957209587097168, |
|
"eval_runtime": 395.2085, |
|
"eval_samples_per_second": 1064.881, |
|
"eval_steps_per_second": 4.16, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.661192214111922, |
|
"grad_norm": 1.4878249168395996, |
|
"learning_rate": 4.6894106427708574e-05, |
|
"loss": 3.0409, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.661192214111922, |
|
"eval_loss": 2.931816339492798, |
|
"eval_runtime": 395.1436, |
|
"eval_samples_per_second": 1065.056, |
|
"eval_steps_per_second": 4.161, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.7372262773722627, |
|
"grad_norm": 1.5256247520446777, |
|
"learning_rate": 4.669546457024816e-05, |
|
"loss": 3.0155, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.7372262773722627, |
|
"eval_loss": 2.9121601581573486, |
|
"eval_runtime": 395.308, |
|
"eval_samples_per_second": 1064.613, |
|
"eval_steps_per_second": 4.159, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.8132603406326036, |
|
"grad_norm": 1.6648399829864502, |
|
"learning_rate": 4.649069867545623e-05, |
|
"loss": 2.9909, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.8132603406326036, |
|
"eval_loss": 2.890857219696045, |
|
"eval_runtime": 395.436, |
|
"eval_samples_per_second": 1064.268, |
|
"eval_steps_per_second": 4.157, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.889294403892944, |
|
"grad_norm": 1.6078656911849976, |
|
"learning_rate": 4.628025515330744e-05, |
|
"loss": 2.9754, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.889294403892944, |
|
"eval_loss": 2.865665912628174, |
|
"eval_runtime": 395.2576, |
|
"eval_samples_per_second": 1064.749, |
|
"eval_steps_per_second": 4.159, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.9653284671532845, |
|
"grad_norm": 1.595712661743164, |
|
"learning_rate": 4.60641896008727e-05, |
|
"loss": 2.9512, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.9653284671532845, |
|
"eval_loss": 2.8427441120147705, |
|
"eval_runtime": 395.1474, |
|
"eval_samples_per_second": 1065.046, |
|
"eval_steps_per_second": 4.16, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 3.0413625304136254, |
|
"grad_norm": 1.5582592487335205, |
|
"learning_rate": 4.584255910050703e-05, |
|
"loss": 2.9132, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 3.0413625304136254, |
|
"eval_loss": 2.821183681488037, |
|
"eval_runtime": 395.3458, |
|
"eval_samples_per_second": 1064.511, |
|
"eval_steps_per_second": 4.158, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 3.117396593673966, |
|
"grad_norm": 1.6548606157302856, |
|
"learning_rate": 4.561588193429872e-05, |
|
"loss": 2.9021, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 3.117396593673966, |
|
"eval_loss": 2.802894115447998, |
|
"eval_runtime": 395.4185, |
|
"eval_samples_per_second": 1064.315, |
|
"eval_steps_per_second": 4.158, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 3.1934306569343067, |
|
"grad_norm": 1.6921550035476685, |
|
"learning_rate": 4.538330948241111e-05, |
|
"loss": 2.8889, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 3.1934306569343067, |
|
"eval_loss": 2.7827913761138916, |
|
"eval_runtime": 395.1602, |
|
"eval_samples_per_second": 1065.011, |
|
"eval_steps_per_second": 4.16, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 3.269464720194647, |
|
"grad_norm": 1.7307897806167603, |
|
"learning_rate": 4.514535196430073e-05, |
|
"loss": 2.8642, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 3.269464720194647, |
|
"eval_loss": 2.767017126083374, |
|
"eval_runtime": 395.2268, |
|
"eval_samples_per_second": 1064.832, |
|
"eval_steps_per_second": 4.16, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 3.345498783454988, |
|
"grad_norm": 1.7314034700393677, |
|
"learning_rate": 4.490207224596068e-05, |
|
"loss": 2.8517, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 3.345498783454988, |
|
"eval_loss": 2.747631311416626, |
|
"eval_runtime": 395.1865, |
|
"eval_samples_per_second": 1064.94, |
|
"eval_steps_per_second": 4.16, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 3.4215328467153285, |
|
"grad_norm": 1.7844088077545166, |
|
"learning_rate": 4.465353459945605e-05, |
|
"loss": 2.8341, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 3.4215328467153285, |
|
"eval_loss": 2.7319579124450684, |
|
"eval_runtime": 395.3244, |
|
"eval_samples_per_second": 1064.569, |
|
"eval_steps_per_second": 4.159, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 3.497566909975669, |
|
"grad_norm": 1.5570697784423828, |
|
"learning_rate": 4.43998046859439e-05, |
|
"loss": 2.8102, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 3.497566909975669, |
|
"eval_loss": 2.7134299278259277, |
|
"eval_runtime": 395.3813, |
|
"eval_samples_per_second": 1064.416, |
|
"eval_steps_per_second": 4.158, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 3.57360097323601, |
|
"grad_norm": 1.5903196334838867, |
|
"learning_rate": 4.414094953832625e-05, |
|
"loss": 2.7942, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 3.57360097323601, |
|
"eval_loss": 2.696880340576172, |
|
"eval_runtime": 395.3996, |
|
"eval_samples_per_second": 1064.366, |
|
"eval_steps_per_second": 4.158, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 3.6496350364963503, |
|
"grad_norm": 1.7155580520629883, |
|
"learning_rate": 4.387703754354059e-05, |
|
"loss": 2.7893, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 3.6496350364963503, |
|
"eval_loss": 2.6850531101226807, |
|
"eval_runtime": 395.3598, |
|
"eval_samples_per_second": 1064.473, |
|
"eval_steps_per_second": 4.158, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 3.725669099756691, |
|
"grad_norm": 1.5948296785354614, |
|
"learning_rate": 4.3608681152880126e-05, |
|
"loss": 2.7681, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 3.725669099756691, |
|
"eval_loss": 2.66740345954895, |
|
"eval_runtime": 395.2365, |
|
"eval_samples_per_second": 1064.805, |
|
"eval_steps_per_second": 4.16, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 3.8017031630170317, |
|
"grad_norm": 1.5723962783813477, |
|
"learning_rate": 4.333487571042728e-05, |
|
"loss": 2.7577, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 3.8017031630170317, |
|
"eval_loss": 2.654303789138794, |
|
"eval_runtime": 395.3645, |
|
"eval_samples_per_second": 1064.461, |
|
"eval_steps_per_second": 4.158, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 3.877737226277372, |
|
"grad_norm": 1.6151896715164185, |
|
"learning_rate": 4.3056226377438776e-05, |
|
"loss": 2.7427, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 3.877737226277372, |
|
"eval_loss": 2.643014669418335, |
|
"eval_runtime": 395.2931, |
|
"eval_samples_per_second": 1064.653, |
|
"eval_steps_per_second": 4.159, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 3.9537712895377126, |
|
"grad_norm": 1.670333743095398, |
|
"learning_rate": 4.27728067702777e-05, |
|
"loss": 2.7302, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 3.9537712895377126, |
|
"eval_loss": 2.6284077167510986, |
|
"eval_runtime": 395.1357, |
|
"eval_samples_per_second": 1065.077, |
|
"eval_steps_per_second": 4.161, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 4.0298053527980535, |
|
"grad_norm": 1.551099419593811, |
|
"learning_rate": 4.248469176556575e-05, |
|
"loss": 2.7106, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 4.0298053527980535, |
|
"eval_loss": 2.616875171661377, |
|
"eval_runtime": 395.4614, |
|
"eval_samples_per_second": 1064.2, |
|
"eval_steps_per_second": 4.157, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 4.105839416058394, |
|
"grad_norm": 1.6209259033203125, |
|
"learning_rate": 4.219313751705213e-05, |
|
"loss": 2.7007, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 4.105839416058394, |
|
"eval_loss": 2.6068313121795654, |
|
"eval_runtime": 395.5688, |
|
"eval_samples_per_second": 1063.911, |
|
"eval_steps_per_second": 4.156, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 4.181873479318734, |
|
"grad_norm": 1.616698980331421, |
|
"learning_rate": 4.189587930102075e-05, |
|
"loss": 2.6858, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 4.181873479318734, |
|
"eval_loss": 2.5947837829589844, |
|
"eval_runtime": 395.5316, |
|
"eval_samples_per_second": 1064.011, |
|
"eval_steps_per_second": 4.156, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 4.257907542579075, |
|
"grad_norm": 1.6252193450927734, |
|
"learning_rate": 4.1594157362893294e-05, |
|
"loss": 2.6748, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 4.257907542579075, |
|
"eval_loss": 2.5821821689605713, |
|
"eval_runtime": 395.6317, |
|
"eval_samples_per_second": 1063.742, |
|
"eval_steps_per_second": 4.155, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 4.333941605839416, |
|
"grad_norm": 1.5178853273391724, |
|
"learning_rate": 4.1288051414584004e-05, |
|
"loss": 2.672, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 4.333941605839416, |
|
"eval_loss": 2.566763162612915, |
|
"eval_runtime": 395.3024, |
|
"eval_samples_per_second": 1064.628, |
|
"eval_steps_per_second": 4.159, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 4.409975669099756, |
|
"grad_norm": 1.6428803205490112, |
|
"learning_rate": 4.097764232621873e-05, |
|
"loss": 2.6498, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 4.409975669099756, |
|
"eval_loss": 2.560192823410034, |
|
"eval_runtime": 395.2916, |
|
"eval_samples_per_second": 1064.657, |
|
"eval_steps_per_second": 4.159, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 4.486009732360097, |
|
"grad_norm": 1.546608805656433, |
|
"learning_rate": 4.066301210476981e-05, |
|
"loss": 2.6422, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 4.486009732360097, |
|
"eval_loss": 2.5504369735717773, |
|
"eval_runtime": 395.4025, |
|
"eval_samples_per_second": 1064.358, |
|
"eval_steps_per_second": 4.158, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 4.562043795620438, |
|
"grad_norm": 1.6463203430175781, |
|
"learning_rate": 4.034424387239068e-05, |
|
"loss": 2.6334, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 4.562043795620438, |
|
"eval_loss": 2.540264844894409, |
|
"eval_runtime": 395.609, |
|
"eval_samples_per_second": 1063.803, |
|
"eval_steps_per_second": 4.156, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 4.638077858880779, |
|
"grad_norm": 1.69281005859375, |
|
"learning_rate": 4.002142184445579e-05, |
|
"loss": 2.6246, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 4.638077858880779, |
|
"eval_loss": 2.529710292816162, |
|
"eval_runtime": 395.534, |
|
"eval_samples_per_second": 1064.005, |
|
"eval_steps_per_second": 4.156, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 4.714111922141119, |
|
"grad_norm": 1.4954875707626343, |
|
"learning_rate": 3.969594626065171e-05, |
|
"loss": 2.6194, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 4.714111922141119, |
|
"eval_loss": 2.5173487663269043, |
|
"eval_runtime": 395.5366, |
|
"eval_samples_per_second": 1063.998, |
|
"eval_steps_per_second": 4.156, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 4.79014598540146, |
|
"grad_norm": 1.586890459060669, |
|
"learning_rate": 3.936528890443755e-05, |
|
"loss": 2.6044, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 4.79014598540146, |
|
"eval_loss": 2.509347438812256, |
|
"eval_runtime": 395.6037, |
|
"eval_samples_per_second": 1063.817, |
|
"eval_steps_per_second": 4.156, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 4.866180048661801, |
|
"grad_norm": 1.4862339496612549, |
|
"learning_rate": 3.903083638276577e-05, |
|
"loss": 2.585, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 4.866180048661801, |
|
"eval_loss": 2.498917579650879, |
|
"eval_runtime": 395.4783, |
|
"eval_samples_per_second": 1064.154, |
|
"eval_steps_per_second": 4.157, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 4.942214111922141, |
|
"grad_norm": 1.6119396686553955, |
|
"learning_rate": 3.869267705464299e-05, |
|
"loss": 2.5825, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 4.942214111922141, |
|
"eval_loss": 2.4927380084991455, |
|
"eval_runtime": 395.5817, |
|
"eval_samples_per_second": 1063.876, |
|
"eval_steps_per_second": 4.156, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 5.018248175182482, |
|
"grad_norm": 1.5895634889602661, |
|
"learning_rate": 3.835090025837699e-05, |
|
"loss": 2.5708, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 5.018248175182482, |
|
"eval_loss": 2.4862186908721924, |
|
"eval_runtime": 395.6219, |
|
"eval_samples_per_second": 1063.768, |
|
"eval_steps_per_second": 4.155, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 5.094282238442823, |
|
"grad_norm": 1.6652857065200806, |
|
"learning_rate": 3.800559628797438e-05, |
|
"loss": 2.5612, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 5.094282238442823, |
|
"eval_loss": 2.475658416748047, |
|
"eval_runtime": 394.9698, |
|
"eval_samples_per_second": 1065.525, |
|
"eval_steps_per_second": 4.162, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 5.170316301703163, |
|
"grad_norm": 1.6712974309921265, |
|
"learning_rate": 3.765685636928585e-05, |
|
"loss": 2.5508, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 5.170316301703163, |
|
"eval_loss": 2.4684622287750244, |
|
"eval_runtime": 394.7029, |
|
"eval_samples_per_second": 1066.245, |
|
"eval_steps_per_second": 4.165, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 5.2463503649635035, |
|
"grad_norm": 1.7370678186416626, |
|
"learning_rate": 3.7305480078818275e-05, |
|
"loss": 2.5517, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 5.2463503649635035, |
|
"eval_loss": 2.4651219844818115, |
|
"eval_runtime": 395.1235, |
|
"eval_samples_per_second": 1065.11, |
|
"eval_steps_per_second": 4.161, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 5.322384428223844, |
|
"grad_norm": 1.6240907907485962, |
|
"learning_rate": 3.6950151955931227e-05, |
|
"loss": 2.536, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 5.322384428223844, |
|
"eval_loss": 2.4535937309265137, |
|
"eval_runtime": 394.9571, |
|
"eval_samples_per_second": 1065.559, |
|
"eval_steps_per_second": 4.162, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 5.398418491484185, |
|
"grad_norm": 1.8107973337173462, |
|
"learning_rate": 3.659166672258033e-05, |
|
"loss": 2.5362, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 5.398418491484185, |
|
"eval_loss": 2.4444773197174072, |
|
"eval_runtime": 395.3, |
|
"eval_samples_per_second": 1064.635, |
|
"eval_steps_per_second": 4.159, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 5.474452554744525, |
|
"grad_norm": 1.550801396369934, |
|
"learning_rate": 3.623011908697394e-05, |
|
"loss": 2.5267, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 5.474452554744525, |
|
"eval_loss": 2.4367120265960693, |
|
"eval_runtime": 395.2579, |
|
"eval_samples_per_second": 1064.748, |
|
"eval_steps_per_second": 4.159, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 5.550486618004866, |
|
"grad_norm": 1.4852931499481201, |
|
"learning_rate": 3.5866336492488555e-05, |
|
"loss": 2.5165, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 5.550486618004866, |
|
"eval_loss": 2.431751251220703, |
|
"eval_runtime": 395.2301, |
|
"eval_samples_per_second": 1064.823, |
|
"eval_steps_per_second": 4.16, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 5.626520681265207, |
|
"grad_norm": 1.603376865386963, |
|
"learning_rate": 3.5498957032536564e-05, |
|
"loss": 2.5194, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 5.626520681265207, |
|
"eval_loss": 2.4255075454711914, |
|
"eval_runtime": 395.4389, |
|
"eval_samples_per_second": 1064.26, |
|
"eval_steps_per_second": 4.157, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 5.702554744525547, |
|
"grad_norm": 1.6579174995422363, |
|
"learning_rate": 3.512880385328552e-05, |
|
"loss": 2.5063, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 5.702554744525547, |
|
"eval_loss": 2.4162917137145996, |
|
"eval_runtime": 395.2478, |
|
"eval_samples_per_second": 1064.775, |
|
"eval_steps_per_second": 4.159, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 5.778588807785888, |
|
"grad_norm": 1.6467429399490356, |
|
"learning_rate": 3.475597474549821e-05, |
|
"loss": 2.4969, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 5.778588807785888, |
|
"eval_loss": 2.4108052253723145, |
|
"eval_runtime": 395.1001, |
|
"eval_samples_per_second": 1065.173, |
|
"eval_steps_per_second": 4.161, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 5.854622871046229, |
|
"grad_norm": 1.6167348623275757, |
|
"learning_rate": 3.438056820689096e-05, |
|
"loss": 2.492, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 5.854622871046229, |
|
"eval_loss": 2.402526617050171, |
|
"eval_runtime": 395.2077, |
|
"eval_samples_per_second": 1064.883, |
|
"eval_steps_per_second": 4.16, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 5.930656934306569, |
|
"grad_norm": 1.7401496171951294, |
|
"learning_rate": 3.400344159273908e-05, |
|
"loss": 2.4729, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 5.930656934306569, |
|
"eval_loss": 2.3961234092712402, |
|
"eval_runtime": 395.2683, |
|
"eval_samples_per_second": 1064.72, |
|
"eval_steps_per_second": 4.159, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 6.00669099756691, |
|
"grad_norm": 1.7321972846984863, |
|
"learning_rate": 3.3623183039946427e-05, |
|
"loss": 2.4753, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 6.00669099756691, |
|
"eval_loss": 2.390777826309204, |
|
"eval_runtime": 395.3927, |
|
"eval_samples_per_second": 1064.385, |
|
"eval_steps_per_second": 4.158, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 6.082725060827251, |
|
"grad_norm": 1.6455748081207275, |
|
"learning_rate": 3.3240646328557325e-05, |
|
"loss": 2.4653, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 6.082725060827251, |
|
"eval_loss": 2.385394334793091, |
|
"eval_runtime": 395.2314, |
|
"eval_samples_per_second": 1064.819, |
|
"eval_steps_per_second": 4.16, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 6.158759124087592, |
|
"grad_norm": 1.6246484518051147, |
|
"learning_rate": 3.2855932520939756e-05, |
|
"loss": 2.4552, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 6.158759124087592, |
|
"eval_loss": 2.3780696392059326, |
|
"eval_runtime": 395.2284, |
|
"eval_samples_per_second": 1064.827, |
|
"eval_steps_per_second": 4.16, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 6.234793187347932, |
|
"grad_norm": 1.6907716989517212, |
|
"learning_rate": 3.246914325462873e-05, |
|
"loss": 2.4577, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 6.234793187347932, |
|
"eval_loss": 2.3710057735443115, |
|
"eval_runtime": 395.2817, |
|
"eval_samples_per_second": 1064.684, |
|
"eval_steps_per_second": 4.159, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 6.3108272506082725, |
|
"grad_norm": 1.733163595199585, |
|
"learning_rate": 3.208038071547463e-05, |
|
"loss": 2.4512, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 6.3108272506082725, |
|
"eval_loss": 2.364978313446045, |
|
"eval_runtime": 395.0989, |
|
"eval_samples_per_second": 1065.176, |
|
"eval_steps_per_second": 4.161, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 6.386861313868613, |
|
"grad_norm": 1.604212999343872, |
|
"learning_rate": 3.1690530675165916e-05, |
|
"loss": 2.4419, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 6.386861313868613, |
|
"eval_loss": 2.3593010902404785, |
|
"eval_runtime": 394.8589, |
|
"eval_samples_per_second": 1065.824, |
|
"eval_steps_per_second": 4.164, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 6.4628953771289535, |
|
"grad_norm": 1.799272060394287, |
|
"learning_rate": 3.1298133637437146e-05, |
|
"loss": 2.443, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 6.4628953771289535, |
|
"eval_loss": 2.3553106784820557, |
|
"eval_runtime": 395.5826, |
|
"eval_samples_per_second": 1063.874, |
|
"eval_steps_per_second": 4.156, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 6.538929440389294, |
|
"grad_norm": 1.5894908905029297, |
|
"learning_rate": 3.0904072695878296e-05, |
|
"loss": 2.4291, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 6.538929440389294, |
|
"eval_loss": 2.350308656692505, |
|
"eval_runtime": 395.6156, |
|
"eval_samples_per_second": 1063.785, |
|
"eval_steps_per_second": 4.156, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 6.614963503649635, |
|
"grad_norm": 1.6308026313781738, |
|
"learning_rate": 3.050845195744353e-05, |
|
"loss": 2.4212, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 6.614963503649635, |
|
"eval_loss": 2.3425817489624023, |
|
"eval_runtime": 395.5628, |
|
"eval_samples_per_second": 1063.927, |
|
"eval_steps_per_second": 4.156, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 6.690997566909976, |
|
"grad_norm": 1.5576202869415283, |
|
"learning_rate": 3.011137594116975e-05, |
|
"loss": 2.4217, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 6.690997566909976, |
|
"eval_loss": 2.3366506099700928, |
|
"eval_runtime": 395.6852, |
|
"eval_samples_per_second": 1063.598, |
|
"eval_steps_per_second": 4.155, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 6.767031630170316, |
|
"grad_norm": 1.698960542678833, |
|
"learning_rate": 2.9713747681111948e-05, |
|
"loss": 2.4191, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 6.767031630170316, |
|
"eval_loss": 2.3311471939086914, |
|
"eval_runtime": 395.6553, |
|
"eval_samples_per_second": 1063.678, |
|
"eval_steps_per_second": 4.155, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 6.843065693430657, |
|
"grad_norm": 1.700810194015503, |
|
"learning_rate": 2.931407856139074e-05, |
|
"loss": 2.4101, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 6.843065693430657, |
|
"eval_loss": 2.326604127883911, |
|
"eval_runtime": 395.4811, |
|
"eval_samples_per_second": 1064.147, |
|
"eval_steps_per_second": 4.157, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 6.919099756690997, |
|
"grad_norm": 1.675718069076538, |
|
"learning_rate": 2.8913269705319878e-05, |
|
"loss": 2.4092, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 6.919099756690997, |
|
"eval_loss": 2.3215043544769287, |
|
"eval_runtime": 395.6152, |
|
"eval_samples_per_second": 1063.786, |
|
"eval_steps_per_second": 4.156, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 6.995133819951338, |
|
"grad_norm": 1.7430431842803955, |
|
"learning_rate": 2.851142700258497e-05, |
|
"loss": 2.4028, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 6.995133819951338, |
|
"eval_loss": 2.3190836906433105, |
|
"eval_runtime": 395.7789, |
|
"eval_samples_per_second": 1063.346, |
|
"eval_steps_per_second": 4.154, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 7.071167883211679, |
|
"grad_norm": 1.7376880645751953, |
|
"learning_rate": 2.8108656616003542e-05, |
|
"loss": 2.393, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 7.071167883211679, |
|
"eval_loss": 2.314730167388916, |
|
"eval_runtime": 395.8715, |
|
"eval_samples_per_second": 1063.097, |
|
"eval_steps_per_second": 4.153, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 7.14720194647202, |
|
"grad_norm": 1.647200584411621, |
|
"learning_rate": 2.7705064953477926e-05, |
|
"loss": 2.3864, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 7.14720194647202, |
|
"eval_loss": 2.3095407485961914, |
|
"eval_runtime": 392.0209, |
|
"eval_samples_per_second": 1073.54, |
|
"eval_steps_per_second": 4.194, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 7.22323600973236, |
|
"grad_norm": 1.5628902912139893, |
|
"learning_rate": 2.7300758639883305e-05, |
|
"loss": 2.3853, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 7.22323600973236, |
|
"eval_loss": 2.3034095764160156, |
|
"eval_runtime": 392.407, |
|
"eval_samples_per_second": 1072.483, |
|
"eval_steps_per_second": 4.19, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 7.299270072992701, |
|
"grad_norm": 1.6254950761795044, |
|
"learning_rate": 2.6896654852743762e-05, |
|
"loss": 2.3778, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 7.299270072992701, |
|
"eval_loss": 2.3009138107299805, |
|
"eval_runtime": 392.2743, |
|
"eval_samples_per_second": 1072.846, |
|
"eval_steps_per_second": 4.191, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 7.375304136253042, |
|
"grad_norm": 1.7831765413284302, |
|
"learning_rate": 2.6491240733505536e-05, |
|
"loss": 2.3902, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 7.375304136253042, |
|
"eval_loss": 2.2940807342529297, |
|
"eval_runtime": 392.0933, |
|
"eval_samples_per_second": 1073.342, |
|
"eval_steps_per_second": 4.193, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 7.451338199513382, |
|
"grad_norm": 1.7135417461395264, |
|
"learning_rate": 2.608543264340055e-05, |
|
"loss": 2.3734, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 7.451338199513382, |
|
"eval_loss": 2.2903780937194824, |
|
"eval_runtime": 392.3395, |
|
"eval_samples_per_second": 1072.668, |
|
"eval_steps_per_second": 4.19, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 7.5273722627737225, |
|
"grad_norm": 1.7215466499328613, |
|
"learning_rate": 2.5679337792861973e-05, |
|
"loss": 2.3644, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 7.5273722627737225, |
|
"eval_loss": 2.2882533073425293, |
|
"eval_runtime": 391.7386, |
|
"eval_samples_per_second": 1074.313, |
|
"eval_steps_per_second": 4.197, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 7.603406326034063, |
|
"grad_norm": 1.5934220552444458, |
|
"learning_rate": 2.527306346808222e-05, |
|
"loss": 2.3644, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 7.603406326034063, |
|
"eval_loss": 2.278449296951294, |
|
"eval_runtime": 392.029, |
|
"eval_samples_per_second": 1073.517, |
|
"eval_steps_per_second": 4.194, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 7.679440389294404, |
|
"grad_norm": 1.734836459159851, |
|
"learning_rate": 2.4866717002668977e-05, |
|
"loss": 2.3643, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 7.679440389294404, |
|
"eval_loss": 2.2776286602020264, |
|
"eval_runtime": 391.9926, |
|
"eval_samples_per_second": 1073.617, |
|
"eval_steps_per_second": 4.194, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 7.755474452554744, |
|
"grad_norm": 1.6759928464889526, |
|
"learning_rate": 2.4461218265301844e-05, |
|
"loss": 2.3549, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 7.755474452554744, |
|
"eval_loss": 2.275527000427246, |
|
"eval_runtime": 392.0053, |
|
"eval_samples_per_second": 1073.582, |
|
"eval_steps_per_second": 4.194, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 7.831508515815085, |
|
"grad_norm": 1.6229385137557983, |
|
"learning_rate": 2.4055049175099393e-05, |
|
"loss": 2.3475, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 7.831508515815085, |
|
"eval_loss": 2.269463539123535, |
|
"eval_runtime": 392.7325, |
|
"eval_samples_per_second": 1071.594, |
|
"eval_steps_per_second": 4.186, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 7.907542579075426, |
|
"grad_norm": 1.5919690132141113, |
|
"learning_rate": 2.3649129731441017e-05, |
|
"loss": 2.3556, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 7.907542579075426, |
|
"eval_loss": 2.2632956504821777, |
|
"eval_runtime": 392.8483, |
|
"eval_samples_per_second": 1071.279, |
|
"eval_steps_per_second": 4.185, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 7.983576642335766, |
|
"grad_norm": 1.6283611059188843, |
|
"learning_rate": 2.32435671741784e-05, |
|
"loss": 2.3441, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 7.983576642335766, |
|
"eval_loss": 2.2631113529205322, |
|
"eval_runtime": 393.1076, |
|
"eval_samples_per_second": 1070.572, |
|
"eval_steps_per_second": 4.182, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 8.059610705596107, |
|
"grad_norm": 1.6927645206451416, |
|
"learning_rate": 2.2838468648877376e-05, |
|
"loss": 2.3396, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 8.059610705596107, |
|
"eval_loss": 2.2605204582214355, |
|
"eval_runtime": 393.0545, |
|
"eval_samples_per_second": 1070.717, |
|
"eval_steps_per_second": 4.183, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 8.135644768856448, |
|
"grad_norm": 1.6524484157562256, |
|
"learning_rate": 2.2433941178511185e-05, |
|
"loss": 2.3281, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 8.135644768856448, |
|
"eval_loss": 2.255591869354248, |
|
"eval_runtime": 393.065, |
|
"eval_samples_per_second": 1070.688, |
|
"eval_steps_per_second": 4.183, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 8.211678832116789, |
|
"grad_norm": 1.8136180639266968, |
|
"learning_rate": 2.2030091635186097e-05, |
|
"loss": 2.3251, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 8.211678832116789, |
|
"eval_loss": 2.2528815269470215, |
|
"eval_runtime": 393.1403, |
|
"eval_samples_per_second": 1070.483, |
|
"eval_steps_per_second": 4.182, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 8.28771289537713, |
|
"grad_norm": 1.7461555004119873, |
|
"learning_rate": 2.1627831987887616e-05, |
|
"loss": 2.3252, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 8.28771289537713, |
|
"eval_loss": 2.247727155685425, |
|
"eval_runtime": 394.607, |
|
"eval_samples_per_second": 1066.504, |
|
"eval_steps_per_second": 4.166, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 8.363746958637469, |
|
"grad_norm": 1.6148008108139038, |
|
"learning_rate": 2.1225656282037674e-05, |
|
"loss": 2.3231, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 8.363746958637469, |
|
"eval_loss": 2.245650291442871, |
|
"eval_runtime": 393.1496, |
|
"eval_samples_per_second": 1070.458, |
|
"eval_steps_per_second": 4.182, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 8.43978102189781, |
|
"grad_norm": 1.5390928983688354, |
|
"learning_rate": 2.082447771999728e-05, |
|
"loss": 2.3218, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 8.43978102189781, |
|
"eval_loss": 2.240283489227295, |
|
"eval_runtime": 393.128, |
|
"eval_samples_per_second": 1070.517, |
|
"eval_steps_per_second": 4.182, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 8.51581508515815, |
|
"grad_norm": 1.7353328466415405, |
|
"learning_rate": 2.0424402289124667e-05, |
|
"loss": 2.3113, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 8.51581508515815, |
|
"eval_loss": 2.236283540725708, |
|
"eval_runtime": 392.9933, |
|
"eval_samples_per_second": 1070.883, |
|
"eval_steps_per_second": 4.183, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 8.591849148418492, |
|
"grad_norm": 1.6553759574890137, |
|
"learning_rate": 2.0025535685341834e-05, |
|
"loss": 2.3137, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 8.591849148418492, |
|
"eval_loss": 2.2341954708099365, |
|
"eval_runtime": 394.1952, |
|
"eval_samples_per_second": 1067.618, |
|
"eval_steps_per_second": 4.171, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 8.667883211678832, |
|
"grad_norm": 1.6300148963928223, |
|
"learning_rate": 1.9627983285210795e-05, |
|
"loss": 2.3153, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 8.667883211678832, |
|
"eval_loss": 2.2316806316375732, |
|
"eval_runtime": 394.4429, |
|
"eval_samples_per_second": 1066.948, |
|
"eval_steps_per_second": 4.168, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 8.743917274939173, |
|
"grad_norm": 1.7760825157165527, |
|
"learning_rate": 1.9231850118094083e-05, |
|
"loss": 2.3086, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 8.743917274939173, |
|
"eval_loss": 2.2260444164276123, |
|
"eval_runtime": 394.1825, |
|
"eval_samples_per_second": 1067.653, |
|
"eval_steps_per_second": 4.171, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 8.819951338199512, |
|
"grad_norm": 1.6700938940048218, |
|
"learning_rate": 1.883724083840713e-05, |
|
"loss": 2.3051, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 8.819951338199512, |
|
"eval_loss": 2.2262229919433594, |
|
"eval_runtime": 394.2521, |
|
"eval_samples_per_second": 1067.464, |
|
"eval_steps_per_second": 4.17, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 8.895985401459853, |
|
"grad_norm": 1.6361171007156372, |
|
"learning_rate": 1.8445043966286124e-05, |
|
"loss": 2.2996, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 8.895985401459853, |
|
"eval_loss": 2.2197461128234863, |
|
"eval_runtime": 394.2947, |
|
"eval_samples_per_second": 1067.349, |
|
"eval_steps_per_second": 4.169, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 8.972019464720194, |
|
"grad_norm": 1.5987651348114014, |
|
"learning_rate": 1.805379121954309e-05, |
|
"loss": 2.295, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 8.972019464720194, |
|
"eval_loss": 2.218661069869995, |
|
"eval_runtime": 394.4471, |
|
"eval_samples_per_second": 1066.936, |
|
"eval_steps_per_second": 4.168, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 9.048053527980535, |
|
"grad_norm": 1.6805070638656616, |
|
"learning_rate": 1.7664373591592323e-05, |
|
"loss": 2.2898, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 9.048053527980535, |
|
"eval_loss": 2.2158923149108887, |
|
"eval_runtime": 394.3964, |
|
"eval_samples_per_second": 1067.074, |
|
"eval_steps_per_second": 4.168, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 9.124087591240876, |
|
"grad_norm": 1.559171199798584, |
|
"learning_rate": 1.727689396267106e-05, |
|
"loss": 2.294, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 9.124087591240876, |
|
"eval_loss": 2.213304281234741, |
|
"eval_runtime": 394.3761, |
|
"eval_samples_per_second": 1067.129, |
|
"eval_steps_per_second": 4.169, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 9.200121654501217, |
|
"grad_norm": 1.7154414653778076, |
|
"learning_rate": 1.689145470101657e-05, |
|
"loss": 2.2905, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 9.200121654501217, |
|
"eval_loss": 2.211729049682617, |
|
"eval_runtime": 394.4483, |
|
"eval_samples_per_second": 1066.933, |
|
"eval_steps_per_second": 4.168, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 9.276155717761558, |
|
"grad_norm": 1.7217854261398315, |
|
"learning_rate": 1.6508922024636513e-05, |
|
"loss": 2.2776, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 9.276155717761558, |
|
"eval_loss": 2.2076163291931152, |
|
"eval_runtime": 394.2479, |
|
"eval_samples_per_second": 1067.476, |
|
"eval_steps_per_second": 4.17, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 9.352189781021897, |
|
"grad_norm": 1.6988067626953125, |
|
"learning_rate": 1.6127863831556155e-05, |
|
"loss": 2.2888, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 9.352189781021897, |
|
"eval_loss": 2.2073538303375244, |
|
"eval_runtime": 394.4185, |
|
"eval_samples_per_second": 1067.014, |
|
"eval_steps_per_second": 4.168, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 9.428223844282238, |
|
"grad_norm": 1.6594995260238647, |
|
"learning_rate": 1.5749149567995482e-05, |
|
"loss": 2.2737, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 9.428223844282238, |
|
"eval_loss": 2.2045233249664307, |
|
"eval_runtime": 394.3688, |
|
"eval_samples_per_second": 1067.148, |
|
"eval_steps_per_second": 4.169, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 9.504257907542579, |
|
"grad_norm": 1.782347321510315, |
|
"learning_rate": 1.537287928647002e-05, |
|
"loss": 2.2715, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 9.504257907542579, |
|
"eval_loss": 2.1984219551086426, |
|
"eval_runtime": 394.2219, |
|
"eval_samples_per_second": 1067.546, |
|
"eval_steps_per_second": 4.17, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 9.58029197080292, |
|
"grad_norm": 1.7212417125701904, |
|
"learning_rate": 1.4999897243562522e-05, |
|
"loss": 2.2736, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 9.58029197080292, |
|
"eval_loss": 2.200115919113159, |
|
"eval_runtime": 394.4095, |
|
"eval_samples_per_second": 1067.038, |
|
"eval_steps_per_second": 4.168, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 9.65632603406326, |
|
"grad_norm": 1.636083722114563, |
|
"learning_rate": 1.4628807092364161e-05, |
|
"loss": 2.2714, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 9.65632603406326, |
|
"eval_loss": 2.196516752243042, |
|
"eval_runtime": 394.3398, |
|
"eval_samples_per_second": 1067.227, |
|
"eval_steps_per_second": 4.169, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 9.732360097323602, |
|
"grad_norm": 1.669154405593872, |
|
"learning_rate": 1.4260456906462644e-05, |
|
"loss": 2.2581, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 9.732360097323602, |
|
"eval_loss": 2.1947672367095947, |
|
"eval_runtime": 394.2775, |
|
"eval_samples_per_second": 1067.396, |
|
"eval_steps_per_second": 4.17, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 9.808394160583942, |
|
"grad_norm": 1.5820955038070679, |
|
"learning_rate": 1.3894944000287996e-05, |
|
"loss": 2.2673, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 9.808394160583942, |
|
"eval_loss": 2.1930572986602783, |
|
"eval_runtime": 394.3185, |
|
"eval_samples_per_second": 1067.284, |
|
"eval_steps_per_second": 4.169, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 9.884428223844282, |
|
"grad_norm": 1.878128170967102, |
|
"learning_rate": 1.3532364938689365e-05, |
|
"loss": 2.2532, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 9.884428223844282, |
|
"eval_loss": 2.186814069747925, |
|
"eval_runtime": 394.1633, |
|
"eval_samples_per_second": 1067.705, |
|
"eval_steps_per_second": 4.171, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 9.960462287104622, |
|
"grad_norm": 1.6541669368743896, |
|
"learning_rate": 1.3172815511423497e-05, |
|
"loss": 2.2599, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 9.960462287104622, |
|
"eval_loss": 2.186183452606201, |
|
"eval_runtime": 394.274, |
|
"eval_samples_per_second": 1067.405, |
|
"eval_steps_per_second": 4.17, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 10.036496350364963, |
|
"grad_norm": 1.6656322479248047, |
|
"learning_rate": 1.2817100376353228e-05, |
|
"loss": 2.2626, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 10.036496350364963, |
|
"eval_loss": 2.1833560466766357, |
|
"eval_runtime": 394.4838, |
|
"eval_samples_per_second": 1066.837, |
|
"eval_steps_per_second": 4.167, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 10.112530413625304, |
|
"grad_norm": 1.64789617061615, |
|
"learning_rate": 1.246388782934231e-05, |
|
"loss": 2.2476, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 10.112530413625304, |
|
"eval_loss": 2.1836633682250977, |
|
"eval_runtime": 394.475, |
|
"eval_samples_per_second": 1066.861, |
|
"eval_steps_per_second": 4.168, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 10.188564476885645, |
|
"grad_norm": 1.626693844795227, |
|
"learning_rate": 1.2113987197615472e-05, |
|
"loss": 2.2597, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 10.188564476885645, |
|
"eval_loss": 2.177664041519165, |
|
"eval_runtime": 394.4402, |
|
"eval_samples_per_second": 1066.955, |
|
"eval_steps_per_second": 4.168, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 10.264598540145986, |
|
"grad_norm": 1.660078525543213, |
|
"learning_rate": 1.1767490921415291e-05, |
|
"loss": 2.2525, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 10.264598540145986, |
|
"eval_loss": 2.177150011062622, |
|
"eval_runtime": 394.2691, |
|
"eval_samples_per_second": 1067.418, |
|
"eval_steps_per_second": 4.17, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 10.340632603406325, |
|
"grad_norm": 1.6624382734298706, |
|
"learning_rate": 1.1424490541587752e-05, |
|
"loss": 2.2477, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 10.340632603406325, |
|
"eval_loss": 2.175464630126953, |
|
"eval_runtime": 394.3358, |
|
"eval_samples_per_second": 1067.238, |
|
"eval_steps_per_second": 4.169, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 10.416666666666666, |
|
"grad_norm": 1.7029284238815308, |
|
"learning_rate": 1.1085076675397963e-05, |
|
"loss": 2.2442, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 10.416666666666666, |
|
"eval_loss": 2.172318935394287, |
|
"eval_runtime": 394.363, |
|
"eval_samples_per_second": 1067.164, |
|
"eval_steps_per_second": 4.169, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 10.492700729927007, |
|
"grad_norm": 1.7094260454177856, |
|
"learning_rate": 1.0750006740005564e-05, |
|
"loss": 2.2461, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 10.492700729927007, |
|
"eval_loss": 2.1725075244903564, |
|
"eval_runtime": 394.3359, |
|
"eval_samples_per_second": 1067.237, |
|
"eval_steps_per_second": 4.169, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 10.568734793187348, |
|
"grad_norm": 1.7138928174972534, |
|
"learning_rate": 1.04180263214852e-05, |
|
"loss": 2.2428, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 10.568734793187348, |
|
"eval_loss": 2.1679632663726807, |
|
"eval_runtime": 394.3498, |
|
"eval_samples_per_second": 1067.2, |
|
"eval_steps_per_second": 4.169, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 10.644768856447689, |
|
"grad_norm": 1.7748503684997559, |
|
"learning_rate": 1.0089898314369628e-05, |
|
"loss": 2.2409, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 10.644768856447689, |
|
"eval_loss": 2.167714834213257, |
|
"eval_runtime": 394.3096, |
|
"eval_samples_per_second": 1067.308, |
|
"eval_steps_per_second": 4.169, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 10.72080291970803, |
|
"grad_norm": 1.8225022554397583, |
|
"learning_rate": 9.765709406792067e-06, |
|
"loss": 2.2421, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 10.72080291970803, |
|
"eval_loss": 2.1677842140197754, |
|
"eval_runtime": 394.4354, |
|
"eval_samples_per_second": 1066.968, |
|
"eval_steps_per_second": 4.168, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 10.79683698296837, |
|
"grad_norm": 1.682428002357483, |
|
"learning_rate": 9.445545246215093e-06, |
|
"loss": 2.2405, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 10.79683698296837, |
|
"eval_loss": 2.162020206451416, |
|
"eval_runtime": 394.4337, |
|
"eval_samples_per_second": 1066.973, |
|
"eval_steps_per_second": 4.168, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 10.87287104622871, |
|
"grad_norm": 1.8187251091003418, |
|
"learning_rate": 9.130118369667984e-06, |
|
"loss": 2.2338, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 10.87287104622871, |
|
"eval_loss": 2.161623001098633, |
|
"eval_runtime": 394.3265, |
|
"eval_samples_per_second": 1067.263, |
|
"eval_steps_per_second": 4.169, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 10.94890510948905, |
|
"grad_norm": 1.586653470993042, |
|
"learning_rate": 8.818247901683923e-06, |
|
"loss": 2.2291, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 10.94890510948905, |
|
"eval_loss": 2.1573026180267334, |
|
"eval_runtime": 394.3904, |
|
"eval_samples_per_second": 1067.09, |
|
"eval_steps_per_second": 4.168, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 11.024939172749392, |
|
"grad_norm": 1.6375211477279663, |
|
"learning_rate": 8.510652490541102e-06, |
|
"loss": 2.2337, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 11.024939172749392, |
|
"eval_loss": 2.158447027206421, |
|
"eval_runtime": 394.8845, |
|
"eval_samples_per_second": 1065.755, |
|
"eval_steps_per_second": 4.163, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 11.100973236009732, |
|
"grad_norm": 1.9024183750152588, |
|
"learning_rate": 8.207413399866525e-06, |
|
"loss": 2.2243, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 11.100973236009732, |
|
"eval_loss": 2.1577627658843994, |
|
"eval_runtime": 394.3929, |
|
"eval_samples_per_second": 1067.083, |
|
"eval_steps_per_second": 4.168, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 11.177007299270073, |
|
"grad_norm": 1.6612706184387207, |
|
"learning_rate": 7.908610742390934e-06, |
|
"loss": 2.2206, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 11.177007299270073, |
|
"eval_loss": 2.156655788421631, |
|
"eval_runtime": 394.4918, |
|
"eval_samples_per_second": 1066.816, |
|
"eval_steps_per_second": 4.167, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 11.253041362530414, |
|
"grad_norm": 1.6041182279586792, |
|
"learning_rate": 7.614323458783904e-06, |
|
"loss": 2.2316, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 11.253041362530414, |
|
"eval_loss": 2.154806137084961, |
|
"eval_runtime": 394.5111, |
|
"eval_samples_per_second": 1066.763, |
|
"eval_steps_per_second": 4.167, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 11.329075425790755, |
|
"grad_norm": 1.7304446697235107, |
|
"learning_rate": 7.324629296798397e-06, |
|
"loss": 2.2252, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 11.329075425790755, |
|
"eval_loss": 2.1519484519958496, |
|
"eval_runtime": 394.2907, |
|
"eval_samples_per_second": 1067.36, |
|
"eval_steps_per_second": 4.17, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 11.405109489051094, |
|
"grad_norm": 1.6792948246002197, |
|
"learning_rate": 7.039604790730683e-06, |
|
"loss": 2.2257, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 11.405109489051094, |
|
"eval_loss": 2.1538424491882324, |
|
"eval_runtime": 394.5221, |
|
"eval_samples_per_second": 1066.734, |
|
"eval_steps_per_second": 4.167, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 11.481143552311435, |
|
"grad_norm": 1.5765753984451294, |
|
"learning_rate": 6.7598810154057336e-06, |
|
"loss": 2.2252, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 11.481143552311435, |
|
"eval_loss": 2.1519691944122314, |
|
"eval_runtime": 394.4824, |
|
"eval_samples_per_second": 1066.841, |
|
"eval_steps_per_second": 4.167, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 11.557177615571776, |
|
"grad_norm": 1.644453525543213, |
|
"learning_rate": 6.484410758400267e-06, |
|
"loss": 2.2228, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 11.557177615571776, |
|
"eval_loss": 2.1509506702423096, |
|
"eval_runtime": 394.5661, |
|
"eval_samples_per_second": 1066.615, |
|
"eval_steps_per_second": 4.167, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 11.633211678832117, |
|
"grad_norm": 1.7033356428146362, |
|
"learning_rate": 6.213832134635486e-06, |
|
"loss": 2.2217, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 11.633211678832117, |
|
"eval_loss": 2.1477901935577393, |
|
"eval_runtime": 394.5248, |
|
"eval_samples_per_second": 1066.726, |
|
"eval_steps_per_second": 4.167, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 11.709245742092458, |
|
"grad_norm": 1.6563267707824707, |
|
"learning_rate": 5.948216628273909e-06, |
|
"loss": 2.2135, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 11.709245742092458, |
|
"eval_loss": 2.1486401557922363, |
|
"eval_runtime": 394.3353, |
|
"eval_samples_per_second": 1067.239, |
|
"eval_steps_per_second": 4.169, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 11.785279805352799, |
|
"grad_norm": 1.6282879114151, |
|
"learning_rate": 5.687634412272127e-06, |
|
"loss": 2.2254, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 11.785279805352799, |
|
"eval_loss": 2.1465682983398438, |
|
"eval_runtime": 394.4898, |
|
"eval_samples_per_second": 1066.821, |
|
"eval_steps_per_second": 4.167, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 11.861313868613138, |
|
"grad_norm": 1.7813278436660767, |
|
"learning_rate": 5.432154329841835e-06, |
|
"loss": 2.2166, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 11.861313868613138, |
|
"eval_loss": 2.14347505569458, |
|
"eval_runtime": 394.4933, |
|
"eval_samples_per_second": 1066.812, |
|
"eval_steps_per_second": 4.167, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 11.937347931873479, |
|
"grad_norm": 1.723649024963379, |
|
"learning_rate": 5.181843876262127e-06, |
|
"loss": 2.2181, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 11.937347931873479, |
|
"eval_loss": 2.1440093517303467, |
|
"eval_runtime": 394.3682, |
|
"eval_samples_per_second": 1067.15, |
|
"eval_steps_per_second": 4.169, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 12.01338199513382, |
|
"grad_norm": 1.7719519138336182, |
|
"learning_rate": 4.936769181047937e-06, |
|
"loss": 2.2092, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 12.01338199513382, |
|
"eval_loss": 2.141754388809204, |
|
"eval_runtime": 394.1783, |
|
"eval_samples_per_second": 1067.664, |
|
"eval_steps_per_second": 4.171, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 12.08941605839416, |
|
"grad_norm": 1.696637749671936, |
|
"learning_rate": 4.697469206617919e-06, |
|
"loss": 2.2007, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 12.08941605839416, |
|
"eval_loss": 2.1432430744171143, |
|
"eval_runtime": 394.2858, |
|
"eval_samples_per_second": 1067.373, |
|
"eval_steps_per_second": 4.17, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 12.165450121654501, |
|
"grad_norm": 1.6854994297027588, |
|
"learning_rate": 4.463511524513736e-06, |
|
"loss": 2.2084, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 12.165450121654501, |
|
"eval_loss": 2.141733407974243, |
|
"eval_runtime": 394.4029, |
|
"eval_samples_per_second": 1067.056, |
|
"eval_steps_per_second": 4.168, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 12.241484184914842, |
|
"grad_norm": 1.6496477127075195, |
|
"learning_rate": 4.2345051393941574e-06, |
|
"loss": 2.2089, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 12.241484184914842, |
|
"eval_loss": 2.139671802520752, |
|
"eval_runtime": 394.4989, |
|
"eval_samples_per_second": 1066.796, |
|
"eval_steps_per_second": 4.167, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 12.317518248175183, |
|
"grad_norm": 1.6591581106185913, |
|
"learning_rate": 4.010984790046615e-06, |
|
"loss": 2.2058, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 12.317518248175183, |
|
"eval_loss": 2.1399948596954346, |
|
"eval_runtime": 394.4647, |
|
"eval_samples_per_second": 1066.889, |
|
"eval_steps_per_second": 4.168, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 12.393552311435522, |
|
"grad_norm": 1.7192113399505615, |
|
"learning_rate": 3.7930095283087966e-06, |
|
"loss": 2.2059, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 12.393552311435522, |
|
"eval_loss": 2.1405417919158936, |
|
"eval_runtime": 394.3798, |
|
"eval_samples_per_second": 1067.118, |
|
"eval_steps_per_second": 4.169, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 12.469586374695863, |
|
"grad_norm": 1.6483603715896606, |
|
"learning_rate": 3.5806369410618047e-06, |
|
"loss": 2.2144, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 12.469586374695863, |
|
"eval_loss": 2.1386895179748535, |
|
"eval_runtime": 394.4506, |
|
"eval_samples_per_second": 1066.927, |
|
"eval_steps_per_second": 4.168, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 12.545620437956204, |
|
"grad_norm": 1.6323285102844238, |
|
"learning_rate": 3.3739231350162437e-06, |
|
"loss": 2.2076, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 12.545620437956204, |
|
"eval_loss": 2.1366796493530273, |
|
"eval_runtime": 394.6499, |
|
"eval_samples_per_second": 1066.388, |
|
"eval_steps_per_second": 4.166, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 12.621654501216545, |
|
"grad_norm": 1.7512730360031128, |
|
"learning_rate": 3.173318985201379e-06, |
|
"loss": 2.21, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 12.621654501216545, |
|
"eval_loss": 2.1367809772491455, |
|
"eval_runtime": 394.4888, |
|
"eval_samples_per_second": 1066.824, |
|
"eval_steps_per_second": 4.167, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 12.697688564476886, |
|
"grad_norm": 1.7279080152511597, |
|
"learning_rate": 2.9780734823130846e-06, |
|
"loss": 2.2014, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 12.697688564476886, |
|
"eval_loss": 2.136183500289917, |
|
"eval_runtime": 394.5466, |
|
"eval_samples_per_second": 1066.667, |
|
"eval_steps_per_second": 4.167, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 12.773722627737227, |
|
"grad_norm": 1.7061643600463867, |
|
"learning_rate": 2.7886459518572467e-06, |
|
"loss": 2.2073, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 12.773722627737227, |
|
"eval_loss": 2.136634111404419, |
|
"eval_runtime": 394.4488, |
|
"eval_samples_per_second": 1066.932, |
|
"eval_steps_per_second": 4.168, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 12.849756690997566, |
|
"grad_norm": 1.6525273323059082, |
|
"learning_rate": 2.6050864386902433e-06, |
|
"loss": 2.2062, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 12.849756690997566, |
|
"eval_loss": 2.135418653488159, |
|
"eval_runtime": 394.6522, |
|
"eval_samples_per_second": 1066.382, |
|
"eval_steps_per_second": 4.166, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 12.925790754257907, |
|
"grad_norm": 1.753316879272461, |
|
"learning_rate": 2.4274434373970757e-06, |
|
"loss": 2.1969, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 12.925790754257907, |
|
"eval_loss": 2.130448579788208, |
|
"eval_runtime": 394.5649, |
|
"eval_samples_per_second": 1066.618, |
|
"eval_steps_per_second": 4.167, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 13.001824817518248, |
|
"grad_norm": 1.5890535116195679, |
|
"learning_rate": 2.256101256668691e-06, |
|
"loss": 2.2078, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 13.001824817518248, |
|
"eval_loss": 2.1335136890411377, |
|
"eval_runtime": 394.3918, |
|
"eval_samples_per_second": 1067.086, |
|
"eval_steps_per_second": 4.168, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 13.077858880778589, |
|
"grad_norm": 1.7298823595046997, |
|
"learning_rate": 2.0904184363357256e-06, |
|
"loss": 2.203, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 13.077858880778589, |
|
"eval_loss": 2.132927894592285, |
|
"eval_runtime": 394.4131, |
|
"eval_samples_per_second": 1067.029, |
|
"eval_steps_per_second": 4.168, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 13.15389294403893, |
|
"grad_norm": 1.7888143062591553, |
|
"learning_rate": 1.930788098008321e-06, |
|
"loss": 2.1993, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 13.15389294403893, |
|
"eval_loss": 2.1313769817352295, |
|
"eval_runtime": 394.3014, |
|
"eval_samples_per_second": 1067.331, |
|
"eval_steps_per_second": 4.169, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 13.22992700729927, |
|
"grad_norm": 1.7427315711975098, |
|
"learning_rate": 1.7772524144231473e-06, |
|
"loss": 2.2032, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 13.22992700729927, |
|
"eval_loss": 2.135279893875122, |
|
"eval_runtime": 394.1525, |
|
"eval_samples_per_second": 1067.734, |
|
"eval_steps_per_second": 4.171, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 13.305961070559611, |
|
"grad_norm": 1.700643539428711, |
|
"learning_rate": 1.6298519481701192e-06, |
|
"loss": 2.2, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 13.305961070559611, |
|
"eval_loss": 2.130155086517334, |
|
"eval_runtime": 393.7376, |
|
"eval_samples_per_second": 1068.859, |
|
"eval_steps_per_second": 4.175, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 13.38199513381995, |
|
"grad_norm": 1.6336027383804321, |
|
"learning_rate": 1.4889019067080928e-06, |
|
"loss": 2.1964, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 13.38199513381995, |
|
"eval_loss": 2.129770517349243, |
|
"eval_runtime": 394.1127, |
|
"eval_samples_per_second": 1067.842, |
|
"eval_steps_per_second": 4.171, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 13.458029197080291, |
|
"grad_norm": 1.698116421699524, |
|
"learning_rate": 1.3538746100630939e-06, |
|
"loss": 2.1957, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 13.458029197080291, |
|
"eval_loss": 2.1296403408050537, |
|
"eval_runtime": 394.7051, |
|
"eval_samples_per_second": 1066.239, |
|
"eval_steps_per_second": 4.165, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 13.534063260340632, |
|
"grad_norm": 1.7204720973968506, |
|
"learning_rate": 1.2250943829259454e-06, |
|
"loss": 2.1985, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 13.534063260340632, |
|
"eval_loss": 2.131389856338501, |
|
"eval_runtime": 394.7347, |
|
"eval_samples_per_second": 1066.159, |
|
"eval_steps_per_second": 4.165, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 13.610097323600973, |
|
"grad_norm": 1.7444037199020386, |
|
"learning_rate": 1.102595247742902e-06, |
|
"loss": 2.1967, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 13.610097323600973, |
|
"eval_loss": 2.13096284866333, |
|
"eval_runtime": 394.695, |
|
"eval_samples_per_second": 1066.266, |
|
"eval_steps_per_second": 4.165, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 13.686131386861314, |
|
"grad_norm": 1.7652897834777832, |
|
"learning_rate": 9.864095675586272e-07, |
|
"loss": 2.1979, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 13.686131386861314, |
|
"eval_loss": 2.1287431716918945, |
|
"eval_runtime": 394.6791, |
|
"eval_samples_per_second": 1066.309, |
|
"eval_steps_per_second": 4.165, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 13.762165450121655, |
|
"grad_norm": 1.6986685991287231, |
|
"learning_rate": 8.765680374662105e-07, |
|
"loss": 2.2055, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 13.762165450121655, |
|
"eval_loss": 2.128450870513916, |
|
"eval_runtime": 394.7254, |
|
"eval_samples_per_second": 1066.184, |
|
"eval_steps_per_second": 4.165, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 13.838199513381996, |
|
"grad_norm": 1.7826683521270752, |
|
"learning_rate": 7.730996764978071e-07, |
|
"loss": 2.1933, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 13.838199513381996, |
|
"eval_loss": 2.128603935241699, |
|
"eval_runtime": 394.6725, |
|
"eval_samples_per_second": 1066.327, |
|
"eval_steps_per_second": 4.165, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 13.914233576642335, |
|
"grad_norm": 1.7597603797912598, |
|
"learning_rate": 6.76031819958145e-07, |
|
"loss": 2.1945, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 13.914233576642335, |
|
"eval_loss": 2.1281092166900635, |
|
"eval_runtime": 394.6346, |
|
"eval_samples_per_second": 1066.43, |
|
"eval_steps_per_second": 4.166, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 13.990267639902676, |
|
"grad_norm": 1.5649290084838867, |
|
"learning_rate": 5.855649661219098e-07, |
|
"loss": 2.2016, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 13.990267639902676, |
|
"eval_loss": 2.129279613494873, |
|
"eval_runtime": 394.55, |
|
"eval_samples_per_second": 1066.658, |
|
"eval_steps_per_second": 4.167, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 14.066301703163017, |
|
"grad_norm": 1.6939290761947632, |
|
"learning_rate": 5.013604308242548e-07, |
|
"loss": 2.195, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 14.066301703163017, |
|
"eval_loss": 2.1266942024230957, |
|
"eval_runtime": 394.5988, |
|
"eval_samples_per_second": 1066.526, |
|
"eval_steps_per_second": 4.166, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 14.142335766423358, |
|
"grad_norm": 1.6481035947799683, |
|
"learning_rate": 4.236281907425227e-07, |
|
"loss": 2.1939, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 14.142335766423358, |
|
"eval_loss": 2.1291019916534424, |
|
"eval_runtime": 393.3337, |
|
"eval_samples_per_second": 1069.957, |
|
"eval_steps_per_second": 4.18, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 14.218369829683699, |
|
"grad_norm": 1.7540963888168335, |
|
"learning_rate": 3.523887819560451e-07, |
|
"loss": 2.1939, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 14.218369829683699, |
|
"eval_loss": 2.130265474319458, |
|
"eval_runtime": 393.7198, |
|
"eval_samples_per_second": 1068.907, |
|
"eval_steps_per_second": 4.176, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 14.29440389294404, |
|
"grad_norm": 1.7240368127822876, |
|
"learning_rate": 2.876610252031453e-07, |
|
"loss": 2.1907, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 14.29440389294404, |
|
"eval_loss": 2.126887321472168, |
|
"eval_runtime": 393.7098, |
|
"eval_samples_per_second": 1068.934, |
|
"eval_steps_per_second": 4.176, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 14.37043795620438, |
|
"grad_norm": 1.6906523704528809, |
|
"learning_rate": 2.2946202090889657e-07, |
|
"loss": 2.1999, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 14.37043795620438, |
|
"eval_loss": 2.126722812652588, |
|
"eval_runtime": 393.685, |
|
"eval_samples_per_second": 1069.002, |
|
"eval_steps_per_second": 4.176, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 14.44647201946472, |
|
"grad_norm": 1.7347662448883057, |
|
"learning_rate": 1.7790391402128793e-07, |
|
"loss": 2.1989, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 14.44647201946472, |
|
"eval_loss": 2.1272239685058594, |
|
"eval_runtime": 393.5995, |
|
"eval_samples_per_second": 1069.234, |
|
"eval_steps_per_second": 4.177, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 14.52250608272506, |
|
"grad_norm": 1.64090096950531, |
|
"learning_rate": 1.327936845155059e-07, |
|
"loss": 2.1963, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 14.52250608272506, |
|
"eval_loss": 2.126425266265869, |
|
"eval_runtime": 394.5292, |
|
"eval_samples_per_second": 1066.715, |
|
"eval_steps_per_second": 4.167, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 14.598540145985401, |
|
"grad_norm": 1.6597987413406372, |
|
"learning_rate": 9.425312186875923e-08, |
|
"loss": 2.1987, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 14.598540145985401, |
|
"eval_loss": 2.1285743713378906, |
|
"eval_runtime": 394.6211, |
|
"eval_samples_per_second": 1066.466, |
|
"eval_steps_per_second": 4.166, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 14.674574209245742, |
|
"grad_norm": 1.6827759742736816, |
|
"learning_rate": 6.2292408111711e-08, |
|
"loss": 2.2012, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 14.674574209245742, |
|
"eval_loss": 2.1267669200897217, |
|
"eval_runtime": 394.6661, |
|
"eval_samples_per_second": 1066.344, |
|
"eval_steps_per_second": 4.166, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 14.750608272506083, |
|
"grad_norm": 1.9470024108886719, |
|
"learning_rate": 3.691998694484722e-08, |
|
"loss": 2.2013, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 14.750608272506083, |
|
"eval_loss": 2.128140449523926, |
|
"eval_runtime": 394.6676, |
|
"eval_samples_per_second": 1066.34, |
|
"eval_steps_per_second": 4.166, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 14.826642335766424, |
|
"grad_norm": 1.6369675397872925, |
|
"learning_rate": 1.817353096532637e-08, |
|
"loss": 2.1923, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 14.826642335766424, |
|
"eval_loss": 2.128028392791748, |
|
"eval_runtime": 394.6764, |
|
"eval_samples_per_second": 1066.317, |
|
"eval_steps_per_second": 4.165, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 14.902676399026763, |
|
"grad_norm": 1.7755557298660278, |
|
"learning_rate": 5.982858360498167e-09, |
|
"loss": 2.1966, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 14.902676399026763, |
|
"eval_loss": 2.1286511421203613, |
|
"eval_runtime": 393.4618, |
|
"eval_samples_per_second": 1069.608, |
|
"eval_steps_per_second": 4.178, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 14.978710462287104, |
|
"grad_norm": 1.7456624507904053, |
|
"learning_rate": 3.953547649482303e-10, |
|
"loss": 2.1987, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 14.978710462287104, |
|
"eval_loss": 2.127889394760132, |
|
"eval_runtime": 393.3437, |
|
"eval_samples_per_second": 1069.93, |
|
"eval_steps_per_second": 4.18, |
|
"step": 98500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 98640, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.646405662995644e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|