DA-BERT_Old_News_V1 / trainer_state.json
SirMappel's picture
Upload folder using huggingface_hub
2b7f874 verified
{
"best_metric": 2.126425266265869,
"best_model_checkpoint": "/work/Ccp-OldNewsBERT_2024/modelling/checkpoint-95500",
"epoch": 15.0,
"eval_steps": 500,
"global_step": 98640,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07603406326034064,
"grad_norm": 1.1528505086898804,
"learning_rate": 1.25e-05,
"loss": 8.5532,
"step": 500
},
{
"epoch": 0.07603406326034064,
"eval_loss": 7.3156208992004395,
"eval_runtime": 392.108,
"eval_samples_per_second": 1073.301,
"eval_steps_per_second": 4.193,
"step": 500
},
{
"epoch": 0.15206812652068127,
"grad_norm": 1.3152525424957275,
"learning_rate": 2.5e-05,
"loss": 7.0493,
"step": 1000
},
{
"epoch": 0.15206812652068127,
"eval_loss": 6.841813087463379,
"eval_runtime": 392.0485,
"eval_samples_per_second": 1073.464,
"eval_steps_per_second": 4.193,
"step": 1000
},
{
"epoch": 0.2281021897810219,
"grad_norm": 1.7870614528656006,
"learning_rate": 3.7500000000000003e-05,
"loss": 6.7805,
"step": 1500
},
{
"epoch": 0.2281021897810219,
"eval_loss": 6.62256383895874,
"eval_runtime": 391.9186,
"eval_samples_per_second": 1073.82,
"eval_steps_per_second": 4.195,
"step": 1500
},
{
"epoch": 0.30413625304136255,
"grad_norm": 1.1754438877105713,
"learning_rate": 5e-05,
"loss": 6.5763,
"step": 2000
},
{
"epoch": 0.30413625304136255,
"eval_loss": 6.442608833312988,
"eval_runtime": 391.8632,
"eval_samples_per_second": 1073.972,
"eval_steps_per_second": 4.195,
"step": 2000
},
{
"epoch": 0.38017031630170317,
"grad_norm": 1.4492470026016235,
"learning_rate": 4.999669762518974e-05,
"loss": 6.4084,
"step": 2500
},
{
"epoch": 0.38017031630170317,
"eval_loss": 6.2790422439575195,
"eval_runtime": 395.0617,
"eval_samples_per_second": 1065.277,
"eval_steps_per_second": 4.161,
"step": 2500
},
{
"epoch": 0.4562043795620438,
"grad_norm": 1.4527273178100586,
"learning_rate": 4.9986791373213283e-05,
"loss": 6.2422,
"step": 3000
},
{
"epoch": 0.4562043795620438,
"eval_loss": 6.124966621398926,
"eval_runtime": 394.995,
"eval_samples_per_second": 1065.457,
"eval_steps_per_second": 4.162,
"step": 3000
},
{
"epoch": 0.5322384428223844,
"grad_norm": 1.8543823957443237,
"learning_rate": 4.997028386120321e-05,
"loss": 6.0635,
"step": 3500
},
{
"epoch": 0.5322384428223844,
"eval_loss": 5.868374347686768,
"eval_runtime": 394.8994,
"eval_samples_per_second": 1065.715,
"eval_steps_per_second": 4.163,
"step": 3500
},
{
"epoch": 0.6082725060827251,
"grad_norm": 1.973868489265442,
"learning_rate": 4.994717945027886e-05,
"loss": 5.7633,
"step": 4000
},
{
"epoch": 0.6082725060827251,
"eval_loss": 5.493896961212158,
"eval_runtime": 394.9221,
"eval_samples_per_second": 1065.653,
"eval_steps_per_second": 4.163,
"step": 4000
},
{
"epoch": 0.6843065693430657,
"grad_norm": 1.8778235912322998,
"learning_rate": 4.99174842443942e-05,
"loss": 5.429,
"step": 4500
},
{
"epoch": 0.6843065693430657,
"eval_loss": 5.116617679595947,
"eval_runtime": 394.8716,
"eval_samples_per_second": 1065.79,
"eval_steps_per_second": 4.163,
"step": 4500
},
{
"epoch": 0.7603406326034063,
"grad_norm": 2.0149049758911133,
"learning_rate": 4.9881206088725227e-05,
"loss": 5.0808,
"step": 5000
},
{
"epoch": 0.7603406326034063,
"eval_loss": 4.774472713470459,
"eval_runtime": 395.1882,
"eval_samples_per_second": 1064.936,
"eval_steps_per_second": 4.16,
"step": 5000
},
{
"epoch": 0.8363746958637469,
"grad_norm": 1.7959963083267212,
"learning_rate": 4.983835456759734e-05,
"loss": 4.7725,
"step": 5500
},
{
"epoch": 0.8363746958637469,
"eval_loss": 4.4951066970825195,
"eval_runtime": 395.052,
"eval_samples_per_second": 1065.303,
"eval_steps_per_second": 4.161,
"step": 5500
},
{
"epoch": 0.9124087591240876,
"grad_norm": 1.6965287923812866,
"learning_rate": 4.978894100195325e-05,
"loss": 4.5344,
"step": 6000
},
{
"epoch": 0.9124087591240876,
"eval_loss": 4.28698205947876,
"eval_runtime": 395.1764,
"eval_samples_per_second": 1064.967,
"eval_steps_per_second": 4.16,
"step": 6000
},
{
"epoch": 0.9884428223844283,
"grad_norm": 1.6758971214294434,
"learning_rate": 4.973297844636212e-05,
"loss": 4.3347,
"step": 6500
},
{
"epoch": 0.9884428223844283,
"eval_loss": 4.129937648773193,
"eval_runtime": 395.104,
"eval_samples_per_second": 1065.163,
"eval_steps_per_second": 4.161,
"step": 6500
},
{
"epoch": 1.0644768856447688,
"grad_norm": 1.7391337156295776,
"learning_rate": 4.9670481685570645e-05,
"loss": 4.1883,
"step": 7000
},
{
"epoch": 1.0644768856447688,
"eval_loss": 4.002706050872803,
"eval_runtime": 395.1014,
"eval_samples_per_second": 1065.17,
"eval_steps_per_second": 4.161,
"step": 7000
},
{
"epoch": 1.1405109489051095,
"grad_norm": 1.592909812927246,
"learning_rate": 4.960146723059713e-05,
"loss": 4.0579,
"step": 7500
},
{
"epoch": 1.1405109489051095,
"eval_loss": 3.8906095027923584,
"eval_runtime": 395.0202,
"eval_samples_per_second": 1065.389,
"eval_steps_per_second": 4.162,
"step": 7500
},
{
"epoch": 1.2165450121654502,
"grad_norm": 1.7625865936279297,
"learning_rate": 4.952595331436939e-05,
"loss": 3.9484,
"step": 8000
},
{
"epoch": 1.2165450121654502,
"eval_loss": 3.793649673461914,
"eval_runtime": 395.0939,
"eval_samples_per_second": 1065.19,
"eval_steps_per_second": 4.161,
"step": 8000
},
{
"epoch": 1.2925790754257909,
"grad_norm": 1.5408483743667603,
"learning_rate": 4.9443959886907786e-05,
"loss": 3.8541,
"step": 8500
},
{
"epoch": 1.2925790754257909,
"eval_loss": 3.707909107208252,
"eval_runtime": 395.0107,
"eval_samples_per_second": 1065.414,
"eval_steps_per_second": 4.162,
"step": 8500
},
{
"epoch": 1.3686131386861313,
"grad_norm": 1.5377788543701172,
"learning_rate": 4.935550861005469e-05,
"loss": 3.7751,
"step": 9000
},
{
"epoch": 1.3686131386861313,
"eval_loss": 3.6381478309631348,
"eval_runtime": 395.0196,
"eval_samples_per_second": 1065.39,
"eval_steps_per_second": 4.162,
"step": 9000
},
{
"epoch": 1.444647201946472,
"grad_norm": 1.5185712575912476,
"learning_rate": 4.926062285175158e-05,
"loss": 3.7,
"step": 9500
},
{
"epoch": 1.444647201946472,
"eval_loss": 3.56645131111145,
"eval_runtime": 395.0168,
"eval_samples_per_second": 1065.398,
"eval_steps_per_second": 4.162,
"step": 9500
},
{
"epoch": 1.5206812652068127,
"grad_norm": 1.4640849828720093,
"learning_rate": 4.9159536649297986e-05,
"loss": 3.6397,
"step": 10000
},
{
"epoch": 1.5206812652068127,
"eval_loss": 3.5038576126098633,
"eval_runtime": 395.2609,
"eval_samples_per_second": 1064.74,
"eval_steps_per_second": 4.159,
"step": 10000
},
{
"epoch": 1.5967153284671531,
"grad_norm": 1.7205146551132202,
"learning_rate": 4.9051871562474056e-05,
"loss": 3.5783,
"step": 10500
},
{
"epoch": 1.5967153284671531,
"eval_loss": 3.4472110271453857,
"eval_runtime": 395.1986,
"eval_samples_per_second": 1064.908,
"eval_steps_per_second": 4.16,
"step": 10500
},
{
"epoch": 1.672749391727494,
"grad_norm": 1.605870008468628,
"learning_rate": 4.8937852212067106e-05,
"loss": 3.5196,
"step": 11000
},
{
"epoch": 1.672749391727494,
"eval_loss": 3.3966190814971924,
"eval_runtime": 395.1285,
"eval_samples_per_second": 1065.097,
"eval_steps_per_second": 4.161,
"step": 11000
},
{
"epoch": 1.7487834549878345,
"grad_norm": 1.6770403385162354,
"learning_rate": 4.8817508720847596e-05,
"loss": 3.4701,
"step": 11500
},
{
"epoch": 1.7487834549878345,
"eval_loss": 3.34128999710083,
"eval_runtime": 395.1091,
"eval_samples_per_second": 1065.149,
"eval_steps_per_second": 4.161,
"step": 11500
},
{
"epoch": 1.8248175182481752,
"grad_norm": 1.5218740701675415,
"learning_rate": 4.869087288236064e-05,
"loss": 3.4226,
"step": 12000
},
{
"epoch": 1.8248175182481752,
"eval_loss": 3.301135540008545,
"eval_runtime": 395.0668,
"eval_samples_per_second": 1065.263,
"eval_steps_per_second": 4.161,
"step": 12000
},
{
"epoch": 1.9008515815085159,
"grad_norm": 1.528290867805481,
"learning_rate": 4.855797815252648e-05,
"loss": 3.3704,
"step": 12500
},
{
"epoch": 1.9008515815085159,
"eval_loss": 3.255563735961914,
"eval_runtime": 395.0617,
"eval_samples_per_second": 1065.277,
"eval_steps_per_second": 4.161,
"step": 12500
},
{
"epoch": 1.9768856447688563,
"grad_norm": 1.4962824583053589,
"learning_rate": 4.8418859640801796e-05,
"loss": 3.3326,
"step": 13000
},
{
"epoch": 1.9768856447688563,
"eval_loss": 3.2163586616516113,
"eval_runtime": 395.1594,
"eval_samples_per_second": 1065.013,
"eval_steps_per_second": 4.16,
"step": 13000
},
{
"epoch": 2.052919708029197,
"grad_norm": 1.5214394330978394,
"learning_rate": 4.8273554100904066e-05,
"loss": 3.2872,
"step": 13500
},
{
"epoch": 2.052919708029197,
"eval_loss": 3.178077220916748,
"eval_runtime": 395.023,
"eval_samples_per_second": 1065.381,
"eval_steps_per_second": 4.162,
"step": 13500
},
{
"epoch": 2.1289537712895377,
"grad_norm": 1.6362810134887695,
"learning_rate": 4.8122408939478185e-05,
"loss": 3.2453,
"step": 14000
},
{
"epoch": 2.1289537712895377,
"eval_loss": 3.1436197757720947,
"eval_runtime": 395.163,
"eval_samples_per_second": 1065.004,
"eval_steps_per_second": 4.16,
"step": 14000
},
{
"epoch": 2.204987834549878,
"grad_norm": 1.6314831972122192,
"learning_rate": 4.79651794790509e-05,
"loss": 3.2149,
"step": 14500
},
{
"epoch": 2.204987834549878,
"eval_loss": 3.1076748371124268,
"eval_runtime": 395.3195,
"eval_samples_per_second": 1064.582,
"eval_steps_per_second": 4.159,
"step": 14500
},
{
"epoch": 2.281021897810219,
"grad_norm": 1.5647250413894653,
"learning_rate": 4.7801573854264494e-05,
"loss": 3.1836,
"step": 15000
},
{
"epoch": 2.281021897810219,
"eval_loss": 3.081753969192505,
"eval_runtime": 395.1551,
"eval_samples_per_second": 1065.025,
"eval_steps_per_second": 4.16,
"step": 15000
},
{
"epoch": 2.3570559610705595,
"grad_norm": 1.559869408607483,
"learning_rate": 4.763194428202762e-05,
"loss": 3.1459,
"step": 15500
},
{
"epoch": 2.3570559610705595,
"eval_loss": 3.044140100479126,
"eval_runtime": 395.2791,
"eval_samples_per_second": 1064.691,
"eval_steps_per_second": 4.159,
"step": 15500
},
{
"epoch": 2.4330900243309004,
"grad_norm": 1.669546365737915,
"learning_rate": 4.745633557677441e-05,
"loss": 3.1298,
"step": 16000
},
{
"epoch": 2.4330900243309004,
"eval_loss": 3.015268325805664,
"eval_runtime": 395.2158,
"eval_samples_per_second": 1064.861,
"eval_steps_per_second": 4.16,
"step": 16000
},
{
"epoch": 2.509124087591241,
"grad_norm": 1.5877552032470703,
"learning_rate": 4.727479413256602e-05,
"loss": 3.0882,
"step": 16500
},
{
"epoch": 2.509124087591241,
"eval_loss": 2.9866795539855957,
"eval_runtime": 395.1307,
"eval_samples_per_second": 1065.091,
"eval_steps_per_second": 4.161,
"step": 16500
},
{
"epoch": 2.5851581508515817,
"grad_norm": 1.6820305585861206,
"learning_rate": 4.708736791083384e-05,
"loss": 3.0738,
"step": 17000
},
{
"epoch": 2.5851581508515817,
"eval_loss": 2.957209587097168,
"eval_runtime": 395.2085,
"eval_samples_per_second": 1064.881,
"eval_steps_per_second": 4.16,
"step": 17000
},
{
"epoch": 2.661192214111922,
"grad_norm": 1.4878249168395996,
"learning_rate": 4.6894106427708574e-05,
"loss": 3.0409,
"step": 17500
},
{
"epoch": 2.661192214111922,
"eval_loss": 2.931816339492798,
"eval_runtime": 395.1436,
"eval_samples_per_second": 1065.056,
"eval_steps_per_second": 4.161,
"step": 17500
},
{
"epoch": 2.7372262773722627,
"grad_norm": 1.5256247520446777,
"learning_rate": 4.669546457024816e-05,
"loss": 3.0155,
"step": 18000
},
{
"epoch": 2.7372262773722627,
"eval_loss": 2.9121601581573486,
"eval_runtime": 395.308,
"eval_samples_per_second": 1064.613,
"eval_steps_per_second": 4.159,
"step": 18000
},
{
"epoch": 2.8132603406326036,
"grad_norm": 1.6648399829864502,
"learning_rate": 4.649069867545623e-05,
"loss": 2.9909,
"step": 18500
},
{
"epoch": 2.8132603406326036,
"eval_loss": 2.890857219696045,
"eval_runtime": 395.436,
"eval_samples_per_second": 1064.268,
"eval_steps_per_second": 4.157,
"step": 18500
},
{
"epoch": 2.889294403892944,
"grad_norm": 1.6078656911849976,
"learning_rate": 4.628025515330744e-05,
"loss": 2.9754,
"step": 19000
},
{
"epoch": 2.889294403892944,
"eval_loss": 2.865665912628174,
"eval_runtime": 395.2576,
"eval_samples_per_second": 1064.749,
"eval_steps_per_second": 4.159,
"step": 19000
},
{
"epoch": 2.9653284671532845,
"grad_norm": 1.595712661743164,
"learning_rate": 4.60641896008727e-05,
"loss": 2.9512,
"step": 19500
},
{
"epoch": 2.9653284671532845,
"eval_loss": 2.8427441120147705,
"eval_runtime": 395.1474,
"eval_samples_per_second": 1065.046,
"eval_steps_per_second": 4.16,
"step": 19500
},
{
"epoch": 3.0413625304136254,
"grad_norm": 1.5582592487335205,
"learning_rate": 4.584255910050703e-05,
"loss": 2.9132,
"step": 20000
},
{
"epoch": 3.0413625304136254,
"eval_loss": 2.821183681488037,
"eval_runtime": 395.3458,
"eval_samples_per_second": 1064.511,
"eval_steps_per_second": 4.158,
"step": 20000
},
{
"epoch": 3.117396593673966,
"grad_norm": 1.6548606157302856,
"learning_rate": 4.561588193429872e-05,
"loss": 2.9021,
"step": 20500
},
{
"epoch": 3.117396593673966,
"eval_loss": 2.802894115447998,
"eval_runtime": 395.4185,
"eval_samples_per_second": 1064.315,
"eval_steps_per_second": 4.158,
"step": 20500
},
{
"epoch": 3.1934306569343067,
"grad_norm": 1.6921550035476685,
"learning_rate": 4.538330948241111e-05,
"loss": 2.8889,
"step": 21000
},
{
"epoch": 3.1934306569343067,
"eval_loss": 2.7827913761138916,
"eval_runtime": 395.1602,
"eval_samples_per_second": 1065.011,
"eval_steps_per_second": 4.16,
"step": 21000
},
{
"epoch": 3.269464720194647,
"grad_norm": 1.7307897806167603,
"learning_rate": 4.514535196430073e-05,
"loss": 2.8642,
"step": 21500
},
{
"epoch": 3.269464720194647,
"eval_loss": 2.767017126083374,
"eval_runtime": 395.2268,
"eval_samples_per_second": 1064.832,
"eval_steps_per_second": 4.16,
"step": 21500
},
{
"epoch": 3.345498783454988,
"grad_norm": 1.7314034700393677,
"learning_rate": 4.490207224596068e-05,
"loss": 2.8517,
"step": 22000
},
{
"epoch": 3.345498783454988,
"eval_loss": 2.747631311416626,
"eval_runtime": 395.1865,
"eval_samples_per_second": 1064.94,
"eval_steps_per_second": 4.16,
"step": 22000
},
{
"epoch": 3.4215328467153285,
"grad_norm": 1.7844088077545166,
"learning_rate": 4.465353459945605e-05,
"loss": 2.8341,
"step": 22500
},
{
"epoch": 3.4215328467153285,
"eval_loss": 2.7319579124450684,
"eval_runtime": 395.3244,
"eval_samples_per_second": 1064.569,
"eval_steps_per_second": 4.159,
"step": 22500
},
{
"epoch": 3.497566909975669,
"grad_norm": 1.5570697784423828,
"learning_rate": 4.43998046859439e-05,
"loss": 2.8102,
"step": 23000
},
{
"epoch": 3.497566909975669,
"eval_loss": 2.7134299278259277,
"eval_runtime": 395.3813,
"eval_samples_per_second": 1064.416,
"eval_steps_per_second": 4.158,
"step": 23000
},
{
"epoch": 3.57360097323601,
"grad_norm": 1.5903196334838867,
"learning_rate": 4.414094953832625e-05,
"loss": 2.7942,
"step": 23500
},
{
"epoch": 3.57360097323601,
"eval_loss": 2.696880340576172,
"eval_runtime": 395.3996,
"eval_samples_per_second": 1064.366,
"eval_steps_per_second": 4.158,
"step": 23500
},
{
"epoch": 3.6496350364963503,
"grad_norm": 1.7155580520629883,
"learning_rate": 4.387703754354059e-05,
"loss": 2.7893,
"step": 24000
},
{
"epoch": 3.6496350364963503,
"eval_loss": 2.6850531101226807,
"eval_runtime": 395.3598,
"eval_samples_per_second": 1064.473,
"eval_steps_per_second": 4.158,
"step": 24000
},
{
"epoch": 3.725669099756691,
"grad_norm": 1.5948296785354614,
"learning_rate": 4.3608681152880126e-05,
"loss": 2.7681,
"step": 24500
},
{
"epoch": 3.725669099756691,
"eval_loss": 2.66740345954895,
"eval_runtime": 395.2365,
"eval_samples_per_second": 1064.805,
"eval_steps_per_second": 4.16,
"step": 24500
},
{
"epoch": 3.8017031630170317,
"grad_norm": 1.5723962783813477,
"learning_rate": 4.333487571042728e-05,
"loss": 2.7577,
"step": 25000
},
{
"epoch": 3.8017031630170317,
"eval_loss": 2.654303789138794,
"eval_runtime": 395.3645,
"eval_samples_per_second": 1064.461,
"eval_steps_per_second": 4.158,
"step": 25000
},
{
"epoch": 3.877737226277372,
"grad_norm": 1.6151896715164185,
"learning_rate": 4.3056226377438776e-05,
"loss": 2.7427,
"step": 25500
},
{
"epoch": 3.877737226277372,
"eval_loss": 2.643014669418335,
"eval_runtime": 395.2931,
"eval_samples_per_second": 1064.653,
"eval_steps_per_second": 4.159,
"step": 25500
},
{
"epoch": 3.9537712895377126,
"grad_norm": 1.670333743095398,
"learning_rate": 4.27728067702777e-05,
"loss": 2.7302,
"step": 26000
},
{
"epoch": 3.9537712895377126,
"eval_loss": 2.6284077167510986,
"eval_runtime": 395.1357,
"eval_samples_per_second": 1065.077,
"eval_steps_per_second": 4.161,
"step": 26000
},
{
"epoch": 4.0298053527980535,
"grad_norm": 1.551099419593811,
"learning_rate": 4.248469176556575e-05,
"loss": 2.7106,
"step": 26500
},
{
"epoch": 4.0298053527980535,
"eval_loss": 2.616875171661377,
"eval_runtime": 395.4614,
"eval_samples_per_second": 1064.2,
"eval_steps_per_second": 4.157,
"step": 26500
},
{
"epoch": 4.105839416058394,
"grad_norm": 1.6209259033203125,
"learning_rate": 4.219313751705213e-05,
"loss": 2.7007,
"step": 27000
},
{
"epoch": 4.105839416058394,
"eval_loss": 2.6068313121795654,
"eval_runtime": 395.5688,
"eval_samples_per_second": 1063.911,
"eval_steps_per_second": 4.156,
"step": 27000
},
{
"epoch": 4.181873479318734,
"grad_norm": 1.616698980331421,
"learning_rate": 4.189587930102075e-05,
"loss": 2.6858,
"step": 27500
},
{
"epoch": 4.181873479318734,
"eval_loss": 2.5947837829589844,
"eval_runtime": 395.5316,
"eval_samples_per_second": 1064.011,
"eval_steps_per_second": 4.156,
"step": 27500
},
{
"epoch": 4.257907542579075,
"grad_norm": 1.6252193450927734,
"learning_rate": 4.1594157362893294e-05,
"loss": 2.6748,
"step": 28000
},
{
"epoch": 4.257907542579075,
"eval_loss": 2.5821821689605713,
"eval_runtime": 395.6317,
"eval_samples_per_second": 1063.742,
"eval_steps_per_second": 4.155,
"step": 28000
},
{
"epoch": 4.333941605839416,
"grad_norm": 1.5178853273391724,
"learning_rate": 4.1288051414584004e-05,
"loss": 2.672,
"step": 28500
},
{
"epoch": 4.333941605839416,
"eval_loss": 2.566763162612915,
"eval_runtime": 395.3024,
"eval_samples_per_second": 1064.628,
"eval_steps_per_second": 4.159,
"step": 28500
},
{
"epoch": 4.409975669099756,
"grad_norm": 1.6428803205490112,
"learning_rate": 4.097764232621873e-05,
"loss": 2.6498,
"step": 29000
},
{
"epoch": 4.409975669099756,
"eval_loss": 2.560192823410034,
"eval_runtime": 395.2916,
"eval_samples_per_second": 1064.657,
"eval_steps_per_second": 4.159,
"step": 29000
},
{
"epoch": 4.486009732360097,
"grad_norm": 1.546608805656433,
"learning_rate": 4.066301210476981e-05,
"loss": 2.6422,
"step": 29500
},
{
"epoch": 4.486009732360097,
"eval_loss": 2.5504369735717773,
"eval_runtime": 395.4025,
"eval_samples_per_second": 1064.358,
"eval_steps_per_second": 4.158,
"step": 29500
},
{
"epoch": 4.562043795620438,
"grad_norm": 1.6463203430175781,
"learning_rate": 4.034424387239068e-05,
"loss": 2.6334,
"step": 30000
},
{
"epoch": 4.562043795620438,
"eval_loss": 2.540264844894409,
"eval_runtime": 395.609,
"eval_samples_per_second": 1063.803,
"eval_steps_per_second": 4.156,
"step": 30000
},
{
"epoch": 4.638077858880779,
"grad_norm": 1.69281005859375,
"learning_rate": 4.002142184445579e-05,
"loss": 2.6246,
"step": 30500
},
{
"epoch": 4.638077858880779,
"eval_loss": 2.529710292816162,
"eval_runtime": 395.534,
"eval_samples_per_second": 1064.005,
"eval_steps_per_second": 4.156,
"step": 30500
},
{
"epoch": 4.714111922141119,
"grad_norm": 1.4954875707626343,
"learning_rate": 3.969594626065171e-05,
"loss": 2.6194,
"step": 31000
},
{
"epoch": 4.714111922141119,
"eval_loss": 2.5173487663269043,
"eval_runtime": 395.5366,
"eval_samples_per_second": 1063.998,
"eval_steps_per_second": 4.156,
"step": 31000
},
{
"epoch": 4.79014598540146,
"grad_norm": 1.586890459060669,
"learning_rate": 3.936528890443755e-05,
"loss": 2.6044,
"step": 31500
},
{
"epoch": 4.79014598540146,
"eval_loss": 2.509347438812256,
"eval_runtime": 395.6037,
"eval_samples_per_second": 1063.817,
"eval_steps_per_second": 4.156,
"step": 31500
},
{
"epoch": 4.866180048661801,
"grad_norm": 1.4862339496612549,
"learning_rate": 3.903083638276577e-05,
"loss": 2.585,
"step": 32000
},
{
"epoch": 4.866180048661801,
"eval_loss": 2.498917579650879,
"eval_runtime": 395.4783,
"eval_samples_per_second": 1064.154,
"eval_steps_per_second": 4.157,
"step": 32000
},
{
"epoch": 4.942214111922141,
"grad_norm": 1.6119396686553955,
"learning_rate": 3.869267705464299e-05,
"loss": 2.5825,
"step": 32500
},
{
"epoch": 4.942214111922141,
"eval_loss": 2.4927380084991455,
"eval_runtime": 395.5817,
"eval_samples_per_second": 1063.876,
"eval_steps_per_second": 4.156,
"step": 32500
},
{
"epoch": 5.018248175182482,
"grad_norm": 1.5895634889602661,
"learning_rate": 3.835090025837699e-05,
"loss": 2.5708,
"step": 33000
},
{
"epoch": 5.018248175182482,
"eval_loss": 2.4862186908721924,
"eval_runtime": 395.6219,
"eval_samples_per_second": 1063.768,
"eval_steps_per_second": 4.155,
"step": 33000
},
{
"epoch": 5.094282238442823,
"grad_norm": 1.6652857065200806,
"learning_rate": 3.800559628797438e-05,
"loss": 2.5612,
"step": 33500
},
{
"epoch": 5.094282238442823,
"eval_loss": 2.475658416748047,
"eval_runtime": 394.9698,
"eval_samples_per_second": 1065.525,
"eval_steps_per_second": 4.162,
"step": 33500
},
{
"epoch": 5.170316301703163,
"grad_norm": 1.6712974309921265,
"learning_rate": 3.765685636928585e-05,
"loss": 2.5508,
"step": 34000
},
{
"epoch": 5.170316301703163,
"eval_loss": 2.4684622287750244,
"eval_runtime": 394.7029,
"eval_samples_per_second": 1066.245,
"eval_steps_per_second": 4.165,
"step": 34000
},
{
"epoch": 5.2463503649635035,
"grad_norm": 1.7370678186416626,
"learning_rate": 3.7305480078818275e-05,
"loss": 2.5517,
"step": 34500
},
{
"epoch": 5.2463503649635035,
"eval_loss": 2.4651219844818115,
"eval_runtime": 395.1235,
"eval_samples_per_second": 1065.11,
"eval_steps_per_second": 4.161,
"step": 34500
},
{
"epoch": 5.322384428223844,
"grad_norm": 1.6240907907485962,
"learning_rate": 3.6950151955931227e-05,
"loss": 2.536,
"step": 35000
},
{
"epoch": 5.322384428223844,
"eval_loss": 2.4535937309265137,
"eval_runtime": 394.9571,
"eval_samples_per_second": 1065.559,
"eval_steps_per_second": 4.162,
"step": 35000
},
{
"epoch": 5.398418491484185,
"grad_norm": 1.8107973337173462,
"learning_rate": 3.659166672258033e-05,
"loss": 2.5362,
"step": 35500
},
{
"epoch": 5.398418491484185,
"eval_loss": 2.4444773197174072,
"eval_runtime": 395.3,
"eval_samples_per_second": 1064.635,
"eval_steps_per_second": 4.159,
"step": 35500
},
{
"epoch": 5.474452554744525,
"grad_norm": 1.550801396369934,
"learning_rate": 3.623011908697394e-05,
"loss": 2.5267,
"step": 36000
},
{
"epoch": 5.474452554744525,
"eval_loss": 2.4367120265960693,
"eval_runtime": 395.2579,
"eval_samples_per_second": 1064.748,
"eval_steps_per_second": 4.159,
"step": 36000
},
{
"epoch": 5.550486618004866,
"grad_norm": 1.4852931499481201,
"learning_rate": 3.5866336492488555e-05,
"loss": 2.5165,
"step": 36500
},
{
"epoch": 5.550486618004866,
"eval_loss": 2.431751251220703,
"eval_runtime": 395.2301,
"eval_samples_per_second": 1064.823,
"eval_steps_per_second": 4.16,
"step": 36500
},
{
"epoch": 5.626520681265207,
"grad_norm": 1.603376865386963,
"learning_rate": 3.5498957032536564e-05,
"loss": 2.5194,
"step": 37000
},
{
"epoch": 5.626520681265207,
"eval_loss": 2.4255075454711914,
"eval_runtime": 395.4389,
"eval_samples_per_second": 1064.26,
"eval_steps_per_second": 4.157,
"step": 37000
},
{
"epoch": 5.702554744525547,
"grad_norm": 1.6579174995422363,
"learning_rate": 3.512880385328552e-05,
"loss": 2.5063,
"step": 37500
},
{
"epoch": 5.702554744525547,
"eval_loss": 2.4162917137145996,
"eval_runtime": 395.2478,
"eval_samples_per_second": 1064.775,
"eval_steps_per_second": 4.159,
"step": 37500
},
{
"epoch": 5.778588807785888,
"grad_norm": 1.6467429399490356,
"learning_rate": 3.475597474549821e-05,
"loss": 2.4969,
"step": 38000
},
{
"epoch": 5.778588807785888,
"eval_loss": 2.4108052253723145,
"eval_runtime": 395.1001,
"eval_samples_per_second": 1065.173,
"eval_steps_per_second": 4.161,
"step": 38000
},
{
"epoch": 5.854622871046229,
"grad_norm": 1.6167348623275757,
"learning_rate": 3.438056820689096e-05,
"loss": 2.492,
"step": 38500
},
{
"epoch": 5.854622871046229,
"eval_loss": 2.402526617050171,
"eval_runtime": 395.2077,
"eval_samples_per_second": 1064.883,
"eval_steps_per_second": 4.16,
"step": 38500
},
{
"epoch": 5.930656934306569,
"grad_norm": 1.7401496171951294,
"learning_rate": 3.400344159273908e-05,
"loss": 2.4729,
"step": 39000
},
{
"epoch": 5.930656934306569,
"eval_loss": 2.3961234092712402,
"eval_runtime": 395.2683,
"eval_samples_per_second": 1064.72,
"eval_steps_per_second": 4.159,
"step": 39000
},
{
"epoch": 6.00669099756691,
"grad_norm": 1.7321972846984863,
"learning_rate": 3.3623183039946427e-05,
"loss": 2.4753,
"step": 39500
},
{
"epoch": 6.00669099756691,
"eval_loss": 2.390777826309204,
"eval_runtime": 395.3927,
"eval_samples_per_second": 1064.385,
"eval_steps_per_second": 4.158,
"step": 39500
},
{
"epoch": 6.082725060827251,
"grad_norm": 1.6455748081207275,
"learning_rate": 3.3240646328557325e-05,
"loss": 2.4653,
"step": 40000
},
{
"epoch": 6.082725060827251,
"eval_loss": 2.385394334793091,
"eval_runtime": 395.2314,
"eval_samples_per_second": 1064.819,
"eval_steps_per_second": 4.16,
"step": 40000
},
{
"epoch": 6.158759124087592,
"grad_norm": 1.6246484518051147,
"learning_rate": 3.2855932520939756e-05,
"loss": 2.4552,
"step": 40500
},
{
"epoch": 6.158759124087592,
"eval_loss": 2.3780696392059326,
"eval_runtime": 395.2284,
"eval_samples_per_second": 1064.827,
"eval_steps_per_second": 4.16,
"step": 40500
},
{
"epoch": 6.234793187347932,
"grad_norm": 1.6907716989517212,
"learning_rate": 3.246914325462873e-05,
"loss": 2.4577,
"step": 41000
},
{
"epoch": 6.234793187347932,
"eval_loss": 2.3710057735443115,
"eval_runtime": 395.2817,
"eval_samples_per_second": 1064.684,
"eval_steps_per_second": 4.159,
"step": 41000
},
{
"epoch": 6.3108272506082725,
"grad_norm": 1.733163595199585,
"learning_rate": 3.208038071547463e-05,
"loss": 2.4512,
"step": 41500
},
{
"epoch": 6.3108272506082725,
"eval_loss": 2.364978313446045,
"eval_runtime": 395.0989,
"eval_samples_per_second": 1065.176,
"eval_steps_per_second": 4.161,
"step": 41500
},
{
"epoch": 6.386861313868613,
"grad_norm": 1.604212999343872,
"learning_rate": 3.1690530675165916e-05,
"loss": 2.4419,
"step": 42000
},
{
"epoch": 6.386861313868613,
"eval_loss": 2.3593010902404785,
"eval_runtime": 394.8589,
"eval_samples_per_second": 1065.824,
"eval_steps_per_second": 4.164,
"step": 42000
},
{
"epoch": 6.4628953771289535,
"grad_norm": 1.799272060394287,
"learning_rate": 3.1298133637437146e-05,
"loss": 2.443,
"step": 42500
},
{
"epoch": 6.4628953771289535,
"eval_loss": 2.3553106784820557,
"eval_runtime": 395.5826,
"eval_samples_per_second": 1063.874,
"eval_steps_per_second": 4.156,
"step": 42500
},
{
"epoch": 6.538929440389294,
"grad_norm": 1.5894908905029297,
"learning_rate": 3.0904072695878296e-05,
"loss": 2.4291,
"step": 43000
},
{
"epoch": 6.538929440389294,
"eval_loss": 2.350308656692505,
"eval_runtime": 395.6156,
"eval_samples_per_second": 1063.785,
"eval_steps_per_second": 4.156,
"step": 43000
},
{
"epoch": 6.614963503649635,
"grad_norm": 1.6308026313781738,
"learning_rate": 3.050845195744353e-05,
"loss": 2.4212,
"step": 43500
},
{
"epoch": 6.614963503649635,
"eval_loss": 2.3425817489624023,
"eval_runtime": 395.5628,
"eval_samples_per_second": 1063.927,
"eval_steps_per_second": 4.156,
"step": 43500
},
{
"epoch": 6.690997566909976,
"grad_norm": 1.5576202869415283,
"learning_rate": 3.011137594116975e-05,
"loss": 2.4217,
"step": 44000
},
{
"epoch": 6.690997566909976,
"eval_loss": 2.3366506099700928,
"eval_runtime": 395.6852,
"eval_samples_per_second": 1063.598,
"eval_steps_per_second": 4.155,
"step": 44000
},
{
"epoch": 6.767031630170316,
"grad_norm": 1.698960542678833,
"learning_rate": 2.9713747681111948e-05,
"loss": 2.4191,
"step": 44500
},
{
"epoch": 6.767031630170316,
"eval_loss": 2.3311471939086914,
"eval_runtime": 395.6553,
"eval_samples_per_second": 1063.678,
"eval_steps_per_second": 4.155,
"step": 44500
},
{
"epoch": 6.843065693430657,
"grad_norm": 1.700810194015503,
"learning_rate": 2.931407856139074e-05,
"loss": 2.4101,
"step": 45000
},
{
"epoch": 6.843065693430657,
"eval_loss": 2.326604127883911,
"eval_runtime": 395.4811,
"eval_samples_per_second": 1064.147,
"eval_steps_per_second": 4.157,
"step": 45000
},
{
"epoch": 6.919099756690997,
"grad_norm": 1.675718069076538,
"learning_rate": 2.8913269705319878e-05,
"loss": 2.4092,
"step": 45500
},
{
"epoch": 6.919099756690997,
"eval_loss": 2.3215043544769287,
"eval_runtime": 395.6152,
"eval_samples_per_second": 1063.786,
"eval_steps_per_second": 4.156,
"step": 45500
},
{
"epoch": 6.995133819951338,
"grad_norm": 1.7430431842803955,
"learning_rate": 2.851142700258497e-05,
"loss": 2.4028,
"step": 46000
},
{
"epoch": 6.995133819951338,
"eval_loss": 2.3190836906433105,
"eval_runtime": 395.7789,
"eval_samples_per_second": 1063.346,
"eval_steps_per_second": 4.154,
"step": 46000
},
{
"epoch": 7.071167883211679,
"grad_norm": 1.7376880645751953,
"learning_rate": 2.8108656616003542e-05,
"loss": 2.393,
"step": 46500
},
{
"epoch": 7.071167883211679,
"eval_loss": 2.314730167388916,
"eval_runtime": 395.8715,
"eval_samples_per_second": 1063.097,
"eval_steps_per_second": 4.153,
"step": 46500
},
{
"epoch": 7.14720194647202,
"grad_norm": 1.647200584411621,
"learning_rate": 2.7705064953477926e-05,
"loss": 2.3864,
"step": 47000
},
{
"epoch": 7.14720194647202,
"eval_loss": 2.3095407485961914,
"eval_runtime": 392.0209,
"eval_samples_per_second": 1073.54,
"eval_steps_per_second": 4.194,
"step": 47000
},
{
"epoch": 7.22323600973236,
"grad_norm": 1.5628902912139893,
"learning_rate": 2.7300758639883305e-05,
"loss": 2.3853,
"step": 47500
},
{
"epoch": 7.22323600973236,
"eval_loss": 2.3034095764160156,
"eval_runtime": 392.407,
"eval_samples_per_second": 1072.483,
"eval_steps_per_second": 4.19,
"step": 47500
},
{
"epoch": 7.299270072992701,
"grad_norm": 1.6254950761795044,
"learning_rate": 2.6896654852743762e-05,
"loss": 2.3778,
"step": 48000
},
{
"epoch": 7.299270072992701,
"eval_loss": 2.3009138107299805,
"eval_runtime": 392.2743,
"eval_samples_per_second": 1072.846,
"eval_steps_per_second": 4.191,
"step": 48000
},
{
"epoch": 7.375304136253042,
"grad_norm": 1.7831765413284302,
"learning_rate": 2.6491240733505536e-05,
"loss": 2.3902,
"step": 48500
},
{
"epoch": 7.375304136253042,
"eval_loss": 2.2940807342529297,
"eval_runtime": 392.0933,
"eval_samples_per_second": 1073.342,
"eval_steps_per_second": 4.193,
"step": 48500
},
{
"epoch": 7.451338199513382,
"grad_norm": 1.7135417461395264,
"learning_rate": 2.608543264340055e-05,
"loss": 2.3734,
"step": 49000
},
{
"epoch": 7.451338199513382,
"eval_loss": 2.2903780937194824,
"eval_runtime": 392.3395,
"eval_samples_per_second": 1072.668,
"eval_steps_per_second": 4.19,
"step": 49000
},
{
"epoch": 7.5273722627737225,
"grad_norm": 1.7215466499328613,
"learning_rate": 2.5679337792861973e-05,
"loss": 2.3644,
"step": 49500
},
{
"epoch": 7.5273722627737225,
"eval_loss": 2.2882533073425293,
"eval_runtime": 391.7386,
"eval_samples_per_second": 1074.313,
"eval_steps_per_second": 4.197,
"step": 49500
},
{
"epoch": 7.603406326034063,
"grad_norm": 1.5934220552444458,
"learning_rate": 2.527306346808222e-05,
"loss": 2.3644,
"step": 50000
},
{
"epoch": 7.603406326034063,
"eval_loss": 2.278449296951294,
"eval_runtime": 392.029,
"eval_samples_per_second": 1073.517,
"eval_steps_per_second": 4.194,
"step": 50000
},
{
"epoch": 7.679440389294404,
"grad_norm": 1.734836459159851,
"learning_rate": 2.4866717002668977e-05,
"loss": 2.3643,
"step": 50500
},
{
"epoch": 7.679440389294404,
"eval_loss": 2.2776286602020264,
"eval_runtime": 391.9926,
"eval_samples_per_second": 1073.617,
"eval_steps_per_second": 4.194,
"step": 50500
},
{
"epoch": 7.755474452554744,
"grad_norm": 1.6759928464889526,
"learning_rate": 2.4461218265301844e-05,
"loss": 2.3549,
"step": 51000
},
{
"epoch": 7.755474452554744,
"eval_loss": 2.275527000427246,
"eval_runtime": 392.0053,
"eval_samples_per_second": 1073.582,
"eval_steps_per_second": 4.194,
"step": 51000
},
{
"epoch": 7.831508515815085,
"grad_norm": 1.6229385137557983,
"learning_rate": 2.4055049175099393e-05,
"loss": 2.3475,
"step": 51500
},
{
"epoch": 7.831508515815085,
"eval_loss": 2.269463539123535,
"eval_runtime": 392.7325,
"eval_samples_per_second": 1071.594,
"eval_steps_per_second": 4.186,
"step": 51500
},
{
"epoch": 7.907542579075426,
"grad_norm": 1.5919690132141113,
"learning_rate": 2.3649129731441017e-05,
"loss": 2.3556,
"step": 52000
},
{
"epoch": 7.907542579075426,
"eval_loss": 2.2632956504821777,
"eval_runtime": 392.8483,
"eval_samples_per_second": 1071.279,
"eval_steps_per_second": 4.185,
"step": 52000
},
{
"epoch": 7.983576642335766,
"grad_norm": 1.6283611059188843,
"learning_rate": 2.32435671741784e-05,
"loss": 2.3441,
"step": 52500
},
{
"epoch": 7.983576642335766,
"eval_loss": 2.2631113529205322,
"eval_runtime": 393.1076,
"eval_samples_per_second": 1070.572,
"eval_steps_per_second": 4.182,
"step": 52500
},
{
"epoch": 8.059610705596107,
"grad_norm": 1.6927645206451416,
"learning_rate": 2.2838468648877376e-05,
"loss": 2.3396,
"step": 53000
},
{
"epoch": 8.059610705596107,
"eval_loss": 2.2605204582214355,
"eval_runtime": 393.0545,
"eval_samples_per_second": 1070.717,
"eval_steps_per_second": 4.183,
"step": 53000
},
{
"epoch": 8.135644768856448,
"grad_norm": 1.6524484157562256,
"learning_rate": 2.2433941178511185e-05,
"loss": 2.3281,
"step": 53500
},
{
"epoch": 8.135644768856448,
"eval_loss": 2.255591869354248,
"eval_runtime": 393.065,
"eval_samples_per_second": 1070.688,
"eval_steps_per_second": 4.183,
"step": 53500
},
{
"epoch": 8.211678832116789,
"grad_norm": 1.8136180639266968,
"learning_rate": 2.2030091635186097e-05,
"loss": 2.3251,
"step": 54000
},
{
"epoch": 8.211678832116789,
"eval_loss": 2.2528815269470215,
"eval_runtime": 393.1403,
"eval_samples_per_second": 1070.483,
"eval_steps_per_second": 4.182,
"step": 54000
},
{
"epoch": 8.28771289537713,
"grad_norm": 1.7461555004119873,
"learning_rate": 2.1627831987887616e-05,
"loss": 2.3252,
"step": 54500
},
{
"epoch": 8.28771289537713,
"eval_loss": 2.247727155685425,
"eval_runtime": 394.607,
"eval_samples_per_second": 1066.504,
"eval_steps_per_second": 4.166,
"step": 54500
},
{
"epoch": 8.363746958637469,
"grad_norm": 1.6148008108139038,
"learning_rate": 2.1225656282037674e-05,
"loss": 2.3231,
"step": 55000
},
{
"epoch": 8.363746958637469,
"eval_loss": 2.245650291442871,
"eval_runtime": 393.1496,
"eval_samples_per_second": 1070.458,
"eval_steps_per_second": 4.182,
"step": 55000
},
{
"epoch": 8.43978102189781,
"grad_norm": 1.5390928983688354,
"learning_rate": 2.082447771999728e-05,
"loss": 2.3218,
"step": 55500
},
{
"epoch": 8.43978102189781,
"eval_loss": 2.240283489227295,
"eval_runtime": 393.128,
"eval_samples_per_second": 1070.517,
"eval_steps_per_second": 4.182,
"step": 55500
},
{
"epoch": 8.51581508515815,
"grad_norm": 1.7353328466415405,
"learning_rate": 2.0424402289124667e-05,
"loss": 2.3113,
"step": 56000
},
{
"epoch": 8.51581508515815,
"eval_loss": 2.236283540725708,
"eval_runtime": 392.9933,
"eval_samples_per_second": 1070.883,
"eval_steps_per_second": 4.183,
"step": 56000
},
{
"epoch": 8.591849148418492,
"grad_norm": 1.6553759574890137,
"learning_rate": 2.0025535685341834e-05,
"loss": 2.3137,
"step": 56500
},
{
"epoch": 8.591849148418492,
"eval_loss": 2.2341954708099365,
"eval_runtime": 394.1952,
"eval_samples_per_second": 1067.618,
"eval_steps_per_second": 4.171,
"step": 56500
},
{
"epoch": 8.667883211678832,
"grad_norm": 1.6300148963928223,
"learning_rate": 1.9627983285210795e-05,
"loss": 2.3153,
"step": 57000
},
{
"epoch": 8.667883211678832,
"eval_loss": 2.2316806316375732,
"eval_runtime": 394.4429,
"eval_samples_per_second": 1066.948,
"eval_steps_per_second": 4.168,
"step": 57000
},
{
"epoch": 8.743917274939173,
"grad_norm": 1.7760825157165527,
"learning_rate": 1.9231850118094083e-05,
"loss": 2.3086,
"step": 57500
},
{
"epoch": 8.743917274939173,
"eval_loss": 2.2260444164276123,
"eval_runtime": 394.1825,
"eval_samples_per_second": 1067.653,
"eval_steps_per_second": 4.171,
"step": 57500
},
{
"epoch": 8.819951338199512,
"grad_norm": 1.6700938940048218,
"learning_rate": 1.883724083840713e-05,
"loss": 2.3051,
"step": 58000
},
{
"epoch": 8.819951338199512,
"eval_loss": 2.2262229919433594,
"eval_runtime": 394.2521,
"eval_samples_per_second": 1067.464,
"eval_steps_per_second": 4.17,
"step": 58000
},
{
"epoch": 8.895985401459853,
"grad_norm": 1.6361171007156372,
"learning_rate": 1.8445043966286124e-05,
"loss": 2.2996,
"step": 58500
},
{
"epoch": 8.895985401459853,
"eval_loss": 2.2197461128234863,
"eval_runtime": 394.2947,
"eval_samples_per_second": 1067.349,
"eval_steps_per_second": 4.169,
"step": 58500
},
{
"epoch": 8.972019464720194,
"grad_norm": 1.5987651348114014,
"learning_rate": 1.805379121954309e-05,
"loss": 2.295,
"step": 59000
},
{
"epoch": 8.972019464720194,
"eval_loss": 2.218661069869995,
"eval_runtime": 394.4471,
"eval_samples_per_second": 1066.936,
"eval_steps_per_second": 4.168,
"step": 59000
},
{
"epoch": 9.048053527980535,
"grad_norm": 1.6805070638656616,
"learning_rate": 1.7664373591592323e-05,
"loss": 2.2898,
"step": 59500
},
{
"epoch": 9.048053527980535,
"eval_loss": 2.2158923149108887,
"eval_runtime": 394.3964,
"eval_samples_per_second": 1067.074,
"eval_steps_per_second": 4.168,
"step": 59500
},
{
"epoch": 9.124087591240876,
"grad_norm": 1.559171199798584,
"learning_rate": 1.727689396267106e-05,
"loss": 2.294,
"step": 60000
},
{
"epoch": 9.124087591240876,
"eval_loss": 2.213304281234741,
"eval_runtime": 394.3761,
"eval_samples_per_second": 1067.129,
"eval_steps_per_second": 4.169,
"step": 60000
},
{
"epoch": 9.200121654501217,
"grad_norm": 1.7154414653778076,
"learning_rate": 1.689145470101657e-05,
"loss": 2.2905,
"step": 60500
},
{
"epoch": 9.200121654501217,
"eval_loss": 2.211729049682617,
"eval_runtime": 394.4483,
"eval_samples_per_second": 1066.933,
"eval_steps_per_second": 4.168,
"step": 60500
},
{
"epoch": 9.276155717761558,
"grad_norm": 1.7217854261398315,
"learning_rate": 1.6508922024636513e-05,
"loss": 2.2776,
"step": 61000
},
{
"epoch": 9.276155717761558,
"eval_loss": 2.2076163291931152,
"eval_runtime": 394.2479,
"eval_samples_per_second": 1067.476,
"eval_steps_per_second": 4.17,
"step": 61000
},
{
"epoch": 9.352189781021897,
"grad_norm": 1.6988067626953125,
"learning_rate": 1.6127863831556155e-05,
"loss": 2.2888,
"step": 61500
},
{
"epoch": 9.352189781021897,
"eval_loss": 2.2073538303375244,
"eval_runtime": 394.4185,
"eval_samples_per_second": 1067.014,
"eval_steps_per_second": 4.168,
"step": 61500
},
{
"epoch": 9.428223844282238,
"grad_norm": 1.6594995260238647,
"learning_rate": 1.5749149567995482e-05,
"loss": 2.2737,
"step": 62000
},
{
"epoch": 9.428223844282238,
"eval_loss": 2.2045233249664307,
"eval_runtime": 394.3688,
"eval_samples_per_second": 1067.148,
"eval_steps_per_second": 4.169,
"step": 62000
},
{
"epoch": 9.504257907542579,
"grad_norm": 1.782347321510315,
"learning_rate": 1.537287928647002e-05,
"loss": 2.2715,
"step": 62500
},
{
"epoch": 9.504257907542579,
"eval_loss": 2.1984219551086426,
"eval_runtime": 394.2219,
"eval_samples_per_second": 1067.546,
"eval_steps_per_second": 4.17,
"step": 62500
},
{
"epoch": 9.58029197080292,
"grad_norm": 1.7212417125701904,
"learning_rate": 1.4999897243562522e-05,
"loss": 2.2736,
"step": 63000
},
{
"epoch": 9.58029197080292,
"eval_loss": 2.200115919113159,
"eval_runtime": 394.4095,
"eval_samples_per_second": 1067.038,
"eval_steps_per_second": 4.168,
"step": 63000
},
{
"epoch": 9.65632603406326,
"grad_norm": 1.636083722114563,
"learning_rate": 1.4628807092364161e-05,
"loss": 2.2714,
"step": 63500
},
{
"epoch": 9.65632603406326,
"eval_loss": 2.196516752243042,
"eval_runtime": 394.3398,
"eval_samples_per_second": 1067.227,
"eval_steps_per_second": 4.169,
"step": 63500
},
{
"epoch": 9.732360097323602,
"grad_norm": 1.669154405593872,
"learning_rate": 1.4260456906462644e-05,
"loss": 2.2581,
"step": 64000
},
{
"epoch": 9.732360097323602,
"eval_loss": 2.1947672367095947,
"eval_runtime": 394.2775,
"eval_samples_per_second": 1067.396,
"eval_steps_per_second": 4.17,
"step": 64000
},
{
"epoch": 9.808394160583942,
"grad_norm": 1.5820955038070679,
"learning_rate": 1.3894944000287996e-05,
"loss": 2.2673,
"step": 64500
},
{
"epoch": 9.808394160583942,
"eval_loss": 2.1930572986602783,
"eval_runtime": 394.3185,
"eval_samples_per_second": 1067.284,
"eval_steps_per_second": 4.169,
"step": 64500
},
{
"epoch": 9.884428223844282,
"grad_norm": 1.878128170967102,
"learning_rate": 1.3532364938689365e-05,
"loss": 2.2532,
"step": 65000
},
{
"epoch": 9.884428223844282,
"eval_loss": 2.186814069747925,
"eval_runtime": 394.1633,
"eval_samples_per_second": 1067.705,
"eval_steps_per_second": 4.171,
"step": 65000
},
{
"epoch": 9.960462287104622,
"grad_norm": 1.6541669368743896,
"learning_rate": 1.3172815511423497e-05,
"loss": 2.2599,
"step": 65500
},
{
"epoch": 9.960462287104622,
"eval_loss": 2.186183452606201,
"eval_runtime": 394.274,
"eval_samples_per_second": 1067.405,
"eval_steps_per_second": 4.17,
"step": 65500
},
{
"epoch": 10.036496350364963,
"grad_norm": 1.6656322479248047,
"learning_rate": 1.2817100376353228e-05,
"loss": 2.2626,
"step": 66000
},
{
"epoch": 10.036496350364963,
"eval_loss": 2.1833560466766357,
"eval_runtime": 394.4838,
"eval_samples_per_second": 1066.837,
"eval_steps_per_second": 4.167,
"step": 66000
},
{
"epoch": 10.112530413625304,
"grad_norm": 1.64789617061615,
"learning_rate": 1.246388782934231e-05,
"loss": 2.2476,
"step": 66500
},
{
"epoch": 10.112530413625304,
"eval_loss": 2.1836633682250977,
"eval_runtime": 394.475,
"eval_samples_per_second": 1066.861,
"eval_steps_per_second": 4.168,
"step": 66500
},
{
"epoch": 10.188564476885645,
"grad_norm": 1.626693844795227,
"learning_rate": 1.2113987197615472e-05,
"loss": 2.2597,
"step": 67000
},
{
"epoch": 10.188564476885645,
"eval_loss": 2.177664041519165,
"eval_runtime": 394.4402,
"eval_samples_per_second": 1066.955,
"eval_steps_per_second": 4.168,
"step": 67000
},
{
"epoch": 10.264598540145986,
"grad_norm": 1.660078525543213,
"learning_rate": 1.1767490921415291e-05,
"loss": 2.2525,
"step": 67500
},
{
"epoch": 10.264598540145986,
"eval_loss": 2.177150011062622,
"eval_runtime": 394.2691,
"eval_samples_per_second": 1067.418,
"eval_steps_per_second": 4.17,
"step": 67500
},
{
"epoch": 10.340632603406325,
"grad_norm": 1.6624382734298706,
"learning_rate": 1.1424490541587752e-05,
"loss": 2.2477,
"step": 68000
},
{
"epoch": 10.340632603406325,
"eval_loss": 2.175464630126953,
"eval_runtime": 394.3358,
"eval_samples_per_second": 1067.238,
"eval_steps_per_second": 4.169,
"step": 68000
},
{
"epoch": 10.416666666666666,
"grad_norm": 1.7029284238815308,
"learning_rate": 1.1085076675397963e-05,
"loss": 2.2442,
"step": 68500
},
{
"epoch": 10.416666666666666,
"eval_loss": 2.172318935394287,
"eval_runtime": 394.363,
"eval_samples_per_second": 1067.164,
"eval_steps_per_second": 4.169,
"step": 68500
},
{
"epoch": 10.492700729927007,
"grad_norm": 1.7094260454177856,
"learning_rate": 1.0750006740005564e-05,
"loss": 2.2461,
"step": 69000
},
{
"epoch": 10.492700729927007,
"eval_loss": 2.1725075244903564,
"eval_runtime": 394.3359,
"eval_samples_per_second": 1067.237,
"eval_steps_per_second": 4.169,
"step": 69000
},
{
"epoch": 10.568734793187348,
"grad_norm": 1.7138928174972534,
"learning_rate": 1.04180263214852e-05,
"loss": 2.2428,
"step": 69500
},
{
"epoch": 10.568734793187348,
"eval_loss": 2.1679632663726807,
"eval_runtime": 394.3498,
"eval_samples_per_second": 1067.2,
"eval_steps_per_second": 4.169,
"step": 69500
},
{
"epoch": 10.644768856447689,
"grad_norm": 1.7748503684997559,
"learning_rate": 1.0089898314369628e-05,
"loss": 2.2409,
"step": 70000
},
{
"epoch": 10.644768856447689,
"eval_loss": 2.167714834213257,
"eval_runtime": 394.3096,
"eval_samples_per_second": 1067.308,
"eval_steps_per_second": 4.169,
"step": 70000
},
{
"epoch": 10.72080291970803,
"grad_norm": 1.8225022554397583,
"learning_rate": 9.765709406792067e-06,
"loss": 2.2421,
"step": 70500
},
{
"epoch": 10.72080291970803,
"eval_loss": 2.1677842140197754,
"eval_runtime": 394.4354,
"eval_samples_per_second": 1066.968,
"eval_steps_per_second": 4.168,
"step": 70500
},
{
"epoch": 10.79683698296837,
"grad_norm": 1.682428002357483,
"learning_rate": 9.445545246215093e-06,
"loss": 2.2405,
"step": 71000
},
{
"epoch": 10.79683698296837,
"eval_loss": 2.162020206451416,
"eval_runtime": 394.4337,
"eval_samples_per_second": 1066.973,
"eval_steps_per_second": 4.168,
"step": 71000
},
{
"epoch": 10.87287104622871,
"grad_norm": 1.8187251091003418,
"learning_rate": 9.130118369667984e-06,
"loss": 2.2338,
"step": 71500
},
{
"epoch": 10.87287104622871,
"eval_loss": 2.161623001098633,
"eval_runtime": 394.3265,
"eval_samples_per_second": 1067.263,
"eval_steps_per_second": 4.169,
"step": 71500
},
{
"epoch": 10.94890510948905,
"grad_norm": 1.586653470993042,
"learning_rate": 8.818247901683923e-06,
"loss": 2.2291,
"step": 72000
},
{
"epoch": 10.94890510948905,
"eval_loss": 2.1573026180267334,
"eval_runtime": 394.3904,
"eval_samples_per_second": 1067.09,
"eval_steps_per_second": 4.168,
"step": 72000
},
{
"epoch": 11.024939172749392,
"grad_norm": 1.6375211477279663,
"learning_rate": 8.510652490541102e-06,
"loss": 2.2337,
"step": 72500
},
{
"epoch": 11.024939172749392,
"eval_loss": 2.158447027206421,
"eval_runtime": 394.8845,
"eval_samples_per_second": 1065.755,
"eval_steps_per_second": 4.163,
"step": 72500
},
{
"epoch": 11.100973236009732,
"grad_norm": 1.9024183750152588,
"learning_rate": 8.207413399866525e-06,
"loss": 2.2243,
"step": 73000
},
{
"epoch": 11.100973236009732,
"eval_loss": 2.1577627658843994,
"eval_runtime": 394.3929,
"eval_samples_per_second": 1067.083,
"eval_steps_per_second": 4.168,
"step": 73000
},
{
"epoch": 11.177007299270073,
"grad_norm": 1.6612706184387207,
"learning_rate": 7.908610742390934e-06,
"loss": 2.2206,
"step": 73500
},
{
"epoch": 11.177007299270073,
"eval_loss": 2.156655788421631,
"eval_runtime": 394.4918,
"eval_samples_per_second": 1066.816,
"eval_steps_per_second": 4.167,
"step": 73500
},
{
"epoch": 11.253041362530414,
"grad_norm": 1.6041182279586792,
"learning_rate": 7.614323458783904e-06,
"loss": 2.2316,
"step": 74000
},
{
"epoch": 11.253041362530414,
"eval_loss": 2.154806137084961,
"eval_runtime": 394.5111,
"eval_samples_per_second": 1066.763,
"eval_steps_per_second": 4.167,
"step": 74000
},
{
"epoch": 11.329075425790755,
"grad_norm": 1.7304446697235107,
"learning_rate": 7.324629296798397e-06,
"loss": 2.2252,
"step": 74500
},
{
"epoch": 11.329075425790755,
"eval_loss": 2.1519484519958496,
"eval_runtime": 394.2907,
"eval_samples_per_second": 1067.36,
"eval_steps_per_second": 4.17,
"step": 74500
},
{
"epoch": 11.405109489051094,
"grad_norm": 1.6792948246002197,
"learning_rate": 7.039604790730683e-06,
"loss": 2.2257,
"step": 75000
},
{
"epoch": 11.405109489051094,
"eval_loss": 2.1538424491882324,
"eval_runtime": 394.5221,
"eval_samples_per_second": 1066.734,
"eval_steps_per_second": 4.167,
"step": 75000
},
{
"epoch": 11.481143552311435,
"grad_norm": 1.5765753984451294,
"learning_rate": 6.7598810154057336e-06,
"loss": 2.2252,
"step": 75500
},
{
"epoch": 11.481143552311435,
"eval_loss": 2.1519691944122314,
"eval_runtime": 394.4824,
"eval_samples_per_second": 1066.841,
"eval_steps_per_second": 4.167,
"step": 75500
},
{
"epoch": 11.557177615571776,
"grad_norm": 1.644453525543213,
"learning_rate": 6.484410758400267e-06,
"loss": 2.2228,
"step": 76000
},
{
"epoch": 11.557177615571776,
"eval_loss": 2.1509506702423096,
"eval_runtime": 394.5661,
"eval_samples_per_second": 1066.615,
"eval_steps_per_second": 4.167,
"step": 76000
},
{
"epoch": 11.633211678832117,
"grad_norm": 1.7033356428146362,
"learning_rate": 6.213832134635486e-06,
"loss": 2.2217,
"step": 76500
},
{
"epoch": 11.633211678832117,
"eval_loss": 2.1477901935577393,
"eval_runtime": 394.5248,
"eval_samples_per_second": 1066.726,
"eval_steps_per_second": 4.167,
"step": 76500
},
{
"epoch": 11.709245742092458,
"grad_norm": 1.6563267707824707,
"learning_rate": 5.948216628273909e-06,
"loss": 2.2135,
"step": 77000
},
{
"epoch": 11.709245742092458,
"eval_loss": 2.1486401557922363,
"eval_runtime": 394.3353,
"eval_samples_per_second": 1067.239,
"eval_steps_per_second": 4.169,
"step": 77000
},
{
"epoch": 11.785279805352799,
"grad_norm": 1.6282879114151,
"learning_rate": 5.687634412272127e-06,
"loss": 2.2254,
"step": 77500
},
{
"epoch": 11.785279805352799,
"eval_loss": 2.1465682983398438,
"eval_runtime": 394.4898,
"eval_samples_per_second": 1066.821,
"eval_steps_per_second": 4.167,
"step": 77500
},
{
"epoch": 11.861313868613138,
"grad_norm": 1.7813278436660767,
"learning_rate": 5.432154329841835e-06,
"loss": 2.2166,
"step": 78000
},
{
"epoch": 11.861313868613138,
"eval_loss": 2.14347505569458,
"eval_runtime": 394.4933,
"eval_samples_per_second": 1066.812,
"eval_steps_per_second": 4.167,
"step": 78000
},
{
"epoch": 11.937347931873479,
"grad_norm": 1.723649024963379,
"learning_rate": 5.181843876262127e-06,
"loss": 2.2181,
"step": 78500
},
{
"epoch": 11.937347931873479,
"eval_loss": 2.1440093517303467,
"eval_runtime": 394.3682,
"eval_samples_per_second": 1067.15,
"eval_steps_per_second": 4.169,
"step": 78500
},
{
"epoch": 12.01338199513382,
"grad_norm": 1.7719519138336182,
"learning_rate": 4.936769181047937e-06,
"loss": 2.2092,
"step": 79000
},
{
"epoch": 12.01338199513382,
"eval_loss": 2.141754388809204,
"eval_runtime": 394.1783,
"eval_samples_per_second": 1067.664,
"eval_steps_per_second": 4.171,
"step": 79000
},
{
"epoch": 12.08941605839416,
"grad_norm": 1.696637749671936,
"learning_rate": 4.697469206617919e-06,
"loss": 2.2007,
"step": 79500
},
{
"epoch": 12.08941605839416,
"eval_loss": 2.1432430744171143,
"eval_runtime": 394.2858,
"eval_samples_per_second": 1067.373,
"eval_steps_per_second": 4.17,
"step": 79500
},
{
"epoch": 12.165450121654501,
"grad_norm": 1.6854994297027588,
"learning_rate": 4.463511524513736e-06,
"loss": 2.2084,
"step": 80000
},
{
"epoch": 12.165450121654501,
"eval_loss": 2.141733407974243,
"eval_runtime": 394.4029,
"eval_samples_per_second": 1067.056,
"eval_steps_per_second": 4.168,
"step": 80000
},
{
"epoch": 12.241484184914842,
"grad_norm": 1.6496477127075195,
"learning_rate": 4.2345051393941574e-06,
"loss": 2.2089,
"step": 80500
},
{
"epoch": 12.241484184914842,
"eval_loss": 2.139671802520752,
"eval_runtime": 394.4989,
"eval_samples_per_second": 1066.796,
"eval_steps_per_second": 4.167,
"step": 80500
},
{
"epoch": 12.317518248175183,
"grad_norm": 1.6591581106185913,
"learning_rate": 4.010984790046615e-06,
"loss": 2.2058,
"step": 81000
},
{
"epoch": 12.317518248175183,
"eval_loss": 2.1399948596954346,
"eval_runtime": 394.4647,
"eval_samples_per_second": 1066.889,
"eval_steps_per_second": 4.168,
"step": 81000
},
{
"epoch": 12.393552311435522,
"grad_norm": 1.7192113399505615,
"learning_rate": 3.7930095283087966e-06,
"loss": 2.2059,
"step": 81500
},
{
"epoch": 12.393552311435522,
"eval_loss": 2.1405417919158936,
"eval_runtime": 394.3798,
"eval_samples_per_second": 1067.118,
"eval_steps_per_second": 4.169,
"step": 81500
},
{
"epoch": 12.469586374695863,
"grad_norm": 1.6483603715896606,
"learning_rate": 3.5806369410618047e-06,
"loss": 2.2144,
"step": 82000
},
{
"epoch": 12.469586374695863,
"eval_loss": 2.1386895179748535,
"eval_runtime": 394.4506,
"eval_samples_per_second": 1066.927,
"eval_steps_per_second": 4.168,
"step": 82000
},
{
"epoch": 12.545620437956204,
"grad_norm": 1.6323285102844238,
"learning_rate": 3.3739231350162437e-06,
"loss": 2.2076,
"step": 82500
},
{
"epoch": 12.545620437956204,
"eval_loss": 2.1366796493530273,
"eval_runtime": 394.6499,
"eval_samples_per_second": 1066.388,
"eval_steps_per_second": 4.166,
"step": 82500
},
{
"epoch": 12.621654501216545,
"grad_norm": 1.7512730360031128,
"learning_rate": 3.173318985201379e-06,
"loss": 2.21,
"step": 83000
},
{
"epoch": 12.621654501216545,
"eval_loss": 2.1367809772491455,
"eval_runtime": 394.4888,
"eval_samples_per_second": 1066.824,
"eval_steps_per_second": 4.167,
"step": 83000
},
{
"epoch": 12.697688564476886,
"grad_norm": 1.7279080152511597,
"learning_rate": 2.9780734823130846e-06,
"loss": 2.2014,
"step": 83500
},
{
"epoch": 12.697688564476886,
"eval_loss": 2.136183500289917,
"eval_runtime": 394.5466,
"eval_samples_per_second": 1066.667,
"eval_steps_per_second": 4.167,
"step": 83500
},
{
"epoch": 12.773722627737227,
"grad_norm": 1.7061643600463867,
"learning_rate": 2.7886459518572467e-06,
"loss": 2.2073,
"step": 84000
},
{
"epoch": 12.773722627737227,
"eval_loss": 2.136634111404419,
"eval_runtime": 394.4488,
"eval_samples_per_second": 1066.932,
"eval_steps_per_second": 4.168,
"step": 84000
},
{
"epoch": 12.849756690997566,
"grad_norm": 1.6525273323059082,
"learning_rate": 2.6050864386902433e-06,
"loss": 2.2062,
"step": 84500
},
{
"epoch": 12.849756690997566,
"eval_loss": 2.135418653488159,
"eval_runtime": 394.6522,
"eval_samples_per_second": 1066.382,
"eval_steps_per_second": 4.166,
"step": 84500
},
{
"epoch": 12.925790754257907,
"grad_norm": 1.753316879272461,
"learning_rate": 2.4274434373970757e-06,
"loss": 2.1969,
"step": 85000
},
{
"epoch": 12.925790754257907,
"eval_loss": 2.130448579788208,
"eval_runtime": 394.5649,
"eval_samples_per_second": 1066.618,
"eval_steps_per_second": 4.167,
"step": 85000
},
{
"epoch": 13.001824817518248,
"grad_norm": 1.5890535116195679,
"learning_rate": 2.256101256668691e-06,
"loss": 2.2078,
"step": 85500
},
{
"epoch": 13.001824817518248,
"eval_loss": 2.1335136890411377,
"eval_runtime": 394.3918,
"eval_samples_per_second": 1067.086,
"eval_steps_per_second": 4.168,
"step": 85500
},
{
"epoch": 13.077858880778589,
"grad_norm": 1.7298823595046997,
"learning_rate": 2.0904184363357256e-06,
"loss": 2.203,
"step": 86000
},
{
"epoch": 13.077858880778589,
"eval_loss": 2.132927894592285,
"eval_runtime": 394.4131,
"eval_samples_per_second": 1067.029,
"eval_steps_per_second": 4.168,
"step": 86000
},
{
"epoch": 13.15389294403893,
"grad_norm": 1.7888143062591553,
"learning_rate": 1.930788098008321e-06,
"loss": 2.1993,
"step": 86500
},
{
"epoch": 13.15389294403893,
"eval_loss": 2.1313769817352295,
"eval_runtime": 394.3014,
"eval_samples_per_second": 1067.331,
"eval_steps_per_second": 4.169,
"step": 86500
},
{
"epoch": 13.22992700729927,
"grad_norm": 1.7427315711975098,
"learning_rate": 1.7772524144231473e-06,
"loss": 2.2032,
"step": 87000
},
{
"epoch": 13.22992700729927,
"eval_loss": 2.135279893875122,
"eval_runtime": 394.1525,
"eval_samples_per_second": 1067.734,
"eval_steps_per_second": 4.171,
"step": 87000
},
{
"epoch": 13.305961070559611,
"grad_norm": 1.700643539428711,
"learning_rate": 1.6298519481701192e-06,
"loss": 2.2,
"step": 87500
},
{
"epoch": 13.305961070559611,
"eval_loss": 2.130155086517334,
"eval_runtime": 393.7376,
"eval_samples_per_second": 1068.859,
"eval_steps_per_second": 4.175,
"step": 87500
},
{
"epoch": 13.38199513381995,
"grad_norm": 1.6336027383804321,
"learning_rate": 1.4889019067080928e-06,
"loss": 2.1964,
"step": 88000
},
{
"epoch": 13.38199513381995,
"eval_loss": 2.129770517349243,
"eval_runtime": 394.1127,
"eval_samples_per_second": 1067.842,
"eval_steps_per_second": 4.171,
"step": 88000
},
{
"epoch": 13.458029197080291,
"grad_norm": 1.698116421699524,
"learning_rate": 1.3538746100630939e-06,
"loss": 2.1957,
"step": 88500
},
{
"epoch": 13.458029197080291,
"eval_loss": 2.1296403408050537,
"eval_runtime": 394.7051,
"eval_samples_per_second": 1066.239,
"eval_steps_per_second": 4.165,
"step": 88500
},
{
"epoch": 13.534063260340632,
"grad_norm": 1.7204720973968506,
"learning_rate": 1.2250943829259454e-06,
"loss": 2.1985,
"step": 89000
},
{
"epoch": 13.534063260340632,
"eval_loss": 2.131389856338501,
"eval_runtime": 394.7347,
"eval_samples_per_second": 1066.159,
"eval_steps_per_second": 4.165,
"step": 89000
},
{
"epoch": 13.610097323600973,
"grad_norm": 1.7444037199020386,
"learning_rate": 1.102595247742902e-06,
"loss": 2.1967,
"step": 89500
},
{
"epoch": 13.610097323600973,
"eval_loss": 2.13096284866333,
"eval_runtime": 394.695,
"eval_samples_per_second": 1066.266,
"eval_steps_per_second": 4.165,
"step": 89500
},
{
"epoch": 13.686131386861314,
"grad_norm": 1.7652897834777832,
"learning_rate": 9.864095675586272e-07,
"loss": 2.1979,
"step": 90000
},
{
"epoch": 13.686131386861314,
"eval_loss": 2.1287431716918945,
"eval_runtime": 394.6791,
"eval_samples_per_second": 1066.309,
"eval_steps_per_second": 4.165,
"step": 90000
},
{
"epoch": 13.762165450121655,
"grad_norm": 1.6986685991287231,
"learning_rate": 8.765680374662105e-07,
"loss": 2.2055,
"step": 90500
},
{
"epoch": 13.762165450121655,
"eval_loss": 2.128450870513916,
"eval_runtime": 394.7254,
"eval_samples_per_second": 1066.184,
"eval_steps_per_second": 4.165,
"step": 90500
},
{
"epoch": 13.838199513381996,
"grad_norm": 1.7826683521270752,
"learning_rate": 7.730996764978071e-07,
"loss": 2.1933,
"step": 91000
},
{
"epoch": 13.838199513381996,
"eval_loss": 2.128603935241699,
"eval_runtime": 394.6725,
"eval_samples_per_second": 1066.327,
"eval_steps_per_second": 4.165,
"step": 91000
},
{
"epoch": 13.914233576642335,
"grad_norm": 1.7597603797912598,
"learning_rate": 6.76031819958145e-07,
"loss": 2.1945,
"step": 91500
},
{
"epoch": 13.914233576642335,
"eval_loss": 2.1281092166900635,
"eval_runtime": 394.6346,
"eval_samples_per_second": 1066.43,
"eval_steps_per_second": 4.166,
"step": 91500
},
{
"epoch": 13.990267639902676,
"grad_norm": 1.5649290084838867,
"learning_rate": 5.855649661219098e-07,
"loss": 2.2016,
"step": 92000
},
{
"epoch": 13.990267639902676,
"eval_loss": 2.129279613494873,
"eval_runtime": 394.55,
"eval_samples_per_second": 1066.658,
"eval_steps_per_second": 4.167,
"step": 92000
},
{
"epoch": 14.066301703163017,
"grad_norm": 1.6939290761947632,
"learning_rate": 5.013604308242548e-07,
"loss": 2.195,
"step": 92500
},
{
"epoch": 14.066301703163017,
"eval_loss": 2.1266942024230957,
"eval_runtime": 394.5988,
"eval_samples_per_second": 1066.526,
"eval_steps_per_second": 4.166,
"step": 92500
},
{
"epoch": 14.142335766423358,
"grad_norm": 1.6481035947799683,
"learning_rate": 4.236281907425227e-07,
"loss": 2.1939,
"step": 93000
},
{
"epoch": 14.142335766423358,
"eval_loss": 2.1291019916534424,
"eval_runtime": 393.3337,
"eval_samples_per_second": 1069.957,
"eval_steps_per_second": 4.18,
"step": 93000
},
{
"epoch": 14.218369829683699,
"grad_norm": 1.7540963888168335,
"learning_rate": 3.523887819560451e-07,
"loss": 2.1939,
"step": 93500
},
{
"epoch": 14.218369829683699,
"eval_loss": 2.130265474319458,
"eval_runtime": 393.7198,
"eval_samples_per_second": 1068.907,
"eval_steps_per_second": 4.176,
"step": 93500
},
{
"epoch": 14.29440389294404,
"grad_norm": 1.7240368127822876,
"learning_rate": 2.876610252031453e-07,
"loss": 2.1907,
"step": 94000
},
{
"epoch": 14.29440389294404,
"eval_loss": 2.126887321472168,
"eval_runtime": 393.7098,
"eval_samples_per_second": 1068.934,
"eval_steps_per_second": 4.176,
"step": 94000
},
{
"epoch": 14.37043795620438,
"grad_norm": 1.6906523704528809,
"learning_rate": 2.2946202090889657e-07,
"loss": 2.1999,
"step": 94500
},
{
"epoch": 14.37043795620438,
"eval_loss": 2.126722812652588,
"eval_runtime": 393.685,
"eval_samples_per_second": 1069.002,
"eval_steps_per_second": 4.176,
"step": 94500
},
{
"epoch": 14.44647201946472,
"grad_norm": 1.7347662448883057,
"learning_rate": 1.7790391402128793e-07,
"loss": 2.1989,
"step": 95000
},
{
"epoch": 14.44647201946472,
"eval_loss": 2.1272239685058594,
"eval_runtime": 393.5995,
"eval_samples_per_second": 1069.234,
"eval_steps_per_second": 4.177,
"step": 95000
},
{
"epoch": 14.52250608272506,
"grad_norm": 1.64090096950531,
"learning_rate": 1.327936845155059e-07,
"loss": 2.1963,
"step": 95500
},
{
"epoch": 14.52250608272506,
"eval_loss": 2.126425266265869,
"eval_runtime": 394.5292,
"eval_samples_per_second": 1066.715,
"eval_steps_per_second": 4.167,
"step": 95500
},
{
"epoch": 14.598540145985401,
"grad_norm": 1.6597987413406372,
"learning_rate": 9.425312186875923e-08,
"loss": 2.1987,
"step": 96000
},
{
"epoch": 14.598540145985401,
"eval_loss": 2.1285743713378906,
"eval_runtime": 394.6211,
"eval_samples_per_second": 1066.466,
"eval_steps_per_second": 4.166,
"step": 96000
},
{
"epoch": 14.674574209245742,
"grad_norm": 1.6827759742736816,
"learning_rate": 6.2292408111711e-08,
"loss": 2.2012,
"step": 96500
},
{
"epoch": 14.674574209245742,
"eval_loss": 2.1267669200897217,
"eval_runtime": 394.6661,
"eval_samples_per_second": 1066.344,
"eval_steps_per_second": 4.166,
"step": 96500
},
{
"epoch": 14.750608272506083,
"grad_norm": 1.9470024108886719,
"learning_rate": 3.691998694484722e-08,
"loss": 2.2013,
"step": 97000
},
{
"epoch": 14.750608272506083,
"eval_loss": 2.128140449523926,
"eval_runtime": 394.6676,
"eval_samples_per_second": 1066.34,
"eval_steps_per_second": 4.166,
"step": 97000
},
{
"epoch": 14.826642335766424,
"grad_norm": 1.6369675397872925,
"learning_rate": 1.817353096532637e-08,
"loss": 2.1923,
"step": 97500
},
{
"epoch": 14.826642335766424,
"eval_loss": 2.128028392791748,
"eval_runtime": 394.6764,
"eval_samples_per_second": 1066.317,
"eval_steps_per_second": 4.165,
"step": 97500
},
{
"epoch": 14.902676399026763,
"grad_norm": 1.7755557298660278,
"learning_rate": 5.982858360498167e-09,
"loss": 2.1966,
"step": 98000
},
{
"epoch": 14.902676399026763,
"eval_loss": 2.1286511421203613,
"eval_runtime": 393.4618,
"eval_samples_per_second": 1069.608,
"eval_steps_per_second": 4.178,
"step": 98000
},
{
"epoch": 14.978710462287104,
"grad_norm": 1.7456624507904053,
"learning_rate": 3.953547649482303e-10,
"loss": 2.1987,
"step": 98500
},
{
"epoch": 14.978710462287104,
"eval_loss": 2.127889394760132,
"eval_runtime": 393.3437,
"eval_samples_per_second": 1069.93,
"eval_steps_per_second": 4.18,
"step": 98500
}
],
"logging_steps": 500,
"max_steps": 98640,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.646405662995644e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}