stella-en-ft-v1.0 / trainer_state.json
prasannad28's picture
Upload folder using huggingface_hub
61b3bd9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.4881567525112644,
"eval_steps": 500,
"global_step": 18000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00826753751395147,
"grad_norm": 9.10895824432373,
"learning_rate": 2.9875981810665565e-06,
"loss": 0.7375,
"step": 100
},
{
"epoch": 0.01653507502790294,
"grad_norm": 12.439295768737793,
"learning_rate": 2.975196362133113e-06,
"loss": 0.3709,
"step": 200
},
{
"epoch": 0.02480261254185441,
"grad_norm": 11.995777130126953,
"learning_rate": 2.9627945431996694e-06,
"loss": 0.3086,
"step": 300
},
{
"epoch": 0.03307015005580588,
"grad_norm": 10.923591613769531,
"learning_rate": 2.950392724266226e-06,
"loss": 0.2387,
"step": 400
},
{
"epoch": 0.04133768756975735,
"grad_norm": 10.521248817443848,
"learning_rate": 2.9379909053327823e-06,
"loss": 0.2154,
"step": 500
},
{
"epoch": 0.04133768756975735,
"eval_cosine_accuracy": 0.9945472090779086,
"eval_loss": 0.16342051327228546,
"eval_runtime": 695.845,
"eval_samples_per_second": 123.607,
"eval_steps_per_second": 3.863,
"step": 500
},
{
"epoch": 0.04960522508370882,
"grad_norm": 9.583465576171875,
"learning_rate": 2.9255890863993388e-06,
"loss": 0.2001,
"step": 600
},
{
"epoch": 0.057872762597660284,
"grad_norm": 11.749183654785156,
"learning_rate": 2.913187267465895e-06,
"loss": 0.1832,
"step": 700
},
{
"epoch": 0.06614030011161176,
"grad_norm": 9.898832321166992,
"learning_rate": 2.9007854485324512e-06,
"loss": 0.1794,
"step": 800
},
{
"epoch": 0.07440783762556323,
"grad_norm": 8.169402122497559,
"learning_rate": 2.888383629599008e-06,
"loss": 0.1459,
"step": 900
},
{
"epoch": 0.0826753751395147,
"grad_norm": 4.54998779296875,
"learning_rate": 2.8759818106655646e-06,
"loss": 0.1279,
"step": 1000
},
{
"epoch": 0.0826753751395147,
"eval_cosine_accuracy": 0.9969887572519794,
"eval_loss": 0.09583044797182083,
"eval_runtime": 694.7532,
"eval_samples_per_second": 123.801,
"eval_steps_per_second": 3.869,
"step": 1000
},
{
"epoch": 0.09094291265346617,
"grad_norm": 9.351482391357422,
"learning_rate": 2.863579991732121e-06,
"loss": 0.1218,
"step": 1100
},
{
"epoch": 0.09921045016741764,
"grad_norm": 9.509383201599121,
"learning_rate": 2.851178172798677e-06,
"loss": 0.105,
"step": 1200
},
{
"epoch": 0.10747798768136911,
"grad_norm": 8.839822769165039,
"learning_rate": 2.8387763538652335e-06,
"loss": 0.119,
"step": 1300
},
{
"epoch": 0.11574552519532057,
"grad_norm": 6.181822776794434,
"learning_rate": 2.82637453493179e-06,
"loss": 0.0899,
"step": 1400
},
{
"epoch": 0.12401306270927204,
"grad_norm": 5.21838903427124,
"learning_rate": 2.813972715998347e-06,
"loss": 0.0862,
"step": 1500
},
{
"epoch": 0.12401306270927204,
"eval_cosine_accuracy": 0.9981630256595087,
"eval_loss": 0.06220981478691101,
"eval_runtime": 694.8959,
"eval_samples_per_second": 123.775,
"eval_steps_per_second": 3.868,
"step": 1500
},
{
"epoch": 0.13228060022322352,
"grad_norm": 6.435093402862549,
"learning_rate": 2.801570897064903e-06,
"loss": 0.0751,
"step": 1600
},
{
"epoch": 0.14054813773717498,
"grad_norm": 0.0,
"learning_rate": 2.7891690781314593e-06,
"loss": 0.077,
"step": 1700
},
{
"epoch": 0.14881567525112646,
"grad_norm": 6.846395015716553,
"learning_rate": 2.7767672591980157e-06,
"loss": 0.0726,
"step": 1800
},
{
"epoch": 0.15708321276507792,
"grad_norm": 8.860357284545898,
"learning_rate": 2.764365440264572e-06,
"loss": 0.0714,
"step": 1900
},
{
"epoch": 0.1653507502790294,
"grad_norm": 13.334174156188965,
"learning_rate": 2.7519636213311286e-06,
"loss": 0.0647,
"step": 2000
},
{
"epoch": 0.1653507502790294,
"eval_cosine_accuracy": 0.9989536222111125,
"eval_loss": 0.04396611452102661,
"eval_runtime": 694.7257,
"eval_samples_per_second": 123.806,
"eval_steps_per_second": 3.869,
"step": 2000
},
{
"epoch": 0.17361828779298086,
"grad_norm": 6.131624221801758,
"learning_rate": 2.739561802397685e-06,
"loss": 0.0627,
"step": 2100
},
{
"epoch": 0.18188582530693234,
"grad_norm": 4.629586696624756,
"learning_rate": 2.7271599834642415e-06,
"loss": 0.0555,
"step": 2200
},
{
"epoch": 0.1901533628208838,
"grad_norm": 7.412508487701416,
"learning_rate": 2.714758164530798e-06,
"loss": 0.0582,
"step": 2300
},
{
"epoch": 0.19842090033483528,
"grad_norm": 0.0,
"learning_rate": 2.7023563455973544e-06,
"loss": 0.0573,
"step": 2400
},
{
"epoch": 0.20668843784878674,
"grad_norm": 7.9056806564331055,
"learning_rate": 2.6899545266639104e-06,
"loss": 0.0566,
"step": 2500
},
{
"epoch": 0.20668843784878674,
"eval_cosine_accuracy": 0.999360546906791,
"eval_loss": 0.030446216464042664,
"eval_runtime": 693.3017,
"eval_samples_per_second": 124.06,
"eval_steps_per_second": 3.877,
"step": 2500
},
{
"epoch": 0.21495597536273822,
"grad_norm": 5.176001071929932,
"learning_rate": 2.6775527077304673e-06,
"loss": 0.0474,
"step": 2600
},
{
"epoch": 0.22322351287668968,
"grad_norm": 0.0,
"learning_rate": 2.6651508887970238e-06,
"loss": 0.0492,
"step": 2700
},
{
"epoch": 0.23149105039064113,
"grad_norm": 0.0,
"learning_rate": 2.65274906986358e-06,
"loss": 0.0452,
"step": 2800
},
{
"epoch": 0.23975858790459262,
"grad_norm": 6.902950763702393,
"learning_rate": 2.6403472509301367e-06,
"loss": 0.049,
"step": 2900
},
{
"epoch": 0.24802612541854407,
"grad_norm": 9.228604316711426,
"learning_rate": 2.6279454319966927e-06,
"loss": 0.0426,
"step": 3000
},
{
"epoch": 0.24802612541854407,
"eval_cosine_accuracy": 0.9996977130832103,
"eval_loss": 0.02179584465920925,
"eval_runtime": 694.8853,
"eval_samples_per_second": 123.777,
"eval_steps_per_second": 3.868,
"step": 3000
},
{
"epoch": 0.25629366293249556,
"grad_norm": 8.215713500976562,
"learning_rate": 2.615543613063249e-06,
"loss": 0.0386,
"step": 3100
},
{
"epoch": 0.26456120044644704,
"grad_norm": 3.6304705142974854,
"learning_rate": 2.603141794129806e-06,
"loss": 0.0358,
"step": 3200
},
{
"epoch": 0.27282873796039847,
"grad_norm": 7.101751804351807,
"learning_rate": 2.5907399751963624e-06,
"loss": 0.0318,
"step": 3300
},
{
"epoch": 0.28109627547434995,
"grad_norm": 0.0,
"learning_rate": 2.5783381562629185e-06,
"loss": 0.0345,
"step": 3400
},
{
"epoch": 0.28936381298830144,
"grad_norm": 7.361840724945068,
"learning_rate": 2.565936337329475e-06,
"loss": 0.0311,
"step": 3500
},
{
"epoch": 0.28936381298830144,
"eval_cosine_accuracy": 0.9996977130832103,
"eval_loss": 0.01837225630879402,
"eval_runtime": 693.1653,
"eval_samples_per_second": 124.084,
"eval_steps_per_second": 3.878,
"step": 3500
},
{
"epoch": 0.2976313505022529,
"grad_norm": 7.0488505363464355,
"learning_rate": 2.5535345183960314e-06,
"loss": 0.0383,
"step": 3600
},
{
"epoch": 0.30589888801620435,
"grad_norm": 3.100097894668579,
"learning_rate": 2.541132699462588e-06,
"loss": 0.0262,
"step": 3700
},
{
"epoch": 0.31416642553015584,
"grad_norm": 6.720247745513916,
"learning_rate": 2.5287308805291443e-06,
"loss": 0.0299,
"step": 3800
},
{
"epoch": 0.3224339630441073,
"grad_norm": 3.7630603313446045,
"learning_rate": 2.5163290615957007e-06,
"loss": 0.0251,
"step": 3900
},
{
"epoch": 0.3307015005580588,
"grad_norm": 7.238327980041504,
"learning_rate": 2.503927242662257e-06,
"loss": 0.0255,
"step": 4000
},
{
"epoch": 0.3307015005580588,
"eval_cosine_accuracy": 0.9997674716024695,
"eval_loss": 0.01472113560885191,
"eval_runtime": 691.5244,
"eval_samples_per_second": 124.379,
"eval_steps_per_second": 3.887,
"step": 4000
},
{
"epoch": 0.33896903807201023,
"grad_norm": 7.58219051361084,
"learning_rate": 2.4915254237288136e-06,
"loss": 0.026,
"step": 4100
},
{
"epoch": 0.3472365755859617,
"grad_norm": 5.715928554534912,
"learning_rate": 2.47912360479537e-06,
"loss": 0.029,
"step": 4200
},
{
"epoch": 0.3555041130999132,
"grad_norm": 4.353998184204102,
"learning_rate": 2.4667217858619265e-06,
"loss": 0.0312,
"step": 4300
},
{
"epoch": 0.3637716506138647,
"grad_norm": 4.321074962615967,
"learning_rate": 2.454319966928483e-06,
"loss": 0.0217,
"step": 4400
},
{
"epoch": 0.3720391881278161,
"grad_norm": 0.0,
"learning_rate": 2.4419181479950394e-06,
"loss": 0.0242,
"step": 4500
},
{
"epoch": 0.3720391881278161,
"eval_cosine_accuracy": 0.9998372301217286,
"eval_loss": 0.012444370426237583,
"eval_runtime": 691.587,
"eval_samples_per_second": 124.368,
"eval_steps_per_second": 3.887,
"step": 4500
},
{
"epoch": 0.3803067256417676,
"grad_norm": 0.0,
"learning_rate": 2.429516329061596e-06,
"loss": 0.0209,
"step": 4600
},
{
"epoch": 0.3885742631557191,
"grad_norm": 0.0,
"learning_rate": 2.4171145101281523e-06,
"loss": 0.0259,
"step": 4700
},
{
"epoch": 0.39684180066967056,
"grad_norm": 5.16447639465332,
"learning_rate": 2.4047126911947083e-06,
"loss": 0.0246,
"step": 4800
},
{
"epoch": 0.405109338183622,
"grad_norm": 6.7723870277404785,
"learning_rate": 2.392310872261265e-06,
"loss": 0.0219,
"step": 4900
},
{
"epoch": 0.4133768756975735,
"grad_norm": 0.0,
"learning_rate": 2.3799090533278216e-06,
"loss": 0.0228,
"step": 5000
},
{
"epoch": 0.4133768756975735,
"eval_cosine_accuracy": 0.9998372301217286,
"eval_loss": 0.009931358508765697,
"eval_runtime": 693.5328,
"eval_samples_per_second": 124.019,
"eval_steps_per_second": 3.876,
"step": 5000
},
{
"epoch": 0.42164441321152496,
"grad_norm": 7.131288051605225,
"learning_rate": 2.367507234394378e-06,
"loss": 0.0185,
"step": 5100
},
{
"epoch": 0.42991195072547644,
"grad_norm": 4.856233596801758,
"learning_rate": 2.355105415460934e-06,
"loss": 0.0194,
"step": 5200
},
{
"epoch": 0.4381794882394279,
"grad_norm": 0.0,
"learning_rate": 2.3427035965274906e-06,
"loss": 0.0177,
"step": 5300
},
{
"epoch": 0.44644702575337936,
"grad_norm": 3.5896942615509033,
"learning_rate": 2.330301777594047e-06,
"loss": 0.0183,
"step": 5400
},
{
"epoch": 0.45471456326733084,
"grad_norm": 5.037208080291748,
"learning_rate": 2.3178999586606035e-06,
"loss": 0.0163,
"step": 5500
},
{
"epoch": 0.45471456326733084,
"eval_cosine_accuracy": 0.9998837358012347,
"eval_loss": 0.009113303385674953,
"eval_runtime": 692.1079,
"eval_samples_per_second": 124.274,
"eval_steps_per_second": 3.884,
"step": 5500
},
{
"epoch": 0.46298210078128227,
"grad_norm": 3.856128692626953,
"learning_rate": 2.3054981397271603e-06,
"loss": 0.0179,
"step": 5600
},
{
"epoch": 0.47124963829523375,
"grad_norm": 4.501084804534912,
"learning_rate": 2.2930963207937164e-06,
"loss": 0.0201,
"step": 5700
},
{
"epoch": 0.47951717580918524,
"grad_norm": 0.0,
"learning_rate": 2.280694501860273e-06,
"loss": 0.0133,
"step": 5800
},
{
"epoch": 0.4877847133231367,
"grad_norm": 0.0,
"learning_rate": 2.2682926829268293e-06,
"loss": 0.0161,
"step": 5900
},
{
"epoch": 0.49605225083708815,
"grad_norm": 0.0,
"learning_rate": 2.2558908639933857e-06,
"loss": 0.0131,
"step": 6000
},
{
"epoch": 0.49605225083708815,
"eval_cosine_accuracy": 0.9998953622211113,
"eval_loss": 0.008038520812988281,
"eval_runtime": 692.7644,
"eval_samples_per_second": 124.156,
"eval_steps_per_second": 3.88,
"step": 6000
},
{
"epoch": 0.5043197883510396,
"grad_norm": 5.829000473022461,
"learning_rate": 2.243489045059942e-06,
"loss": 0.0147,
"step": 6100
},
{
"epoch": 0.5125873258649911,
"grad_norm": 8.35444164276123,
"learning_rate": 2.2310872261264986e-06,
"loss": 0.016,
"step": 6200
},
{
"epoch": 0.5208548633789426,
"grad_norm": 5.209952354431152,
"learning_rate": 2.218685407193055e-06,
"loss": 0.0141,
"step": 6300
},
{
"epoch": 0.5291224008928941,
"grad_norm": 6.257237434387207,
"learning_rate": 2.2062835882596115e-06,
"loss": 0.0173,
"step": 6400
},
{
"epoch": 0.5373899384068456,
"grad_norm": 0.0,
"learning_rate": 2.193881769326168e-06,
"loss": 0.0126,
"step": 6500
},
{
"epoch": 0.5373899384068456,
"eval_cosine_accuracy": 0.9999302414807408,
"eval_loss": 0.0074099162593483925,
"eval_runtime": 692.7522,
"eval_samples_per_second": 124.158,
"eval_steps_per_second": 3.88,
"step": 6500
},
{
"epoch": 0.5456574759207969,
"grad_norm": 3.5162978172302246,
"learning_rate": 2.1814799503927244e-06,
"loss": 0.0126,
"step": 6600
},
{
"epoch": 0.5539250134347484,
"grad_norm": 4.822145938873291,
"learning_rate": 2.169078131459281e-06,
"loss": 0.0166,
"step": 6700
},
{
"epoch": 0.5621925509486999,
"grad_norm": 0.0,
"learning_rate": 2.1566763125258373e-06,
"loss": 0.0127,
"step": 6800
},
{
"epoch": 0.5704600884626514,
"grad_norm": 0.0,
"learning_rate": 2.1442744935923937e-06,
"loss": 0.0149,
"step": 6900
},
{
"epoch": 0.5787276259766029,
"grad_norm": 3.7051377296447754,
"learning_rate": 2.13187267465895e-06,
"loss": 0.0087,
"step": 7000
},
{
"epoch": 0.5787276259766029,
"eval_cosine_accuracy": 0.9999186150608643,
"eval_loss": 0.006947255693376064,
"eval_runtime": 692.9985,
"eval_samples_per_second": 124.114,
"eval_steps_per_second": 3.879,
"step": 7000
},
{
"epoch": 0.5869951634905544,
"grad_norm": 4.005233287811279,
"learning_rate": 2.119470855725506e-06,
"loss": 0.0182,
"step": 7100
},
{
"epoch": 0.5952627010045058,
"grad_norm": 0.0,
"learning_rate": 2.1070690367920627e-06,
"loss": 0.0122,
"step": 7200
},
{
"epoch": 0.6035302385184573,
"grad_norm": 0.0,
"learning_rate": 2.0946672178586195e-06,
"loss": 0.0141,
"step": 7300
},
{
"epoch": 0.6117977760324087,
"grad_norm": 0.0,
"learning_rate": 2.082265398925176e-06,
"loss": 0.0145,
"step": 7400
},
{
"epoch": 0.6200653135463602,
"grad_norm": 0.0,
"learning_rate": 2.069863579991732e-06,
"loss": 0.0133,
"step": 7500
},
{
"epoch": 0.6200653135463602,
"eval_cosine_accuracy": 0.9999069886409878,
"eval_loss": 0.006750450469553471,
"eval_runtime": 692.7796,
"eval_samples_per_second": 124.153,
"eval_steps_per_second": 3.88,
"step": 7500
},
{
"epoch": 0.6283328510603117,
"grad_norm": 4.767193794250488,
"learning_rate": 2.0574617610582885e-06,
"loss": 0.0139,
"step": 7600
},
{
"epoch": 0.6366003885742632,
"grad_norm": 3.9448177814483643,
"learning_rate": 2.045059942124845e-06,
"loss": 0.0159,
"step": 7700
},
{
"epoch": 0.6448679260882146,
"grad_norm": 0.0,
"learning_rate": 2.0326581231914013e-06,
"loss": 0.0167,
"step": 7800
},
{
"epoch": 0.6531354636021661,
"grad_norm": 0.0,
"learning_rate": 2.0202563042579582e-06,
"loss": 0.0106,
"step": 7900
},
{
"epoch": 0.6614030011161176,
"grad_norm": 0.0,
"learning_rate": 2.0078544853245142e-06,
"loss": 0.0125,
"step": 8000
},
{
"epoch": 0.6614030011161176,
"eval_cosine_accuracy": 0.9999302414807408,
"eval_loss": 0.005728627555072308,
"eval_runtime": 690.5659,
"eval_samples_per_second": 124.551,
"eval_steps_per_second": 3.892,
"step": 8000
},
{
"epoch": 0.669670538630069,
"grad_norm": 5.22554874420166,
"learning_rate": 1.9954526663910707e-06,
"loss": 0.0086,
"step": 8100
},
{
"epoch": 0.6779380761440205,
"grad_norm": 2.913892984390259,
"learning_rate": 1.983050847457627e-06,
"loss": 0.0138,
"step": 8200
},
{
"epoch": 0.686205613657972,
"grad_norm": 4.088724613189697,
"learning_rate": 1.9706490285241836e-06,
"loss": 0.0151,
"step": 8300
},
{
"epoch": 0.6944731511719234,
"grad_norm": 0.0,
"learning_rate": 1.95824720959074e-06,
"loss": 0.0148,
"step": 8400
},
{
"epoch": 0.7027406886858749,
"grad_norm": 0.0,
"learning_rate": 1.9458453906572965e-06,
"loss": 0.0086,
"step": 8500
},
{
"epoch": 0.7027406886858749,
"eval_cosine_accuracy": 0.9999186150608643,
"eval_loss": 0.00535405520349741,
"eval_runtime": 688.2066,
"eval_samples_per_second": 124.978,
"eval_steps_per_second": 3.906,
"step": 8500
},
{
"epoch": 0.7110082261998264,
"grad_norm": 4.961321830749512,
"learning_rate": 1.933443571723853e-06,
"loss": 0.0082,
"step": 8600
},
{
"epoch": 0.7192757637137779,
"grad_norm": 0.0,
"learning_rate": 1.9210417527904094e-06,
"loss": 0.0088,
"step": 8700
},
{
"epoch": 0.7275433012277294,
"grad_norm": 4.613398551940918,
"learning_rate": 1.908639933856966e-06,
"loss": 0.0097,
"step": 8800
},
{
"epoch": 0.7358108387416807,
"grad_norm": 5.949629306793213,
"learning_rate": 1.896238114923522e-06,
"loss": 0.0103,
"step": 8900
},
{
"epoch": 0.7440783762556322,
"grad_norm": 0.0,
"learning_rate": 1.8838362959900785e-06,
"loss": 0.0108,
"step": 9000
},
{
"epoch": 0.7440783762556322,
"eval_cosine_accuracy": 0.9999069886409878,
"eval_loss": 0.004750753752887249,
"eval_runtime": 692.3618,
"eval_samples_per_second": 124.228,
"eval_steps_per_second": 3.882,
"step": 9000
},
{
"epoch": 0.7523459137695837,
"grad_norm": 0.0,
"learning_rate": 1.871434477056635e-06,
"loss": 0.0113,
"step": 9100
},
{
"epoch": 0.7606134512835352,
"grad_norm": 0.0,
"learning_rate": 1.8590326581231916e-06,
"loss": 0.0096,
"step": 9200
},
{
"epoch": 0.7688809887974867,
"grad_norm": 0.0,
"learning_rate": 1.8466308391897476e-06,
"loss": 0.0115,
"step": 9300
},
{
"epoch": 0.7771485263114382,
"grad_norm": 7.0339884757995605,
"learning_rate": 1.8342290202563043e-06,
"loss": 0.011,
"step": 9400
},
{
"epoch": 0.7854160638253896,
"grad_norm": 0.0,
"learning_rate": 1.8218272013228608e-06,
"loss": 0.0091,
"step": 9500
},
{
"epoch": 0.7854160638253896,
"eval_cosine_accuracy": 0.9999069886409878,
"eval_loss": 0.00442688912153244,
"eval_runtime": 692.6571,
"eval_samples_per_second": 124.175,
"eval_steps_per_second": 3.881,
"step": 9500
},
{
"epoch": 0.7936836013393411,
"grad_norm": 3.4286439418792725,
"learning_rate": 1.8094253823894172e-06,
"loss": 0.0157,
"step": 9600
},
{
"epoch": 0.8019511388532925,
"grad_norm": 0.0,
"learning_rate": 1.7970235634559737e-06,
"loss": 0.0119,
"step": 9700
},
{
"epoch": 0.810218676367244,
"grad_norm": 7.657674312591553,
"learning_rate": 1.7846217445225299e-06,
"loss": 0.0071,
"step": 9800
},
{
"epoch": 0.8184862138811955,
"grad_norm": 0.0,
"learning_rate": 1.7722199255890863e-06,
"loss": 0.008,
"step": 9900
},
{
"epoch": 0.826753751395147,
"grad_norm": 0.0,
"learning_rate": 1.759818106655643e-06,
"loss": 0.0111,
"step": 10000
},
{
"epoch": 0.826753751395147,
"eval_cosine_accuracy": 0.9999186150608643,
"eval_loss": 0.004347575828433037,
"eval_runtime": 691.4865,
"eval_samples_per_second": 124.386,
"eval_steps_per_second": 3.887,
"step": 10000
},
{
"epoch": 0.8350212889090984,
"grad_norm": 0.0,
"learning_rate": 1.7474162877221994e-06,
"loss": 0.0107,
"step": 10100
},
{
"epoch": 0.8432888264230499,
"grad_norm": 0.0,
"learning_rate": 1.7350144687887557e-06,
"loss": 0.0094,
"step": 10200
},
{
"epoch": 0.8515563639370014,
"grad_norm": 0.0,
"learning_rate": 1.7226126498553121e-06,
"loss": 0.0099,
"step": 10300
},
{
"epoch": 0.8598239014509529,
"grad_norm": 5.666162967681885,
"learning_rate": 1.7102108309218686e-06,
"loss": 0.0108,
"step": 10400
},
{
"epoch": 0.8680914389649043,
"grad_norm": 0.0,
"learning_rate": 1.697809011988425e-06,
"loss": 0.0077,
"step": 10500
},
{
"epoch": 0.8680914389649043,
"eval_cosine_accuracy": 0.9999418679006173,
"eval_loss": 0.003721497254446149,
"eval_runtime": 691.4305,
"eval_samples_per_second": 124.396,
"eval_steps_per_second": 3.888,
"step": 10500
},
{
"epoch": 0.8763589764788557,
"grad_norm": 0.0,
"learning_rate": 1.6854071930549817e-06,
"loss": 0.0065,
"step": 10600
},
{
"epoch": 0.8846265139928072,
"grad_norm": 5.624023914337158,
"learning_rate": 1.6730053741215377e-06,
"loss": 0.0111,
"step": 10700
},
{
"epoch": 0.8928940515067587,
"grad_norm": 0.0,
"learning_rate": 1.6606035551880942e-06,
"loss": 0.0099,
"step": 10800
},
{
"epoch": 0.9011615890207102,
"grad_norm": 4.608953952789307,
"learning_rate": 1.6482017362546508e-06,
"loss": 0.01,
"step": 10900
},
{
"epoch": 0.9094291265346617,
"grad_norm": 3.9025700092315674,
"learning_rate": 1.6357999173212073e-06,
"loss": 0.0098,
"step": 11000
},
{
"epoch": 0.9094291265346617,
"eval_cosine_accuracy": 0.9999534943204939,
"eval_loss": 0.0034107074607163668,
"eval_runtime": 690.7094,
"eval_samples_per_second": 124.526,
"eval_steps_per_second": 3.892,
"step": 11000
},
{
"epoch": 0.9176966640486132,
"grad_norm": 0.0,
"learning_rate": 1.6233980983877635e-06,
"loss": 0.0075,
"step": 11100
},
{
"epoch": 0.9259642015625645,
"grad_norm": 0.0,
"learning_rate": 1.61099627945432e-06,
"loss": 0.0049,
"step": 11200
},
{
"epoch": 0.934231739076516,
"grad_norm": 0.0,
"learning_rate": 1.5985944605208764e-06,
"loss": 0.0091,
"step": 11300
},
{
"epoch": 0.9424992765904675,
"grad_norm": 0.0,
"learning_rate": 1.5861926415874328e-06,
"loss": 0.0067,
"step": 11400
},
{
"epoch": 0.950766814104419,
"grad_norm": 0.0,
"learning_rate": 1.5737908226539895e-06,
"loss": 0.0089,
"step": 11500
},
{
"epoch": 0.950766814104419,
"eval_cosine_accuracy": 0.9999418679006173,
"eval_loss": 0.0033190434332937002,
"eval_runtime": 692.4967,
"eval_samples_per_second": 124.204,
"eval_steps_per_second": 3.882,
"step": 11500
},
{
"epoch": 0.9590343516183705,
"grad_norm": 0.0,
"learning_rate": 1.5613890037205455e-06,
"loss": 0.0087,
"step": 11600
},
{
"epoch": 0.967301889132322,
"grad_norm": 0.0,
"learning_rate": 1.5489871847871022e-06,
"loss": 0.0092,
"step": 11700
},
{
"epoch": 0.9755694266462734,
"grad_norm": 6.598356246948242,
"learning_rate": 1.5365853658536586e-06,
"loss": 0.0051,
"step": 11800
},
{
"epoch": 0.9838369641602249,
"grad_norm": 0.0,
"learning_rate": 1.524183546920215e-06,
"loss": 0.0088,
"step": 11900
},
{
"epoch": 0.9921045016741763,
"grad_norm": 0.0,
"learning_rate": 1.5117817279867713e-06,
"loss": 0.0089,
"step": 12000
},
{
"epoch": 0.9921045016741763,
"eval_cosine_accuracy": 0.9999302414807408,
"eval_loss": 0.0032671140506863594,
"eval_runtime": 691.2453,
"eval_samples_per_second": 124.429,
"eval_steps_per_second": 3.889,
"step": 12000
},
{
"epoch": 1.000372039188128,
"grad_norm": 5.8269877433776855,
"learning_rate": 1.4993799090533278e-06,
"loss": 0.0145,
"step": 12100
},
{
"epoch": 1.0086395767020793,
"grad_norm": 5.8136091232299805,
"learning_rate": 1.4869780901198842e-06,
"loss": 0.0078,
"step": 12200
},
{
"epoch": 1.0169071142160309,
"grad_norm": 7.668418884277344,
"learning_rate": 1.4745762711864409e-06,
"loss": 0.0087,
"step": 12300
},
{
"epoch": 1.0251746517299822,
"grad_norm": 0.0,
"learning_rate": 1.4621744522529971e-06,
"loss": 0.0047,
"step": 12400
},
{
"epoch": 1.0334421892439336,
"grad_norm": 0.0,
"learning_rate": 1.4497726333195536e-06,
"loss": 0.0064,
"step": 12500
},
{
"epoch": 1.0334421892439336,
"eval_cosine_accuracy": 0.9999651207403705,
"eval_loss": 0.003002994926646352,
"eval_runtime": 695.5578,
"eval_samples_per_second": 123.658,
"eval_steps_per_second": 3.865,
"step": 12500
},
{
"epoch": 1.0417097267578852,
"grad_norm": 0.0,
"learning_rate": 1.43737081438611e-06,
"loss": 0.0057,
"step": 12600
},
{
"epoch": 1.0499772642718366,
"grad_norm": 0.0,
"learning_rate": 1.4249689954526665e-06,
"loss": 0.0067,
"step": 12700
},
{
"epoch": 1.0582448017857882,
"grad_norm": 0.0,
"learning_rate": 1.4125671765192227e-06,
"loss": 0.0044,
"step": 12800
},
{
"epoch": 1.0665123392997395,
"grad_norm": 6.3078789710998535,
"learning_rate": 1.4001653575857794e-06,
"loss": 0.0051,
"step": 12900
},
{
"epoch": 1.0747798768136911,
"grad_norm": 0.0,
"learning_rate": 1.3877635386523356e-06,
"loss": 0.0057,
"step": 13000
},
{
"epoch": 1.0747798768136911,
"eval_cosine_accuracy": 0.9999418679006173,
"eval_loss": 0.003003525547683239,
"eval_runtime": 699.036,
"eval_samples_per_second": 123.042,
"eval_steps_per_second": 3.845,
"step": 13000
},
{
"epoch": 1.0830474143276425,
"grad_norm": 0.0,
"learning_rate": 1.375361719718892e-06,
"loss": 0.0064,
"step": 13100
},
{
"epoch": 1.0913149518415939,
"grad_norm": 3.789832830429077,
"learning_rate": 1.3629599007854487e-06,
"loss": 0.0051,
"step": 13200
},
{
"epoch": 1.0995824893555455,
"grad_norm": 0.0,
"learning_rate": 1.350558081852005e-06,
"loss": 0.0037,
"step": 13300
},
{
"epoch": 1.1078500268694969,
"grad_norm": 0.0,
"learning_rate": 1.3381562629185614e-06,
"loss": 0.0041,
"step": 13400
},
{
"epoch": 1.1161175643834484,
"grad_norm": 0.0,
"learning_rate": 1.3257544439851178e-06,
"loss": 0.0023,
"step": 13500
},
{
"epoch": 1.1161175643834484,
"eval_cosine_accuracy": 0.9999534943204939,
"eval_loss": 0.002926659770309925,
"eval_runtime": 609.8415,
"eval_samples_per_second": 141.038,
"eval_steps_per_second": 4.408,
"step": 13500
},
{
"epoch": 1.1243851018973998,
"grad_norm": 6.910253047943115,
"learning_rate": 1.3133526250516743e-06,
"loss": 0.0039,
"step": 13600
},
{
"epoch": 1.1326526394113514,
"grad_norm": 0.0,
"learning_rate": 1.3009508061182307e-06,
"loss": 0.0024,
"step": 13700
},
{
"epoch": 1.1409201769253028,
"grad_norm": 0.0,
"learning_rate": 1.2885489871847872e-06,
"loss": 0.0046,
"step": 13800
},
{
"epoch": 1.1491877144392544,
"grad_norm": 0.0,
"learning_rate": 1.2761471682513436e-06,
"loss": 0.0053,
"step": 13900
},
{
"epoch": 1.1574552519532058,
"grad_norm": 0.0,
"learning_rate": 1.2637453493179e-06,
"loss": 0.005,
"step": 14000
},
{
"epoch": 1.1574552519532058,
"eval_cosine_accuracy": 0.9999651207403705,
"eval_loss": 0.002777786459773779,
"eval_runtime": 544.44,
"eval_samples_per_second": 157.981,
"eval_steps_per_second": 4.937,
"step": 14000
},
{
"epoch": 1.1657227894671571,
"grad_norm": 0.0,
"learning_rate": 1.2513435303844565e-06,
"loss": 0.0036,
"step": 14100
},
{
"epoch": 1.1739903269811087,
"grad_norm": 0.0,
"learning_rate": 1.2389417114510128e-06,
"loss": 0.003,
"step": 14200
},
{
"epoch": 1.18225786449506,
"grad_norm": 0.0,
"learning_rate": 1.2265398925175692e-06,
"loss": 0.0024,
"step": 14300
},
{
"epoch": 1.1905254020090117,
"grad_norm": 0.0,
"learning_rate": 1.2141380735841257e-06,
"loss": 0.0045,
"step": 14400
},
{
"epoch": 1.198792939522963,
"grad_norm": 0.0,
"learning_rate": 1.2017362546506821e-06,
"loss": 0.0048,
"step": 14500
},
{
"epoch": 1.198792939522963,
"eval_cosine_accuracy": 0.9999651207403705,
"eval_loss": 0.0027241790667176247,
"eval_runtime": 709.377,
"eval_samples_per_second": 121.249,
"eval_steps_per_second": 3.789,
"step": 14500
},
{
"epoch": 1.2070604770369147,
"grad_norm": 0.0,
"learning_rate": 1.1893344357172386e-06,
"loss": 0.0037,
"step": 14600
},
{
"epoch": 1.215328014550866,
"grad_norm": 0.0,
"learning_rate": 1.176932616783795e-06,
"loss": 0.0041,
"step": 14700
},
{
"epoch": 1.2235955520648174,
"grad_norm": 0.0,
"learning_rate": 1.1645307978503515e-06,
"loss": 0.0035,
"step": 14800
},
{
"epoch": 1.231863089578769,
"grad_norm": 0.0,
"learning_rate": 1.152128978916908e-06,
"loss": 0.0024,
"step": 14900
},
{
"epoch": 1.2401306270927204,
"grad_norm": 3.7592828273773193,
"learning_rate": 1.1397271599834644e-06,
"loss": 0.0036,
"step": 15000
},
{
"epoch": 1.2401306270927204,
"eval_cosine_accuracy": 0.9999651207403705,
"eval_loss": 0.0026917748618870974,
"eval_runtime": 691.7866,
"eval_samples_per_second": 124.332,
"eval_steps_per_second": 3.886,
"step": 15000
},
{
"epoch": 1.248398164606672,
"grad_norm": 0.0,
"learning_rate": 1.1273253410500206e-06,
"loss": 0.0017,
"step": 15100
},
{
"epoch": 1.2566657021206233,
"grad_norm": 0.0,
"learning_rate": 1.1149235221165772e-06,
"loss": 0.0032,
"step": 15200
},
{
"epoch": 1.264933239634575,
"grad_norm": 0.0,
"learning_rate": 1.1025217031831335e-06,
"loss": 0.0032,
"step": 15300
},
{
"epoch": 1.2732007771485263,
"grad_norm": 0.0,
"learning_rate": 1.09011988424969e-06,
"loss": 0.004,
"step": 15400
},
{
"epoch": 1.2814683146624777,
"grad_norm": 0.0,
"learning_rate": 1.0777180653162464e-06,
"loss": 0.0034,
"step": 15500
},
{
"epoch": 1.2814683146624777,
"eval_cosine_accuracy": 0.9999651207403705,
"eval_loss": 0.0028443040791898966,
"eval_runtime": 691.2444,
"eval_samples_per_second": 124.429,
"eval_steps_per_second": 3.889,
"step": 15500
},
{
"epoch": 1.2897358521764293,
"grad_norm": 0.0,
"learning_rate": 1.0653162463828028e-06,
"loss": 0.0036,
"step": 15600
},
{
"epoch": 1.2980033896903806,
"grad_norm": 0.0,
"learning_rate": 1.0529144274493593e-06,
"loss": 0.0047,
"step": 15700
},
{
"epoch": 1.3062709272043322,
"grad_norm": 0.0,
"learning_rate": 1.0405126085159157e-06,
"loss": 0.0022,
"step": 15800
},
{
"epoch": 1.3145384647182836,
"grad_norm": 0.0,
"learning_rate": 1.0281107895824722e-06,
"loss": 0.0023,
"step": 15900
},
{
"epoch": 1.3228060022322352,
"grad_norm": 0.0,
"learning_rate": 1.0157089706490284e-06,
"loss": 0.0031,
"step": 16000
},
{
"epoch": 1.3228060022322352,
"eval_cosine_accuracy": 0.9999534943204939,
"eval_loss": 0.002732690190896392,
"eval_runtime": 690.7452,
"eval_samples_per_second": 124.519,
"eval_steps_per_second": 3.891,
"step": 16000
},
{
"epoch": 1.3310735397461866,
"grad_norm": 0.0,
"learning_rate": 1.003307151715585e-06,
"loss": 0.0024,
"step": 16100
},
{
"epoch": 1.339341077260138,
"grad_norm": 0.0,
"learning_rate": 9.909053327821413e-07,
"loss": 0.0027,
"step": 16200
},
{
"epoch": 1.3476086147740896,
"grad_norm": 4.422085285186768,
"learning_rate": 9.785035138486978e-07,
"loss": 0.0043,
"step": 16300
},
{
"epoch": 1.355876152288041,
"grad_norm": 0.0,
"learning_rate": 9.661016949152542e-07,
"loss": 0.004,
"step": 16400
},
{
"epoch": 1.3641436898019925,
"grad_norm": 3.7661290168762207,
"learning_rate": 9.536998759818107e-07,
"loss": 0.0018,
"step": 16500
},
{
"epoch": 1.3641436898019925,
"eval_cosine_accuracy": 0.9999651207403705,
"eval_loss": 0.0025276916567236185,
"eval_runtime": 691.7092,
"eval_samples_per_second": 124.346,
"eval_steps_per_second": 3.886,
"step": 16500
},
{
"epoch": 1.372411227315944,
"grad_norm": 0.0,
"learning_rate": 9.412980570483672e-07,
"loss": 0.0035,
"step": 16600
},
{
"epoch": 1.3806787648298955,
"grad_norm": 0.0,
"learning_rate": 9.288962381149235e-07,
"loss": 0.0028,
"step": 16700
},
{
"epoch": 1.3889463023438469,
"grad_norm": 0.0,
"learning_rate": 9.1649441918148e-07,
"loss": 0.003,
"step": 16800
},
{
"epoch": 1.3972138398577982,
"grad_norm": 3.7538058757781982,
"learning_rate": 9.040926002480363e-07,
"loss": 0.0031,
"step": 16900
},
{
"epoch": 1.4054813773717498,
"grad_norm": 0.0,
"learning_rate": 8.916907813145929e-07,
"loss": 0.0019,
"step": 17000
},
{
"epoch": 1.4054813773717498,
"eval_cosine_accuracy": 0.9999651207403705,
"eval_loss": 0.0025049280375242233,
"eval_runtime": 690.902,
"eval_samples_per_second": 124.491,
"eval_steps_per_second": 3.891,
"step": 17000
},
{
"epoch": 1.4137489148857014,
"grad_norm": 0.0,
"learning_rate": 8.792889623811492e-07,
"loss": 0.0032,
"step": 17100
},
{
"epoch": 1.4220164523996528,
"grad_norm": 0.0,
"learning_rate": 8.668871434477057e-07,
"loss": 0.0035,
"step": 17200
},
{
"epoch": 1.4302839899136042,
"grad_norm": 4.074612140655518,
"learning_rate": 8.54485324514262e-07,
"loss": 0.0017,
"step": 17300
},
{
"epoch": 1.4385515274275558,
"grad_norm": 0.0,
"learning_rate": 8.420835055808186e-07,
"loss": 0.0023,
"step": 17400
},
{
"epoch": 1.4468190649415071,
"grad_norm": 0.0,
"learning_rate": 8.29681686647375e-07,
"loss": 0.0024,
"step": 17500
},
{
"epoch": 1.4468190649415071,
"eval_cosine_accuracy": 0.9999534943204939,
"eval_loss": 0.0024029347114264965,
"eval_runtime": 690.076,
"eval_samples_per_second": 124.64,
"eval_steps_per_second": 3.895,
"step": 17500
},
{
"epoch": 1.4550866024554585,
"grad_norm": 0.0,
"learning_rate": 8.172798677139314e-07,
"loss": 0.0017,
"step": 17600
},
{
"epoch": 1.46335413996941,
"grad_norm": 0.0,
"learning_rate": 8.048780487804879e-07,
"loss": 0.0025,
"step": 17700
},
{
"epoch": 1.4716216774833617,
"grad_norm": 0.0,
"learning_rate": 7.924762298470442e-07,
"loss": 0.0019,
"step": 17800
},
{
"epoch": 1.479889214997313,
"grad_norm": 0.0,
"learning_rate": 7.800744109136007e-07,
"loss": 0.003,
"step": 17900
},
{
"epoch": 1.4881567525112644,
"grad_norm": 0.0,
"learning_rate": 7.676725919801571e-07,
"loss": 0.003,
"step": 18000
},
{
"epoch": 1.4881567525112644,
"eval_cosine_accuracy": 0.9999651207403705,
"eval_loss": 0.0024024369195103645,
"eval_runtime": 690.6226,
"eval_samples_per_second": 124.541,
"eval_steps_per_second": 3.892,
"step": 18000
}
],
"logging_steps": 100,
"max_steps": 24190,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}