|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.4881567525112644, |
|
"eval_steps": 500, |
|
"global_step": 18000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00826753751395147, |
|
"grad_norm": 9.10895824432373, |
|
"learning_rate": 2.9875981810665565e-06, |
|
"loss": 0.7375, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01653507502790294, |
|
"grad_norm": 12.439295768737793, |
|
"learning_rate": 2.975196362133113e-06, |
|
"loss": 0.3709, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02480261254185441, |
|
"grad_norm": 11.995777130126953, |
|
"learning_rate": 2.9627945431996694e-06, |
|
"loss": 0.3086, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03307015005580588, |
|
"grad_norm": 10.923591613769531, |
|
"learning_rate": 2.950392724266226e-06, |
|
"loss": 0.2387, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04133768756975735, |
|
"grad_norm": 10.521248817443848, |
|
"learning_rate": 2.9379909053327823e-06, |
|
"loss": 0.2154, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04133768756975735, |
|
"eval_cosine_accuracy": 0.9945472090779086, |
|
"eval_loss": 0.16342051327228546, |
|
"eval_runtime": 695.845, |
|
"eval_samples_per_second": 123.607, |
|
"eval_steps_per_second": 3.863, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04960522508370882, |
|
"grad_norm": 9.583465576171875, |
|
"learning_rate": 2.9255890863993388e-06, |
|
"loss": 0.2001, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.057872762597660284, |
|
"grad_norm": 11.749183654785156, |
|
"learning_rate": 2.913187267465895e-06, |
|
"loss": 0.1832, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.06614030011161176, |
|
"grad_norm": 9.898832321166992, |
|
"learning_rate": 2.9007854485324512e-06, |
|
"loss": 0.1794, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07440783762556323, |
|
"grad_norm": 8.169402122497559, |
|
"learning_rate": 2.888383629599008e-06, |
|
"loss": 0.1459, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.0826753751395147, |
|
"grad_norm": 4.54998779296875, |
|
"learning_rate": 2.8759818106655646e-06, |
|
"loss": 0.1279, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0826753751395147, |
|
"eval_cosine_accuracy": 0.9969887572519794, |
|
"eval_loss": 0.09583044797182083, |
|
"eval_runtime": 694.7532, |
|
"eval_samples_per_second": 123.801, |
|
"eval_steps_per_second": 3.869, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.09094291265346617, |
|
"grad_norm": 9.351482391357422, |
|
"learning_rate": 2.863579991732121e-06, |
|
"loss": 0.1218, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.09921045016741764, |
|
"grad_norm": 9.509383201599121, |
|
"learning_rate": 2.851178172798677e-06, |
|
"loss": 0.105, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.10747798768136911, |
|
"grad_norm": 8.839822769165039, |
|
"learning_rate": 2.8387763538652335e-06, |
|
"loss": 0.119, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.11574552519532057, |
|
"grad_norm": 6.181822776794434, |
|
"learning_rate": 2.82637453493179e-06, |
|
"loss": 0.0899, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.12401306270927204, |
|
"grad_norm": 5.21838903427124, |
|
"learning_rate": 2.813972715998347e-06, |
|
"loss": 0.0862, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.12401306270927204, |
|
"eval_cosine_accuracy": 0.9981630256595087, |
|
"eval_loss": 0.06220981478691101, |
|
"eval_runtime": 694.8959, |
|
"eval_samples_per_second": 123.775, |
|
"eval_steps_per_second": 3.868, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.13228060022322352, |
|
"grad_norm": 6.435093402862549, |
|
"learning_rate": 2.801570897064903e-06, |
|
"loss": 0.0751, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.14054813773717498, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.7891690781314593e-06, |
|
"loss": 0.077, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.14881567525112646, |
|
"grad_norm": 6.846395015716553, |
|
"learning_rate": 2.7767672591980157e-06, |
|
"loss": 0.0726, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.15708321276507792, |
|
"grad_norm": 8.860357284545898, |
|
"learning_rate": 2.764365440264572e-06, |
|
"loss": 0.0714, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.1653507502790294, |
|
"grad_norm": 13.334174156188965, |
|
"learning_rate": 2.7519636213311286e-06, |
|
"loss": 0.0647, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1653507502790294, |
|
"eval_cosine_accuracy": 0.9989536222111125, |
|
"eval_loss": 0.04396611452102661, |
|
"eval_runtime": 694.7257, |
|
"eval_samples_per_second": 123.806, |
|
"eval_steps_per_second": 3.869, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.17361828779298086, |
|
"grad_norm": 6.131624221801758, |
|
"learning_rate": 2.739561802397685e-06, |
|
"loss": 0.0627, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.18188582530693234, |
|
"grad_norm": 4.629586696624756, |
|
"learning_rate": 2.7271599834642415e-06, |
|
"loss": 0.0555, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.1901533628208838, |
|
"grad_norm": 7.412508487701416, |
|
"learning_rate": 2.714758164530798e-06, |
|
"loss": 0.0582, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.19842090033483528, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.7023563455973544e-06, |
|
"loss": 0.0573, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.20668843784878674, |
|
"grad_norm": 7.9056806564331055, |
|
"learning_rate": 2.6899545266639104e-06, |
|
"loss": 0.0566, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.20668843784878674, |
|
"eval_cosine_accuracy": 0.999360546906791, |
|
"eval_loss": 0.030446216464042664, |
|
"eval_runtime": 693.3017, |
|
"eval_samples_per_second": 124.06, |
|
"eval_steps_per_second": 3.877, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.21495597536273822, |
|
"grad_norm": 5.176001071929932, |
|
"learning_rate": 2.6775527077304673e-06, |
|
"loss": 0.0474, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.22322351287668968, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.6651508887970238e-06, |
|
"loss": 0.0492, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.23149105039064113, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.65274906986358e-06, |
|
"loss": 0.0452, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.23975858790459262, |
|
"grad_norm": 6.902950763702393, |
|
"learning_rate": 2.6403472509301367e-06, |
|
"loss": 0.049, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.24802612541854407, |
|
"grad_norm": 9.228604316711426, |
|
"learning_rate": 2.6279454319966927e-06, |
|
"loss": 0.0426, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.24802612541854407, |
|
"eval_cosine_accuracy": 0.9996977130832103, |
|
"eval_loss": 0.02179584465920925, |
|
"eval_runtime": 694.8853, |
|
"eval_samples_per_second": 123.777, |
|
"eval_steps_per_second": 3.868, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.25629366293249556, |
|
"grad_norm": 8.215713500976562, |
|
"learning_rate": 2.615543613063249e-06, |
|
"loss": 0.0386, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.26456120044644704, |
|
"grad_norm": 3.6304705142974854, |
|
"learning_rate": 2.603141794129806e-06, |
|
"loss": 0.0358, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.27282873796039847, |
|
"grad_norm": 7.101751804351807, |
|
"learning_rate": 2.5907399751963624e-06, |
|
"loss": 0.0318, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.28109627547434995, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.5783381562629185e-06, |
|
"loss": 0.0345, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.28936381298830144, |
|
"grad_norm": 7.361840724945068, |
|
"learning_rate": 2.565936337329475e-06, |
|
"loss": 0.0311, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.28936381298830144, |
|
"eval_cosine_accuracy": 0.9996977130832103, |
|
"eval_loss": 0.01837225630879402, |
|
"eval_runtime": 693.1653, |
|
"eval_samples_per_second": 124.084, |
|
"eval_steps_per_second": 3.878, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.2976313505022529, |
|
"grad_norm": 7.0488505363464355, |
|
"learning_rate": 2.5535345183960314e-06, |
|
"loss": 0.0383, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.30589888801620435, |
|
"grad_norm": 3.100097894668579, |
|
"learning_rate": 2.541132699462588e-06, |
|
"loss": 0.0262, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.31416642553015584, |
|
"grad_norm": 6.720247745513916, |
|
"learning_rate": 2.5287308805291443e-06, |
|
"loss": 0.0299, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.3224339630441073, |
|
"grad_norm": 3.7630603313446045, |
|
"learning_rate": 2.5163290615957007e-06, |
|
"loss": 0.0251, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.3307015005580588, |
|
"grad_norm": 7.238327980041504, |
|
"learning_rate": 2.503927242662257e-06, |
|
"loss": 0.0255, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3307015005580588, |
|
"eval_cosine_accuracy": 0.9997674716024695, |
|
"eval_loss": 0.01472113560885191, |
|
"eval_runtime": 691.5244, |
|
"eval_samples_per_second": 124.379, |
|
"eval_steps_per_second": 3.887, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.33896903807201023, |
|
"grad_norm": 7.58219051361084, |
|
"learning_rate": 2.4915254237288136e-06, |
|
"loss": 0.026, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.3472365755859617, |
|
"grad_norm": 5.715928554534912, |
|
"learning_rate": 2.47912360479537e-06, |
|
"loss": 0.029, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.3555041130999132, |
|
"grad_norm": 4.353998184204102, |
|
"learning_rate": 2.4667217858619265e-06, |
|
"loss": 0.0312, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.3637716506138647, |
|
"grad_norm": 4.321074962615967, |
|
"learning_rate": 2.454319966928483e-06, |
|
"loss": 0.0217, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.3720391881278161, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.4419181479950394e-06, |
|
"loss": 0.0242, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.3720391881278161, |
|
"eval_cosine_accuracy": 0.9998372301217286, |
|
"eval_loss": 0.012444370426237583, |
|
"eval_runtime": 691.587, |
|
"eval_samples_per_second": 124.368, |
|
"eval_steps_per_second": 3.887, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.3803067256417676, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.429516329061596e-06, |
|
"loss": 0.0209, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.3885742631557191, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.4171145101281523e-06, |
|
"loss": 0.0259, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.39684180066967056, |
|
"grad_norm": 5.16447639465332, |
|
"learning_rate": 2.4047126911947083e-06, |
|
"loss": 0.0246, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.405109338183622, |
|
"grad_norm": 6.7723870277404785, |
|
"learning_rate": 2.392310872261265e-06, |
|
"loss": 0.0219, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.4133768756975735, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.3799090533278216e-06, |
|
"loss": 0.0228, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.4133768756975735, |
|
"eval_cosine_accuracy": 0.9998372301217286, |
|
"eval_loss": 0.009931358508765697, |
|
"eval_runtime": 693.5328, |
|
"eval_samples_per_second": 124.019, |
|
"eval_steps_per_second": 3.876, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.42164441321152496, |
|
"grad_norm": 7.131288051605225, |
|
"learning_rate": 2.367507234394378e-06, |
|
"loss": 0.0185, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.42991195072547644, |
|
"grad_norm": 4.856233596801758, |
|
"learning_rate": 2.355105415460934e-06, |
|
"loss": 0.0194, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.4381794882394279, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.3427035965274906e-06, |
|
"loss": 0.0177, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.44644702575337936, |
|
"grad_norm": 3.5896942615509033, |
|
"learning_rate": 2.330301777594047e-06, |
|
"loss": 0.0183, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.45471456326733084, |
|
"grad_norm": 5.037208080291748, |
|
"learning_rate": 2.3178999586606035e-06, |
|
"loss": 0.0163, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.45471456326733084, |
|
"eval_cosine_accuracy": 0.9998837358012347, |
|
"eval_loss": 0.009113303385674953, |
|
"eval_runtime": 692.1079, |
|
"eval_samples_per_second": 124.274, |
|
"eval_steps_per_second": 3.884, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.46298210078128227, |
|
"grad_norm": 3.856128692626953, |
|
"learning_rate": 2.3054981397271603e-06, |
|
"loss": 0.0179, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.47124963829523375, |
|
"grad_norm": 4.501084804534912, |
|
"learning_rate": 2.2930963207937164e-06, |
|
"loss": 0.0201, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.47951717580918524, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.280694501860273e-06, |
|
"loss": 0.0133, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.4877847133231367, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.2682926829268293e-06, |
|
"loss": 0.0161, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.49605225083708815, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.2558908639933857e-06, |
|
"loss": 0.0131, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.49605225083708815, |
|
"eval_cosine_accuracy": 0.9998953622211113, |
|
"eval_loss": 0.008038520812988281, |
|
"eval_runtime": 692.7644, |
|
"eval_samples_per_second": 124.156, |
|
"eval_steps_per_second": 3.88, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5043197883510396, |
|
"grad_norm": 5.829000473022461, |
|
"learning_rate": 2.243489045059942e-06, |
|
"loss": 0.0147, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.5125873258649911, |
|
"grad_norm": 8.35444164276123, |
|
"learning_rate": 2.2310872261264986e-06, |
|
"loss": 0.016, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.5208548633789426, |
|
"grad_norm": 5.209952354431152, |
|
"learning_rate": 2.218685407193055e-06, |
|
"loss": 0.0141, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.5291224008928941, |
|
"grad_norm": 6.257237434387207, |
|
"learning_rate": 2.2062835882596115e-06, |
|
"loss": 0.0173, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.5373899384068456, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.193881769326168e-06, |
|
"loss": 0.0126, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5373899384068456, |
|
"eval_cosine_accuracy": 0.9999302414807408, |
|
"eval_loss": 0.0074099162593483925, |
|
"eval_runtime": 692.7522, |
|
"eval_samples_per_second": 124.158, |
|
"eval_steps_per_second": 3.88, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5456574759207969, |
|
"grad_norm": 3.5162978172302246, |
|
"learning_rate": 2.1814799503927244e-06, |
|
"loss": 0.0126, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.5539250134347484, |
|
"grad_norm": 4.822145938873291, |
|
"learning_rate": 2.169078131459281e-06, |
|
"loss": 0.0166, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.5621925509486999, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.1566763125258373e-06, |
|
"loss": 0.0127, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.5704600884626514, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.1442744935923937e-06, |
|
"loss": 0.0149, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.5787276259766029, |
|
"grad_norm": 3.7051377296447754, |
|
"learning_rate": 2.13187267465895e-06, |
|
"loss": 0.0087, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.5787276259766029, |
|
"eval_cosine_accuracy": 0.9999186150608643, |
|
"eval_loss": 0.006947255693376064, |
|
"eval_runtime": 692.9985, |
|
"eval_samples_per_second": 124.114, |
|
"eval_steps_per_second": 3.879, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.5869951634905544, |
|
"grad_norm": 4.005233287811279, |
|
"learning_rate": 2.119470855725506e-06, |
|
"loss": 0.0182, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.5952627010045058, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.1070690367920627e-06, |
|
"loss": 0.0122, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.6035302385184573, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.0946672178586195e-06, |
|
"loss": 0.0141, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.6117977760324087, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.082265398925176e-06, |
|
"loss": 0.0145, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.6200653135463602, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.069863579991732e-06, |
|
"loss": 0.0133, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6200653135463602, |
|
"eval_cosine_accuracy": 0.9999069886409878, |
|
"eval_loss": 0.006750450469553471, |
|
"eval_runtime": 692.7796, |
|
"eval_samples_per_second": 124.153, |
|
"eval_steps_per_second": 3.88, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6283328510603117, |
|
"grad_norm": 4.767193794250488, |
|
"learning_rate": 2.0574617610582885e-06, |
|
"loss": 0.0139, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.6366003885742632, |
|
"grad_norm": 3.9448177814483643, |
|
"learning_rate": 2.045059942124845e-06, |
|
"loss": 0.0159, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.6448679260882146, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.0326581231914013e-06, |
|
"loss": 0.0167, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.6531354636021661, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.0202563042579582e-06, |
|
"loss": 0.0106, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.6614030011161176, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.0078544853245142e-06, |
|
"loss": 0.0125, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6614030011161176, |
|
"eval_cosine_accuracy": 0.9999302414807408, |
|
"eval_loss": 0.005728627555072308, |
|
"eval_runtime": 690.5659, |
|
"eval_samples_per_second": 124.551, |
|
"eval_steps_per_second": 3.892, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.669670538630069, |
|
"grad_norm": 5.22554874420166, |
|
"learning_rate": 1.9954526663910707e-06, |
|
"loss": 0.0086, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.6779380761440205, |
|
"grad_norm": 2.913892984390259, |
|
"learning_rate": 1.983050847457627e-06, |
|
"loss": 0.0138, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.686205613657972, |
|
"grad_norm": 4.088724613189697, |
|
"learning_rate": 1.9706490285241836e-06, |
|
"loss": 0.0151, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.6944731511719234, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.95824720959074e-06, |
|
"loss": 0.0148, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.7027406886858749, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9458453906572965e-06, |
|
"loss": 0.0086, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.7027406886858749, |
|
"eval_cosine_accuracy": 0.9999186150608643, |
|
"eval_loss": 0.00535405520349741, |
|
"eval_runtime": 688.2066, |
|
"eval_samples_per_second": 124.978, |
|
"eval_steps_per_second": 3.906, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.7110082261998264, |
|
"grad_norm": 4.961321830749512, |
|
"learning_rate": 1.933443571723853e-06, |
|
"loss": 0.0082, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.7192757637137779, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.9210417527904094e-06, |
|
"loss": 0.0088, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.7275433012277294, |
|
"grad_norm": 4.613398551940918, |
|
"learning_rate": 1.908639933856966e-06, |
|
"loss": 0.0097, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.7358108387416807, |
|
"grad_norm": 5.949629306793213, |
|
"learning_rate": 1.896238114923522e-06, |
|
"loss": 0.0103, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.7440783762556322, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8838362959900785e-06, |
|
"loss": 0.0108, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7440783762556322, |
|
"eval_cosine_accuracy": 0.9999069886409878, |
|
"eval_loss": 0.004750753752887249, |
|
"eval_runtime": 692.3618, |
|
"eval_samples_per_second": 124.228, |
|
"eval_steps_per_second": 3.882, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7523459137695837, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.871434477056635e-06, |
|
"loss": 0.0113, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.7606134512835352, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8590326581231916e-06, |
|
"loss": 0.0096, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.7688809887974867, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8466308391897476e-06, |
|
"loss": 0.0115, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.7771485263114382, |
|
"grad_norm": 7.0339884757995605, |
|
"learning_rate": 1.8342290202563043e-06, |
|
"loss": 0.011, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.7854160638253896, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.8218272013228608e-06, |
|
"loss": 0.0091, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.7854160638253896, |
|
"eval_cosine_accuracy": 0.9999069886409878, |
|
"eval_loss": 0.00442688912153244, |
|
"eval_runtime": 692.6571, |
|
"eval_samples_per_second": 124.175, |
|
"eval_steps_per_second": 3.881, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.7936836013393411, |
|
"grad_norm": 3.4286439418792725, |
|
"learning_rate": 1.8094253823894172e-06, |
|
"loss": 0.0157, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.8019511388532925, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7970235634559737e-06, |
|
"loss": 0.0119, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.810218676367244, |
|
"grad_norm": 7.657674312591553, |
|
"learning_rate": 1.7846217445225299e-06, |
|
"loss": 0.0071, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.8184862138811955, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7722199255890863e-06, |
|
"loss": 0.008, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.826753751395147, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.759818106655643e-06, |
|
"loss": 0.0111, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.826753751395147, |
|
"eval_cosine_accuracy": 0.9999186150608643, |
|
"eval_loss": 0.004347575828433037, |
|
"eval_runtime": 691.4865, |
|
"eval_samples_per_second": 124.386, |
|
"eval_steps_per_second": 3.887, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8350212889090984, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7474162877221994e-06, |
|
"loss": 0.0107, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.8432888264230499, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7350144687887557e-06, |
|
"loss": 0.0094, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.8515563639370014, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.7226126498553121e-06, |
|
"loss": 0.0099, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.8598239014509529, |
|
"grad_norm": 5.666162967681885, |
|
"learning_rate": 1.7102108309218686e-06, |
|
"loss": 0.0108, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.8680914389649043, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.697809011988425e-06, |
|
"loss": 0.0077, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.8680914389649043, |
|
"eval_cosine_accuracy": 0.9999418679006173, |
|
"eval_loss": 0.003721497254446149, |
|
"eval_runtime": 691.4305, |
|
"eval_samples_per_second": 124.396, |
|
"eval_steps_per_second": 3.888, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.8763589764788557, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6854071930549817e-06, |
|
"loss": 0.0065, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.8846265139928072, |
|
"grad_norm": 5.624023914337158, |
|
"learning_rate": 1.6730053741215377e-06, |
|
"loss": 0.0111, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.8928940515067587, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6606035551880942e-06, |
|
"loss": 0.0099, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.9011615890207102, |
|
"grad_norm": 4.608953952789307, |
|
"learning_rate": 1.6482017362546508e-06, |
|
"loss": 0.01, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.9094291265346617, |
|
"grad_norm": 3.9025700092315674, |
|
"learning_rate": 1.6357999173212073e-06, |
|
"loss": 0.0098, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9094291265346617, |
|
"eval_cosine_accuracy": 0.9999534943204939, |
|
"eval_loss": 0.0034107074607163668, |
|
"eval_runtime": 690.7094, |
|
"eval_samples_per_second": 124.526, |
|
"eval_steps_per_second": 3.892, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9176966640486132, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6233980983877635e-06, |
|
"loss": 0.0075, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.9259642015625645, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.61099627945432e-06, |
|
"loss": 0.0049, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.934231739076516, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5985944605208764e-06, |
|
"loss": 0.0091, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.9424992765904675, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5861926415874328e-06, |
|
"loss": 0.0067, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.950766814104419, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5737908226539895e-06, |
|
"loss": 0.0089, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.950766814104419, |
|
"eval_cosine_accuracy": 0.9999418679006173, |
|
"eval_loss": 0.0033190434332937002, |
|
"eval_runtime": 692.4967, |
|
"eval_samples_per_second": 124.204, |
|
"eval_steps_per_second": 3.882, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.9590343516183705, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5613890037205455e-06, |
|
"loss": 0.0087, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.967301889132322, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5489871847871022e-06, |
|
"loss": 0.0092, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.9755694266462734, |
|
"grad_norm": 6.598356246948242, |
|
"learning_rate": 1.5365853658536586e-06, |
|
"loss": 0.0051, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.9838369641602249, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.524183546920215e-06, |
|
"loss": 0.0088, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.9921045016741763, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5117817279867713e-06, |
|
"loss": 0.0089, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.9921045016741763, |
|
"eval_cosine_accuracy": 0.9999302414807408, |
|
"eval_loss": 0.0032671140506863594, |
|
"eval_runtime": 691.2453, |
|
"eval_samples_per_second": 124.429, |
|
"eval_steps_per_second": 3.889, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.000372039188128, |
|
"grad_norm": 5.8269877433776855, |
|
"learning_rate": 1.4993799090533278e-06, |
|
"loss": 0.0145, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 1.0086395767020793, |
|
"grad_norm": 5.8136091232299805, |
|
"learning_rate": 1.4869780901198842e-06, |
|
"loss": 0.0078, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 1.0169071142160309, |
|
"grad_norm": 7.668418884277344, |
|
"learning_rate": 1.4745762711864409e-06, |
|
"loss": 0.0087, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 1.0251746517299822, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4621744522529971e-06, |
|
"loss": 0.0047, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 1.0334421892439336, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4497726333195536e-06, |
|
"loss": 0.0064, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.0334421892439336, |
|
"eval_cosine_accuracy": 0.9999651207403705, |
|
"eval_loss": 0.003002994926646352, |
|
"eval_runtime": 695.5578, |
|
"eval_samples_per_second": 123.658, |
|
"eval_steps_per_second": 3.865, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.0417097267578852, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.43737081438611e-06, |
|
"loss": 0.0057, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 1.0499772642718366, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4249689954526665e-06, |
|
"loss": 0.0067, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 1.0582448017857882, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4125671765192227e-06, |
|
"loss": 0.0044, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 1.0665123392997395, |
|
"grad_norm": 6.3078789710998535, |
|
"learning_rate": 1.4001653575857794e-06, |
|
"loss": 0.0051, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 1.0747798768136911, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3877635386523356e-06, |
|
"loss": 0.0057, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.0747798768136911, |
|
"eval_cosine_accuracy": 0.9999418679006173, |
|
"eval_loss": 0.003003525547683239, |
|
"eval_runtime": 699.036, |
|
"eval_samples_per_second": 123.042, |
|
"eval_steps_per_second": 3.845, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.0830474143276425, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.375361719718892e-06, |
|
"loss": 0.0064, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 1.0913149518415939, |
|
"grad_norm": 3.789832830429077, |
|
"learning_rate": 1.3629599007854487e-06, |
|
"loss": 0.0051, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 1.0995824893555455, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.350558081852005e-06, |
|
"loss": 0.0037, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 1.1078500268694969, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3381562629185614e-06, |
|
"loss": 0.0041, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 1.1161175643834484, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3257544439851178e-06, |
|
"loss": 0.0023, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.1161175643834484, |
|
"eval_cosine_accuracy": 0.9999534943204939, |
|
"eval_loss": 0.002926659770309925, |
|
"eval_runtime": 609.8415, |
|
"eval_samples_per_second": 141.038, |
|
"eval_steps_per_second": 4.408, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.1243851018973998, |
|
"grad_norm": 6.910253047943115, |
|
"learning_rate": 1.3133526250516743e-06, |
|
"loss": 0.0039, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 1.1326526394113514, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3009508061182307e-06, |
|
"loss": 0.0024, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 1.1409201769253028, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2885489871847872e-06, |
|
"loss": 0.0046, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 1.1491877144392544, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2761471682513436e-06, |
|
"loss": 0.0053, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 1.1574552519532058, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2637453493179e-06, |
|
"loss": 0.005, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.1574552519532058, |
|
"eval_cosine_accuracy": 0.9999651207403705, |
|
"eval_loss": 0.002777786459773779, |
|
"eval_runtime": 544.44, |
|
"eval_samples_per_second": 157.981, |
|
"eval_steps_per_second": 4.937, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.1657227894671571, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2513435303844565e-06, |
|
"loss": 0.0036, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 1.1739903269811087, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2389417114510128e-06, |
|
"loss": 0.003, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 1.18225786449506, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2265398925175692e-06, |
|
"loss": 0.0024, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 1.1905254020090117, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2141380735841257e-06, |
|
"loss": 0.0045, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 1.198792939522963, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.2017362546506821e-06, |
|
"loss": 0.0048, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.198792939522963, |
|
"eval_cosine_accuracy": 0.9999651207403705, |
|
"eval_loss": 0.0027241790667176247, |
|
"eval_runtime": 709.377, |
|
"eval_samples_per_second": 121.249, |
|
"eval_steps_per_second": 3.789, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.2070604770369147, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1893344357172386e-06, |
|
"loss": 0.0037, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 1.215328014550866, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.176932616783795e-06, |
|
"loss": 0.0041, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 1.2235955520648174, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1645307978503515e-06, |
|
"loss": 0.0035, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 1.231863089578769, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.152128978916908e-06, |
|
"loss": 0.0024, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 1.2401306270927204, |
|
"grad_norm": 3.7592828273773193, |
|
"learning_rate": 1.1397271599834644e-06, |
|
"loss": 0.0036, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.2401306270927204, |
|
"eval_cosine_accuracy": 0.9999651207403705, |
|
"eval_loss": 0.0026917748618870974, |
|
"eval_runtime": 691.7866, |
|
"eval_samples_per_second": 124.332, |
|
"eval_steps_per_second": 3.886, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.248398164606672, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1273253410500206e-06, |
|
"loss": 0.0017, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 1.2566657021206233, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1149235221165772e-06, |
|
"loss": 0.0032, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 1.264933239634575, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1025217031831335e-06, |
|
"loss": 0.0032, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 1.2732007771485263, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.09011988424969e-06, |
|
"loss": 0.004, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 1.2814683146624777, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0777180653162464e-06, |
|
"loss": 0.0034, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.2814683146624777, |
|
"eval_cosine_accuracy": 0.9999651207403705, |
|
"eval_loss": 0.0028443040791898966, |
|
"eval_runtime": 691.2444, |
|
"eval_samples_per_second": 124.429, |
|
"eval_steps_per_second": 3.889, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.2897358521764293, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0653162463828028e-06, |
|
"loss": 0.0036, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 1.2980033896903806, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0529144274493593e-06, |
|
"loss": 0.0047, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 1.3062709272043322, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0405126085159157e-06, |
|
"loss": 0.0022, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 1.3145384647182836, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0281107895824722e-06, |
|
"loss": 0.0023, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 1.3228060022322352, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0157089706490284e-06, |
|
"loss": 0.0031, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.3228060022322352, |
|
"eval_cosine_accuracy": 0.9999534943204939, |
|
"eval_loss": 0.002732690190896392, |
|
"eval_runtime": 690.7452, |
|
"eval_samples_per_second": 124.519, |
|
"eval_steps_per_second": 3.891, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.3310735397461866, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.003307151715585e-06, |
|
"loss": 0.0024, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 1.339341077260138, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.909053327821413e-07, |
|
"loss": 0.0027, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.3476086147740896, |
|
"grad_norm": 4.422085285186768, |
|
"learning_rate": 9.785035138486978e-07, |
|
"loss": 0.0043, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 1.355876152288041, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.661016949152542e-07, |
|
"loss": 0.004, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 1.3641436898019925, |
|
"grad_norm": 3.7661290168762207, |
|
"learning_rate": 9.536998759818107e-07, |
|
"loss": 0.0018, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.3641436898019925, |
|
"eval_cosine_accuracy": 0.9999651207403705, |
|
"eval_loss": 0.0025276916567236185, |
|
"eval_runtime": 691.7092, |
|
"eval_samples_per_second": 124.346, |
|
"eval_steps_per_second": 3.886, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.372411227315944, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.412980570483672e-07, |
|
"loss": 0.0035, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.3806787648298955, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.288962381149235e-07, |
|
"loss": 0.0028, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 1.3889463023438469, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.1649441918148e-07, |
|
"loss": 0.003, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.3972138398577982, |
|
"grad_norm": 3.7538058757781982, |
|
"learning_rate": 9.040926002480363e-07, |
|
"loss": 0.0031, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 1.4054813773717498, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.916907813145929e-07, |
|
"loss": 0.0019, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.4054813773717498, |
|
"eval_cosine_accuracy": 0.9999651207403705, |
|
"eval_loss": 0.0025049280375242233, |
|
"eval_runtime": 690.902, |
|
"eval_samples_per_second": 124.491, |
|
"eval_steps_per_second": 3.891, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.4137489148857014, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.792889623811492e-07, |
|
"loss": 0.0032, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 1.4220164523996528, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.668871434477057e-07, |
|
"loss": 0.0035, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.4302839899136042, |
|
"grad_norm": 4.074612140655518, |
|
"learning_rate": 8.54485324514262e-07, |
|
"loss": 0.0017, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 1.4385515274275558, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.420835055808186e-07, |
|
"loss": 0.0023, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 1.4468190649415071, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.29681686647375e-07, |
|
"loss": 0.0024, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.4468190649415071, |
|
"eval_cosine_accuracy": 0.9999534943204939, |
|
"eval_loss": 0.0024029347114264965, |
|
"eval_runtime": 690.076, |
|
"eval_samples_per_second": 124.64, |
|
"eval_steps_per_second": 3.895, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.4550866024554585, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.172798677139314e-07, |
|
"loss": 0.0017, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.46335413996941, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.048780487804879e-07, |
|
"loss": 0.0025, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.4716216774833617, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.924762298470442e-07, |
|
"loss": 0.0019, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.479889214997313, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.800744109136007e-07, |
|
"loss": 0.003, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 1.4881567525112644, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.676725919801571e-07, |
|
"loss": 0.003, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.4881567525112644, |
|
"eval_cosine_accuracy": 0.9999651207403705, |
|
"eval_loss": 0.0024024369195103645, |
|
"eval_runtime": 690.6226, |
|
"eval_samples_per_second": 124.541, |
|
"eval_steps_per_second": 3.892, |
|
"step": 18000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 24190, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|