{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4881567525112644, "eval_steps": 500, "global_step": 18000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00826753751395147, "grad_norm": 9.10895824432373, "learning_rate": 2.9875981810665565e-06, "loss": 0.7375, "step": 100 }, { "epoch": 0.01653507502790294, "grad_norm": 12.439295768737793, "learning_rate": 2.975196362133113e-06, "loss": 0.3709, "step": 200 }, { "epoch": 0.02480261254185441, "grad_norm": 11.995777130126953, "learning_rate": 2.9627945431996694e-06, "loss": 0.3086, "step": 300 }, { "epoch": 0.03307015005580588, "grad_norm": 10.923591613769531, "learning_rate": 2.950392724266226e-06, "loss": 0.2387, "step": 400 }, { "epoch": 0.04133768756975735, "grad_norm": 10.521248817443848, "learning_rate": 2.9379909053327823e-06, "loss": 0.2154, "step": 500 }, { "epoch": 0.04133768756975735, "eval_cosine_accuracy": 0.9945472090779086, "eval_loss": 0.16342051327228546, "eval_runtime": 695.845, "eval_samples_per_second": 123.607, "eval_steps_per_second": 3.863, "step": 500 }, { "epoch": 0.04960522508370882, "grad_norm": 9.583465576171875, "learning_rate": 2.9255890863993388e-06, "loss": 0.2001, "step": 600 }, { "epoch": 0.057872762597660284, "grad_norm": 11.749183654785156, "learning_rate": 2.913187267465895e-06, "loss": 0.1832, "step": 700 }, { "epoch": 0.06614030011161176, "grad_norm": 9.898832321166992, "learning_rate": 2.9007854485324512e-06, "loss": 0.1794, "step": 800 }, { "epoch": 0.07440783762556323, "grad_norm": 8.169402122497559, "learning_rate": 2.888383629599008e-06, "loss": 0.1459, "step": 900 }, { "epoch": 0.0826753751395147, "grad_norm": 4.54998779296875, "learning_rate": 2.8759818106655646e-06, "loss": 0.1279, "step": 1000 }, { "epoch": 0.0826753751395147, "eval_cosine_accuracy": 0.9969887572519794, "eval_loss": 0.09583044797182083, "eval_runtime": 694.7532, "eval_samples_per_second": 123.801, "eval_steps_per_second": 3.869, "step": 1000 }, { "epoch": 0.09094291265346617, "grad_norm": 9.351482391357422, "learning_rate": 2.863579991732121e-06, "loss": 0.1218, "step": 1100 }, { "epoch": 0.09921045016741764, "grad_norm": 9.509383201599121, "learning_rate": 2.851178172798677e-06, "loss": 0.105, "step": 1200 }, { "epoch": 0.10747798768136911, "grad_norm": 8.839822769165039, "learning_rate": 2.8387763538652335e-06, "loss": 0.119, "step": 1300 }, { "epoch": 0.11574552519532057, "grad_norm": 6.181822776794434, "learning_rate": 2.82637453493179e-06, "loss": 0.0899, "step": 1400 }, { "epoch": 0.12401306270927204, "grad_norm": 5.21838903427124, "learning_rate": 2.813972715998347e-06, "loss": 0.0862, "step": 1500 }, { "epoch": 0.12401306270927204, "eval_cosine_accuracy": 0.9981630256595087, "eval_loss": 0.06220981478691101, "eval_runtime": 694.8959, "eval_samples_per_second": 123.775, "eval_steps_per_second": 3.868, "step": 1500 }, { "epoch": 0.13228060022322352, "grad_norm": 6.435093402862549, "learning_rate": 2.801570897064903e-06, "loss": 0.0751, "step": 1600 }, { "epoch": 0.14054813773717498, "grad_norm": 0.0, "learning_rate": 2.7891690781314593e-06, "loss": 0.077, "step": 1700 }, { "epoch": 0.14881567525112646, "grad_norm": 6.846395015716553, "learning_rate": 2.7767672591980157e-06, "loss": 0.0726, "step": 1800 }, { "epoch": 0.15708321276507792, "grad_norm": 8.860357284545898, "learning_rate": 2.764365440264572e-06, "loss": 0.0714, "step": 1900 }, { "epoch": 0.1653507502790294, "grad_norm": 13.334174156188965, "learning_rate": 2.7519636213311286e-06, "loss": 0.0647, "step": 2000 }, { "epoch": 0.1653507502790294, "eval_cosine_accuracy": 0.9989536222111125, "eval_loss": 0.04396611452102661, "eval_runtime": 694.7257, "eval_samples_per_second": 123.806, "eval_steps_per_second": 3.869, "step": 2000 }, { "epoch": 0.17361828779298086, "grad_norm": 6.131624221801758, "learning_rate": 2.739561802397685e-06, "loss": 0.0627, "step": 2100 }, { "epoch": 0.18188582530693234, "grad_norm": 4.629586696624756, "learning_rate": 2.7271599834642415e-06, "loss": 0.0555, "step": 2200 }, { "epoch": 0.1901533628208838, "grad_norm": 7.412508487701416, "learning_rate": 2.714758164530798e-06, "loss": 0.0582, "step": 2300 }, { "epoch": 0.19842090033483528, "grad_norm": 0.0, "learning_rate": 2.7023563455973544e-06, "loss": 0.0573, "step": 2400 }, { "epoch": 0.20668843784878674, "grad_norm": 7.9056806564331055, "learning_rate": 2.6899545266639104e-06, "loss": 0.0566, "step": 2500 }, { "epoch": 0.20668843784878674, "eval_cosine_accuracy": 0.999360546906791, "eval_loss": 0.030446216464042664, "eval_runtime": 693.3017, "eval_samples_per_second": 124.06, "eval_steps_per_second": 3.877, "step": 2500 }, { "epoch": 0.21495597536273822, "grad_norm": 5.176001071929932, "learning_rate": 2.6775527077304673e-06, "loss": 0.0474, "step": 2600 }, { "epoch": 0.22322351287668968, "grad_norm": 0.0, "learning_rate": 2.6651508887970238e-06, "loss": 0.0492, "step": 2700 }, { "epoch": 0.23149105039064113, "grad_norm": 0.0, "learning_rate": 2.65274906986358e-06, "loss": 0.0452, "step": 2800 }, { "epoch": 0.23975858790459262, "grad_norm": 6.902950763702393, "learning_rate": 2.6403472509301367e-06, "loss": 0.049, "step": 2900 }, { "epoch": 0.24802612541854407, "grad_norm": 9.228604316711426, "learning_rate": 2.6279454319966927e-06, "loss": 0.0426, "step": 3000 }, { "epoch": 0.24802612541854407, "eval_cosine_accuracy": 0.9996977130832103, "eval_loss": 0.02179584465920925, "eval_runtime": 694.8853, "eval_samples_per_second": 123.777, "eval_steps_per_second": 3.868, "step": 3000 }, { "epoch": 0.25629366293249556, "grad_norm": 8.215713500976562, "learning_rate": 2.615543613063249e-06, "loss": 0.0386, "step": 3100 }, { "epoch": 0.26456120044644704, "grad_norm": 3.6304705142974854, "learning_rate": 2.603141794129806e-06, "loss": 0.0358, "step": 3200 }, { "epoch": 0.27282873796039847, "grad_norm": 7.101751804351807, "learning_rate": 2.5907399751963624e-06, "loss": 0.0318, "step": 3300 }, { "epoch": 0.28109627547434995, "grad_norm": 0.0, "learning_rate": 2.5783381562629185e-06, "loss": 0.0345, "step": 3400 }, { "epoch": 0.28936381298830144, "grad_norm": 7.361840724945068, "learning_rate": 2.565936337329475e-06, "loss": 0.0311, "step": 3500 }, { "epoch": 0.28936381298830144, "eval_cosine_accuracy": 0.9996977130832103, "eval_loss": 0.01837225630879402, "eval_runtime": 693.1653, "eval_samples_per_second": 124.084, "eval_steps_per_second": 3.878, "step": 3500 }, { "epoch": 0.2976313505022529, "grad_norm": 7.0488505363464355, "learning_rate": 2.5535345183960314e-06, "loss": 0.0383, "step": 3600 }, { "epoch": 0.30589888801620435, "grad_norm": 3.100097894668579, "learning_rate": 2.541132699462588e-06, "loss": 0.0262, "step": 3700 }, { "epoch": 0.31416642553015584, "grad_norm": 6.720247745513916, "learning_rate": 2.5287308805291443e-06, "loss": 0.0299, "step": 3800 }, { "epoch": 0.3224339630441073, "grad_norm": 3.7630603313446045, "learning_rate": 2.5163290615957007e-06, "loss": 0.0251, "step": 3900 }, { "epoch": 0.3307015005580588, "grad_norm": 7.238327980041504, "learning_rate": 2.503927242662257e-06, "loss": 0.0255, "step": 4000 }, { "epoch": 0.3307015005580588, "eval_cosine_accuracy": 0.9997674716024695, "eval_loss": 0.01472113560885191, "eval_runtime": 691.5244, "eval_samples_per_second": 124.379, "eval_steps_per_second": 3.887, "step": 4000 }, { "epoch": 0.33896903807201023, "grad_norm": 7.58219051361084, "learning_rate": 2.4915254237288136e-06, "loss": 0.026, "step": 4100 }, { "epoch": 0.3472365755859617, "grad_norm": 5.715928554534912, "learning_rate": 2.47912360479537e-06, "loss": 0.029, "step": 4200 }, { "epoch": 0.3555041130999132, "grad_norm": 4.353998184204102, "learning_rate": 2.4667217858619265e-06, "loss": 0.0312, "step": 4300 }, { "epoch": 0.3637716506138647, "grad_norm": 4.321074962615967, "learning_rate": 2.454319966928483e-06, "loss": 0.0217, "step": 4400 }, { "epoch": 0.3720391881278161, "grad_norm": 0.0, "learning_rate": 2.4419181479950394e-06, "loss": 0.0242, "step": 4500 }, { "epoch": 0.3720391881278161, "eval_cosine_accuracy": 0.9998372301217286, "eval_loss": 0.012444370426237583, "eval_runtime": 691.587, "eval_samples_per_second": 124.368, "eval_steps_per_second": 3.887, "step": 4500 }, { "epoch": 0.3803067256417676, "grad_norm": 0.0, "learning_rate": 2.429516329061596e-06, "loss": 0.0209, "step": 4600 }, { "epoch": 0.3885742631557191, "grad_norm": 0.0, "learning_rate": 2.4171145101281523e-06, "loss": 0.0259, "step": 4700 }, { "epoch": 0.39684180066967056, "grad_norm": 5.16447639465332, "learning_rate": 2.4047126911947083e-06, "loss": 0.0246, "step": 4800 }, { "epoch": 0.405109338183622, "grad_norm": 6.7723870277404785, "learning_rate": 2.392310872261265e-06, "loss": 0.0219, "step": 4900 }, { "epoch": 0.4133768756975735, "grad_norm": 0.0, "learning_rate": 2.3799090533278216e-06, "loss": 0.0228, "step": 5000 }, { "epoch": 0.4133768756975735, "eval_cosine_accuracy": 0.9998372301217286, "eval_loss": 0.009931358508765697, "eval_runtime": 693.5328, "eval_samples_per_second": 124.019, "eval_steps_per_second": 3.876, "step": 5000 }, { "epoch": 0.42164441321152496, "grad_norm": 7.131288051605225, "learning_rate": 2.367507234394378e-06, "loss": 0.0185, "step": 5100 }, { "epoch": 0.42991195072547644, "grad_norm": 4.856233596801758, "learning_rate": 2.355105415460934e-06, "loss": 0.0194, "step": 5200 }, { "epoch": 0.4381794882394279, "grad_norm": 0.0, "learning_rate": 2.3427035965274906e-06, "loss": 0.0177, "step": 5300 }, { "epoch": 0.44644702575337936, "grad_norm": 3.5896942615509033, "learning_rate": 2.330301777594047e-06, "loss": 0.0183, "step": 5400 }, { "epoch": 0.45471456326733084, "grad_norm": 5.037208080291748, "learning_rate": 2.3178999586606035e-06, "loss": 0.0163, "step": 5500 }, { "epoch": 0.45471456326733084, "eval_cosine_accuracy": 0.9998837358012347, "eval_loss": 0.009113303385674953, "eval_runtime": 692.1079, "eval_samples_per_second": 124.274, "eval_steps_per_second": 3.884, "step": 5500 }, { "epoch": 0.46298210078128227, "grad_norm": 3.856128692626953, "learning_rate": 2.3054981397271603e-06, "loss": 0.0179, "step": 5600 }, { "epoch": 0.47124963829523375, "grad_norm": 4.501084804534912, "learning_rate": 2.2930963207937164e-06, "loss": 0.0201, "step": 5700 }, { "epoch": 0.47951717580918524, "grad_norm": 0.0, "learning_rate": 2.280694501860273e-06, "loss": 0.0133, "step": 5800 }, { "epoch": 0.4877847133231367, "grad_norm": 0.0, "learning_rate": 2.2682926829268293e-06, "loss": 0.0161, "step": 5900 }, { "epoch": 0.49605225083708815, "grad_norm": 0.0, "learning_rate": 2.2558908639933857e-06, "loss": 0.0131, "step": 6000 }, { "epoch": 0.49605225083708815, "eval_cosine_accuracy": 0.9998953622211113, "eval_loss": 0.008038520812988281, "eval_runtime": 692.7644, "eval_samples_per_second": 124.156, "eval_steps_per_second": 3.88, "step": 6000 }, { "epoch": 0.5043197883510396, "grad_norm": 5.829000473022461, "learning_rate": 2.243489045059942e-06, "loss": 0.0147, "step": 6100 }, { "epoch": 0.5125873258649911, "grad_norm": 8.35444164276123, "learning_rate": 2.2310872261264986e-06, "loss": 0.016, "step": 6200 }, { "epoch": 0.5208548633789426, "grad_norm": 5.209952354431152, "learning_rate": 2.218685407193055e-06, "loss": 0.0141, "step": 6300 }, { "epoch": 0.5291224008928941, "grad_norm": 6.257237434387207, "learning_rate": 2.2062835882596115e-06, "loss": 0.0173, "step": 6400 }, { "epoch": 0.5373899384068456, "grad_norm": 0.0, "learning_rate": 2.193881769326168e-06, "loss": 0.0126, "step": 6500 }, { "epoch": 0.5373899384068456, "eval_cosine_accuracy": 0.9999302414807408, "eval_loss": 0.0074099162593483925, "eval_runtime": 692.7522, "eval_samples_per_second": 124.158, "eval_steps_per_second": 3.88, "step": 6500 }, { "epoch": 0.5456574759207969, "grad_norm": 3.5162978172302246, "learning_rate": 2.1814799503927244e-06, "loss": 0.0126, "step": 6600 }, { "epoch": 0.5539250134347484, "grad_norm": 4.822145938873291, "learning_rate": 2.169078131459281e-06, "loss": 0.0166, "step": 6700 }, { "epoch": 0.5621925509486999, "grad_norm": 0.0, "learning_rate": 2.1566763125258373e-06, "loss": 0.0127, "step": 6800 }, { "epoch": 0.5704600884626514, "grad_norm": 0.0, "learning_rate": 2.1442744935923937e-06, "loss": 0.0149, "step": 6900 }, { "epoch": 0.5787276259766029, "grad_norm": 3.7051377296447754, "learning_rate": 2.13187267465895e-06, "loss": 0.0087, "step": 7000 }, { "epoch": 0.5787276259766029, "eval_cosine_accuracy": 0.9999186150608643, "eval_loss": 0.006947255693376064, "eval_runtime": 692.9985, "eval_samples_per_second": 124.114, "eval_steps_per_second": 3.879, "step": 7000 }, { "epoch": 0.5869951634905544, "grad_norm": 4.005233287811279, "learning_rate": 2.119470855725506e-06, "loss": 0.0182, "step": 7100 }, { "epoch": 0.5952627010045058, "grad_norm": 0.0, "learning_rate": 2.1070690367920627e-06, "loss": 0.0122, "step": 7200 }, { "epoch": 0.6035302385184573, "grad_norm": 0.0, "learning_rate": 2.0946672178586195e-06, "loss": 0.0141, "step": 7300 }, { "epoch": 0.6117977760324087, "grad_norm": 0.0, "learning_rate": 2.082265398925176e-06, "loss": 0.0145, "step": 7400 }, { "epoch": 0.6200653135463602, "grad_norm": 0.0, "learning_rate": 2.069863579991732e-06, "loss": 0.0133, "step": 7500 }, { "epoch": 0.6200653135463602, "eval_cosine_accuracy": 0.9999069886409878, "eval_loss": 0.006750450469553471, "eval_runtime": 692.7796, "eval_samples_per_second": 124.153, "eval_steps_per_second": 3.88, "step": 7500 }, { "epoch": 0.6283328510603117, "grad_norm": 4.767193794250488, "learning_rate": 2.0574617610582885e-06, "loss": 0.0139, "step": 7600 }, { "epoch": 0.6366003885742632, "grad_norm": 3.9448177814483643, "learning_rate": 2.045059942124845e-06, "loss": 0.0159, "step": 7700 }, { "epoch": 0.6448679260882146, "grad_norm": 0.0, "learning_rate": 2.0326581231914013e-06, "loss": 0.0167, "step": 7800 }, { "epoch": 0.6531354636021661, "grad_norm": 0.0, "learning_rate": 2.0202563042579582e-06, "loss": 0.0106, "step": 7900 }, { "epoch": 0.6614030011161176, "grad_norm": 0.0, "learning_rate": 2.0078544853245142e-06, "loss": 0.0125, "step": 8000 }, { "epoch": 0.6614030011161176, "eval_cosine_accuracy": 0.9999302414807408, "eval_loss": 0.005728627555072308, "eval_runtime": 690.5659, "eval_samples_per_second": 124.551, "eval_steps_per_second": 3.892, "step": 8000 }, { "epoch": 0.669670538630069, "grad_norm": 5.22554874420166, "learning_rate": 1.9954526663910707e-06, "loss": 0.0086, "step": 8100 }, { "epoch": 0.6779380761440205, "grad_norm": 2.913892984390259, "learning_rate": 1.983050847457627e-06, "loss": 0.0138, "step": 8200 }, { "epoch": 0.686205613657972, "grad_norm": 4.088724613189697, "learning_rate": 1.9706490285241836e-06, "loss": 0.0151, "step": 8300 }, { "epoch": 0.6944731511719234, "grad_norm": 0.0, "learning_rate": 1.95824720959074e-06, "loss": 0.0148, "step": 8400 }, { "epoch": 0.7027406886858749, "grad_norm": 0.0, "learning_rate": 1.9458453906572965e-06, "loss": 0.0086, "step": 8500 }, { "epoch": 0.7027406886858749, "eval_cosine_accuracy": 0.9999186150608643, "eval_loss": 0.00535405520349741, "eval_runtime": 688.2066, "eval_samples_per_second": 124.978, "eval_steps_per_second": 3.906, "step": 8500 }, { "epoch": 0.7110082261998264, "grad_norm": 4.961321830749512, "learning_rate": 1.933443571723853e-06, "loss": 0.0082, "step": 8600 }, { "epoch": 0.7192757637137779, "grad_norm": 0.0, "learning_rate": 1.9210417527904094e-06, "loss": 0.0088, "step": 8700 }, { "epoch": 0.7275433012277294, "grad_norm": 4.613398551940918, "learning_rate": 1.908639933856966e-06, "loss": 0.0097, "step": 8800 }, { "epoch": 0.7358108387416807, "grad_norm": 5.949629306793213, "learning_rate": 1.896238114923522e-06, "loss": 0.0103, "step": 8900 }, { "epoch": 0.7440783762556322, "grad_norm": 0.0, "learning_rate": 1.8838362959900785e-06, "loss": 0.0108, "step": 9000 }, { "epoch": 0.7440783762556322, "eval_cosine_accuracy": 0.9999069886409878, "eval_loss": 0.004750753752887249, "eval_runtime": 692.3618, "eval_samples_per_second": 124.228, "eval_steps_per_second": 3.882, "step": 9000 }, { "epoch": 0.7523459137695837, "grad_norm": 0.0, "learning_rate": 1.871434477056635e-06, "loss": 0.0113, "step": 9100 }, { "epoch": 0.7606134512835352, "grad_norm": 0.0, "learning_rate": 1.8590326581231916e-06, "loss": 0.0096, "step": 9200 }, { "epoch": 0.7688809887974867, "grad_norm": 0.0, "learning_rate": 1.8466308391897476e-06, "loss": 0.0115, "step": 9300 }, { "epoch": 0.7771485263114382, "grad_norm": 7.0339884757995605, "learning_rate": 1.8342290202563043e-06, "loss": 0.011, "step": 9400 }, { "epoch": 0.7854160638253896, "grad_norm": 0.0, "learning_rate": 1.8218272013228608e-06, "loss": 0.0091, "step": 9500 }, { "epoch": 0.7854160638253896, "eval_cosine_accuracy": 0.9999069886409878, "eval_loss": 0.00442688912153244, "eval_runtime": 692.6571, "eval_samples_per_second": 124.175, "eval_steps_per_second": 3.881, "step": 9500 }, { "epoch": 0.7936836013393411, "grad_norm": 3.4286439418792725, "learning_rate": 1.8094253823894172e-06, "loss": 0.0157, "step": 9600 }, { "epoch": 0.8019511388532925, "grad_norm": 0.0, "learning_rate": 1.7970235634559737e-06, "loss": 0.0119, "step": 9700 }, { "epoch": 0.810218676367244, "grad_norm": 7.657674312591553, "learning_rate": 1.7846217445225299e-06, "loss": 0.0071, "step": 9800 }, { "epoch": 0.8184862138811955, "grad_norm": 0.0, "learning_rate": 1.7722199255890863e-06, "loss": 0.008, "step": 9900 }, { "epoch": 0.826753751395147, "grad_norm": 0.0, "learning_rate": 1.759818106655643e-06, "loss": 0.0111, "step": 10000 }, { "epoch": 0.826753751395147, "eval_cosine_accuracy": 0.9999186150608643, "eval_loss": 0.004347575828433037, "eval_runtime": 691.4865, "eval_samples_per_second": 124.386, "eval_steps_per_second": 3.887, "step": 10000 }, { "epoch": 0.8350212889090984, "grad_norm": 0.0, "learning_rate": 1.7474162877221994e-06, "loss": 0.0107, "step": 10100 }, { "epoch": 0.8432888264230499, "grad_norm": 0.0, "learning_rate": 1.7350144687887557e-06, "loss": 0.0094, "step": 10200 }, { "epoch": 0.8515563639370014, "grad_norm": 0.0, "learning_rate": 1.7226126498553121e-06, "loss": 0.0099, "step": 10300 }, { "epoch": 0.8598239014509529, "grad_norm": 5.666162967681885, "learning_rate": 1.7102108309218686e-06, "loss": 0.0108, "step": 10400 }, { "epoch": 0.8680914389649043, "grad_norm": 0.0, "learning_rate": 1.697809011988425e-06, "loss": 0.0077, "step": 10500 }, { "epoch": 0.8680914389649043, "eval_cosine_accuracy": 0.9999418679006173, "eval_loss": 0.003721497254446149, "eval_runtime": 691.4305, "eval_samples_per_second": 124.396, "eval_steps_per_second": 3.888, "step": 10500 }, { "epoch": 0.8763589764788557, "grad_norm": 0.0, "learning_rate": 1.6854071930549817e-06, "loss": 0.0065, "step": 10600 }, { "epoch": 0.8846265139928072, "grad_norm": 5.624023914337158, "learning_rate": 1.6730053741215377e-06, "loss": 0.0111, "step": 10700 }, { "epoch": 0.8928940515067587, "grad_norm": 0.0, "learning_rate": 1.6606035551880942e-06, "loss": 0.0099, "step": 10800 }, { "epoch": 0.9011615890207102, "grad_norm": 4.608953952789307, "learning_rate": 1.6482017362546508e-06, "loss": 0.01, "step": 10900 }, { "epoch": 0.9094291265346617, "grad_norm": 3.9025700092315674, "learning_rate": 1.6357999173212073e-06, "loss": 0.0098, "step": 11000 }, { "epoch": 0.9094291265346617, "eval_cosine_accuracy": 0.9999534943204939, "eval_loss": 0.0034107074607163668, "eval_runtime": 690.7094, "eval_samples_per_second": 124.526, "eval_steps_per_second": 3.892, "step": 11000 }, { "epoch": 0.9176966640486132, "grad_norm": 0.0, "learning_rate": 1.6233980983877635e-06, "loss": 0.0075, "step": 11100 }, { "epoch": 0.9259642015625645, "grad_norm": 0.0, "learning_rate": 1.61099627945432e-06, "loss": 0.0049, "step": 11200 }, { "epoch": 0.934231739076516, "grad_norm": 0.0, "learning_rate": 1.5985944605208764e-06, "loss": 0.0091, "step": 11300 }, { "epoch": 0.9424992765904675, "grad_norm": 0.0, "learning_rate": 1.5861926415874328e-06, "loss": 0.0067, "step": 11400 }, { "epoch": 0.950766814104419, "grad_norm": 0.0, "learning_rate": 1.5737908226539895e-06, "loss": 0.0089, "step": 11500 }, { "epoch": 0.950766814104419, "eval_cosine_accuracy": 0.9999418679006173, "eval_loss": 0.0033190434332937002, "eval_runtime": 692.4967, "eval_samples_per_second": 124.204, "eval_steps_per_second": 3.882, "step": 11500 }, { "epoch": 0.9590343516183705, "grad_norm": 0.0, "learning_rate": 1.5613890037205455e-06, "loss": 0.0087, "step": 11600 }, { "epoch": 0.967301889132322, "grad_norm": 0.0, "learning_rate": 1.5489871847871022e-06, "loss": 0.0092, "step": 11700 }, { "epoch": 0.9755694266462734, "grad_norm": 6.598356246948242, "learning_rate": 1.5365853658536586e-06, "loss": 0.0051, "step": 11800 }, { "epoch": 0.9838369641602249, "grad_norm": 0.0, "learning_rate": 1.524183546920215e-06, "loss": 0.0088, "step": 11900 }, { "epoch": 0.9921045016741763, "grad_norm": 0.0, "learning_rate": 1.5117817279867713e-06, "loss": 0.0089, "step": 12000 }, { "epoch": 0.9921045016741763, "eval_cosine_accuracy": 0.9999302414807408, "eval_loss": 0.0032671140506863594, "eval_runtime": 691.2453, "eval_samples_per_second": 124.429, "eval_steps_per_second": 3.889, "step": 12000 }, { "epoch": 1.000372039188128, "grad_norm": 5.8269877433776855, "learning_rate": 1.4993799090533278e-06, "loss": 0.0145, "step": 12100 }, { "epoch": 1.0086395767020793, "grad_norm": 5.8136091232299805, "learning_rate": 1.4869780901198842e-06, "loss": 0.0078, "step": 12200 }, { "epoch": 1.0169071142160309, "grad_norm": 7.668418884277344, "learning_rate": 1.4745762711864409e-06, "loss": 0.0087, "step": 12300 }, { "epoch": 1.0251746517299822, "grad_norm": 0.0, "learning_rate": 1.4621744522529971e-06, "loss": 0.0047, "step": 12400 }, { "epoch": 1.0334421892439336, "grad_norm": 0.0, "learning_rate": 1.4497726333195536e-06, "loss": 0.0064, "step": 12500 }, { "epoch": 1.0334421892439336, "eval_cosine_accuracy": 0.9999651207403705, "eval_loss": 0.003002994926646352, "eval_runtime": 695.5578, "eval_samples_per_second": 123.658, "eval_steps_per_second": 3.865, "step": 12500 }, { "epoch": 1.0417097267578852, "grad_norm": 0.0, "learning_rate": 1.43737081438611e-06, "loss": 0.0057, "step": 12600 }, { "epoch": 1.0499772642718366, "grad_norm": 0.0, "learning_rate": 1.4249689954526665e-06, "loss": 0.0067, "step": 12700 }, { "epoch": 1.0582448017857882, "grad_norm": 0.0, "learning_rate": 1.4125671765192227e-06, "loss": 0.0044, "step": 12800 }, { "epoch": 1.0665123392997395, "grad_norm": 6.3078789710998535, "learning_rate": 1.4001653575857794e-06, "loss": 0.0051, "step": 12900 }, { "epoch": 1.0747798768136911, "grad_norm": 0.0, "learning_rate": 1.3877635386523356e-06, "loss": 0.0057, "step": 13000 }, { "epoch": 1.0747798768136911, "eval_cosine_accuracy": 0.9999418679006173, "eval_loss": 0.003003525547683239, "eval_runtime": 699.036, "eval_samples_per_second": 123.042, "eval_steps_per_second": 3.845, "step": 13000 }, { "epoch": 1.0830474143276425, "grad_norm": 0.0, "learning_rate": 1.375361719718892e-06, "loss": 0.0064, "step": 13100 }, { "epoch": 1.0913149518415939, "grad_norm": 3.789832830429077, "learning_rate": 1.3629599007854487e-06, "loss": 0.0051, "step": 13200 }, { "epoch": 1.0995824893555455, "grad_norm": 0.0, "learning_rate": 1.350558081852005e-06, "loss": 0.0037, "step": 13300 }, { "epoch": 1.1078500268694969, "grad_norm": 0.0, "learning_rate": 1.3381562629185614e-06, "loss": 0.0041, "step": 13400 }, { "epoch": 1.1161175643834484, "grad_norm": 0.0, "learning_rate": 1.3257544439851178e-06, "loss": 0.0023, "step": 13500 }, { "epoch": 1.1161175643834484, "eval_cosine_accuracy": 0.9999534943204939, "eval_loss": 0.002926659770309925, "eval_runtime": 609.8415, "eval_samples_per_second": 141.038, "eval_steps_per_second": 4.408, "step": 13500 }, { "epoch": 1.1243851018973998, "grad_norm": 6.910253047943115, "learning_rate": 1.3133526250516743e-06, "loss": 0.0039, "step": 13600 }, { "epoch": 1.1326526394113514, "grad_norm": 0.0, "learning_rate": 1.3009508061182307e-06, "loss": 0.0024, "step": 13700 }, { "epoch": 1.1409201769253028, "grad_norm": 0.0, "learning_rate": 1.2885489871847872e-06, "loss": 0.0046, "step": 13800 }, { "epoch": 1.1491877144392544, "grad_norm": 0.0, "learning_rate": 1.2761471682513436e-06, "loss": 0.0053, "step": 13900 }, { "epoch": 1.1574552519532058, "grad_norm": 0.0, "learning_rate": 1.2637453493179e-06, "loss": 0.005, "step": 14000 }, { "epoch": 1.1574552519532058, "eval_cosine_accuracy": 0.9999651207403705, "eval_loss": 0.002777786459773779, "eval_runtime": 544.44, "eval_samples_per_second": 157.981, "eval_steps_per_second": 4.937, "step": 14000 }, { "epoch": 1.1657227894671571, "grad_norm": 0.0, "learning_rate": 1.2513435303844565e-06, "loss": 0.0036, "step": 14100 }, { "epoch": 1.1739903269811087, "grad_norm": 0.0, "learning_rate": 1.2389417114510128e-06, "loss": 0.003, "step": 14200 }, { "epoch": 1.18225786449506, "grad_norm": 0.0, "learning_rate": 1.2265398925175692e-06, "loss": 0.0024, "step": 14300 }, { "epoch": 1.1905254020090117, "grad_norm": 0.0, "learning_rate": 1.2141380735841257e-06, "loss": 0.0045, "step": 14400 }, { "epoch": 1.198792939522963, "grad_norm": 0.0, "learning_rate": 1.2017362546506821e-06, "loss": 0.0048, "step": 14500 }, { "epoch": 1.198792939522963, "eval_cosine_accuracy": 0.9999651207403705, "eval_loss": 0.0027241790667176247, "eval_runtime": 709.377, "eval_samples_per_second": 121.249, "eval_steps_per_second": 3.789, "step": 14500 }, { "epoch": 1.2070604770369147, "grad_norm": 0.0, "learning_rate": 1.1893344357172386e-06, "loss": 0.0037, "step": 14600 }, { "epoch": 1.215328014550866, "grad_norm": 0.0, "learning_rate": 1.176932616783795e-06, "loss": 0.0041, "step": 14700 }, { "epoch": 1.2235955520648174, "grad_norm": 0.0, "learning_rate": 1.1645307978503515e-06, "loss": 0.0035, "step": 14800 }, { "epoch": 1.231863089578769, "grad_norm": 0.0, "learning_rate": 1.152128978916908e-06, "loss": 0.0024, "step": 14900 }, { "epoch": 1.2401306270927204, "grad_norm": 3.7592828273773193, "learning_rate": 1.1397271599834644e-06, "loss": 0.0036, "step": 15000 }, { "epoch": 1.2401306270927204, "eval_cosine_accuracy": 0.9999651207403705, "eval_loss": 0.0026917748618870974, "eval_runtime": 691.7866, "eval_samples_per_second": 124.332, "eval_steps_per_second": 3.886, "step": 15000 }, { "epoch": 1.248398164606672, "grad_norm": 0.0, "learning_rate": 1.1273253410500206e-06, "loss": 0.0017, "step": 15100 }, { "epoch": 1.2566657021206233, "grad_norm": 0.0, "learning_rate": 1.1149235221165772e-06, "loss": 0.0032, "step": 15200 }, { "epoch": 1.264933239634575, "grad_norm": 0.0, "learning_rate": 1.1025217031831335e-06, "loss": 0.0032, "step": 15300 }, { "epoch": 1.2732007771485263, "grad_norm": 0.0, "learning_rate": 1.09011988424969e-06, "loss": 0.004, "step": 15400 }, { "epoch": 1.2814683146624777, "grad_norm": 0.0, "learning_rate": 1.0777180653162464e-06, "loss": 0.0034, "step": 15500 }, { "epoch": 1.2814683146624777, "eval_cosine_accuracy": 0.9999651207403705, "eval_loss": 0.0028443040791898966, "eval_runtime": 691.2444, "eval_samples_per_second": 124.429, "eval_steps_per_second": 3.889, "step": 15500 }, { "epoch": 1.2897358521764293, "grad_norm": 0.0, "learning_rate": 1.0653162463828028e-06, "loss": 0.0036, "step": 15600 }, { "epoch": 1.2980033896903806, "grad_norm": 0.0, "learning_rate": 1.0529144274493593e-06, "loss": 0.0047, "step": 15700 }, { "epoch": 1.3062709272043322, "grad_norm": 0.0, "learning_rate": 1.0405126085159157e-06, "loss": 0.0022, "step": 15800 }, { "epoch": 1.3145384647182836, "grad_norm": 0.0, "learning_rate": 1.0281107895824722e-06, "loss": 0.0023, "step": 15900 }, { "epoch": 1.3228060022322352, "grad_norm": 0.0, "learning_rate": 1.0157089706490284e-06, "loss": 0.0031, "step": 16000 }, { "epoch": 1.3228060022322352, "eval_cosine_accuracy": 0.9999534943204939, "eval_loss": 0.002732690190896392, "eval_runtime": 690.7452, "eval_samples_per_second": 124.519, "eval_steps_per_second": 3.891, "step": 16000 }, { "epoch": 1.3310735397461866, "grad_norm": 0.0, "learning_rate": 1.003307151715585e-06, "loss": 0.0024, "step": 16100 }, { "epoch": 1.339341077260138, "grad_norm": 0.0, "learning_rate": 9.909053327821413e-07, "loss": 0.0027, "step": 16200 }, { "epoch": 1.3476086147740896, "grad_norm": 4.422085285186768, "learning_rate": 9.785035138486978e-07, "loss": 0.0043, "step": 16300 }, { "epoch": 1.355876152288041, "grad_norm": 0.0, "learning_rate": 9.661016949152542e-07, "loss": 0.004, "step": 16400 }, { "epoch": 1.3641436898019925, "grad_norm": 3.7661290168762207, "learning_rate": 9.536998759818107e-07, "loss": 0.0018, "step": 16500 }, { "epoch": 1.3641436898019925, "eval_cosine_accuracy": 0.9999651207403705, "eval_loss": 0.0025276916567236185, "eval_runtime": 691.7092, "eval_samples_per_second": 124.346, "eval_steps_per_second": 3.886, "step": 16500 }, { "epoch": 1.372411227315944, "grad_norm": 0.0, "learning_rate": 9.412980570483672e-07, "loss": 0.0035, "step": 16600 }, { "epoch": 1.3806787648298955, "grad_norm": 0.0, "learning_rate": 9.288962381149235e-07, "loss": 0.0028, "step": 16700 }, { "epoch": 1.3889463023438469, "grad_norm": 0.0, "learning_rate": 9.1649441918148e-07, "loss": 0.003, "step": 16800 }, { "epoch": 1.3972138398577982, "grad_norm": 3.7538058757781982, "learning_rate": 9.040926002480363e-07, "loss": 0.0031, "step": 16900 }, { "epoch": 1.4054813773717498, "grad_norm": 0.0, "learning_rate": 8.916907813145929e-07, "loss": 0.0019, "step": 17000 }, { "epoch": 1.4054813773717498, "eval_cosine_accuracy": 0.9999651207403705, "eval_loss": 0.0025049280375242233, "eval_runtime": 690.902, "eval_samples_per_second": 124.491, "eval_steps_per_second": 3.891, "step": 17000 }, { "epoch": 1.4137489148857014, "grad_norm": 0.0, "learning_rate": 8.792889623811492e-07, "loss": 0.0032, "step": 17100 }, { "epoch": 1.4220164523996528, "grad_norm": 0.0, "learning_rate": 8.668871434477057e-07, "loss": 0.0035, "step": 17200 }, { "epoch": 1.4302839899136042, "grad_norm": 4.074612140655518, "learning_rate": 8.54485324514262e-07, "loss": 0.0017, "step": 17300 }, { "epoch": 1.4385515274275558, "grad_norm": 0.0, "learning_rate": 8.420835055808186e-07, "loss": 0.0023, "step": 17400 }, { "epoch": 1.4468190649415071, "grad_norm": 0.0, "learning_rate": 8.29681686647375e-07, "loss": 0.0024, "step": 17500 }, { "epoch": 1.4468190649415071, "eval_cosine_accuracy": 0.9999534943204939, "eval_loss": 0.0024029347114264965, "eval_runtime": 690.076, "eval_samples_per_second": 124.64, "eval_steps_per_second": 3.895, "step": 17500 }, { "epoch": 1.4550866024554585, "grad_norm": 0.0, "learning_rate": 8.172798677139314e-07, "loss": 0.0017, "step": 17600 }, { "epoch": 1.46335413996941, "grad_norm": 0.0, "learning_rate": 8.048780487804879e-07, "loss": 0.0025, "step": 17700 }, { "epoch": 1.4716216774833617, "grad_norm": 0.0, "learning_rate": 7.924762298470442e-07, "loss": 0.0019, "step": 17800 }, { "epoch": 1.479889214997313, "grad_norm": 0.0, "learning_rate": 7.800744109136007e-07, "loss": 0.003, "step": 17900 }, { "epoch": 1.4881567525112644, "grad_norm": 0.0, "learning_rate": 7.676725919801571e-07, "loss": 0.003, "step": 18000 }, { "epoch": 1.4881567525112644, "eval_cosine_accuracy": 0.9999651207403705, "eval_loss": 0.0024024369195103645, "eval_runtime": 690.6226, "eval_samples_per_second": 124.541, "eval_steps_per_second": 3.892, "step": 18000 } ], "logging_steps": 100, "max_steps": 24190, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }