{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7054574185902139, "eval_steps": 500, "global_step": 25000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0028218296743608554, "grad_norm": 175.4037628173828, "learning_rate": 1.3261851015801355e-07, "loss": 3.1599, "step": 100 }, { "epoch": 0.005643659348721711, "grad_norm": 24.302709579467773, "learning_rate": 2.737020316027088e-07, "loss": 1.8481, "step": 200 }, { "epoch": 0.008465489023082567, "grad_norm": 39.38914489746094, "learning_rate": 4.147855530474041e-07, "loss": 1.2574, "step": 300 }, { "epoch": 0.011287318697443422, "grad_norm": 28.08403205871582, "learning_rate": 5.558690744920993e-07, "loss": 1.0333, "step": 400 }, { "epoch": 0.014109148371804279, "grad_norm": 22.54578399658203, "learning_rate": 6.969525959367947e-07, "loss": 0.9413, "step": 500 }, { "epoch": 0.016930978046165134, "grad_norm": 17.940231323242188, "learning_rate": 8.3803611738149e-07, "loss": 0.7995, "step": 600 }, { "epoch": 0.01975280772052599, "grad_norm": 26.575462341308594, "learning_rate": 9.79119638826185e-07, "loss": 0.7403, "step": 700 }, { "epoch": 0.022574637394886844, "grad_norm": 51.95569610595703, "learning_rate": 1.1202031602708804e-06, "loss": 0.7774, "step": 800 }, { "epoch": 0.0253964670692477, "grad_norm": 140.2630157470703, "learning_rate": 1.2612866817155757e-06, "loss": 0.6903, "step": 900 }, { "epoch": 0.028218296743608557, "grad_norm": 28.976390838623047, "learning_rate": 1.402370203160271e-06, "loss": 0.6727, "step": 1000 }, { "epoch": 0.03104012641796941, "grad_norm": 36.016754150390625, "learning_rate": 1.5434537246049664e-06, "loss": 0.6351, "step": 1100 }, { "epoch": 0.03386195609233027, "grad_norm": 41.480491638183594, "learning_rate": 1.6845372460496615e-06, "loss": 0.6555, "step": 1200 }, { "epoch": 0.036683785766691124, "grad_norm": 24.90089225769043, "learning_rate": 1.8256207674943568e-06, "loss": 0.6714, "step": 1300 }, { "epoch": 0.03950561544105198, "grad_norm": 31.070730209350586, "learning_rate": 1.966704288939052e-06, "loss": 0.6781, "step": 1400 }, { "epoch": 0.04232744511541283, "grad_norm": 33.644046783447266, "learning_rate": 2.1077878103837474e-06, "loss": 0.6284, "step": 1500 }, { "epoch": 0.04514927478977369, "grad_norm": 22.074399948120117, "learning_rate": 2.2488713318284427e-06, "loss": 0.578, "step": 1600 }, { "epoch": 0.047971104464134544, "grad_norm": 30.381807327270508, "learning_rate": 2.389954853273138e-06, "loss": 0.631, "step": 1700 }, { "epoch": 0.0507929341384954, "grad_norm": 31.510488510131836, "learning_rate": 2.531038374717833e-06, "loss": 0.61, "step": 1800 }, { "epoch": 0.05361476381285626, "grad_norm": 27.364355087280273, "learning_rate": 2.6721218961625283e-06, "loss": 0.5953, "step": 1900 }, { "epoch": 0.056436593487217114, "grad_norm": 48.54792404174805, "learning_rate": 2.8132054176072236e-06, "loss": 0.6046, "step": 2000 }, { "epoch": 0.059258423161577964, "grad_norm": 49.58477020263672, "learning_rate": 2.9542889390519193e-06, "loss": 0.5609, "step": 2100 }, { "epoch": 0.06208025283593882, "grad_norm": 26.104825973510742, "learning_rate": 3.095372460496614e-06, "loss": 0.5931, "step": 2200 }, { "epoch": 0.06490208251029968, "grad_norm": 36.350685119628906, "learning_rate": 3.2364559819413096e-06, "loss": 0.5884, "step": 2300 }, { "epoch": 0.06772391218466053, "grad_norm": 36.55471420288086, "learning_rate": 3.377539503386005e-06, "loss": 0.5957, "step": 2400 }, { "epoch": 0.07054574185902139, "grad_norm": 35.742225646972656, "learning_rate": 3.5186230248307e-06, "loss": 0.5958, "step": 2500 }, { "epoch": 0.07336757153338225, "grad_norm": 20.766326904296875, "learning_rate": 3.6597065462753955e-06, "loss": 0.561, "step": 2600 }, { "epoch": 0.0761894012077431, "grad_norm": 43.46022033691406, "learning_rate": 3.8007900677200904e-06, "loss": 0.5751, "step": 2700 }, { "epoch": 0.07901123088210396, "grad_norm": 35.60908889770508, "learning_rate": 3.941873589164786e-06, "loss": 0.6054, "step": 2800 }, { "epoch": 0.08183306055646482, "grad_norm": 25.768211364746094, "learning_rate": 4.082957110609481e-06, "loss": 0.5796, "step": 2900 }, { "epoch": 0.08465489023082566, "grad_norm": 38.575496673583984, "learning_rate": 4.224040632054177e-06, "loss": 0.5874, "step": 3000 }, { "epoch": 0.08747671990518652, "grad_norm": 23.993473052978516, "learning_rate": 4.363713318284425e-06, "loss": 0.5564, "step": 3100 }, { "epoch": 0.09029854957954737, "grad_norm": 42.830509185791016, "learning_rate": 4.50479683972912e-06, "loss": 0.5645, "step": 3200 }, { "epoch": 0.09312037925390823, "grad_norm": 34.766197204589844, "learning_rate": 4.6458803611738155e-06, "loss": 0.5433, "step": 3300 }, { "epoch": 0.09594220892826909, "grad_norm": 32.30384826660156, "learning_rate": 4.78696388261851e-06, "loss": 0.5691, "step": 3400 }, { "epoch": 0.09876403860262994, "grad_norm": 77.13898468017578, "learning_rate": 4.928047404063206e-06, "loss": 0.5685, "step": 3500 }, { "epoch": 0.1015858682769908, "grad_norm": 42.225563049316406, "learning_rate": 5.069130925507901e-06, "loss": 0.5381, "step": 3600 }, { "epoch": 0.10440769795135166, "grad_norm": 32.32414245605469, "learning_rate": 5.210214446952596e-06, "loss": 0.5406, "step": 3700 }, { "epoch": 0.10722952762571251, "grad_norm": 25.47852325439453, "learning_rate": 5.3512979683972925e-06, "loss": 0.5309, "step": 3800 }, { "epoch": 0.11005135730007337, "grad_norm": 31.160581588745117, "learning_rate": 5.4923814898419865e-06, "loss": 0.5259, "step": 3900 }, { "epoch": 0.11287318697443423, "grad_norm": 36.84406661987305, "learning_rate": 5.632054176072235e-06, "loss": 0.5297, "step": 4000 }, { "epoch": 0.11569501664879508, "grad_norm": 10.82895565032959, "learning_rate": 5.77313769751693e-06, "loss": 0.4846, "step": 4100 }, { "epoch": 0.11851684632315593, "grad_norm": 21.587764739990234, "learning_rate": 5.914221218961625e-06, "loss": 0.5333, "step": 4200 }, { "epoch": 0.12133867599751678, "grad_norm": 22.369434356689453, "learning_rate": 6.055304740406322e-06, "loss": 0.5372, "step": 4300 }, { "epoch": 0.12416050567187764, "grad_norm": 33.766910552978516, "learning_rate": 6.196388261851016e-06, "loss": 0.5652, "step": 4400 }, { "epoch": 0.1269823353462385, "grad_norm": 27.229183197021484, "learning_rate": 6.337471783295711e-06, "loss": 0.531, "step": 4500 }, { "epoch": 0.12980416502059935, "grad_norm": 25.017833709716797, "learning_rate": 6.478555304740407e-06, "loss": 0.5483, "step": 4600 }, { "epoch": 0.13262599469496023, "grad_norm": 27.347057342529297, "learning_rate": 6.619638826185102e-06, "loss": 0.5289, "step": 4700 }, { "epoch": 0.13544782436932107, "grad_norm": 32.57697296142578, "learning_rate": 6.760722347629798e-06, "loss": 0.5225, "step": 4800 }, { "epoch": 0.1382696540436819, "grad_norm": 31.288978576660156, "learning_rate": 6.901805869074493e-06, "loss": 0.4876, "step": 4900 }, { "epoch": 0.14109148371804278, "grad_norm": 14.970600128173828, "learning_rate": 7.042889390519188e-06, "loss": 0.4723, "step": 5000 }, { "epoch": 0.14391331339240362, "grad_norm": 50.01182556152344, "learning_rate": 7.1839729119638835e-06, "loss": 0.518, "step": 5100 }, { "epoch": 0.1467351430667645, "grad_norm": 28.612035751342773, "learning_rate": 7.325056433408578e-06, "loss": 0.4673, "step": 5200 }, { "epoch": 0.14955697274112534, "grad_norm": 79.7120132446289, "learning_rate": 7.466139954853274e-06, "loss": 0.5274, "step": 5300 }, { "epoch": 0.1523788024154862, "grad_norm": 15.11323070526123, "learning_rate": 7.607223476297969e-06, "loss": 0.5082, "step": 5400 }, { "epoch": 0.15520063208984705, "grad_norm": 18.369705200195312, "learning_rate": 7.748306997742663e-06, "loss": 0.4996, "step": 5500 }, { "epoch": 0.15802246176420792, "grad_norm": 36.83811950683594, "learning_rate": 7.88939051918736e-06, "loss": 0.5587, "step": 5600 }, { "epoch": 0.16084429143856876, "grad_norm": 45.13284683227539, "learning_rate": 8.030474040632055e-06, "loss": 0.5121, "step": 5700 }, { "epoch": 0.16366612111292964, "grad_norm": 22.508358001708984, "learning_rate": 8.17155756207675e-06, "loss": 0.5167, "step": 5800 }, { "epoch": 0.16648795078729048, "grad_norm": 37.16364288330078, "learning_rate": 8.312641083521446e-06, "loss": 0.524, "step": 5900 }, { "epoch": 0.16930978046165132, "grad_norm": 25.45941925048828, "learning_rate": 8.453724604966141e-06, "loss": 0.4932, "step": 6000 }, { "epoch": 0.1721316101360122, "grad_norm": 29.486251831054688, "learning_rate": 8.594808126410836e-06, "loss": 0.5069, "step": 6100 }, { "epoch": 0.17495343981037303, "grad_norm": 36.87773513793945, "learning_rate": 8.73589164785553e-06, "loss": 0.5286, "step": 6200 }, { "epoch": 0.1777752694847339, "grad_norm": 27.660303115844727, "learning_rate": 8.876975169300226e-06, "loss": 0.5015, "step": 6300 }, { "epoch": 0.18059709915909475, "grad_norm": 20.193119049072266, "learning_rate": 9.018058690744922e-06, "loss": 0.5353, "step": 6400 }, { "epoch": 0.18341892883345562, "grad_norm": 46.997806549072266, "learning_rate": 9.159142212189617e-06, "loss": 0.4702, "step": 6500 }, { "epoch": 0.18624075850781646, "grad_norm": 33.448543548583984, "learning_rate": 9.300225733634312e-06, "loss": 0.51, "step": 6600 }, { "epoch": 0.18906258818217733, "grad_norm": 36.22298049926758, "learning_rate": 9.441309255079007e-06, "loss": 0.4891, "step": 6700 }, { "epoch": 0.19188441785653818, "grad_norm": 37.21258544921875, "learning_rate": 9.582392776523702e-06, "loss": 0.5353, "step": 6800 }, { "epoch": 0.19470624753089905, "grad_norm": 14.430715560913086, "learning_rate": 9.723476297968398e-06, "loss": 0.5091, "step": 6900 }, { "epoch": 0.1975280772052599, "grad_norm": 37.424869537353516, "learning_rate": 9.864559819413093e-06, "loss": 0.5326, "step": 7000 }, { "epoch": 0.20034990687962076, "grad_norm": 12.207716941833496, "learning_rate": 9.999372922806798e-06, "loss": 0.517, "step": 7100 }, { "epoch": 0.2031717365539816, "grad_norm": 10.985408782958984, "learning_rate": 9.983695992976736e-06, "loss": 0.5263, "step": 7200 }, { "epoch": 0.20599356622834245, "grad_norm": 32.765968322753906, "learning_rate": 9.968019063146674e-06, "loss": 0.5073, "step": 7300 }, { "epoch": 0.20881539590270332, "grad_norm": 24.93343734741211, "learning_rate": 9.952342133316612e-06, "loss": 0.5049, "step": 7400 }, { "epoch": 0.21163722557706416, "grad_norm": 42.219844818115234, "learning_rate": 9.93666520348655e-06, "loss": 0.5039, "step": 7500 }, { "epoch": 0.21445905525142503, "grad_norm": 23.783781051635742, "learning_rate": 9.920988273656488e-06, "loss": 0.5179, "step": 7600 }, { "epoch": 0.21728088492578587, "grad_norm": 29.034082412719727, "learning_rate": 9.905311343826426e-06, "loss": 0.5053, "step": 7700 }, { "epoch": 0.22010271460014674, "grad_norm": 33.75339126586914, "learning_rate": 9.889634413996364e-06, "loss": 0.4951, "step": 7800 }, { "epoch": 0.22292454427450759, "grad_norm": 13.03099536895752, "learning_rate": 9.873957484166302e-06, "loss": 0.5285, "step": 7900 }, { "epoch": 0.22574637394886846, "grad_norm": 31.321319580078125, "learning_rate": 9.858280554336239e-06, "loss": 0.5172, "step": 8000 }, { "epoch": 0.2285682036232293, "grad_norm": 16.2780704498291, "learning_rate": 9.842603624506177e-06, "loss": 0.5214, "step": 8100 }, { "epoch": 0.23139003329759017, "grad_norm": 23.63935661315918, "learning_rate": 9.826926694676115e-06, "loss": 0.4817, "step": 8200 }, { "epoch": 0.234211862971951, "grad_norm": 28.826778411865234, "learning_rate": 9.811249764846053e-06, "loss": 0.5106, "step": 8300 }, { "epoch": 0.23703369264631186, "grad_norm": 23.31501007080078, "learning_rate": 9.795572835015993e-06, "loss": 0.5101, "step": 8400 }, { "epoch": 0.23985552232067273, "grad_norm": 22.932710647583008, "learning_rate": 9.779895905185929e-06, "loss": 0.4779, "step": 8500 }, { "epoch": 0.24267735199503357, "grad_norm": 32.443641662597656, "learning_rate": 9.764218975355867e-06, "loss": 0.5057, "step": 8600 }, { "epoch": 0.24549918166939444, "grad_norm": 60.96305847167969, "learning_rate": 9.748542045525805e-06, "loss": 0.4986, "step": 8700 }, { "epoch": 0.24832101134375528, "grad_norm": 27.693511962890625, "learning_rate": 9.732865115695743e-06, "loss": 0.5099, "step": 8800 }, { "epoch": 0.2511428410181161, "grad_norm": 16.883127212524414, "learning_rate": 9.717188185865681e-06, "loss": 0.5244, "step": 8900 }, { "epoch": 0.253964670692477, "grad_norm": 30.549161911010742, "learning_rate": 9.701511256035619e-06, "loss": 0.5023, "step": 9000 }, { "epoch": 0.25678650036683787, "grad_norm": 23.576152801513672, "learning_rate": 9.685991095503856e-06, "loss": 0.5008, "step": 9100 }, { "epoch": 0.2596083300411987, "grad_norm": 21.88428497314453, "learning_rate": 9.670314165673796e-06, "loss": 0.5295, "step": 9200 }, { "epoch": 0.26243015971555955, "grad_norm": 21.60301971435547, "learning_rate": 9.654637235843734e-06, "loss": 0.5001, "step": 9300 }, { "epoch": 0.26525198938992045, "grad_norm": 36.93144607543945, "learning_rate": 9.638960306013672e-06, "loss": 0.4505, "step": 9400 }, { "epoch": 0.2680738190642813, "grad_norm": 17.426191329956055, "learning_rate": 9.623283376183608e-06, "loss": 0.4694, "step": 9500 }, { "epoch": 0.27089564873864214, "grad_norm": 22.8311710357666, "learning_rate": 9.607606446353546e-06, "loss": 0.488, "step": 9600 }, { "epoch": 0.273717478413003, "grad_norm": 32.39801025390625, "learning_rate": 9.591929516523485e-06, "loss": 0.5119, "step": 9700 }, { "epoch": 0.2765393080873638, "grad_norm": 14.913991928100586, "learning_rate": 9.576252586693423e-06, "loss": 0.4523, "step": 9800 }, { "epoch": 0.2793611377617247, "grad_norm": 28.448148727416992, "learning_rate": 9.56057565686336e-06, "loss": 0.4689, "step": 9900 }, { "epoch": 0.28218296743608556, "grad_norm": 49.07644271850586, "learning_rate": 9.544898727033299e-06, "loss": 0.5235, "step": 10000 }, { "epoch": 0.2850047971104464, "grad_norm": 31.61011505126953, "learning_rate": 9.529221797203237e-06, "loss": 0.5053, "step": 10100 }, { "epoch": 0.28782662678480725, "grad_norm": 23.562646865844727, "learning_rate": 9.513544867373175e-06, "loss": 0.485, "step": 10200 }, { "epoch": 0.29064845645916815, "grad_norm": 22.867277145385742, "learning_rate": 9.497867937543113e-06, "loss": 0.523, "step": 10300 }, { "epoch": 0.293470286133529, "grad_norm": 44.8724365234375, "learning_rate": 9.48219100771305e-06, "loss": 0.4691, "step": 10400 }, { "epoch": 0.29629211580788983, "grad_norm": 15.85916519165039, "learning_rate": 9.466514077882987e-06, "loss": 0.4646, "step": 10500 }, { "epoch": 0.2991139454822507, "grad_norm": 19.45340347290039, "learning_rate": 9.450837148052927e-06, "loss": 0.4907, "step": 10600 }, { "epoch": 0.3019357751566116, "grad_norm": 14.807464599609375, "learning_rate": 9.435160218222865e-06, "loss": 0.4539, "step": 10700 }, { "epoch": 0.3047576048309724, "grad_norm": 24.157548904418945, "learning_rate": 9.419483288392803e-06, "loss": 0.3937, "step": 10800 }, { "epoch": 0.30757943450533326, "grad_norm": 37.66196060180664, "learning_rate": 9.40380635856274e-06, "loss": 0.4346, "step": 10900 }, { "epoch": 0.3104012641796941, "grad_norm": 54.08269500732422, "learning_rate": 9.388129428732677e-06, "loss": 0.4566, "step": 11000 }, { "epoch": 0.31322309385405495, "grad_norm": 37.25579833984375, "learning_rate": 9.372452498902615e-06, "loss": 0.4333, "step": 11100 }, { "epoch": 0.31604492352841584, "grad_norm": 32.52021026611328, "learning_rate": 9.356775569072553e-06, "loss": 0.4971, "step": 11200 }, { "epoch": 0.3188667532027767, "grad_norm": 24.20098876953125, "learning_rate": 9.341098639242491e-06, "loss": 0.4936, "step": 11300 }, { "epoch": 0.32168858287713753, "grad_norm": 37.67250061035156, "learning_rate": 9.32542170941243e-06, "loss": 0.4925, "step": 11400 }, { "epoch": 0.3245104125514984, "grad_norm": 51.37995147705078, "learning_rate": 9.309744779582367e-06, "loss": 0.4567, "step": 11500 }, { "epoch": 0.32733224222585927, "grad_norm": 2.77247953414917, "learning_rate": 9.294067849752305e-06, "loss": 0.4273, "step": 11600 }, { "epoch": 0.3301540719002201, "grad_norm": 26.4583740234375, "learning_rate": 9.278390919922243e-06, "loss": 0.4972, "step": 11700 }, { "epoch": 0.33297590157458096, "grad_norm": 23.889802932739258, "learning_rate": 9.262713990092181e-06, "loss": 0.4748, "step": 11800 }, { "epoch": 0.3357977312489418, "grad_norm": 26.13481903076172, "learning_rate": 9.24703706026212e-06, "loss": 0.4772, "step": 11900 }, { "epoch": 0.33861956092330264, "grad_norm": 32.484073638916016, "learning_rate": 9.231360130432057e-06, "loss": 0.4582, "step": 12000 }, { "epoch": 0.34144139059766354, "grad_norm": 12.51020336151123, "learning_rate": 9.215683200601995e-06, "loss": 0.466, "step": 12100 }, { "epoch": 0.3442632202720244, "grad_norm": 22.987064361572266, "learning_rate": 9.200006270771933e-06, "loss": 0.474, "step": 12200 }, { "epoch": 0.3470850499463852, "grad_norm": 19.731643676757812, "learning_rate": 9.184329340941871e-06, "loss": 0.428, "step": 12300 }, { "epoch": 0.34990687962074607, "grad_norm": 30.095190048217773, "learning_rate": 9.16865241111181e-06, "loss": 0.4448, "step": 12400 }, { "epoch": 0.35272870929510697, "grad_norm": 23.412023544311523, "learning_rate": 9.152975481281746e-06, "loss": 0.4607, "step": 12500 }, { "epoch": 0.3555505389694678, "grad_norm": 15.968676567077637, "learning_rate": 9.137298551451684e-06, "loss": 0.4774, "step": 12600 }, { "epoch": 0.35837236864382865, "grad_norm": 22.27809715270996, "learning_rate": 9.121621621621622e-06, "loss": 0.4796, "step": 12700 }, { "epoch": 0.3611941983181895, "grad_norm": 25.096717834472656, "learning_rate": 9.106101461089861e-06, "loss": 0.4238, "step": 12800 }, { "epoch": 0.3640160279925504, "grad_norm": 32.24757766723633, "learning_rate": 9.090424531259799e-06, "loss": 0.4719, "step": 12900 }, { "epoch": 0.36683785766691124, "grad_norm": 43.72540283203125, "learning_rate": 9.074747601429737e-06, "loss": 0.4571, "step": 13000 }, { "epoch": 0.3696596873412721, "grad_norm": 25.39431381225586, "learning_rate": 9.059070671599675e-06, "loss": 0.464, "step": 13100 }, { "epoch": 0.3724815170156329, "grad_norm": 15.996291160583496, "learning_rate": 9.043393741769613e-06, "loss": 0.4345, "step": 13200 }, { "epoch": 0.37530334668999377, "grad_norm": 108.50527954101562, "learning_rate": 9.027716811939551e-06, "loss": 0.4013, "step": 13300 }, { "epoch": 0.37812517636435466, "grad_norm": 28.973108291625977, "learning_rate": 9.012039882109487e-06, "loss": 0.4512, "step": 13400 }, { "epoch": 0.3809470060387155, "grad_norm": 74.1708984375, "learning_rate": 8.996362952279425e-06, "loss": 0.4343, "step": 13500 }, { "epoch": 0.38376883571307635, "grad_norm": 19.657316207885742, "learning_rate": 8.980686022449363e-06, "loss": 0.4125, "step": 13600 }, { "epoch": 0.3865906653874372, "grad_norm": 28.859350204467773, "learning_rate": 8.965165861917603e-06, "loss": 0.4746, "step": 13700 }, { "epoch": 0.3894124950617981, "grad_norm": 18.13363265991211, "learning_rate": 8.94948893208754e-06, "loss": 0.4771, "step": 13800 }, { "epoch": 0.39223432473615893, "grad_norm": 22.466726303100586, "learning_rate": 8.933812002257479e-06, "loss": 0.4613, "step": 13900 }, { "epoch": 0.3950561544105198, "grad_norm": 27.679174423217773, "learning_rate": 8.918135072427417e-06, "loss": 0.462, "step": 14000 }, { "epoch": 0.3978779840848806, "grad_norm": 27.347543716430664, "learning_rate": 8.902458142597355e-06, "loss": 0.4828, "step": 14100 }, { "epoch": 0.4006998137592415, "grad_norm": 16.224889755249023, "learning_rate": 8.886781212767293e-06, "loss": 0.4608, "step": 14200 }, { "epoch": 0.40352164343360236, "grad_norm": 23.845518112182617, "learning_rate": 8.87110428293723e-06, "loss": 0.4187, "step": 14300 }, { "epoch": 0.4063434731079632, "grad_norm": 18.049110412597656, "learning_rate": 8.855427353107167e-06, "loss": 0.4794, "step": 14400 }, { "epoch": 0.40916530278232405, "grad_norm": 30.752511978149414, "learning_rate": 8.839750423277105e-06, "loss": 0.473, "step": 14500 }, { "epoch": 0.4119871324566849, "grad_norm": 29.279788970947266, "learning_rate": 8.824073493447045e-06, "loss": 0.4564, "step": 14600 }, { "epoch": 0.4148089621310458, "grad_norm": 7.239739894866943, "learning_rate": 8.808396563616983e-06, "loss": 0.4363, "step": 14700 }, { "epoch": 0.41763079180540663, "grad_norm": 19.604881286621094, "learning_rate": 8.79271963378692e-06, "loss": 0.4745, "step": 14800 }, { "epoch": 0.4204526214797675, "grad_norm": 21.136322021484375, "learning_rate": 8.777042703956857e-06, "loss": 0.4552, "step": 14900 }, { "epoch": 0.4232744511541283, "grad_norm": 23.589292526245117, "learning_rate": 8.761365774126795e-06, "loss": 0.4245, "step": 15000 }, { "epoch": 0.4260962808284892, "grad_norm": 9.279696464538574, "learning_rate": 8.745688844296733e-06, "loss": 0.4583, "step": 15100 }, { "epoch": 0.42891811050285006, "grad_norm": 23.76906394958496, "learning_rate": 8.730011914466671e-06, "loss": 0.4873, "step": 15200 }, { "epoch": 0.4317399401772109, "grad_norm": 29.270069122314453, "learning_rate": 8.71433498463661e-06, "loss": 0.453, "step": 15300 }, { "epoch": 0.43456176985157174, "grad_norm": 31.86260414123535, "learning_rate": 8.698658054806547e-06, "loss": 0.4939, "step": 15400 }, { "epoch": 0.4373835995259326, "grad_norm": 36.27268600463867, "learning_rate": 8.682981124976485e-06, "loss": 0.447, "step": 15500 }, { "epoch": 0.4402054292002935, "grad_norm": 26.660816192626953, "learning_rate": 8.667304195146423e-06, "loss": 0.5021, "step": 15600 }, { "epoch": 0.44302725887465433, "grad_norm": 24.500545501708984, "learning_rate": 8.651627265316361e-06, "loss": 0.4443, "step": 15700 }, { "epoch": 0.44584908854901517, "grad_norm": 30.132831573486328, "learning_rate": 8.6359503354863e-06, "loss": 0.4095, "step": 15800 }, { "epoch": 0.448670918223376, "grad_norm": 33.71192169189453, "learning_rate": 8.620273405656236e-06, "loss": 0.442, "step": 15900 }, { "epoch": 0.4514927478977369, "grad_norm": 59.854530334472656, "learning_rate": 8.604596475826175e-06, "loss": 0.4319, "step": 16000 }, { "epoch": 0.45431457757209776, "grad_norm": 23.0345516204834, "learning_rate": 8.588919545996113e-06, "loss": 0.4537, "step": 16100 }, { "epoch": 0.4571364072464586, "grad_norm": 12.112003326416016, "learning_rate": 8.573242616166051e-06, "loss": 0.4462, "step": 16200 }, { "epoch": 0.45995823692081944, "grad_norm": 24.53924560546875, "learning_rate": 8.55756568633599e-06, "loss": 0.4354, "step": 16300 }, { "epoch": 0.46278006659518034, "grad_norm": 13.580459594726562, "learning_rate": 8.541888756505926e-06, "loss": 0.4662, "step": 16400 }, { "epoch": 0.4656018962695412, "grad_norm": 14.42153263092041, "learning_rate": 8.526211826675864e-06, "loss": 0.4699, "step": 16500 }, { "epoch": 0.468423725943902, "grad_norm": 31.8900089263916, "learning_rate": 8.510534896845802e-06, "loss": 0.4473, "step": 16600 }, { "epoch": 0.47124555561826287, "grad_norm": 22.49077606201172, "learning_rate": 8.49485796701574e-06, "loss": 0.4516, "step": 16700 }, { "epoch": 0.4740673852926237, "grad_norm": 41.829689025878906, "learning_rate": 8.479181037185678e-06, "loss": 0.3909, "step": 16800 }, { "epoch": 0.4768892149669846, "grad_norm": 31.608774185180664, "learning_rate": 8.463504107355616e-06, "loss": 0.4598, "step": 16900 }, { "epoch": 0.47971104464134545, "grad_norm": 16.948047637939453, "learning_rate": 8.447827177525554e-06, "loss": 0.4696, "step": 17000 }, { "epoch": 0.4825328743157063, "grad_norm": 26.435142517089844, "learning_rate": 8.432150247695492e-06, "loss": 0.4523, "step": 17100 }, { "epoch": 0.48535470399006714, "grad_norm": 29.971567153930664, "learning_rate": 8.41647331786543e-06, "loss": 0.4253, "step": 17200 }, { "epoch": 0.48817653366442804, "grad_norm": 16.34187126159668, "learning_rate": 8.400796388035368e-06, "loss": 0.4364, "step": 17300 }, { "epoch": 0.4909983633387889, "grad_norm": 17.09442710876465, "learning_rate": 8.385119458205306e-06, "loss": 0.4243, "step": 17400 }, { "epoch": 0.4938201930131497, "grad_norm": 17.122453689575195, "learning_rate": 8.369442528375244e-06, "loss": 0.4456, "step": 17500 }, { "epoch": 0.49664202268751056, "grad_norm": 19.6232852935791, "learning_rate": 8.353765598545182e-06, "loss": 0.4297, "step": 17600 }, { "epoch": 0.49946385236187146, "grad_norm": 26.52530288696289, "learning_rate": 8.33808866871512e-06, "loss": 0.442, "step": 17700 }, { "epoch": 0.5022856820362323, "grad_norm": 23.909753799438477, "learning_rate": 8.322411738885058e-06, "loss": 0.4656, "step": 17800 }, { "epoch": 0.5051075117105931, "grad_norm": 28.525676727294922, "learning_rate": 8.306734809054994e-06, "loss": 0.4366, "step": 17900 }, { "epoch": 0.507929341384954, "grad_norm": 18.39270782470703, "learning_rate": 8.291057879224932e-06, "loss": 0.4473, "step": 18000 }, { "epoch": 0.5107511710593149, "grad_norm": 17.035900115966797, "learning_rate": 8.27538094939487e-06, "loss": 0.3899, "step": 18100 }, { "epoch": 0.5135730007336757, "grad_norm": 34.1118049621582, "learning_rate": 8.259704019564809e-06, "loss": 0.4569, "step": 18200 }, { "epoch": 0.5163948304080366, "grad_norm": 46.11088562011719, "learning_rate": 8.244027089734748e-06, "loss": 0.4155, "step": 18300 }, { "epoch": 0.5192166600823974, "grad_norm": 13.6370267868042, "learning_rate": 8.228350159904685e-06, "loss": 0.4137, "step": 18400 }, { "epoch": 0.5220384897567583, "grad_norm": 27.46001434326172, "learning_rate": 8.212673230074623e-06, "loss": 0.4355, "step": 18500 }, { "epoch": 0.5248603194311191, "grad_norm": 50.026153564453125, "learning_rate": 8.19699630024456e-06, "loss": 0.4602, "step": 18600 }, { "epoch": 0.52768214910548, "grad_norm": 10.866439819335938, "learning_rate": 8.181319370414499e-06, "loss": 0.4433, "step": 18700 }, { "epoch": 0.5305039787798409, "grad_norm": 25.32647705078125, "learning_rate": 8.165642440584437e-06, "loss": 0.4633, "step": 18800 }, { "epoch": 0.5333258084542017, "grad_norm": 24.343759536743164, "learning_rate": 8.149965510754375e-06, "loss": 0.4046, "step": 18900 }, { "epoch": 0.5361476381285626, "grad_norm": 6.707170009613037, "learning_rate": 8.134288580924313e-06, "loss": 0.371, "step": 19000 }, { "epoch": 0.5389694678029234, "grad_norm": 31.448177337646484, "learning_rate": 8.11861165109425e-06, "loss": 0.4444, "step": 19100 }, { "epoch": 0.5417912974772843, "grad_norm": 27.307594299316406, "learning_rate": 8.102934721264189e-06, "loss": 0.4495, "step": 19200 }, { "epoch": 0.5446131271516451, "grad_norm": 33.58913040161133, "learning_rate": 8.087257791434127e-06, "loss": 0.4265, "step": 19300 }, { "epoch": 0.547434956826006, "grad_norm": 10.490382194519043, "learning_rate": 8.071580861604063e-06, "loss": 0.4098, "step": 19400 }, { "epoch": 0.5502567865003668, "grad_norm": 34.94256591796875, "learning_rate": 8.055903931774001e-06, "loss": 0.4345, "step": 19500 }, { "epoch": 0.5530786161747276, "grad_norm": 15.932448387145996, "learning_rate": 8.040227001943939e-06, "loss": 0.4064, "step": 19600 }, { "epoch": 0.5559004458490886, "grad_norm": 43.63095474243164, "learning_rate": 8.024550072113879e-06, "loss": 0.4644, "step": 19700 }, { "epoch": 0.5587222755234494, "grad_norm": 16.069515228271484, "learning_rate": 8.008873142283817e-06, "loss": 0.4064, "step": 19800 }, { "epoch": 0.5615441051978103, "grad_norm": 21.504106521606445, "learning_rate": 7.993196212453753e-06, "loss": 0.4134, "step": 19900 }, { "epoch": 0.5643659348721711, "grad_norm": 30.280887603759766, "learning_rate": 7.977519282623691e-06, "loss": 0.4066, "step": 20000 }, { "epoch": 0.567187764546532, "grad_norm": 35.03623580932617, "learning_rate": 7.96184235279363e-06, "loss": 0.4568, "step": 20100 }, { "epoch": 0.5700095942208928, "grad_norm": 33.643226623535156, "learning_rate": 7.946165422963567e-06, "loss": 0.4131, "step": 20200 }, { "epoch": 0.5728314238952537, "grad_norm": 17.969562530517578, "learning_rate": 7.930488493133505e-06, "loss": 0.447, "step": 20300 }, { "epoch": 0.5756532535696145, "grad_norm": 24.389982223510742, "learning_rate": 7.914811563303443e-06, "loss": 0.448, "step": 20400 }, { "epoch": 0.5784750832439753, "grad_norm": 28.242773056030273, "learning_rate": 7.899134633473381e-06, "loss": 0.4822, "step": 20500 }, { "epoch": 0.5812969129183363, "grad_norm": 13.920737266540527, "learning_rate": 7.88345770364332e-06, "loss": 0.432, "step": 20600 }, { "epoch": 0.5841187425926971, "grad_norm": 24.488445281982422, "learning_rate": 7.867780773813257e-06, "loss": 0.4167, "step": 20700 }, { "epoch": 0.586940572267058, "grad_norm": 13.902533531188965, "learning_rate": 7.852103843983195e-06, "loss": 0.4354, "step": 20800 }, { "epoch": 0.5897624019414188, "grad_norm": 13.963842391967773, "learning_rate": 7.836426914153132e-06, "loss": 0.4484, "step": 20900 }, { "epoch": 0.5925842316157797, "grad_norm": 17.448009490966797, "learning_rate": 7.82074998432307e-06, "loss": 0.38, "step": 21000 }, { "epoch": 0.5954060612901405, "grad_norm": 16.734331130981445, "learning_rate": 7.80507305449301e-06, "loss": 0.4409, "step": 21100 }, { "epoch": 0.5982278909645014, "grad_norm": 29.44654655456543, "learning_rate": 7.789396124662948e-06, "loss": 0.4482, "step": 21200 }, { "epoch": 0.6010497206388622, "grad_norm": 21.073148727416992, "learning_rate": 7.773719194832886e-06, "loss": 0.4136, "step": 21300 }, { "epoch": 0.6038715503132231, "grad_norm": 8.795435905456543, "learning_rate": 7.758042265002822e-06, "loss": 0.4726, "step": 21400 }, { "epoch": 0.606693379987584, "grad_norm": 18.364139556884766, "learning_rate": 7.74236533517276e-06, "loss": 0.4165, "step": 21500 }, { "epoch": 0.6095152096619448, "grad_norm": 16.08391761779785, "learning_rate": 7.726688405342698e-06, "loss": 0.3864, "step": 21600 }, { "epoch": 0.6123370393363057, "grad_norm": 20.25522804260254, "learning_rate": 7.711011475512636e-06, "loss": 0.4517, "step": 21700 }, { "epoch": 0.6151588690106665, "grad_norm": 8.111662864685059, "learning_rate": 7.695334545682574e-06, "loss": 0.3994, "step": 21800 }, { "epoch": 0.6179806986850274, "grad_norm": 20.919527053833008, "learning_rate": 7.679657615852512e-06, "loss": 0.4073, "step": 21900 }, { "epoch": 0.6208025283593882, "grad_norm": 17.961687088012695, "learning_rate": 7.66398068602245e-06, "loss": 0.4139, "step": 22000 }, { "epoch": 0.623624358033749, "grad_norm": 28.198232650756836, "learning_rate": 7.648303756192388e-06, "loss": 0.4406, "step": 22100 }, { "epoch": 0.6264461877081099, "grad_norm": 9.413177490234375, "learning_rate": 7.632626826362326e-06, "loss": 0.4488, "step": 22200 }, { "epoch": 0.6292680173824708, "grad_norm": 15.02452278137207, "learning_rate": 7.616949896532264e-06, "loss": 0.4523, "step": 22300 }, { "epoch": 0.6320898470568317, "grad_norm": 37.498958587646484, "learning_rate": 7.601272966702201e-06, "loss": 0.4219, "step": 22400 }, { "epoch": 0.6349116767311925, "grad_norm": 14.869726181030273, "learning_rate": 7.585596036872139e-06, "loss": 0.4111, "step": 22500 }, { "epoch": 0.6377335064055534, "grad_norm": 19.956878662109375, "learning_rate": 7.569919107042077e-06, "loss": 0.4143, "step": 22600 }, { "epoch": 0.6405553360799142, "grad_norm": 20.845680236816406, "learning_rate": 7.554242177212015e-06, "loss": 0.4326, "step": 22700 }, { "epoch": 0.6433771657542751, "grad_norm": 7.167665958404541, "learning_rate": 7.538565247381953e-06, "loss": 0.3882, "step": 22800 }, { "epoch": 0.6461989954286359, "grad_norm": 24.01291847229004, "learning_rate": 7.5228883175518905e-06, "loss": 0.4506, "step": 22900 }, { "epoch": 0.6490208251029967, "grad_norm": 18.37822151184082, "learning_rate": 7.5072113877218286e-06, "loss": 0.4495, "step": 23000 }, { "epoch": 0.6518426547773576, "grad_norm": 17.8455810546875, "learning_rate": 7.4915344578917674e-06, "loss": 0.4631, "step": 23100 }, { "epoch": 0.6546644844517185, "grad_norm": 21.360794067382812, "learning_rate": 7.4758575280617055e-06, "loss": 0.3775, "step": 23200 }, { "epoch": 0.6574863141260794, "grad_norm": 25.73183250427246, "learning_rate": 7.4601805982316435e-06, "loss": 0.4126, "step": 23300 }, { "epoch": 0.6603081438004402, "grad_norm": 23.874588012695312, "learning_rate": 7.444503668401581e-06, "loss": 0.4415, "step": 23400 }, { "epoch": 0.6631299734748011, "grad_norm": 23.876819610595703, "learning_rate": 7.428826738571519e-06, "loss": 0.3982, "step": 23500 }, { "epoch": 0.6659518031491619, "grad_norm": 12.06535816192627, "learning_rate": 7.413149808741457e-06, "loss": 0.4357, "step": 23600 }, { "epoch": 0.6687736328235228, "grad_norm": 26.34955406188965, "learning_rate": 7.397472878911395e-06, "loss": 0.4377, "step": 23700 }, { "epoch": 0.6715954624978836, "grad_norm": 22.41992950439453, "learning_rate": 7.381795949081333e-06, "loss": 0.434, "step": 23800 }, { "epoch": 0.6744172921722444, "grad_norm": 33.86873245239258, "learning_rate": 7.36611901925127e-06, "loss": 0.4248, "step": 23900 }, { "epoch": 0.6772391218466053, "grad_norm": 10.177706718444824, "learning_rate": 7.350598858719509e-06, "loss": 0.3836, "step": 24000 }, { "epoch": 0.6800609515209662, "grad_norm": 16.718257904052734, "learning_rate": 7.334921928889447e-06, "loss": 0.4107, "step": 24100 }, { "epoch": 0.6828827811953271, "grad_norm": 27.072444915771484, "learning_rate": 7.319244999059385e-06, "loss": 0.3875, "step": 24200 }, { "epoch": 0.6857046108696879, "grad_norm": 14.283991813659668, "learning_rate": 7.303568069229323e-06, "loss": 0.4267, "step": 24300 }, { "epoch": 0.6885264405440488, "grad_norm": 12.753026008605957, "learning_rate": 7.28789113939926e-06, "loss": 0.4146, "step": 24400 }, { "epoch": 0.6913482702184096, "grad_norm": 40.42727279663086, "learning_rate": 7.272214209569198e-06, "loss": 0.4194, "step": 24500 }, { "epoch": 0.6941700998927705, "grad_norm": 13.08806324005127, "learning_rate": 7.256537279739136e-06, "loss": 0.3789, "step": 24600 }, { "epoch": 0.6969919295671313, "grad_norm": 31.469749450683594, "learning_rate": 7.240860349909074e-06, "loss": 0.3996, "step": 24700 }, { "epoch": 0.6998137592414921, "grad_norm": 12.432112693786621, "learning_rate": 7.225183420079012e-06, "loss": 0.4347, "step": 24800 }, { "epoch": 0.7026355889158531, "grad_norm": 22.593225479125977, "learning_rate": 7.2095064902489495e-06, "loss": 0.3896, "step": 24900 }, { "epoch": 0.7054574185902139, "grad_norm": 28.22087860107422, "learning_rate": 7.1938295604188876e-06, "loss": 0.4072, "step": 25000 } ], "logging_steps": 100, "max_steps": 70876, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }