{ "best_global_step": 6681, "best_metric": 3.168698787689209, "best_model_checkpoint": "./luc-bat-poet-model\\checkpoint-6681", "epoch": 3.0, "eval_steps": 500, "global_step": 6681, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00044917324050419694, "grad_norm": 5.473126411437988, "learning_rate": 0.0, "loss": 5.4778, "step": 1 }, { "epoch": 0.011229331012604924, "grad_norm": 0.8930467963218689, "learning_rate": 0.00012, "loss": 5.0452, "step": 25 }, { "epoch": 0.02245866202520985, "grad_norm": 0.7617964744567871, "learning_rate": 0.000245, "loss": 4.5144, "step": 50 }, { "epoch": 0.03368799303781477, "grad_norm": 0.8281979560852051, "learning_rate": 0.00037, "loss": 4.3613, "step": 75 }, { "epoch": 0.0449173240504197, "grad_norm": 0.8189941644668579, "learning_rate": 0.000495, "loss": 4.2239, "step": 100 }, { "epoch": 0.05614665506302462, "grad_norm": 1.01946222782135, "learning_rate": 0.00062, "loss": 4.1386, "step": 125 }, { "epoch": 0.06737598607562954, "grad_norm": 0.9489204287528992, "learning_rate": 0.000745, "loss": 4.0065, "step": 150 }, { "epoch": 0.07860531708823447, "grad_norm": 1.6589257717132568, "learning_rate": 0.00087, "loss": 3.9373, "step": 175 }, { "epoch": 0.0898346481008394, "grad_norm": 1.1351829767227173, "learning_rate": 0.000995, "loss": 3.9028, "step": 200 }, { "epoch": 0.10106397911344432, "grad_norm": 0.7443220615386963, "learning_rate": 0.000999995416032659, "loss": 3.8687, "step": 225 }, { "epoch": 0.11229331012604923, "grad_norm": 0.8907812833786011, "learning_rate": 0.0009999808922703088, "loss": 3.8144, "step": 250 }, { "epoch": 0.12352264113865416, "grad_norm": 0.651372492313385, "learning_rate": 0.0009999564210436207, "loss": 3.8007, "step": 275 }, { "epoch": 0.1347519721512591, "grad_norm": 0.7908700704574585, "learning_rate": 0.000999922002839467, "loss": 3.7684, "step": 300 }, { "epoch": 0.145981303163864, "grad_norm": 0.6001281142234802, "learning_rate": 0.0009998776383426215, "loss": 3.743, "step": 325 }, { "epoch": 0.15721063417646894, "grad_norm": 0.6484293937683105, "learning_rate": 0.0009998233284357462, "loss": 3.7416, "step": 350 }, { "epoch": 0.16843996518907386, "grad_norm": 0.7714053988456726, "learning_rate": 0.0009997590741993743, "loss": 3.7108, "step": 375 }, { "epoch": 0.1796692962016788, "grad_norm": 0.6280301213264465, "learning_rate": 0.0009996848769118882, "loss": 3.6854, "step": 400 }, { "epoch": 0.19089862721428372, "grad_norm": 0.644794225692749, "learning_rate": 0.0009996007380494937, "loss": 3.6737, "step": 425 }, { "epoch": 0.20212795822688864, "grad_norm": 0.6872850060462952, "learning_rate": 0.0009995066592861919, "loss": 3.662, "step": 450 }, { "epoch": 0.21335728923949357, "grad_norm": 0.601828396320343, "learning_rate": 0.0009994026424937441, "loss": 3.678, "step": 475 }, { "epoch": 0.22458662025209847, "grad_norm": 0.6280804872512817, "learning_rate": 0.0009992886897416365, "loss": 3.6652, "step": 500 }, { "epoch": 0.2358159512647034, "grad_norm": 0.5711939930915833, "learning_rate": 0.0009991648032970373, "loss": 3.627, "step": 525 }, { "epoch": 0.24704528227730832, "grad_norm": 0.5883836150169373, "learning_rate": 0.000999030985624753, "loss": 3.6433, "step": 550 }, { "epoch": 0.25827461328991325, "grad_norm": 0.5956864356994629, "learning_rate": 0.000998887239387178, "loss": 3.6207, "step": 575 }, { "epoch": 0.2695039443025182, "grad_norm": 0.577458918094635, "learning_rate": 0.000998733567444243, "loss": 3.6074, "step": 600 }, { "epoch": 0.2807332753151231, "grad_norm": 0.8079866170883179, "learning_rate": 0.0009985699728533573, "loss": 3.6119, "step": 625 }, { "epoch": 0.291962606327728, "grad_norm": 0.5645362138748169, "learning_rate": 0.0009983964588693478, "loss": 3.6411, "step": 650 }, { "epoch": 0.30319193734033295, "grad_norm": 0.5666602253913879, "learning_rate": 0.0009982130289443944, "loss": 3.5725, "step": 675 }, { "epoch": 0.3144212683529379, "grad_norm": 0.6160576343536377, "learning_rate": 0.0009980196867279626, "loss": 3.6055, "step": 700 }, { "epoch": 0.3256505993655428, "grad_norm": 0.6656046509742737, "learning_rate": 0.0009978164360667286, "loss": 3.5776, "step": 725 }, { "epoch": 0.33687993037814773, "grad_norm": 0.6236292123794556, "learning_rate": 0.0009976032810045043, "loss": 3.5509, "step": 750 }, { "epoch": 0.34810926139075266, "grad_norm": 0.5736802816390991, "learning_rate": 0.0009973802257821566, "loss": 3.557, "step": 775 }, { "epoch": 0.3593385924033576, "grad_norm": 0.5729912519454956, "learning_rate": 0.000997147274837523, "loss": 3.5374, "step": 800 }, { "epoch": 0.3705679234159625, "grad_norm": 0.6009777188301086, "learning_rate": 0.000996904432805323, "loss": 3.5564, "step": 825 }, { "epoch": 0.38179725442856743, "grad_norm": 0.5442889928817749, "learning_rate": 0.0009966517045170659, "loss": 3.5109, "step": 850 }, { "epoch": 0.39302658544117236, "grad_norm": 0.5199196338653564, "learning_rate": 0.0009963890950009549, "loss": 3.526, "step": 875 }, { "epoch": 0.4042559164537773, "grad_norm": 0.5444336533546448, "learning_rate": 0.000996116609481788, "loss": 3.5177, "step": 900 }, { "epoch": 0.4154852474663822, "grad_norm": 0.5710541009902954, "learning_rate": 0.000995834253380852, "loss": 3.4767, "step": 925 }, { "epoch": 0.42671457847898714, "grad_norm": 0.5203866362571716, "learning_rate": 0.000995542032315816, "loss": 3.4972, "step": 950 }, { "epoch": 0.43794390949159206, "grad_norm": 0.545213520526886, "learning_rate": 0.0009952399521006192, "loss": 3.4623, "step": 975 }, { "epoch": 0.44917324050419694, "grad_norm": 0.5384295582771301, "learning_rate": 0.0009949280187453561, "loss": 3.4367, "step": 1000 }, { "epoch": 0.46040257151680186, "grad_norm": 0.5814492106437683, "learning_rate": 0.0009946062384561555, "loss": 3.441, "step": 1025 }, { "epoch": 0.4716319025294068, "grad_norm": 0.5313106179237366, "learning_rate": 0.000994274617635058, "loss": 3.4336, "step": 1050 }, { "epoch": 0.4828612335420117, "grad_norm": 0.6174953579902649, "learning_rate": 0.0009939331628798882, "loss": 3.3919, "step": 1075 }, { "epoch": 0.49409056455461664, "grad_norm": 0.566247284412384, "learning_rate": 0.0009935818809841239, "loss": 3.4281, "step": 1100 }, { "epoch": 0.5053198955672216, "grad_norm": 0.5577934384346008, "learning_rate": 0.0009932207789367603, "loss": 3.4043, "step": 1125 }, { "epoch": 0.5165492265798265, "grad_norm": 0.5082374811172485, "learning_rate": 0.0009928498639221715, "loss": 3.3853, "step": 1150 }, { "epoch": 0.5277785575924314, "grad_norm": 0.49451902508735657, "learning_rate": 0.0009924691433199674, "loss": 3.3794, "step": 1175 }, { "epoch": 0.5390078886050363, "grad_norm": 0.5019585490226746, "learning_rate": 0.0009920786247048464, "loss": 3.4127, "step": 1200 }, { "epoch": 0.5502372196176413, "grad_norm": 0.599949300289154, "learning_rate": 0.0009916783158464455, "loss": 3.3923, "step": 1225 }, { "epoch": 0.5614665506302462, "grad_norm": 0.5792480707168579, "learning_rate": 0.0009912682247091853, "loss": 3.3656, "step": 1250 }, { "epoch": 0.5726958816428511, "grad_norm": 0.5839366912841797, "learning_rate": 0.0009908483594521116, "loss": 3.3895, "step": 1275 }, { "epoch": 0.583925212655456, "grad_norm": 0.5739409923553467, "learning_rate": 0.0009904187284287332, "loss": 3.3506, "step": 1300 }, { "epoch": 0.595154543668061, "grad_norm": 0.5897430777549744, "learning_rate": 0.0009899793401868546, "loss": 3.3247, "step": 1325 }, { "epoch": 0.6063838746806659, "grad_norm": 0.5063382983207703, "learning_rate": 0.0009895302034684083, "loss": 3.3369, "step": 1350 }, { "epoch": 0.6176132056932708, "grad_norm": 0.5431721806526184, "learning_rate": 0.0009890713272092786, "loss": 3.3446, "step": 1375 }, { "epoch": 0.6288425367058758, "grad_norm": 0.4924439489841461, "learning_rate": 0.0009886027205391248, "loss": 3.3377, "step": 1400 }, { "epoch": 0.6400718677184807, "grad_norm": 0.54843670129776, "learning_rate": 0.0009881243927811992, "loss": 3.345, "step": 1425 }, { "epoch": 0.6513011987310856, "grad_norm": 0.5399184226989746, "learning_rate": 0.0009876363534521626, "loss": 3.3344, "step": 1450 }, { "epoch": 0.6625305297436905, "grad_norm": 0.5677058696746826, "learning_rate": 0.0009871386122618933, "loss": 3.3328, "step": 1475 }, { "epoch": 0.6737598607562955, "grad_norm": 0.482010155916214, "learning_rate": 0.0009866311791132953, "loss": 3.3433, "step": 1500 }, { "epoch": 0.6849891917689004, "grad_norm": 0.49796995520591736, "learning_rate": 0.000986114064102101, "loss": 3.3413, "step": 1525 }, { "epoch": 0.6962185227815053, "grad_norm": 0.4976087808609009, "learning_rate": 0.0009855872775166696, "loss": 3.3176, "step": 1550 }, { "epoch": 0.7074478537941102, "grad_norm": 0.5586963891983032, "learning_rate": 0.0009850508298377832, "loss": 3.3188, "step": 1575 }, { "epoch": 0.7186771848067152, "grad_norm": 0.45419466495513916, "learning_rate": 0.0009845047317384378, "loss": 3.2902, "step": 1600 }, { "epoch": 0.7299065158193201, "grad_norm": 0.5402281880378723, "learning_rate": 0.0009839489940836317, "loss": 3.2893, "step": 1625 }, { "epoch": 0.741135846831925, "grad_norm": 0.5439639687538147, "learning_rate": 0.0009833836279301484, "loss": 3.3065, "step": 1650 }, { "epoch": 0.7523651778445299, "grad_norm": 0.49050387740135193, "learning_rate": 0.0009828086445263368, "loss": 3.2796, "step": 1675 }, { "epoch": 0.7635945088571349, "grad_norm": 0.5266304016113281, "learning_rate": 0.000982224055311888, "loss": 3.29, "step": 1700 }, { "epoch": 0.7748238398697398, "grad_norm": 0.5335237979888916, "learning_rate": 0.0009816298719176073, "loss": 3.2889, "step": 1725 }, { "epoch": 0.7860531708823447, "grad_norm": 0.5236843824386597, "learning_rate": 0.0009810261061651826, "loss": 3.2758, "step": 1750 }, { "epoch": 0.7972825018949496, "grad_norm": 0.541266679763794, "learning_rate": 0.0009804127700669496, "loss": 3.3053, "step": 1775 }, { "epoch": 0.8085118329075546, "grad_norm": 0.47298872470855713, "learning_rate": 0.0009797898758256525, "loss": 3.277, "step": 1800 }, { "epoch": 0.8197411639201595, "grad_norm": 0.5097940564155579, "learning_rate": 0.0009791574358342014, "loss": 3.2537, "step": 1825 }, { "epoch": 0.8309704949327644, "grad_norm": 0.5668537616729736, "learning_rate": 0.0009785154626754259, "loss": 3.2514, "step": 1850 }, { "epoch": 0.8421998259453694, "grad_norm": 0.5071256160736084, "learning_rate": 0.000977863969121824, "loss": 3.2467, "step": 1875 }, { "epoch": 0.8534291569579743, "grad_norm": 0.4892118573188782, "learning_rate": 0.000977202968135309, "loss": 3.2772, "step": 1900 }, { "epoch": 0.8646584879705792, "grad_norm": 0.463008850812912, "learning_rate": 0.000976532472866951, "loss": 3.2492, "step": 1925 }, { "epoch": 0.8758878189831841, "grad_norm": 0.5179749727249146, "learning_rate": 0.0009758524966567152, "loss": 3.2289, "step": 1950 }, { "epoch": 0.8871171499957891, "grad_norm": 0.5276876091957092, "learning_rate": 0.000975163053033197, "loss": 3.2474, "step": 1975 }, { "epoch": 0.8983464810083939, "grad_norm": 0.5618298649787903, "learning_rate": 0.000974464155713352, "loss": 3.2652, "step": 2000 }, { "epoch": 0.9095758120209988, "grad_norm": 0.4800003468990326, "learning_rate": 0.0009737558186022242, "loss": 3.2424, "step": 2025 }, { "epoch": 0.9208051430336037, "grad_norm": 0.47998154163360596, "learning_rate": 0.0009730380557926682, "loss": 3.2737, "step": 2050 }, { "epoch": 0.9320344740462086, "grad_norm": 0.5447694659233093, "learning_rate": 0.00097231088156507, "loss": 3.249, "step": 2075 }, { "epoch": 0.9432638050588136, "grad_norm": 0.5559055209159851, "learning_rate": 0.0009715743103870615, "loss": 3.2566, "step": 2100 }, { "epoch": 0.9544931360714185, "grad_norm": 0.478614866733551, "learning_rate": 0.0009708283569132341, "loss": 3.2076, "step": 2125 }, { "epoch": 0.9657224670840234, "grad_norm": 0.44457143545150757, "learning_rate": 0.000970073035984846, "loss": 3.2052, "step": 2150 }, { "epoch": 0.9769517980966284, "grad_norm": 0.5057160258293152, "learning_rate": 0.0009693083626295274, "loss": 3.1944, "step": 2175 }, { "epoch": 0.9881811291092333, "grad_norm": 0.487543523311615, "learning_rate": 0.0009685343520609816, "loss": 3.2862, "step": 2200 }, { "epoch": 0.9994104601218382, "grad_norm": 0.5547086000442505, "learning_rate": 0.0009677510196786822, "loss": 3.2249, "step": 2225 }, { "epoch": 1.0, "eval_loss": 3.2925968170166016, "eval_runtime": 230.4828, "eval_samples_per_second": 54.598, "eval_steps_per_second": 54.598, "step": 2227 }, { "epoch": 1.0103309845315964, "grad_norm": 0.5509684085845947, "learning_rate": 0.0009669583810675666, "loss": 3.0297, "step": 2250 }, { "epoch": 1.0215603155442015, "grad_norm": 0.5036989450454712, "learning_rate": 0.0009661564519977263, "loss": 2.9815, "step": 2275 }, { "epoch": 1.0327896465568063, "grad_norm": 0.5602796673774719, "learning_rate": 0.0009653452484240923, "loss": 2.994, "step": 2300 }, { "epoch": 1.0440189775694113, "grad_norm": 0.5729214549064636, "learning_rate": 0.0009645247864861191, "loss": 2.9956, "step": 2325 }, { "epoch": 1.0552483085820161, "grad_norm": 0.5456846356391907, "learning_rate": 0.0009636950825074618, "loss": 2.985, "step": 2350 }, { "epoch": 1.0664776395946212, "grad_norm": 0.544643223285675, "learning_rate": 0.0009628561529956529, "loss": 2.9973, "step": 2375 }, { "epoch": 1.077706970607226, "grad_norm": 0.5306060314178467, "learning_rate": 0.0009620080146417731, "loss": 3.0053, "step": 2400 }, { "epoch": 1.088936301619831, "grad_norm": 0.5001072883605957, "learning_rate": 0.0009611506843201193, "loss": 3.0244, "step": 2425 }, { "epoch": 1.1001656326324358, "grad_norm": 0.52583909034729, "learning_rate": 0.0009602841790878688, "loss": 3.0266, "step": 2450 }, { "epoch": 1.1113949636450409, "grad_norm": 0.536445677280426, "learning_rate": 0.0009594085161847405, "loss": 3.0124, "step": 2475 }, { "epoch": 1.1226242946576457, "grad_norm": 0.5341405272483826, "learning_rate": 0.0009585237130326508, "loss": 3.0272, "step": 2500 }, { "epoch": 1.1338536256702507, "grad_norm": 0.5340954661369324, "learning_rate": 0.0009576297872353686, "loss": 3.0152, "step": 2525 }, { "epoch": 1.1450829566828555, "grad_norm": 0.4479193687438965, "learning_rate": 0.0009567267565781628, "loss": 3.0202, "step": 2550 }, { "epoch": 1.1563122876954606, "grad_norm": 0.5316035747528076, "learning_rate": 0.0009558146390274512, "loss": 3.015, "step": 2575 }, { "epoch": 1.1675416187080654, "grad_norm": 0.5239371061325073, "learning_rate": 0.0009548934527304407, "loss": 3.0618, "step": 2600 }, { "epoch": 1.1787709497206704, "grad_norm": 0.6486944556236267, "learning_rate": 0.0009539632160147672, "loss": 3.0004, "step": 2625 }, { "epoch": 1.1900002807332752, "grad_norm": 0.5308857560157776, "learning_rate": 0.0009530239473881313, "loss": 3.0425, "step": 2650 }, { "epoch": 1.2012296117458803, "grad_norm": 0.5612149834632874, "learning_rate": 0.0009520756655379293, "loss": 3.0447, "step": 2675 }, { "epoch": 1.212458942758485, "grad_norm": 0.5429418683052063, "learning_rate": 0.0009511183893308821, "loss": 2.9887, "step": 2700 }, { "epoch": 1.2236882737710901, "grad_norm": 0.5688816905021667, "learning_rate": 0.0009501521378126594, "loss": 2.9961, "step": 2725 }, { "epoch": 1.234917604783695, "grad_norm": 0.5409512519836426, "learning_rate": 0.0009491769302075008, "loss": 3.0, "step": 2750 }, { "epoch": 1.2461469357963, "grad_norm": 0.5384955406188965, "learning_rate": 0.0009481927859178337, "loss": 3.0271, "step": 2775 }, { "epoch": 1.2573762668089048, "grad_norm": 0.5857961177825928, "learning_rate": 0.0009471997245238865, "loss": 2.9983, "step": 2800 }, { "epoch": 1.2686055978215098, "grad_norm": 0.5337027907371521, "learning_rate": 0.0009461977657833003, "loss": 3.0552, "step": 2825 }, { "epoch": 1.2798349288341146, "grad_norm": 0.5078946352005005, "learning_rate": 0.0009451869296307341, "loss": 3.0191, "step": 2850 }, { "epoch": 1.2910642598467197, "grad_norm": 0.5108660459518433, "learning_rate": 0.00094416723617747, "loss": 3.0234, "step": 2875 }, { "epoch": 1.3022935908593245, "grad_norm": 0.5631129741668701, "learning_rate": 0.0009431387057110118, "loss": 3.0319, "step": 2900 }, { "epoch": 1.3135229218719295, "grad_norm": 0.5249589085578918, "learning_rate": 0.0009421013586946816, "loss": 2.9866, "step": 2925 }, { "epoch": 1.3247522528845344, "grad_norm": 0.4992469251155853, "learning_rate": 0.000941055215767213, "loss": 3.0144, "step": 2950 }, { "epoch": 1.3359815838971394, "grad_norm": 0.4509263336658478, "learning_rate": 0.0009400002977423405, "loss": 3.0092, "step": 2975 }, { "epoch": 1.3472109149097442, "grad_norm": 0.515438973903656, "learning_rate": 0.0009389366256083849, "loss": 2.9993, "step": 3000 }, { "epoch": 1.3584402459223492, "grad_norm": 0.5087840557098389, "learning_rate": 0.0009378642205278363, "loss": 3.0242, "step": 3025 }, { "epoch": 1.369669576934954, "grad_norm": 0.5046051144599915, "learning_rate": 0.0009367831038369326, "loss": 2.9971, "step": 3050 }, { "epoch": 1.380898907947559, "grad_norm": 0.5728681087493896, "learning_rate": 0.0009356932970452353, "loss": 3.0292, "step": 3075 }, { "epoch": 1.392128238960164, "grad_norm": 0.5724380016326904, "learning_rate": 0.0009345948218352014, "loss": 3.0098, "step": 3100 }, { "epoch": 1.403357569972769, "grad_norm": 0.5322164297103882, "learning_rate": 0.0009334877000617518, "loss": 2.9968, "step": 3125 }, { "epoch": 1.4145869009853738, "grad_norm": 0.558423638343811, "learning_rate": 0.0009323719537518374, "loss": 3.0334, "step": 3150 }, { "epoch": 1.4258162319979788, "grad_norm": 0.5415078997612, "learning_rate": 0.0009312476051039994, "loss": 3.0313, "step": 3175 }, { "epoch": 1.4370455630105836, "grad_norm": 0.46919873356819153, "learning_rate": 0.0009301146764879292, "loss": 2.9992, "step": 3200 }, { "epoch": 1.4482748940231884, "grad_norm": 0.5965465903282166, "learning_rate": 0.0009289731904440217, "loss": 3.0071, "step": 3225 }, { "epoch": 1.4595042250357935, "grad_norm": 0.4882059693336487, "learning_rate": 0.0009278231696829288, "loss": 2.968, "step": 3250 }, { "epoch": 1.4707335560483985, "grad_norm": 0.6297493577003479, "learning_rate": 0.0009266646370851055, "loss": 3.0411, "step": 3275 }, { "epoch": 1.4819628870610033, "grad_norm": 0.5603842735290527, "learning_rate": 0.0009254976157003563, "loss": 3.0203, "step": 3300 }, { "epoch": 1.4931922180736081, "grad_norm": 0.49509698152542114, "learning_rate": 0.0009243221287473755, "loss": 3.0176, "step": 3325 }, { "epoch": 1.5044215490862132, "grad_norm": 0.48536983132362366, "learning_rate": 0.0009231381996132862, "loss": 2.9547, "step": 3350 }, { "epoch": 1.5156508800988182, "grad_norm": 0.47351208329200745, "learning_rate": 0.0009219458518531739, "loss": 2.9666, "step": 3375 }, { "epoch": 1.526880211111423, "grad_norm": 0.5615521669387817, "learning_rate": 0.0009207451091896191, "loss": 3.0295, "step": 3400 }, { "epoch": 1.5381095421240278, "grad_norm": 0.5138916969299316, "learning_rate": 0.0009195359955122244, "loss": 3.0146, "step": 3425 }, { "epoch": 1.5493388731366329, "grad_norm": 0.5883649587631226, "learning_rate": 0.0009183185348771392, "loss": 3.0151, "step": 3450 }, { "epoch": 1.560568204149238, "grad_norm": 0.5921751260757446, "learning_rate": 0.0009170927515065821, "loss": 3.0314, "step": 3475 }, { "epoch": 1.5717975351618427, "grad_norm": 0.5592530965805054, "learning_rate": 0.0009158586697883576, "loss": 2.9921, "step": 3500 }, { "epoch": 1.5830268661744475, "grad_norm": 0.5621814727783203, "learning_rate": 0.0009146163142753716, "loss": 2.9987, "step": 3525 }, { "epoch": 1.5942561971870526, "grad_norm": 0.5482603311538696, "learning_rate": 0.0009133657096851431, "loss": 2.9802, "step": 3550 }, { "epoch": 1.6054855281996576, "grad_norm": 0.5254377722740173, "learning_rate": 0.0009121068808993124, "loss": 3.0121, "step": 3575 }, { "epoch": 1.6167148592122624, "grad_norm": 0.47623664140701294, "learning_rate": 0.0009108398529631451, "loss": 3.0068, "step": 3600 }, { "epoch": 1.6279441902248672, "grad_norm": 0.49733710289001465, "learning_rate": 0.0009095646510850351, "loss": 3.0104, "step": 3625 }, { "epoch": 1.6391735212374723, "grad_norm": 0.5388875603675842, "learning_rate": 0.0009082813006360026, "loss": 2.9823, "step": 3650 }, { "epoch": 1.6504028522500773, "grad_norm": 0.5329872965812683, "learning_rate": 0.0009069898271491887, "loss": 2.9945, "step": 3675 }, { "epoch": 1.6616321832626821, "grad_norm": 0.5175071358680725, "learning_rate": 0.0009056902563193486, "loss": 2.9875, "step": 3700 }, { "epoch": 1.672861514275287, "grad_norm": 0.514216423034668, "learning_rate": 0.0009043826140023388, "loss": 3.016, "step": 3725 }, { "epoch": 1.684090845287892, "grad_norm": 0.5547803640365601, "learning_rate": 0.0009030669262146046, "loss": 2.9906, "step": 3750 }, { "epoch": 1.695320176300497, "grad_norm": 0.5035697817802429, "learning_rate": 0.0009017432191326611, "loss": 2.9795, "step": 3775 }, { "epoch": 1.7065495073131018, "grad_norm": 0.4960135519504547, "learning_rate": 0.0009004115190925724, "loss": 2.986, "step": 3800 }, { "epoch": 1.7177788383257067, "grad_norm": 0.5573786497116089, "learning_rate": 0.0008990718525894286, "loss": 2.9981, "step": 3825 }, { "epoch": 1.7290081693383117, "grad_norm": 0.558542788028717, "learning_rate": 0.0008977242462768177, "loss": 3.0122, "step": 3850 }, { "epoch": 1.7402375003509167, "grad_norm": 0.48205050826072693, "learning_rate": 0.0008963687269662957, "loss": 2.9558, "step": 3875 }, { "epoch": 1.7514668313635215, "grad_norm": 0.48525846004486084, "learning_rate": 0.0008950053216268534, "loss": 3.0034, "step": 3900 }, { "epoch": 1.7626961623761264, "grad_norm": 0.5863490700721741, "learning_rate": 0.0008936340573843795, "loss": 3.0222, "step": 3925 }, { "epoch": 1.7739254933887314, "grad_norm": 0.54740309715271, "learning_rate": 0.0008922549615211206, "loss": 2.9785, "step": 3950 }, { "epoch": 1.7851548244013364, "grad_norm": 0.5275555849075317, "learning_rate": 0.0008908680614751392, "loss": 2.982, "step": 3975 }, { "epoch": 1.7963841554139413, "grad_norm": 0.5472078919410706, "learning_rate": 0.0008894733848397674, "loss": 3.0128, "step": 4000 }, { "epoch": 1.807613486426546, "grad_norm": 0.5604407787322998, "learning_rate": 0.0008880709593630578, "loss": 3.0119, "step": 4025 }, { "epoch": 1.818842817439151, "grad_norm": 0.5137823224067688, "learning_rate": 0.0008866608129472313, "loss": 2.9858, "step": 4050 }, { "epoch": 1.8300721484517561, "grad_norm": 0.5707024931907654, "learning_rate": 0.0008852429736481227, "loss": 3.013, "step": 4075 }, { "epoch": 1.841301479464361, "grad_norm": 0.5344915986061096, "learning_rate": 0.0008838174696746215, "loss": 2.9899, "step": 4100 }, { "epoch": 1.8525308104769658, "grad_norm": 0.504295289516449, "learning_rate": 0.0008823843293881117, "loss": 3.0095, "step": 4125 }, { "epoch": 1.8637601414895708, "grad_norm": 0.5654752254486084, "learning_rate": 0.0008809435813019065, "loss": 2.988, "step": 4150 }, { "epoch": 1.8749894725021758, "grad_norm": 0.5086371302604675, "learning_rate": 0.0008794952540806817, "loss": 3.0304, "step": 4175 }, { "epoch": 1.8862188035147807, "grad_norm": 0.5218560099601746, "learning_rate": 0.0008780393765399055, "loss": 2.9817, "step": 4200 }, { "epoch": 1.8974481345273855, "grad_norm": 0.48831528425216675, "learning_rate": 0.0008765759776452646, "loss": 3.0245, "step": 4225 }, { "epoch": 1.9086774655399905, "grad_norm": 0.5015767812728882, "learning_rate": 0.0008751050865120882, "loss": 3.0238, "step": 4250 }, { "epoch": 1.9199067965525953, "grad_norm": 0.5325757265090942, "learning_rate": 0.000873626732404769, "loss": 2.9993, "step": 4275 }, { "epoch": 1.9311361275652001, "grad_norm": 0.48629334568977356, "learning_rate": 0.0008721409447361803, "loss": 2.9634, "step": 4300 }, { "epoch": 1.9423654585778052, "grad_norm": 0.49022358655929565, "learning_rate": 0.0008706477530670917, "loss": 2.9736, "step": 4325 }, { "epoch": 1.9535947895904102, "grad_norm": 0.5039647221565247, "learning_rate": 0.0008691471871055801, "loss": 2.9802, "step": 4350 }, { "epoch": 1.964824120603015, "grad_norm": 0.5223824977874756, "learning_rate": 0.0008676392767064391, "loss": 3.0397, "step": 4375 }, { "epoch": 1.9760534516156198, "grad_norm": 0.5172558426856995, "learning_rate": 0.0008661240518705854, "loss": 2.9756, "step": 4400 }, { "epoch": 1.9872827826282249, "grad_norm": 0.4955403208732605, "learning_rate": 0.0008646015427444609, "loss": 2.9748, "step": 4425 }, { "epoch": 1.99851211364083, "grad_norm": 0.553287923336029, "learning_rate": 0.0008630717796194337, "loss": 2.9501, "step": 4450 }, { "epoch": 2.0, "eval_loss": 3.1917288303375244, "eval_runtime": 227.8959, "eval_samples_per_second": 55.218, "eval_steps_per_second": 55.218, "step": 4454 }, { "epoch": 2.0094326380505882, "grad_norm": 0.5545716285705566, "learning_rate": 0.0008615347929311949, "loss": 2.7426, "step": 4475 }, { "epoch": 2.020661969063193, "grad_norm": 0.5116350650787354, "learning_rate": 0.0008599906132591541, "loss": 2.6669, "step": 4500 }, { "epoch": 2.031891300075798, "grad_norm": 0.6248686909675598, "learning_rate": 0.0008584392713258295, "loss": 2.6597, "step": 4525 }, { "epoch": 2.043120631088403, "grad_norm": 0.5305931568145752, "learning_rate": 0.0008568807979962379, "loss": 2.6635, "step": 4550 }, { "epoch": 2.054349962101008, "grad_norm": 0.5366395711898804, "learning_rate": 0.0008553152242772798, "loss": 2.668, "step": 4575 }, { "epoch": 2.0655792931136125, "grad_norm": 0.6074578762054443, "learning_rate": 0.0008537425813171232, "loss": 2.7031, "step": 4600 }, { "epoch": 2.0768086241262176, "grad_norm": 0.6074210405349731, "learning_rate": 0.0008521629004045832, "loss": 2.6721, "step": 4625 }, { "epoch": 2.0880379551388226, "grad_norm": 0.6065968871116638, "learning_rate": 0.0008505762129685002, "loss": 2.6774, "step": 4650 }, { "epoch": 2.0992672861514277, "grad_norm": 0.5635676383972168, "learning_rate": 0.0008489825505771136, "loss": 2.6537, "step": 4675 }, { "epoch": 2.1104966171640323, "grad_norm": 0.5447210669517517, "learning_rate": 0.000847381944937435, "loss": 2.6964, "step": 4700 }, { "epoch": 2.1217259481766373, "grad_norm": 0.6450474858283997, "learning_rate": 0.0008457744278946162, "loss": 2.6591, "step": 4725 }, { "epoch": 2.1329552791892423, "grad_norm": 0.5620629787445068, "learning_rate": 0.0008441600314313165, "loss": 2.6787, "step": 4750 }, { "epoch": 2.1441846102018474, "grad_norm": 0.5661433935165405, "learning_rate": 0.0008425387876670658, "loss": 2.7193, "step": 4775 }, { "epoch": 2.155413941214452, "grad_norm": 0.6069077849388123, "learning_rate": 0.0008409107288576259, "loss": 2.6947, "step": 4800 }, { "epoch": 2.166643272227057, "grad_norm": 0.6271758675575256, "learning_rate": 0.0008392758873943484, "loss": 2.6952, "step": 4825 }, { "epoch": 2.177872603239662, "grad_norm": 0.5473480820655823, "learning_rate": 0.0008376342958035308, "loss": 2.6981, "step": 4850 }, { "epoch": 2.189101934252267, "grad_norm": 0.631519079208374, "learning_rate": 0.0008359859867457686, "loss": 2.6921, "step": 4875 }, { "epoch": 2.2003312652648717, "grad_norm": 0.6639063954353333, "learning_rate": 0.0008343309930153064, "loss": 2.6837, "step": 4900 }, { "epoch": 2.2115605962774767, "grad_norm": 0.6233927011489868, "learning_rate": 0.0008326693475393846, "loss": 2.7112, "step": 4925 }, { "epoch": 2.2227899272900817, "grad_norm": 0.636464536190033, "learning_rate": 0.0008310010833775849, "loss": 2.7213, "step": 4950 }, { "epoch": 2.2340192583026868, "grad_norm": 0.5854448676109314, "learning_rate": 0.0008293262337211723, "loss": 2.7131, "step": 4975 }, { "epoch": 2.2452485893152914, "grad_norm": 0.6958891749382019, "learning_rate": 0.0008276448318924346, "loss": 2.6883, "step": 5000 }, { "epoch": 2.2564779203278964, "grad_norm": 0.5899659991264343, "learning_rate": 0.0008259569113440198, "loss": 2.6872, "step": 5025 }, { "epoch": 2.2677072513405014, "grad_norm": 0.6791245937347412, "learning_rate": 0.0008242625056582698, "loss": 2.7202, "step": 5050 }, { "epoch": 2.2789365823531065, "grad_norm": 0.5778390169143677, "learning_rate": 0.0008225616485465535, "loss": 2.7153, "step": 5075 }, { "epoch": 2.290165913365711, "grad_norm": 0.5727918148040771, "learning_rate": 0.000820854373848595, "loss": 2.7314, "step": 5100 }, { "epoch": 2.301395244378316, "grad_norm": 0.6461373567581177, "learning_rate": 0.0008191407155318007, "loss": 2.6973, "step": 5125 }, { "epoch": 2.312624575390921, "grad_norm": 0.6795935034751892, "learning_rate": 0.0008174207076905835, "loss": 2.6605, "step": 5150 }, { "epoch": 2.323853906403526, "grad_norm": 0.6217265725135803, "learning_rate": 0.0008156943845456843, "loss": 2.6715, "step": 5175 }, { "epoch": 2.3350832374161308, "grad_norm": 0.6071234941482544, "learning_rate": 0.0008139617804434918, "loss": 2.6806, "step": 5200 }, { "epoch": 2.346312568428736, "grad_norm": 0.6218218207359314, "learning_rate": 0.0008122229298553583, "loss": 2.7077, "step": 5225 }, { "epoch": 2.357541899441341, "grad_norm": 0.5912306904792786, "learning_rate": 0.0008104778673769142, "loss": 2.7314, "step": 5250 }, { "epoch": 2.368771230453946, "grad_norm": 0.5841456651687622, "learning_rate": 0.0008087266277273799, "loss": 2.6645, "step": 5275 }, { "epoch": 2.3800005614665505, "grad_norm": 0.6491414904594421, "learning_rate": 0.0008069692457488749, "loss": 2.7115, "step": 5300 }, { "epoch": 2.3912298924791555, "grad_norm": 0.6534895896911621, "learning_rate": 0.0008052057564057244, "loss": 2.7057, "step": 5325 }, { "epoch": 2.4024592234917606, "grad_norm": 0.5900655388832092, "learning_rate": 0.000803436194783764, "loss": 2.7302, "step": 5350 }, { "epoch": 2.4136885545043656, "grad_norm": 0.5586231350898743, "learning_rate": 0.0008016605960896412, "loss": 2.7339, "step": 5375 }, { "epoch": 2.42491788551697, "grad_norm": 0.705515444278717, "learning_rate": 0.0007998789956501159, "loss": 2.7323, "step": 5400 }, { "epoch": 2.436147216529575, "grad_norm": 0.5936200022697449, "learning_rate": 0.0007980914289113558, "loss": 2.7116, "step": 5425 }, { "epoch": 2.4473765475421803, "grad_norm": 0.6085701584815979, "learning_rate": 0.000796297931438233, "loss": 2.7406, "step": 5450 }, { "epoch": 2.458605878554785, "grad_norm": 0.5549573302268982, "learning_rate": 0.0007944985389136157, "loss": 2.7408, "step": 5475 }, { "epoch": 2.46983520956739, "grad_norm": 0.5694999694824219, "learning_rate": 0.0007926932871376575, "loss": 2.7216, "step": 5500 }, { "epoch": 2.481064540579995, "grad_norm": 0.5795106887817383, "learning_rate": 0.0007908822120270867, "loss": 2.6724, "step": 5525 }, { "epoch": 2.4922938715926, "grad_norm": 0.5619019865989685, "learning_rate": 0.0007890653496144902, "loss": 2.6867, "step": 5550 }, { "epoch": 2.503523202605205, "grad_norm": 0.5836601257324219, "learning_rate": 0.0007872427360475974, "loss": 2.7091, "step": 5575 }, { "epoch": 2.5147525336178096, "grad_norm": 0.6521953344345093, "learning_rate": 0.0007854144075885614, "loss": 2.7138, "step": 5600 }, { "epoch": 2.5259818646304146, "grad_norm": 0.6129563450813293, "learning_rate": 0.0007835804006132364, "loss": 2.6796, "step": 5625 }, { "epoch": 2.5372111956430197, "grad_norm": 0.5933245420455933, "learning_rate": 0.0007817407516104547, "loss": 2.6541, "step": 5650 }, { "epoch": 2.5484405266556243, "grad_norm": 0.5935245156288147, "learning_rate": 0.0007798954971813009, "loss": 2.6849, "step": 5675 }, { "epoch": 2.5596698576682293, "grad_norm": 0.7134594321250916, "learning_rate": 0.0007780446740383829, "loss": 2.7141, "step": 5700 }, { "epoch": 2.5708991886808343, "grad_norm": 0.6013050675392151, "learning_rate": 0.0007761883190051029, "loss": 2.7276, "step": 5725 }, { "epoch": 2.5821285196934394, "grad_norm": 0.6081655025482178, "learning_rate": 0.000774326469014923, "loss": 2.7205, "step": 5750 }, { "epoch": 2.5933578507060444, "grad_norm": 0.6464730501174927, "learning_rate": 0.0007724591611106315, "loss": 2.6872, "step": 5775 }, { "epoch": 2.604587181718649, "grad_norm": 0.578700840473175, "learning_rate": 0.0007705864324436059, "loss": 2.7152, "step": 5800 }, { "epoch": 2.615816512731254, "grad_norm": 0.5782270431518555, "learning_rate": 0.000768708320273073, "loss": 2.7233, "step": 5825 }, { "epoch": 2.627045843743859, "grad_norm": 0.5789017081260681, "learning_rate": 0.000766824861965369, "loss": 2.7474, "step": 5850 }, { "epoch": 2.6382751747564637, "grad_norm": 0.6152642369270325, "learning_rate": 0.0007649360949931941, "loss": 2.7071, "step": 5875 }, { "epoch": 2.6495045057690687, "grad_norm": 0.6417413353919983, "learning_rate": 0.0007630420569348688, "loss": 2.694, "step": 5900 }, { "epoch": 2.6607338367816737, "grad_norm": 0.5956742167472839, "learning_rate": 0.0007611427854735855, "loss": 2.7318, "step": 5925 }, { "epoch": 2.671963167794279, "grad_norm": 0.6362649202346802, "learning_rate": 0.0007592383183966581, "loss": 2.6966, "step": 5950 }, { "epoch": 2.683192498806884, "grad_norm": 0.5551230311393738, "learning_rate": 0.0007573286935947715, "loss": 2.6876, "step": 5975 }, { "epoch": 2.6944218298194884, "grad_norm": 0.6288260817527771, "learning_rate": 0.0007554139490612269, "loss": 2.7336, "step": 6000 }, { "epoch": 2.7056511608320934, "grad_norm": 0.5820605158805847, "learning_rate": 0.0007534941228911856, "loss": 2.683, "step": 6025 }, { "epoch": 2.7168804918446985, "grad_norm": 0.6264435648918152, "learning_rate": 0.0007515692532809126, "loss": 2.7461, "step": 6050 }, { "epoch": 2.728109822857303, "grad_norm": 0.6948567628860474, "learning_rate": 0.0007496393785270148, "loss": 2.7297, "step": 6075 }, { "epoch": 2.739339153869908, "grad_norm": 0.5976940393447876, "learning_rate": 0.0007477045370256802, "loss": 2.7419, "step": 6100 }, { "epoch": 2.750568484882513, "grad_norm": 0.5582289099693298, "learning_rate": 0.0007457647672719133, "loss": 2.7238, "step": 6125 }, { "epoch": 2.761797815895118, "grad_norm": 0.6097133755683899, "learning_rate": 0.00074389798763174, "loss": 2.7345, "step": 6150 }, { "epoch": 2.7730271469077232, "grad_norm": 0.576604425907135, "learning_rate": 0.0007419486705442532, "loss": 2.7075, "step": 6175 }, { "epoch": 2.784256477920328, "grad_norm": 0.6071319580078125, "learning_rate": 0.0007399945397212636, "loss": 2.7122, "step": 6200 }, { "epoch": 2.795485808932933, "grad_norm": 0.6164671182632446, "learning_rate": 0.0007380356340415503, "loss": 2.698, "step": 6225 }, { "epoch": 2.806715139945538, "grad_norm": 0.5970659255981445, "learning_rate": 0.0007360719924788919, "loss": 2.7429, "step": 6250 }, { "epoch": 2.8179444709581425, "grad_norm": 0.5380481481552124, "learning_rate": 0.0007341036541012898, "loss": 2.6655, "step": 6275 }, { "epoch": 2.8291738019707475, "grad_norm": 0.5859966278076172, "learning_rate": 0.0007321306580701923, "loss": 2.7115, "step": 6300 }, { "epoch": 2.8404031329833526, "grad_norm": 0.5644718408584595, "learning_rate": 0.0007301530436397148, "loss": 2.6945, "step": 6325 }, { "epoch": 2.8516324639959576, "grad_norm": 0.5971605777740479, "learning_rate": 0.0007281708501558591, "loss": 2.7082, "step": 6350 }, { "epoch": 2.8628617950085626, "grad_norm": 0.6274561882019043, "learning_rate": 0.0007261841170557303, "loss": 2.7207, "step": 6375 }, { "epoch": 2.8740911260211672, "grad_norm": 0.611348032951355, "learning_rate": 0.0007241928838667522, "loss": 2.7155, "step": 6400 }, { "epoch": 2.8853204570337723, "grad_norm": 0.5650250911712646, "learning_rate": 0.000722197190205881, "loss": 2.7063, "step": 6425 }, { "epoch": 2.896549788046377, "grad_norm": 0.6043295860290527, "learning_rate": 0.0007201970757788173, "loss": 2.6909, "step": 6450 }, { "epoch": 2.907779119058982, "grad_norm": 0.582062304019928, "learning_rate": 0.0007181925803792153, "loss": 2.7262, "step": 6475 }, { "epoch": 2.919008450071587, "grad_norm": 0.6272075772285461, "learning_rate": 0.0007161837438878926, "loss": 2.7224, "step": 6500 }, { "epoch": 2.930237781084192, "grad_norm": 0.6399256587028503, "learning_rate": 0.0007141706062720349, "loss": 2.7202, "step": 6525 }, { "epoch": 2.941467112096797, "grad_norm": 0.637417197227478, "learning_rate": 0.0007121532075844023, "loss": 2.6624, "step": 6550 }, { "epoch": 2.9526964431094016, "grad_norm": 0.7301665544509888, "learning_rate": 0.0007101315879625315, "loss": 2.7103, "step": 6575 }, { "epoch": 2.9639257741220066, "grad_norm": 0.6282750964164734, "learning_rate": 0.000708105787627938, "loss": 2.6985, "step": 6600 }, { "epoch": 2.9751551051346117, "grad_norm": 0.7559499144554138, "learning_rate": 0.0007060758468853153, "loss": 2.6989, "step": 6625 }, { "epoch": 2.9863844361472163, "grad_norm": 0.6338511109352112, "learning_rate": 0.0007040418061217324, "loss": 2.7278, "step": 6650 }, { "epoch": 2.9976137671598213, "grad_norm": 0.6456329226493835, "learning_rate": 0.0007020037058058326, "loss": 2.6851, "step": 6675 }, { "epoch": 3.0, "eval_loss": 3.168698787689209, "eval_runtime": 228.8127, "eval_samples_per_second": 54.997, "eval_steps_per_second": 54.997, "step": 6681 } ], "logging_steps": 25, "max_steps": 17808, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3044200408064e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }