{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04485511796896026, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004485511796896026, "grad_norm": 14.375, "learning_rate": 4e-06, "loss": 2.1672, "step": 5 }, { "epoch": 0.0008971023593792052, "grad_norm": 21.875, "learning_rate": 8.999999999999999e-06, "loss": 1.8368, "step": 10 }, { "epoch": 0.0013456535390688076, "grad_norm": 10.9375, "learning_rate": 1.4e-05, "loss": 2.0673, "step": 15 }, { "epoch": 0.0017942047187584104, "grad_norm": 12.8125, "learning_rate": 1.8999999999999998e-05, "loss": 1.7473, "step": 20 }, { "epoch": 0.002242755898448013, "grad_norm": 8.0625, "learning_rate": 2.3999999999999997e-05, "loss": 1.5355, "step": 25 }, { "epoch": 0.0026913070781376153, "grad_norm": 10.125, "learning_rate": 2.9e-05, "loss": 1.5981, "step": 30 }, { "epoch": 0.003139858257827218, "grad_norm": 7.25, "learning_rate": 3.4e-05, "loss": 1.3957, "step": 35 }, { "epoch": 0.003588409437516821, "grad_norm": 8.1875, "learning_rate": 3.499999907137561e-05, "loss": 1.5353, "step": 40 }, { "epoch": 0.004036960617206423, "grad_norm": 7.9375, "learning_rate": 3.499999529883925e-05, "loss": 1.418, "step": 45 }, { "epoch": 0.004485511796896026, "grad_norm": 5.59375, "learning_rate": 3.499998862435274e-05, "loss": 1.5083, "step": 50 }, { "epoch": 0.004934062976585629, "grad_norm": 6.84375, "learning_rate": 3.499997904791754e-05, "loss": 1.461, "step": 55 }, { "epoch": 0.005382614156275231, "grad_norm": 7.0, "learning_rate": 3.4999966569535776e-05, "loss": 1.4581, "step": 60 }, { "epoch": 0.005831165335964833, "grad_norm": 13.8125, "learning_rate": 3.499995118921021e-05, "loss": 1.7087, "step": 65 }, { "epoch": 0.006279716515654436, "grad_norm": 7.15625, "learning_rate": 3.4999932906944236e-05, "loss": 1.348, "step": 70 }, { "epoch": 0.006728267695344039, "grad_norm": 11.9375, "learning_rate": 3.4999911722741894e-05, "loss": 1.4475, "step": 75 }, { "epoch": 0.007176818875033642, "grad_norm": 10.1875, "learning_rate": 3.499988763660788e-05, "loss": 1.517, "step": 80 }, { "epoch": 0.0076253700547232435, "grad_norm": 7.09375, "learning_rate": 3.4999860648547514e-05, "loss": 1.329, "step": 85 }, { "epoch": 0.008073921234412846, "grad_norm": 13.375, "learning_rate": 3.499983075856676e-05, "loss": 1.7583, "step": 90 }, { "epoch": 0.00852247241410245, "grad_norm": 8.25, "learning_rate": 3.499979796667222e-05, "loss": 1.382, "step": 95 }, { "epoch": 0.008971023593792052, "grad_norm": 7.3125, "learning_rate": 3.4999762272871155e-05, "loss": 1.4598, "step": 100 }, { "epoch": 0.009419574773481654, "grad_norm": 10.25, "learning_rate": 3.499972367717146e-05, "loss": 1.5433, "step": 105 }, { "epoch": 0.009868125953171257, "grad_norm": 7.375, "learning_rate": 3.4999682179581665e-05, "loss": 1.5629, "step": 110 }, { "epoch": 0.01031667713286086, "grad_norm": 4.90625, "learning_rate": 3.499963778011093e-05, "loss": 1.5992, "step": 115 }, { "epoch": 0.010765228312550461, "grad_norm": 5.34375, "learning_rate": 3.499959047876909e-05, "loss": 1.4565, "step": 120 }, { "epoch": 0.011213779492240065, "grad_norm": 8.4375, "learning_rate": 3.49995402755666e-05, "loss": 1.543, "step": 125 }, { "epoch": 0.011662330671929667, "grad_norm": 7.0, "learning_rate": 3.4999487170514556e-05, "loss": 1.5431, "step": 130 }, { "epoch": 0.01211088185161927, "grad_norm": 6.3125, "learning_rate": 3.499943116362471e-05, "loss": 1.4024, "step": 135 }, { "epoch": 0.012559433031308872, "grad_norm": 10.3125, "learning_rate": 3.499937225490943e-05, "loss": 1.8023, "step": 140 }, { "epoch": 0.013007984210998474, "grad_norm": 9.1875, "learning_rate": 3.4999310444381744e-05, "loss": 1.5551, "step": 145 }, { "epoch": 0.013456535390688078, "grad_norm": 4.6875, "learning_rate": 3.499924573205533e-05, "loss": 1.2654, "step": 150 }, { "epoch": 0.01390508657037768, "grad_norm": 11.9375, "learning_rate": 3.499917811794448e-05, "loss": 1.7612, "step": 155 }, { "epoch": 0.014353637750067283, "grad_norm": 5.21875, "learning_rate": 3.4999107602064155e-05, "loss": 1.5516, "step": 160 }, { "epoch": 0.014802188929756885, "grad_norm": 5.5625, "learning_rate": 3.4999034184429946e-05, "loss": 1.6631, "step": 165 }, { "epoch": 0.015250740109446487, "grad_norm": 6.03125, "learning_rate": 3.4998957865058077e-05, "loss": 1.2633, "step": 170 }, { "epoch": 0.01569929128913609, "grad_norm": 5.375, "learning_rate": 3.4998878643965425e-05, "loss": 1.3481, "step": 175 }, { "epoch": 0.016147842468825693, "grad_norm": 5.84375, "learning_rate": 3.4998796521169516e-05, "loss": 1.5153, "step": 180 }, { "epoch": 0.016596393648515294, "grad_norm": 6.09375, "learning_rate": 3.4998711496688497e-05, "loss": 1.4984, "step": 185 }, { "epoch": 0.0170449448282049, "grad_norm": 5.6875, "learning_rate": 3.4998623570541177e-05, "loss": 1.5425, "step": 190 }, { "epoch": 0.017493496007894502, "grad_norm": 5.53125, "learning_rate": 3.499853274274698e-05, "loss": 1.6314, "step": 195 }, { "epoch": 0.017942047187584104, "grad_norm": 6.0625, "learning_rate": 3.4998439013326e-05, "loss": 1.252, "step": 200 }, { "epoch": 0.018390598367273706, "grad_norm": 4.96875, "learning_rate": 3.499834238229896e-05, "loss": 1.1834, "step": 205 }, { "epoch": 0.018839149546963307, "grad_norm": 3.734375, "learning_rate": 3.499824284968722e-05, "loss": 1.3504, "step": 210 }, { "epoch": 0.019287700726652913, "grad_norm": 9.1875, "learning_rate": 3.499814041551279e-05, "loss": 1.7496, "step": 215 }, { "epoch": 0.019736251906342515, "grad_norm": 10.375, "learning_rate": 3.499803507979832e-05, "loss": 1.8731, "step": 220 }, { "epoch": 0.020184803086032117, "grad_norm": 4.8125, "learning_rate": 3.4997926842567095e-05, "loss": 1.4044, "step": 225 }, { "epoch": 0.02063335426572172, "grad_norm": 7.28125, "learning_rate": 3.499781570384305e-05, "loss": 1.7739, "step": 230 }, { "epoch": 0.02108190544541132, "grad_norm": 4.8125, "learning_rate": 3.499770166365077e-05, "loss": 1.5356, "step": 235 }, { "epoch": 0.021530456625100922, "grad_norm": 5.40625, "learning_rate": 3.499758472201544e-05, "loss": 1.3888, "step": 240 }, { "epoch": 0.021979007804790528, "grad_norm": 4.65625, "learning_rate": 3.4997464878962945e-05, "loss": 1.3103, "step": 245 }, { "epoch": 0.02242755898448013, "grad_norm": 6.28125, "learning_rate": 3.499734213451976e-05, "loss": 1.6716, "step": 250 }, { "epoch": 0.02287611016416973, "grad_norm": 11.375, "learning_rate": 3.499721648871305e-05, "loss": 1.9034, "step": 255 }, { "epoch": 0.023324661343859333, "grad_norm": 5.4375, "learning_rate": 3.499708794157057e-05, "loss": 1.5581, "step": 260 }, { "epoch": 0.023773212523548935, "grad_norm": 5.625, "learning_rate": 3.4996956493120746e-05, "loss": 1.438, "step": 265 }, { "epoch": 0.02422176370323854, "grad_norm": 7.03125, "learning_rate": 3.499682214339265e-05, "loss": 1.6683, "step": 270 }, { "epoch": 0.024670314882928143, "grad_norm": 5.09375, "learning_rate": 3.499668489241599e-05, "loss": 1.2929, "step": 275 }, { "epoch": 0.025118866062617744, "grad_norm": 6.53125, "learning_rate": 3.4996544740221106e-05, "loss": 1.4972, "step": 280 }, { "epoch": 0.025567417242307346, "grad_norm": 5.125, "learning_rate": 3.499640168683898e-05, "loss": 1.2642, "step": 285 }, { "epoch": 0.026015968421996948, "grad_norm": 4.71875, "learning_rate": 3.499625573230125e-05, "loss": 1.481, "step": 290 }, { "epoch": 0.026464519601686554, "grad_norm": 7.0, "learning_rate": 3.499610687664018e-05, "loss": 1.4126, "step": 295 }, { "epoch": 0.026913070781376155, "grad_norm": 4.46875, "learning_rate": 3.499595511988869e-05, "loss": 1.7327, "step": 300 }, { "epoch": 0.027361621961065757, "grad_norm": 4.65625, "learning_rate": 3.499580046208033e-05, "loss": 1.4368, "step": 305 }, { "epoch": 0.02781017314075536, "grad_norm": 4.5625, "learning_rate": 3.49956429032493e-05, "loss": 1.4169, "step": 310 }, { "epoch": 0.02825872432044496, "grad_norm": 6.71875, "learning_rate": 3.4995482443430426e-05, "loss": 1.5973, "step": 315 }, { "epoch": 0.028707275500134567, "grad_norm": 3.453125, "learning_rate": 3.499531908265919e-05, "loss": 1.6349, "step": 320 }, { "epoch": 0.02915582667982417, "grad_norm": 2.65625, "learning_rate": 3.499515282097171e-05, "loss": 1.3379, "step": 325 }, { "epoch": 0.02960437785951377, "grad_norm": 4.46875, "learning_rate": 3.4994983658404754e-05, "loss": 1.5752, "step": 330 }, { "epoch": 0.030052929039203372, "grad_norm": 4.40625, "learning_rate": 3.499481159499572e-05, "loss": 1.3574, "step": 335 }, { "epoch": 0.030501480218892974, "grad_norm": 4.0625, "learning_rate": 3.499463663078264e-05, "loss": 1.4591, "step": 340 }, { "epoch": 0.03095003139858258, "grad_norm": 4.53125, "learning_rate": 3.499445876580422e-05, "loss": 1.5197, "step": 345 }, { "epoch": 0.03139858257827218, "grad_norm": 3.390625, "learning_rate": 3.499427800009977e-05, "loss": 1.3433, "step": 350 }, { "epoch": 0.03184713375796178, "grad_norm": 4.34375, "learning_rate": 3.4994094333709264e-05, "loss": 1.6308, "step": 355 }, { "epoch": 0.032295684937651385, "grad_norm": 8.125, "learning_rate": 3.499390776667331e-05, "loss": 1.5933, "step": 360 }, { "epoch": 0.03274423611734099, "grad_norm": 5.5625, "learning_rate": 3.499371829903316e-05, "loss": 1.4659, "step": 365 }, { "epoch": 0.03319278729703059, "grad_norm": 4.03125, "learning_rate": 3.4993525930830694e-05, "loss": 1.3809, "step": 370 }, { "epoch": 0.03364133847672019, "grad_norm": 5.0, "learning_rate": 3.499333066210846e-05, "loss": 1.4262, "step": 375 }, { "epoch": 0.0340898896564098, "grad_norm": 3.171875, "learning_rate": 3.4993132492909624e-05, "loss": 1.5301, "step": 380 }, { "epoch": 0.0345384408360994, "grad_norm": 5.09375, "learning_rate": 3.499293142327801e-05, "loss": 1.4591, "step": 385 }, { "epoch": 0.034986992015789004, "grad_norm": 3.703125, "learning_rate": 3.499272745325806e-05, "loss": 1.4161, "step": 390 }, { "epoch": 0.035435543195478605, "grad_norm": 5.40625, "learning_rate": 3.4992520582894886e-05, "loss": 1.4047, "step": 395 }, { "epoch": 0.03588409437516821, "grad_norm": 5.1875, "learning_rate": 3.499231081223422e-05, "loss": 1.9751, "step": 400 }, { "epoch": 0.03633264555485781, "grad_norm": 6.5625, "learning_rate": 3.499209814132244e-05, "loss": 1.5034, "step": 405 }, { "epoch": 0.03678119673454741, "grad_norm": 4.96875, "learning_rate": 3.499188257020657e-05, "loss": 1.5578, "step": 410 }, { "epoch": 0.03722974791423701, "grad_norm": 5.875, "learning_rate": 3.499166409893428e-05, "loss": 1.552, "step": 415 }, { "epoch": 0.037678299093926615, "grad_norm": 4.5, "learning_rate": 3.499144272755387e-05, "loss": 1.4408, "step": 420 }, { "epoch": 0.03812685027361622, "grad_norm": 4.21875, "learning_rate": 3.499121845611428e-05, "loss": 1.4257, "step": 425 }, { "epoch": 0.038575401453305826, "grad_norm": 5.9375, "learning_rate": 3.49909912846651e-05, "loss": 1.4551, "step": 430 }, { "epoch": 0.03902395263299543, "grad_norm": 2.984375, "learning_rate": 3.499076121325657e-05, "loss": 1.2085, "step": 435 }, { "epoch": 0.03947250381268503, "grad_norm": 4.875, "learning_rate": 3.499052824193953e-05, "loss": 1.2269, "step": 440 }, { "epoch": 0.03992105499237463, "grad_norm": 5.1875, "learning_rate": 3.499029237076552e-05, "loss": 1.5631, "step": 445 }, { "epoch": 0.04036960617206423, "grad_norm": 5.28125, "learning_rate": 3.499005359978668e-05, "loss": 1.4777, "step": 450 }, { "epoch": 0.040818157351753835, "grad_norm": 4.3125, "learning_rate": 3.49898119290558e-05, "loss": 1.3492, "step": 455 }, { "epoch": 0.04126670853144344, "grad_norm": 3.59375, "learning_rate": 3.4989567358626314e-05, "loss": 1.5453, "step": 460 }, { "epoch": 0.04171525971113304, "grad_norm": 5.0625, "learning_rate": 3.49893198885523e-05, "loss": 1.4498, "step": 465 }, { "epoch": 0.04216381089082264, "grad_norm": 3.359375, "learning_rate": 3.498906951888847e-05, "loss": 1.4422, "step": 470 }, { "epoch": 0.04261236207051224, "grad_norm": 3.46875, "learning_rate": 3.4988816249690185e-05, "loss": 1.4364, "step": 475 }, { "epoch": 0.043060913250201845, "grad_norm": 4.125, "learning_rate": 3.498856008101344e-05, "loss": 1.6421, "step": 480 }, { "epoch": 0.04350946442989145, "grad_norm": 3.453125, "learning_rate": 3.4988301012914875e-05, "loss": 1.4868, "step": 485 }, { "epoch": 0.043958015609581055, "grad_norm": 3.765625, "learning_rate": 3.498803904545177e-05, "loss": 1.6498, "step": 490 }, { "epoch": 0.04440656678927066, "grad_norm": 5.59375, "learning_rate": 3.4987774178682054e-05, "loss": 2.0081, "step": 495 }, { "epoch": 0.04485511796896026, "grad_norm": 3.734375, "learning_rate": 3.4987506412664274e-05, "loss": 1.4555, "step": 500 }, { "epoch": 0.04485511796896026, "eval_loss": 2.2115273475646973, "eval_runtime": 14.9781, "eval_samples_per_second": 13.353, "eval_steps_per_second": 13.353, "step": 500 } ], "logging_steps": 5, "max_steps": 33441, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.37005101498368e+17, "train_batch_size": 18, "trial_name": null, "trial_params": null }