|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.04485511796896026, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004485511796896026, |
|
"grad_norm": 14.375, |
|
"learning_rate": 4e-06, |
|
"loss": 2.1672, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0008971023593792052, |
|
"grad_norm": 21.875, |
|
"learning_rate": 8.999999999999999e-06, |
|
"loss": 1.8368, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0013456535390688076, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 1.4e-05, |
|
"loss": 2.0673, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0017942047187584104, |
|
"grad_norm": 12.8125, |
|
"learning_rate": 1.8999999999999998e-05, |
|
"loss": 1.7473, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.002242755898448013, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 2.3999999999999997e-05, |
|
"loss": 1.5355, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0026913070781376153, |
|
"grad_norm": 10.125, |
|
"learning_rate": 2.9e-05, |
|
"loss": 1.5981, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.003139858257827218, |
|
"grad_norm": 7.25, |
|
"learning_rate": 3.4e-05, |
|
"loss": 1.3957, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.003588409437516821, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 3.499999907137561e-05, |
|
"loss": 1.5353, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.004036960617206423, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 3.499999529883925e-05, |
|
"loss": 1.418, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.004485511796896026, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 3.499998862435274e-05, |
|
"loss": 1.5083, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.004934062976585629, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 3.499997904791754e-05, |
|
"loss": 1.461, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.005382614156275231, |
|
"grad_norm": 7.0, |
|
"learning_rate": 3.4999966569535776e-05, |
|
"loss": 1.4581, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.005831165335964833, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 3.499995118921021e-05, |
|
"loss": 1.7087, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.006279716515654436, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 3.4999932906944236e-05, |
|
"loss": 1.348, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.006728267695344039, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 3.4999911722741894e-05, |
|
"loss": 1.4475, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.007176818875033642, |
|
"grad_norm": 10.1875, |
|
"learning_rate": 3.499988763660788e-05, |
|
"loss": 1.517, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0076253700547232435, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 3.4999860648547514e-05, |
|
"loss": 1.329, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.008073921234412846, |
|
"grad_norm": 13.375, |
|
"learning_rate": 3.499983075856676e-05, |
|
"loss": 1.7583, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.00852247241410245, |
|
"grad_norm": 8.25, |
|
"learning_rate": 3.499979796667222e-05, |
|
"loss": 1.382, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.008971023593792052, |
|
"grad_norm": 7.3125, |
|
"learning_rate": 3.4999762272871155e-05, |
|
"loss": 1.4598, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.009419574773481654, |
|
"grad_norm": 10.25, |
|
"learning_rate": 3.499972367717146e-05, |
|
"loss": 1.5433, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.009868125953171257, |
|
"grad_norm": 7.375, |
|
"learning_rate": 3.4999682179581665e-05, |
|
"loss": 1.5629, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.01031667713286086, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 3.499963778011093e-05, |
|
"loss": 1.5992, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.010765228312550461, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 3.499959047876909e-05, |
|
"loss": 1.4565, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.011213779492240065, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 3.49995402755666e-05, |
|
"loss": 1.543, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.011662330671929667, |
|
"grad_norm": 7.0, |
|
"learning_rate": 3.4999487170514556e-05, |
|
"loss": 1.5431, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.01211088185161927, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 3.499943116362471e-05, |
|
"loss": 1.4024, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.012559433031308872, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 3.499937225490943e-05, |
|
"loss": 1.8023, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.013007984210998474, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 3.4999310444381744e-05, |
|
"loss": 1.5551, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.013456535390688078, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 3.499924573205533e-05, |
|
"loss": 1.2654, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.01390508657037768, |
|
"grad_norm": 11.9375, |
|
"learning_rate": 3.499917811794448e-05, |
|
"loss": 1.7612, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.014353637750067283, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 3.4999107602064155e-05, |
|
"loss": 1.5516, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.014802188929756885, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 3.4999034184429946e-05, |
|
"loss": 1.6631, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.015250740109446487, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 3.4998957865058077e-05, |
|
"loss": 1.2633, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.01569929128913609, |
|
"grad_norm": 5.375, |
|
"learning_rate": 3.4998878643965425e-05, |
|
"loss": 1.3481, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.016147842468825693, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 3.4998796521169516e-05, |
|
"loss": 1.5153, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.016596393648515294, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 3.4998711496688497e-05, |
|
"loss": 1.4984, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.0170449448282049, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 3.4998623570541177e-05, |
|
"loss": 1.5425, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.017493496007894502, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 3.499853274274698e-05, |
|
"loss": 1.6314, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.017942047187584104, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 3.4998439013326e-05, |
|
"loss": 1.252, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.018390598367273706, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 3.499834238229896e-05, |
|
"loss": 1.1834, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.018839149546963307, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 3.499824284968722e-05, |
|
"loss": 1.3504, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.019287700726652913, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 3.499814041551279e-05, |
|
"loss": 1.7496, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.019736251906342515, |
|
"grad_norm": 10.375, |
|
"learning_rate": 3.499803507979832e-05, |
|
"loss": 1.8731, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.020184803086032117, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 3.4997926842567095e-05, |
|
"loss": 1.4044, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.02063335426572172, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 3.499781570384305e-05, |
|
"loss": 1.7739, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.02108190544541132, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 3.499770166365077e-05, |
|
"loss": 1.5356, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.021530456625100922, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 3.499758472201544e-05, |
|
"loss": 1.3888, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.021979007804790528, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 3.4997464878962945e-05, |
|
"loss": 1.3103, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.02242755898448013, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 3.499734213451976e-05, |
|
"loss": 1.6716, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.02287611016416973, |
|
"grad_norm": 11.375, |
|
"learning_rate": 3.499721648871305e-05, |
|
"loss": 1.9034, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.023324661343859333, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 3.499708794157057e-05, |
|
"loss": 1.5581, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.023773212523548935, |
|
"grad_norm": 5.625, |
|
"learning_rate": 3.4996956493120746e-05, |
|
"loss": 1.438, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.02422176370323854, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 3.499682214339265e-05, |
|
"loss": 1.6683, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.024670314882928143, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 3.499668489241599e-05, |
|
"loss": 1.2929, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.025118866062617744, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 3.4996544740221106e-05, |
|
"loss": 1.4972, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.025567417242307346, |
|
"grad_norm": 5.125, |
|
"learning_rate": 3.499640168683898e-05, |
|
"loss": 1.2642, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.026015968421996948, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 3.499625573230125e-05, |
|
"loss": 1.481, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.026464519601686554, |
|
"grad_norm": 7.0, |
|
"learning_rate": 3.499610687664018e-05, |
|
"loss": 1.4126, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.026913070781376155, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.499595511988869e-05, |
|
"loss": 1.7327, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.027361621961065757, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 3.499580046208033e-05, |
|
"loss": 1.4368, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.02781017314075536, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 3.49956429032493e-05, |
|
"loss": 1.4169, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.02825872432044496, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 3.4995482443430426e-05, |
|
"loss": 1.5973, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.028707275500134567, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 3.499531908265919e-05, |
|
"loss": 1.6349, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.02915582667982417, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 3.499515282097171e-05, |
|
"loss": 1.3379, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.02960437785951377, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 3.4994983658404754e-05, |
|
"loss": 1.5752, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.030052929039203372, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 3.499481159499572e-05, |
|
"loss": 1.3574, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.030501480218892974, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 3.499463663078264e-05, |
|
"loss": 1.4591, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.03095003139858258, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 3.499445876580422e-05, |
|
"loss": 1.5197, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.03139858257827218, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 3.499427800009977e-05, |
|
"loss": 1.3433, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.03184713375796178, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 3.4994094333709264e-05, |
|
"loss": 1.6308, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.032295684937651385, |
|
"grad_norm": 8.125, |
|
"learning_rate": 3.499390776667331e-05, |
|
"loss": 1.5933, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03274423611734099, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 3.499371829903316e-05, |
|
"loss": 1.4659, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.03319278729703059, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 3.4993525930830694e-05, |
|
"loss": 1.3809, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.03364133847672019, |
|
"grad_norm": 5.0, |
|
"learning_rate": 3.499333066210846e-05, |
|
"loss": 1.4262, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.0340898896564098, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 3.4993132492909624e-05, |
|
"loss": 1.5301, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0345384408360994, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 3.499293142327801e-05, |
|
"loss": 1.4591, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.034986992015789004, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 3.499272745325806e-05, |
|
"loss": 1.4161, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.035435543195478605, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 3.4992520582894886e-05, |
|
"loss": 1.4047, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.03588409437516821, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 3.499231081223422e-05, |
|
"loss": 1.9751, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03633264555485781, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 3.499209814132244e-05, |
|
"loss": 1.5034, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.03678119673454741, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 3.499188257020657e-05, |
|
"loss": 1.5578, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.03722974791423701, |
|
"grad_norm": 5.875, |
|
"learning_rate": 3.499166409893428e-05, |
|
"loss": 1.552, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.037678299093926615, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.499144272755387e-05, |
|
"loss": 1.4408, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.03812685027361622, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 3.499121845611428e-05, |
|
"loss": 1.4257, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.038575401453305826, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 3.49909912846651e-05, |
|
"loss": 1.4551, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.03902395263299543, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 3.499076121325657e-05, |
|
"loss": 1.2085, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.03947250381268503, |
|
"grad_norm": 4.875, |
|
"learning_rate": 3.499052824193953e-05, |
|
"loss": 1.2269, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.03992105499237463, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 3.499029237076552e-05, |
|
"loss": 1.5631, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.04036960617206423, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 3.499005359978668e-05, |
|
"loss": 1.4777, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.040818157351753835, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 3.49898119290558e-05, |
|
"loss": 1.3492, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.04126670853144344, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 3.4989567358626314e-05, |
|
"loss": 1.5453, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04171525971113304, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 3.49893198885523e-05, |
|
"loss": 1.4498, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.04216381089082264, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 3.498906951888847e-05, |
|
"loss": 1.4422, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.04261236207051224, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 3.4988816249690185e-05, |
|
"loss": 1.4364, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.043060913250201845, |
|
"grad_norm": 4.125, |
|
"learning_rate": 3.498856008101344e-05, |
|
"loss": 1.6421, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.04350946442989145, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 3.4988301012914875e-05, |
|
"loss": 1.4868, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.043958015609581055, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 3.498803904545177e-05, |
|
"loss": 1.6498, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.04440656678927066, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 3.4987774178682054e-05, |
|
"loss": 2.0081, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.04485511796896026, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 3.4987506412664274e-05, |
|
"loss": 1.4555, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04485511796896026, |
|
"eval_loss": 2.2115273475646973, |
|
"eval_runtime": 14.9781, |
|
"eval_samples_per_second": 13.353, |
|
"eval_steps_per_second": 13.353, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 33441, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 200.0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.37005101498368e+17, |
|
"train_batch_size": 18, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|