apriasmoro's picture
Upload task output c732d2c1-46df-4ed8-83ee-7525f648965f
f7960d1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.04485511796896026,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004485511796896026,
"grad_norm": 14.375,
"learning_rate": 4e-06,
"loss": 2.1672,
"step": 5
},
{
"epoch": 0.0008971023593792052,
"grad_norm": 21.875,
"learning_rate": 8.999999999999999e-06,
"loss": 1.8368,
"step": 10
},
{
"epoch": 0.0013456535390688076,
"grad_norm": 10.9375,
"learning_rate": 1.4e-05,
"loss": 2.0673,
"step": 15
},
{
"epoch": 0.0017942047187584104,
"grad_norm": 12.8125,
"learning_rate": 1.8999999999999998e-05,
"loss": 1.7473,
"step": 20
},
{
"epoch": 0.002242755898448013,
"grad_norm": 8.0625,
"learning_rate": 2.3999999999999997e-05,
"loss": 1.5355,
"step": 25
},
{
"epoch": 0.0026913070781376153,
"grad_norm": 10.125,
"learning_rate": 2.9e-05,
"loss": 1.5981,
"step": 30
},
{
"epoch": 0.003139858257827218,
"grad_norm": 7.25,
"learning_rate": 3.4e-05,
"loss": 1.3957,
"step": 35
},
{
"epoch": 0.003588409437516821,
"grad_norm": 8.1875,
"learning_rate": 3.499999907137561e-05,
"loss": 1.5353,
"step": 40
},
{
"epoch": 0.004036960617206423,
"grad_norm": 7.9375,
"learning_rate": 3.499999529883925e-05,
"loss": 1.418,
"step": 45
},
{
"epoch": 0.004485511796896026,
"grad_norm": 5.59375,
"learning_rate": 3.499998862435274e-05,
"loss": 1.5083,
"step": 50
},
{
"epoch": 0.004934062976585629,
"grad_norm": 6.84375,
"learning_rate": 3.499997904791754e-05,
"loss": 1.461,
"step": 55
},
{
"epoch": 0.005382614156275231,
"grad_norm": 7.0,
"learning_rate": 3.4999966569535776e-05,
"loss": 1.4581,
"step": 60
},
{
"epoch": 0.005831165335964833,
"grad_norm": 13.8125,
"learning_rate": 3.499995118921021e-05,
"loss": 1.7087,
"step": 65
},
{
"epoch": 0.006279716515654436,
"grad_norm": 7.15625,
"learning_rate": 3.4999932906944236e-05,
"loss": 1.348,
"step": 70
},
{
"epoch": 0.006728267695344039,
"grad_norm": 11.9375,
"learning_rate": 3.4999911722741894e-05,
"loss": 1.4475,
"step": 75
},
{
"epoch": 0.007176818875033642,
"grad_norm": 10.1875,
"learning_rate": 3.499988763660788e-05,
"loss": 1.517,
"step": 80
},
{
"epoch": 0.0076253700547232435,
"grad_norm": 7.09375,
"learning_rate": 3.4999860648547514e-05,
"loss": 1.329,
"step": 85
},
{
"epoch": 0.008073921234412846,
"grad_norm": 13.375,
"learning_rate": 3.499983075856676e-05,
"loss": 1.7583,
"step": 90
},
{
"epoch": 0.00852247241410245,
"grad_norm": 8.25,
"learning_rate": 3.499979796667222e-05,
"loss": 1.382,
"step": 95
},
{
"epoch": 0.008971023593792052,
"grad_norm": 7.3125,
"learning_rate": 3.4999762272871155e-05,
"loss": 1.4598,
"step": 100
},
{
"epoch": 0.009419574773481654,
"grad_norm": 10.25,
"learning_rate": 3.499972367717146e-05,
"loss": 1.5433,
"step": 105
},
{
"epoch": 0.009868125953171257,
"grad_norm": 7.375,
"learning_rate": 3.4999682179581665e-05,
"loss": 1.5629,
"step": 110
},
{
"epoch": 0.01031667713286086,
"grad_norm": 4.90625,
"learning_rate": 3.499963778011093e-05,
"loss": 1.5992,
"step": 115
},
{
"epoch": 0.010765228312550461,
"grad_norm": 5.34375,
"learning_rate": 3.499959047876909e-05,
"loss": 1.4565,
"step": 120
},
{
"epoch": 0.011213779492240065,
"grad_norm": 8.4375,
"learning_rate": 3.49995402755666e-05,
"loss": 1.543,
"step": 125
},
{
"epoch": 0.011662330671929667,
"grad_norm": 7.0,
"learning_rate": 3.4999487170514556e-05,
"loss": 1.5431,
"step": 130
},
{
"epoch": 0.01211088185161927,
"grad_norm": 6.3125,
"learning_rate": 3.499943116362471e-05,
"loss": 1.4024,
"step": 135
},
{
"epoch": 0.012559433031308872,
"grad_norm": 10.3125,
"learning_rate": 3.499937225490943e-05,
"loss": 1.8023,
"step": 140
},
{
"epoch": 0.013007984210998474,
"grad_norm": 9.1875,
"learning_rate": 3.4999310444381744e-05,
"loss": 1.5551,
"step": 145
},
{
"epoch": 0.013456535390688078,
"grad_norm": 4.6875,
"learning_rate": 3.499924573205533e-05,
"loss": 1.2654,
"step": 150
},
{
"epoch": 0.01390508657037768,
"grad_norm": 11.9375,
"learning_rate": 3.499917811794448e-05,
"loss": 1.7612,
"step": 155
},
{
"epoch": 0.014353637750067283,
"grad_norm": 5.21875,
"learning_rate": 3.4999107602064155e-05,
"loss": 1.5516,
"step": 160
},
{
"epoch": 0.014802188929756885,
"grad_norm": 5.5625,
"learning_rate": 3.4999034184429946e-05,
"loss": 1.6631,
"step": 165
},
{
"epoch": 0.015250740109446487,
"grad_norm": 6.03125,
"learning_rate": 3.4998957865058077e-05,
"loss": 1.2633,
"step": 170
},
{
"epoch": 0.01569929128913609,
"grad_norm": 5.375,
"learning_rate": 3.4998878643965425e-05,
"loss": 1.3481,
"step": 175
},
{
"epoch": 0.016147842468825693,
"grad_norm": 5.84375,
"learning_rate": 3.4998796521169516e-05,
"loss": 1.5153,
"step": 180
},
{
"epoch": 0.016596393648515294,
"grad_norm": 6.09375,
"learning_rate": 3.4998711496688497e-05,
"loss": 1.4984,
"step": 185
},
{
"epoch": 0.0170449448282049,
"grad_norm": 5.6875,
"learning_rate": 3.4998623570541177e-05,
"loss": 1.5425,
"step": 190
},
{
"epoch": 0.017493496007894502,
"grad_norm": 5.53125,
"learning_rate": 3.499853274274698e-05,
"loss": 1.6314,
"step": 195
},
{
"epoch": 0.017942047187584104,
"grad_norm": 6.0625,
"learning_rate": 3.4998439013326e-05,
"loss": 1.252,
"step": 200
},
{
"epoch": 0.018390598367273706,
"grad_norm": 4.96875,
"learning_rate": 3.499834238229896e-05,
"loss": 1.1834,
"step": 205
},
{
"epoch": 0.018839149546963307,
"grad_norm": 3.734375,
"learning_rate": 3.499824284968722e-05,
"loss": 1.3504,
"step": 210
},
{
"epoch": 0.019287700726652913,
"grad_norm": 9.1875,
"learning_rate": 3.499814041551279e-05,
"loss": 1.7496,
"step": 215
},
{
"epoch": 0.019736251906342515,
"grad_norm": 10.375,
"learning_rate": 3.499803507979832e-05,
"loss": 1.8731,
"step": 220
},
{
"epoch": 0.020184803086032117,
"grad_norm": 4.8125,
"learning_rate": 3.4997926842567095e-05,
"loss": 1.4044,
"step": 225
},
{
"epoch": 0.02063335426572172,
"grad_norm": 7.28125,
"learning_rate": 3.499781570384305e-05,
"loss": 1.7739,
"step": 230
},
{
"epoch": 0.02108190544541132,
"grad_norm": 4.8125,
"learning_rate": 3.499770166365077e-05,
"loss": 1.5356,
"step": 235
},
{
"epoch": 0.021530456625100922,
"grad_norm": 5.40625,
"learning_rate": 3.499758472201544e-05,
"loss": 1.3888,
"step": 240
},
{
"epoch": 0.021979007804790528,
"grad_norm": 4.65625,
"learning_rate": 3.4997464878962945e-05,
"loss": 1.3103,
"step": 245
},
{
"epoch": 0.02242755898448013,
"grad_norm": 6.28125,
"learning_rate": 3.499734213451976e-05,
"loss": 1.6716,
"step": 250
},
{
"epoch": 0.02287611016416973,
"grad_norm": 11.375,
"learning_rate": 3.499721648871305e-05,
"loss": 1.9034,
"step": 255
},
{
"epoch": 0.023324661343859333,
"grad_norm": 5.4375,
"learning_rate": 3.499708794157057e-05,
"loss": 1.5581,
"step": 260
},
{
"epoch": 0.023773212523548935,
"grad_norm": 5.625,
"learning_rate": 3.4996956493120746e-05,
"loss": 1.438,
"step": 265
},
{
"epoch": 0.02422176370323854,
"grad_norm": 7.03125,
"learning_rate": 3.499682214339265e-05,
"loss": 1.6683,
"step": 270
},
{
"epoch": 0.024670314882928143,
"grad_norm": 5.09375,
"learning_rate": 3.499668489241599e-05,
"loss": 1.2929,
"step": 275
},
{
"epoch": 0.025118866062617744,
"grad_norm": 6.53125,
"learning_rate": 3.4996544740221106e-05,
"loss": 1.4972,
"step": 280
},
{
"epoch": 0.025567417242307346,
"grad_norm": 5.125,
"learning_rate": 3.499640168683898e-05,
"loss": 1.2642,
"step": 285
},
{
"epoch": 0.026015968421996948,
"grad_norm": 4.71875,
"learning_rate": 3.499625573230125e-05,
"loss": 1.481,
"step": 290
},
{
"epoch": 0.026464519601686554,
"grad_norm": 7.0,
"learning_rate": 3.499610687664018e-05,
"loss": 1.4126,
"step": 295
},
{
"epoch": 0.026913070781376155,
"grad_norm": 4.46875,
"learning_rate": 3.499595511988869e-05,
"loss": 1.7327,
"step": 300
},
{
"epoch": 0.027361621961065757,
"grad_norm": 4.65625,
"learning_rate": 3.499580046208033e-05,
"loss": 1.4368,
"step": 305
},
{
"epoch": 0.02781017314075536,
"grad_norm": 4.5625,
"learning_rate": 3.49956429032493e-05,
"loss": 1.4169,
"step": 310
},
{
"epoch": 0.02825872432044496,
"grad_norm": 6.71875,
"learning_rate": 3.4995482443430426e-05,
"loss": 1.5973,
"step": 315
},
{
"epoch": 0.028707275500134567,
"grad_norm": 3.453125,
"learning_rate": 3.499531908265919e-05,
"loss": 1.6349,
"step": 320
},
{
"epoch": 0.02915582667982417,
"grad_norm": 2.65625,
"learning_rate": 3.499515282097171e-05,
"loss": 1.3379,
"step": 325
},
{
"epoch": 0.02960437785951377,
"grad_norm": 4.46875,
"learning_rate": 3.4994983658404754e-05,
"loss": 1.5752,
"step": 330
},
{
"epoch": 0.030052929039203372,
"grad_norm": 4.40625,
"learning_rate": 3.499481159499572e-05,
"loss": 1.3574,
"step": 335
},
{
"epoch": 0.030501480218892974,
"grad_norm": 4.0625,
"learning_rate": 3.499463663078264e-05,
"loss": 1.4591,
"step": 340
},
{
"epoch": 0.03095003139858258,
"grad_norm": 4.53125,
"learning_rate": 3.499445876580422e-05,
"loss": 1.5197,
"step": 345
},
{
"epoch": 0.03139858257827218,
"grad_norm": 3.390625,
"learning_rate": 3.499427800009977e-05,
"loss": 1.3433,
"step": 350
},
{
"epoch": 0.03184713375796178,
"grad_norm": 4.34375,
"learning_rate": 3.4994094333709264e-05,
"loss": 1.6308,
"step": 355
},
{
"epoch": 0.032295684937651385,
"grad_norm": 8.125,
"learning_rate": 3.499390776667331e-05,
"loss": 1.5933,
"step": 360
},
{
"epoch": 0.03274423611734099,
"grad_norm": 5.5625,
"learning_rate": 3.499371829903316e-05,
"loss": 1.4659,
"step": 365
},
{
"epoch": 0.03319278729703059,
"grad_norm": 4.03125,
"learning_rate": 3.4993525930830694e-05,
"loss": 1.3809,
"step": 370
},
{
"epoch": 0.03364133847672019,
"grad_norm": 5.0,
"learning_rate": 3.499333066210846e-05,
"loss": 1.4262,
"step": 375
},
{
"epoch": 0.0340898896564098,
"grad_norm": 3.171875,
"learning_rate": 3.4993132492909624e-05,
"loss": 1.5301,
"step": 380
},
{
"epoch": 0.0345384408360994,
"grad_norm": 5.09375,
"learning_rate": 3.499293142327801e-05,
"loss": 1.4591,
"step": 385
},
{
"epoch": 0.034986992015789004,
"grad_norm": 3.703125,
"learning_rate": 3.499272745325806e-05,
"loss": 1.4161,
"step": 390
},
{
"epoch": 0.035435543195478605,
"grad_norm": 5.40625,
"learning_rate": 3.4992520582894886e-05,
"loss": 1.4047,
"step": 395
},
{
"epoch": 0.03588409437516821,
"grad_norm": 5.1875,
"learning_rate": 3.499231081223422e-05,
"loss": 1.9751,
"step": 400
},
{
"epoch": 0.03633264555485781,
"grad_norm": 6.5625,
"learning_rate": 3.499209814132244e-05,
"loss": 1.5034,
"step": 405
},
{
"epoch": 0.03678119673454741,
"grad_norm": 4.96875,
"learning_rate": 3.499188257020657e-05,
"loss": 1.5578,
"step": 410
},
{
"epoch": 0.03722974791423701,
"grad_norm": 5.875,
"learning_rate": 3.499166409893428e-05,
"loss": 1.552,
"step": 415
},
{
"epoch": 0.037678299093926615,
"grad_norm": 4.5,
"learning_rate": 3.499144272755387e-05,
"loss": 1.4408,
"step": 420
},
{
"epoch": 0.03812685027361622,
"grad_norm": 4.21875,
"learning_rate": 3.499121845611428e-05,
"loss": 1.4257,
"step": 425
},
{
"epoch": 0.038575401453305826,
"grad_norm": 5.9375,
"learning_rate": 3.49909912846651e-05,
"loss": 1.4551,
"step": 430
},
{
"epoch": 0.03902395263299543,
"grad_norm": 2.984375,
"learning_rate": 3.499076121325657e-05,
"loss": 1.2085,
"step": 435
},
{
"epoch": 0.03947250381268503,
"grad_norm": 4.875,
"learning_rate": 3.499052824193953e-05,
"loss": 1.2269,
"step": 440
},
{
"epoch": 0.03992105499237463,
"grad_norm": 5.1875,
"learning_rate": 3.499029237076552e-05,
"loss": 1.5631,
"step": 445
},
{
"epoch": 0.04036960617206423,
"grad_norm": 5.28125,
"learning_rate": 3.499005359978668e-05,
"loss": 1.4777,
"step": 450
},
{
"epoch": 0.040818157351753835,
"grad_norm": 4.3125,
"learning_rate": 3.49898119290558e-05,
"loss": 1.3492,
"step": 455
},
{
"epoch": 0.04126670853144344,
"grad_norm": 3.59375,
"learning_rate": 3.4989567358626314e-05,
"loss": 1.5453,
"step": 460
},
{
"epoch": 0.04171525971113304,
"grad_norm": 5.0625,
"learning_rate": 3.49893198885523e-05,
"loss": 1.4498,
"step": 465
},
{
"epoch": 0.04216381089082264,
"grad_norm": 3.359375,
"learning_rate": 3.498906951888847e-05,
"loss": 1.4422,
"step": 470
},
{
"epoch": 0.04261236207051224,
"grad_norm": 3.46875,
"learning_rate": 3.4988816249690185e-05,
"loss": 1.4364,
"step": 475
},
{
"epoch": 0.043060913250201845,
"grad_norm": 4.125,
"learning_rate": 3.498856008101344e-05,
"loss": 1.6421,
"step": 480
},
{
"epoch": 0.04350946442989145,
"grad_norm": 3.453125,
"learning_rate": 3.4988301012914875e-05,
"loss": 1.4868,
"step": 485
},
{
"epoch": 0.043958015609581055,
"grad_norm": 3.765625,
"learning_rate": 3.498803904545177e-05,
"loss": 1.6498,
"step": 490
},
{
"epoch": 0.04440656678927066,
"grad_norm": 5.59375,
"learning_rate": 3.4987774178682054e-05,
"loss": 2.0081,
"step": 495
},
{
"epoch": 0.04485511796896026,
"grad_norm": 3.734375,
"learning_rate": 3.4987506412664274e-05,
"loss": 1.4555,
"step": 500
},
{
"epoch": 0.04485511796896026,
"eval_loss": 2.2115273475646973,
"eval_runtime": 14.9781,
"eval_samples_per_second": 13.353,
"eval_steps_per_second": 13.353,
"step": 500
}
],
"logging_steps": 5,
"max_steps": 33441,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.37005101498368e+17,
"train_batch_size": 18,
"trial_name": null,
"trial_params": null
}