assignment3-step1 / checkpoint-2500 /trainer_state.json
JimmyYang2025's picture
Upload folder using huggingface_hub
1a94f15 verified
{
"best_global_step": 2500,
"best_metric": 0.7841161489486694,
"best_model_checkpoint": "./llama2-m2/checkpoint-2500",
"epoch": 2.9767718880285887,
"eval_steps": 100,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05955926146515783,
"grad_norm": 11.590325355529785,
"learning_rate": 4.9000000000000005e-06,
"loss": 3.0703,
"step": 50
},
{
"epoch": 0.11911852293031566,
"grad_norm": 6.117842197418213,
"learning_rate": 9.9e-06,
"loss": 2.3868,
"step": 100
},
{
"epoch": 0.11911852293031566,
"eval_loss": 1.5943528413772583,
"eval_runtime": 86.9995,
"eval_samples_per_second": 8.081,
"eval_steps_per_second": 2.023,
"step": 100
},
{
"epoch": 0.1786777843954735,
"grad_norm": 0.4276258051395416,
"learning_rate": 9.797269342159703e-06,
"loss": 1.1152,
"step": 150
},
{
"epoch": 0.23823704586063132,
"grad_norm": 0.34813204407691956,
"learning_rate": 9.590401323955318e-06,
"loss": 0.9458,
"step": 200
},
{
"epoch": 0.23823704586063132,
"eval_loss": 0.9201429486274719,
"eval_runtime": 86.8761,
"eval_samples_per_second": 8.092,
"eval_steps_per_second": 2.026,
"step": 200
},
{
"epoch": 0.29779630732578916,
"grad_norm": 0.31664666533470154,
"learning_rate": 9.383533305750931e-06,
"loss": 0.8754,
"step": 250
},
{
"epoch": 0.357355568790947,
"grad_norm": 0.3039833903312683,
"learning_rate": 9.176665287546546e-06,
"loss": 0.8236,
"step": 300
},
{
"epoch": 0.357355568790947,
"eval_loss": 0.8294563293457031,
"eval_runtime": 86.6916,
"eval_samples_per_second": 8.109,
"eval_steps_per_second": 2.03,
"step": 300
},
{
"epoch": 0.4169148302561048,
"grad_norm": 0.3569670021533966,
"learning_rate": 8.969797269342161e-06,
"loss": 0.7643,
"step": 350
},
{
"epoch": 0.47647409172126265,
"grad_norm": 0.4587797224521637,
"learning_rate": 8.762929251137776e-06,
"loss": 0.7638,
"step": 400
},
{
"epoch": 0.47647409172126265,
"eval_loss": 0.8007138967514038,
"eval_runtime": 86.7325,
"eval_samples_per_second": 8.105,
"eval_steps_per_second": 2.029,
"step": 400
},
{
"epoch": 0.5360333531864205,
"grad_norm": 0.2903870940208435,
"learning_rate": 8.556061232933389e-06,
"loss": 0.7505,
"step": 450
},
{
"epoch": 0.5955926146515783,
"grad_norm": 0.39271315932273865,
"learning_rate": 8.349193214729004e-06,
"loss": 0.7773,
"step": 500
},
{
"epoch": 0.5955926146515783,
"eval_loss": 0.7964405417442322,
"eval_runtime": 86.7496,
"eval_samples_per_second": 8.104,
"eval_steps_per_second": 2.029,
"step": 500
},
{
"epoch": 0.6551518761167362,
"grad_norm": 0.2611350119113922,
"learning_rate": 8.142325196524617e-06,
"loss": 0.7339,
"step": 550
},
{
"epoch": 0.714711137581894,
"grad_norm": 0.3096601665019989,
"learning_rate": 7.935457178320233e-06,
"loss": 0.7867,
"step": 600
},
{
"epoch": 0.714711137581894,
"eval_loss": 0.7935438752174377,
"eval_runtime": 86.8192,
"eval_samples_per_second": 8.097,
"eval_steps_per_second": 2.027,
"step": 600
},
{
"epoch": 0.7742703990470519,
"grad_norm": 0.28062084317207336,
"learning_rate": 7.728589160115847e-06,
"loss": 0.7642,
"step": 650
},
{
"epoch": 0.8338296605122096,
"grad_norm": 0.2916211783885956,
"learning_rate": 7.521721141911461e-06,
"loss": 0.7436,
"step": 700
},
{
"epoch": 0.8338296605122096,
"eval_loss": 0.7918882369995117,
"eval_runtime": 86.8706,
"eval_samples_per_second": 8.092,
"eval_steps_per_second": 2.026,
"step": 700
},
{
"epoch": 0.8933889219773675,
"grad_norm": 0.4260661005973816,
"learning_rate": 7.3148531237070755e-06,
"loss": 0.7944,
"step": 750
},
{
"epoch": 0.9529481834425253,
"grad_norm": 0.3311309218406677,
"learning_rate": 7.1079851055026895e-06,
"loss": 0.7618,
"step": 800
},
{
"epoch": 0.9529481834425253,
"eval_loss": 0.7905948758125305,
"eval_runtime": 86.7293,
"eval_samples_per_second": 8.106,
"eval_steps_per_second": 2.029,
"step": 800
},
{
"epoch": 1.0119118522930315,
"grad_norm": 0.33902204036712646,
"learning_rate": 6.901117087298304e-06,
"loss": 0.7565,
"step": 850
},
{
"epoch": 1.0714711137581894,
"grad_norm": 0.3156481981277466,
"learning_rate": 6.694249069093918e-06,
"loss": 0.7834,
"step": 900
},
{
"epoch": 1.0714711137581894,
"eval_loss": 0.789471447467804,
"eval_runtime": 86.7639,
"eval_samples_per_second": 8.102,
"eval_steps_per_second": 2.028,
"step": 900
},
{
"epoch": 1.1310303752233473,
"grad_norm": 0.29626569151878357,
"learning_rate": 6.487381050889533e-06,
"loss": 0.7636,
"step": 950
},
{
"epoch": 1.1905896366885051,
"grad_norm": 0.32058003544807434,
"learning_rate": 6.280513032685147e-06,
"loss": 0.7588,
"step": 1000
},
{
"epoch": 1.1905896366885051,
"eval_loss": 0.7887451648712158,
"eval_runtime": 86.8029,
"eval_samples_per_second": 8.099,
"eval_steps_per_second": 2.028,
"step": 1000
},
{
"epoch": 1.2501488981536628,
"grad_norm": 0.3029298484325409,
"learning_rate": 6.073645014480761e-06,
"loss": 0.7651,
"step": 1050
},
{
"epoch": 1.3097081596188207,
"grad_norm": 0.30075645446777344,
"learning_rate": 5.866776996276376e-06,
"loss": 0.747,
"step": 1100
},
{
"epoch": 1.3097081596188207,
"eval_loss": 0.7880399227142334,
"eval_runtime": 86.7707,
"eval_samples_per_second": 8.102,
"eval_steps_per_second": 2.028,
"step": 1100
},
{
"epoch": 1.3692674210839786,
"grad_norm": 0.30230703949928284,
"learning_rate": 5.659908978071991e-06,
"loss": 0.7694,
"step": 1150
},
{
"epoch": 1.4288266825491365,
"grad_norm": 0.2981889545917511,
"learning_rate": 5.453040959867605e-06,
"loss": 0.7546,
"step": 1200
},
{
"epoch": 1.4288266825491365,
"eval_loss": 0.7873143553733826,
"eval_runtime": 86.9249,
"eval_samples_per_second": 8.087,
"eval_steps_per_second": 2.025,
"step": 1200
},
{
"epoch": 1.4883859440142944,
"grad_norm": 0.33295580744743347,
"learning_rate": 5.246172941663219e-06,
"loss": 0.7356,
"step": 1250
},
{
"epoch": 1.547945205479452,
"grad_norm": 0.2881334125995636,
"learning_rate": 5.039304923458833e-06,
"loss": 0.7616,
"step": 1300
},
{
"epoch": 1.547945205479452,
"eval_loss": 0.7868330478668213,
"eval_runtime": 86.9371,
"eval_samples_per_second": 8.086,
"eval_steps_per_second": 2.024,
"step": 1300
},
{
"epoch": 1.60750446694461,
"grad_norm": 0.42549142241477966,
"learning_rate": 4.832436905254448e-06,
"loss": 0.7613,
"step": 1350
},
{
"epoch": 1.6670637284097678,
"grad_norm": 0.32537880539894104,
"learning_rate": 4.625568887050063e-06,
"loss": 0.777,
"step": 1400
},
{
"epoch": 1.6670637284097678,
"eval_loss": 0.7863583564758301,
"eval_runtime": 86.9105,
"eval_samples_per_second": 8.089,
"eval_steps_per_second": 2.025,
"step": 1400
},
{
"epoch": 1.7266229898749255,
"grad_norm": 0.31612130999565125,
"learning_rate": 4.418700868845677e-06,
"loss": 0.7123,
"step": 1450
},
{
"epoch": 1.7861822513400833,
"grad_norm": 0.39497706294059753,
"learning_rate": 4.211832850641292e-06,
"loss": 0.7999,
"step": 1500
},
{
"epoch": 1.7861822513400833,
"eval_loss": 0.7859570980072021,
"eval_runtime": 86.7739,
"eval_samples_per_second": 8.102,
"eval_steps_per_second": 2.028,
"step": 1500
},
{
"epoch": 1.8457415128052412,
"grad_norm": 0.3905975818634033,
"learning_rate": 4.004964832436906e-06,
"loss": 0.7105,
"step": 1550
},
{
"epoch": 1.905300774270399,
"grad_norm": 0.3420596718788147,
"learning_rate": 3.7980968142325196e-06,
"loss": 0.7735,
"step": 1600
},
{
"epoch": 1.905300774270399,
"eval_loss": 0.7855594754219055,
"eval_runtime": 86.9977,
"eval_samples_per_second": 8.081,
"eval_steps_per_second": 2.023,
"step": 1600
},
{
"epoch": 1.964860035735557,
"grad_norm": 0.2925880551338196,
"learning_rate": 3.5912287960281345e-06,
"loss": 0.7675,
"step": 1650
},
{
"epoch": 2.023823704586063,
"grad_norm": 0.42387983202934265,
"learning_rate": 3.3843607778237485e-06,
"loss": 0.7679,
"step": 1700
},
{
"epoch": 2.023823704586063,
"eval_loss": 0.7852116227149963,
"eval_runtime": 86.7932,
"eval_samples_per_second": 8.1,
"eval_steps_per_second": 2.028,
"step": 1700
},
{
"epoch": 2.083382966051221,
"grad_norm": 0.3012678325176239,
"learning_rate": 3.1774927596193634e-06,
"loss": 0.7529,
"step": 1750
},
{
"epoch": 2.1429422275163788,
"grad_norm": 0.3647378385066986,
"learning_rate": 2.9706247414149774e-06,
"loss": 0.7772,
"step": 1800
},
{
"epoch": 2.1429422275163788,
"eval_loss": 0.7850247025489807,
"eval_runtime": 86.7181,
"eval_samples_per_second": 8.107,
"eval_steps_per_second": 2.03,
"step": 1800
},
{
"epoch": 2.202501488981537,
"grad_norm": 0.30863115191459656,
"learning_rate": 2.763756723210592e-06,
"loss": 0.7485,
"step": 1850
},
{
"epoch": 2.2620607504466945,
"grad_norm": 0.3829723298549652,
"learning_rate": 2.5568887050062062e-06,
"loss": 0.7449,
"step": 1900
},
{
"epoch": 2.2620607504466945,
"eval_loss": 0.7847884893417358,
"eval_runtime": 86.725,
"eval_samples_per_second": 8.106,
"eval_steps_per_second": 2.029,
"step": 1900
},
{
"epoch": 2.321620011911852,
"grad_norm": 0.3733135759830475,
"learning_rate": 2.3500206868018207e-06,
"loss": 0.7508,
"step": 1950
},
{
"epoch": 2.3811792733770103,
"grad_norm": 0.37344199419021606,
"learning_rate": 2.143152668597435e-06,
"loss": 0.7509,
"step": 2000
},
{
"epoch": 2.3811792733770103,
"eval_loss": 0.7846249938011169,
"eval_runtime": 86.7361,
"eval_samples_per_second": 8.105,
"eval_steps_per_second": 2.029,
"step": 2000
},
{
"epoch": 2.440738534842168,
"grad_norm": 0.46035104990005493,
"learning_rate": 1.9362846503930496e-06,
"loss": 0.7901,
"step": 2050
},
{
"epoch": 2.5002977963073256,
"grad_norm": 0.31786802411079407,
"learning_rate": 1.7294166321886638e-06,
"loss": 0.7654,
"step": 2100
},
{
"epoch": 2.5002977963073256,
"eval_loss": 0.7844468951225281,
"eval_runtime": 86.7628,
"eval_samples_per_second": 8.103,
"eval_steps_per_second": 2.029,
"step": 2100
},
{
"epoch": 2.5598570577724837,
"grad_norm": 0.337811678647995,
"learning_rate": 1.5225486139842782e-06,
"loss": 0.7524,
"step": 2150
},
{
"epoch": 2.6194163192376414,
"grad_norm": 0.29232126474380493,
"learning_rate": 1.3156805957798926e-06,
"loss": 0.7279,
"step": 2200
},
{
"epoch": 2.6194163192376414,
"eval_loss": 0.7843312621116638,
"eval_runtime": 86.7533,
"eval_samples_per_second": 8.103,
"eval_steps_per_second": 2.029,
"step": 2200
},
{
"epoch": 2.678975580702799,
"grad_norm": 0.4377705454826355,
"learning_rate": 1.1088125775755069e-06,
"loss": 0.7593,
"step": 2250
},
{
"epoch": 2.738534842167957,
"grad_norm": 0.36447674036026,
"learning_rate": 9.019445593711212e-07,
"loss": 0.7523,
"step": 2300
},
{
"epoch": 2.738534842167957,
"eval_loss": 0.7842342257499695,
"eval_runtime": 86.8097,
"eval_samples_per_second": 8.098,
"eval_steps_per_second": 2.027,
"step": 2300
},
{
"epoch": 2.798094103633115,
"grad_norm": 0.38712531328201294,
"learning_rate": 6.950765411667356e-07,
"loss": 0.7347,
"step": 2350
},
{
"epoch": 2.857653365098273,
"grad_norm": 0.34733325242996216,
"learning_rate": 4.882085229623501e-07,
"loss": 0.7605,
"step": 2400
},
{
"epoch": 2.857653365098273,
"eval_loss": 0.7841441035270691,
"eval_runtime": 86.8339,
"eval_samples_per_second": 8.096,
"eval_steps_per_second": 2.027,
"step": 2400
},
{
"epoch": 2.9172126265634306,
"grad_norm": 0.3819723129272461,
"learning_rate": 2.8134050475796445e-07,
"loss": 0.7412,
"step": 2450
},
{
"epoch": 2.9767718880285887,
"grad_norm": 0.3409363329410553,
"learning_rate": 7.447248655357883e-08,
"loss": 0.7425,
"step": 2500
},
{
"epoch": 2.9767718880285887,
"eval_loss": 0.7841161489486694,
"eval_runtime": 87.2598,
"eval_samples_per_second": 8.056,
"eval_steps_per_second": 2.017,
"step": 2500
}
],
"logging_steps": 50,
"max_steps": 2517,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.120601880087757e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}