|
{ |
|
"best_global_step": 2500, |
|
"best_metric": 0.7841161489486694, |
|
"best_model_checkpoint": "./llama2-m2/checkpoint-2500", |
|
"epoch": 2.9767718880285887, |
|
"eval_steps": 100, |
|
"global_step": 2500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05955926146515783, |
|
"grad_norm": 11.590325355529785, |
|
"learning_rate": 4.9000000000000005e-06, |
|
"loss": 3.0703, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11911852293031566, |
|
"grad_norm": 6.117842197418213, |
|
"learning_rate": 9.9e-06, |
|
"loss": 2.3868, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11911852293031566, |
|
"eval_loss": 1.5943528413772583, |
|
"eval_runtime": 86.9995, |
|
"eval_samples_per_second": 8.081, |
|
"eval_steps_per_second": 2.023, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1786777843954735, |
|
"grad_norm": 0.4276258051395416, |
|
"learning_rate": 9.797269342159703e-06, |
|
"loss": 1.1152, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23823704586063132, |
|
"grad_norm": 0.34813204407691956, |
|
"learning_rate": 9.590401323955318e-06, |
|
"loss": 0.9458, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.23823704586063132, |
|
"eval_loss": 0.9201429486274719, |
|
"eval_runtime": 86.8761, |
|
"eval_samples_per_second": 8.092, |
|
"eval_steps_per_second": 2.026, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.29779630732578916, |
|
"grad_norm": 0.31664666533470154, |
|
"learning_rate": 9.383533305750931e-06, |
|
"loss": 0.8754, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.357355568790947, |
|
"grad_norm": 0.3039833903312683, |
|
"learning_rate": 9.176665287546546e-06, |
|
"loss": 0.8236, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.357355568790947, |
|
"eval_loss": 0.8294563293457031, |
|
"eval_runtime": 86.6916, |
|
"eval_samples_per_second": 8.109, |
|
"eval_steps_per_second": 2.03, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4169148302561048, |
|
"grad_norm": 0.3569670021533966, |
|
"learning_rate": 8.969797269342161e-06, |
|
"loss": 0.7643, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.47647409172126265, |
|
"grad_norm": 0.4587797224521637, |
|
"learning_rate": 8.762929251137776e-06, |
|
"loss": 0.7638, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.47647409172126265, |
|
"eval_loss": 0.8007138967514038, |
|
"eval_runtime": 86.7325, |
|
"eval_samples_per_second": 8.105, |
|
"eval_steps_per_second": 2.029, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5360333531864205, |
|
"grad_norm": 0.2903870940208435, |
|
"learning_rate": 8.556061232933389e-06, |
|
"loss": 0.7505, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5955926146515783, |
|
"grad_norm": 0.39271315932273865, |
|
"learning_rate": 8.349193214729004e-06, |
|
"loss": 0.7773, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5955926146515783, |
|
"eval_loss": 0.7964405417442322, |
|
"eval_runtime": 86.7496, |
|
"eval_samples_per_second": 8.104, |
|
"eval_steps_per_second": 2.029, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6551518761167362, |
|
"grad_norm": 0.2611350119113922, |
|
"learning_rate": 8.142325196524617e-06, |
|
"loss": 0.7339, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.714711137581894, |
|
"grad_norm": 0.3096601665019989, |
|
"learning_rate": 7.935457178320233e-06, |
|
"loss": 0.7867, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.714711137581894, |
|
"eval_loss": 0.7935438752174377, |
|
"eval_runtime": 86.8192, |
|
"eval_samples_per_second": 8.097, |
|
"eval_steps_per_second": 2.027, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7742703990470519, |
|
"grad_norm": 0.28062084317207336, |
|
"learning_rate": 7.728589160115847e-06, |
|
"loss": 0.7642, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8338296605122096, |
|
"grad_norm": 0.2916211783885956, |
|
"learning_rate": 7.521721141911461e-06, |
|
"loss": 0.7436, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8338296605122096, |
|
"eval_loss": 0.7918882369995117, |
|
"eval_runtime": 86.8706, |
|
"eval_samples_per_second": 8.092, |
|
"eval_steps_per_second": 2.026, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8933889219773675, |
|
"grad_norm": 0.4260661005973816, |
|
"learning_rate": 7.3148531237070755e-06, |
|
"loss": 0.7944, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9529481834425253, |
|
"grad_norm": 0.3311309218406677, |
|
"learning_rate": 7.1079851055026895e-06, |
|
"loss": 0.7618, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9529481834425253, |
|
"eval_loss": 0.7905948758125305, |
|
"eval_runtime": 86.7293, |
|
"eval_samples_per_second": 8.106, |
|
"eval_steps_per_second": 2.029, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.0119118522930315, |
|
"grad_norm": 0.33902204036712646, |
|
"learning_rate": 6.901117087298304e-06, |
|
"loss": 0.7565, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0714711137581894, |
|
"grad_norm": 0.3156481981277466, |
|
"learning_rate": 6.694249069093918e-06, |
|
"loss": 0.7834, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.0714711137581894, |
|
"eval_loss": 0.789471447467804, |
|
"eval_runtime": 86.7639, |
|
"eval_samples_per_second": 8.102, |
|
"eval_steps_per_second": 2.028, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.1310303752233473, |
|
"grad_norm": 0.29626569151878357, |
|
"learning_rate": 6.487381050889533e-06, |
|
"loss": 0.7636, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.1905896366885051, |
|
"grad_norm": 0.32058003544807434, |
|
"learning_rate": 6.280513032685147e-06, |
|
"loss": 0.7588, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.1905896366885051, |
|
"eval_loss": 0.7887451648712158, |
|
"eval_runtime": 86.8029, |
|
"eval_samples_per_second": 8.099, |
|
"eval_steps_per_second": 2.028, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.2501488981536628, |
|
"grad_norm": 0.3029298484325409, |
|
"learning_rate": 6.073645014480761e-06, |
|
"loss": 0.7651, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.3097081596188207, |
|
"grad_norm": 0.30075645446777344, |
|
"learning_rate": 5.866776996276376e-06, |
|
"loss": 0.747, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.3097081596188207, |
|
"eval_loss": 0.7880399227142334, |
|
"eval_runtime": 86.7707, |
|
"eval_samples_per_second": 8.102, |
|
"eval_steps_per_second": 2.028, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.3692674210839786, |
|
"grad_norm": 0.30230703949928284, |
|
"learning_rate": 5.659908978071991e-06, |
|
"loss": 0.7694, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.4288266825491365, |
|
"grad_norm": 0.2981889545917511, |
|
"learning_rate": 5.453040959867605e-06, |
|
"loss": 0.7546, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4288266825491365, |
|
"eval_loss": 0.7873143553733826, |
|
"eval_runtime": 86.9249, |
|
"eval_samples_per_second": 8.087, |
|
"eval_steps_per_second": 2.025, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4883859440142944, |
|
"grad_norm": 0.33295580744743347, |
|
"learning_rate": 5.246172941663219e-06, |
|
"loss": 0.7356, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.547945205479452, |
|
"grad_norm": 0.2881334125995636, |
|
"learning_rate": 5.039304923458833e-06, |
|
"loss": 0.7616, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.547945205479452, |
|
"eval_loss": 0.7868330478668213, |
|
"eval_runtime": 86.9371, |
|
"eval_samples_per_second": 8.086, |
|
"eval_steps_per_second": 2.024, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.60750446694461, |
|
"grad_norm": 0.42549142241477966, |
|
"learning_rate": 4.832436905254448e-06, |
|
"loss": 0.7613, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.6670637284097678, |
|
"grad_norm": 0.32537880539894104, |
|
"learning_rate": 4.625568887050063e-06, |
|
"loss": 0.777, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.6670637284097678, |
|
"eval_loss": 0.7863583564758301, |
|
"eval_runtime": 86.9105, |
|
"eval_samples_per_second": 8.089, |
|
"eval_steps_per_second": 2.025, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.7266229898749255, |
|
"grad_norm": 0.31612130999565125, |
|
"learning_rate": 4.418700868845677e-06, |
|
"loss": 0.7123, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.7861822513400833, |
|
"grad_norm": 0.39497706294059753, |
|
"learning_rate": 4.211832850641292e-06, |
|
"loss": 0.7999, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.7861822513400833, |
|
"eval_loss": 0.7859570980072021, |
|
"eval_runtime": 86.7739, |
|
"eval_samples_per_second": 8.102, |
|
"eval_steps_per_second": 2.028, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.8457415128052412, |
|
"grad_norm": 0.3905975818634033, |
|
"learning_rate": 4.004964832436906e-06, |
|
"loss": 0.7105, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.905300774270399, |
|
"grad_norm": 0.3420596718788147, |
|
"learning_rate": 3.7980968142325196e-06, |
|
"loss": 0.7735, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.905300774270399, |
|
"eval_loss": 0.7855594754219055, |
|
"eval_runtime": 86.9977, |
|
"eval_samples_per_second": 8.081, |
|
"eval_steps_per_second": 2.023, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.964860035735557, |
|
"grad_norm": 0.2925880551338196, |
|
"learning_rate": 3.5912287960281345e-06, |
|
"loss": 0.7675, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.023823704586063, |
|
"grad_norm": 0.42387983202934265, |
|
"learning_rate": 3.3843607778237485e-06, |
|
"loss": 0.7679, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.023823704586063, |
|
"eval_loss": 0.7852116227149963, |
|
"eval_runtime": 86.7932, |
|
"eval_samples_per_second": 8.1, |
|
"eval_steps_per_second": 2.028, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.083382966051221, |
|
"grad_norm": 0.3012678325176239, |
|
"learning_rate": 3.1774927596193634e-06, |
|
"loss": 0.7529, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.1429422275163788, |
|
"grad_norm": 0.3647378385066986, |
|
"learning_rate": 2.9706247414149774e-06, |
|
"loss": 0.7772, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.1429422275163788, |
|
"eval_loss": 0.7850247025489807, |
|
"eval_runtime": 86.7181, |
|
"eval_samples_per_second": 8.107, |
|
"eval_steps_per_second": 2.03, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.202501488981537, |
|
"grad_norm": 0.30863115191459656, |
|
"learning_rate": 2.763756723210592e-06, |
|
"loss": 0.7485, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.2620607504466945, |
|
"grad_norm": 0.3829723298549652, |
|
"learning_rate": 2.5568887050062062e-06, |
|
"loss": 0.7449, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.2620607504466945, |
|
"eval_loss": 0.7847884893417358, |
|
"eval_runtime": 86.725, |
|
"eval_samples_per_second": 8.106, |
|
"eval_steps_per_second": 2.029, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.321620011911852, |
|
"grad_norm": 0.3733135759830475, |
|
"learning_rate": 2.3500206868018207e-06, |
|
"loss": 0.7508, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.3811792733770103, |
|
"grad_norm": 0.37344199419021606, |
|
"learning_rate": 2.143152668597435e-06, |
|
"loss": 0.7509, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.3811792733770103, |
|
"eval_loss": 0.7846249938011169, |
|
"eval_runtime": 86.7361, |
|
"eval_samples_per_second": 8.105, |
|
"eval_steps_per_second": 2.029, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.440738534842168, |
|
"grad_norm": 0.46035104990005493, |
|
"learning_rate": 1.9362846503930496e-06, |
|
"loss": 0.7901, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.5002977963073256, |
|
"grad_norm": 0.31786802411079407, |
|
"learning_rate": 1.7294166321886638e-06, |
|
"loss": 0.7654, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.5002977963073256, |
|
"eval_loss": 0.7844468951225281, |
|
"eval_runtime": 86.7628, |
|
"eval_samples_per_second": 8.103, |
|
"eval_steps_per_second": 2.029, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.5598570577724837, |
|
"grad_norm": 0.337811678647995, |
|
"learning_rate": 1.5225486139842782e-06, |
|
"loss": 0.7524, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.6194163192376414, |
|
"grad_norm": 0.29232126474380493, |
|
"learning_rate": 1.3156805957798926e-06, |
|
"loss": 0.7279, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.6194163192376414, |
|
"eval_loss": 0.7843312621116638, |
|
"eval_runtime": 86.7533, |
|
"eval_samples_per_second": 8.103, |
|
"eval_steps_per_second": 2.029, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.678975580702799, |
|
"grad_norm": 0.4377705454826355, |
|
"learning_rate": 1.1088125775755069e-06, |
|
"loss": 0.7593, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.738534842167957, |
|
"grad_norm": 0.36447674036026, |
|
"learning_rate": 9.019445593711212e-07, |
|
"loss": 0.7523, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.738534842167957, |
|
"eval_loss": 0.7842342257499695, |
|
"eval_runtime": 86.8097, |
|
"eval_samples_per_second": 8.098, |
|
"eval_steps_per_second": 2.027, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.798094103633115, |
|
"grad_norm": 0.38712531328201294, |
|
"learning_rate": 6.950765411667356e-07, |
|
"loss": 0.7347, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.857653365098273, |
|
"grad_norm": 0.34733325242996216, |
|
"learning_rate": 4.882085229623501e-07, |
|
"loss": 0.7605, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.857653365098273, |
|
"eval_loss": 0.7841441035270691, |
|
"eval_runtime": 86.8339, |
|
"eval_samples_per_second": 8.096, |
|
"eval_steps_per_second": 2.027, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.9172126265634306, |
|
"grad_norm": 0.3819723129272461, |
|
"learning_rate": 2.8134050475796445e-07, |
|
"loss": 0.7412, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.9767718880285887, |
|
"grad_norm": 0.3409363329410553, |
|
"learning_rate": 7.447248655357883e-08, |
|
"loss": 0.7425, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.9767718880285887, |
|
"eval_loss": 0.7841161489486694, |
|
"eval_runtime": 87.2598, |
|
"eval_samples_per_second": 8.056, |
|
"eval_steps_per_second": 2.017, |
|
"step": 2500 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 2517, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.120601880087757e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|