gpt-medmentions / trainer_state.json
Ben10x's picture
End of training
7037635 verified
raw
history blame
13 kB
{
"best_global_step": 11700,
"best_metric": 0.5111122131347656,
"best_model_checkpoint": "./output/gpt-medmentions/checkpoint-11700",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 29250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.08547008547008547,
"grad_norm": 7.1186957359313965,
"learning_rate": 4.9153846153846157e-05,
"loss": 0.9374,
"step": 500
},
{
"epoch": 0.17094017094017094,
"grad_norm": 2.6842756271362305,
"learning_rate": 4.829914529914531e-05,
"loss": 0.6708,
"step": 1000
},
{
"epoch": 0.2564102564102564,
"grad_norm": 4.874898433685303,
"learning_rate": 4.7444444444444445e-05,
"loss": 0.6453,
"step": 1500
},
{
"epoch": 0.3418803418803419,
"grad_norm": 5.255520820617676,
"learning_rate": 4.658974358974359e-05,
"loss": 0.6121,
"step": 2000
},
{
"epoch": 0.42735042735042733,
"grad_norm": 2.5840187072753906,
"learning_rate": 4.573504273504274e-05,
"loss": 0.5936,
"step": 2500
},
{
"epoch": 0.5128205128205128,
"grad_norm": 5.246805667877197,
"learning_rate": 4.488547008547009e-05,
"loss": 0.5943,
"step": 3000
},
{
"epoch": 0.5982905982905983,
"grad_norm": 3.491377592086792,
"learning_rate": 4.4032478632478637e-05,
"loss": 0.5663,
"step": 3500
},
{
"epoch": 0.6837606837606838,
"grad_norm": 4.0636210441589355,
"learning_rate": 4.317777777777778e-05,
"loss": 0.5611,
"step": 4000
},
{
"epoch": 0.7692307692307693,
"grad_norm": 4.316960334777832,
"learning_rate": 4.2323076923076925e-05,
"loss": 0.5449,
"step": 4500
},
{
"epoch": 0.8547008547008547,
"grad_norm": 4.260451316833496,
"learning_rate": 4.146837606837607e-05,
"loss": 0.5481,
"step": 5000
},
{
"epoch": 0.9401709401709402,
"grad_norm": 5.917566299438477,
"learning_rate": 4.061367521367522e-05,
"loss": 0.5307,
"step": 5500
},
{
"epoch": 1.0,
"eval_accuracy": 0.8341444876242783,
"eval_f1": 0.44006334125098967,
"eval_loss": 0.5368949770927429,
"eval_precision": 0.41286584459961373,
"eval_recall": 0.47109679606713,
"eval_runtime": 20.612,
"eval_samples_per_second": 141.859,
"eval_steps_per_second": 17.757,
"step": 5850
},
{
"epoch": 1.0256410256410255,
"grad_norm": 3.265369415283203,
"learning_rate": 3.975897435897436e-05,
"loss": 0.4748,
"step": 6000
},
{
"epoch": 1.1111111111111112,
"grad_norm": 3.2452337741851807,
"learning_rate": 3.890427350427351e-05,
"loss": 0.3767,
"step": 6500
},
{
"epoch": 1.1965811965811965,
"grad_norm": 2.8009512424468994,
"learning_rate": 3.804957264957265e-05,
"loss": 0.3761,
"step": 7000
},
{
"epoch": 1.282051282051282,
"grad_norm": 5.858109951019287,
"learning_rate": 3.71948717948718e-05,
"loss": 0.3922,
"step": 7500
},
{
"epoch": 1.3675213675213675,
"grad_norm": 1.7275584936141968,
"learning_rate": 3.634017094017094e-05,
"loss": 0.3677,
"step": 8000
},
{
"epoch": 1.452991452991453,
"grad_norm": 4.6104044914245605,
"learning_rate": 3.5485470085470085e-05,
"loss": 0.3746,
"step": 8500
},
{
"epoch": 1.5384615384615383,
"grad_norm": 2.6036839485168457,
"learning_rate": 3.4630769230769236e-05,
"loss": 0.3661,
"step": 9000
},
{
"epoch": 1.623931623931624,
"grad_norm": 2.7406065464019775,
"learning_rate": 3.3776068376068374e-05,
"loss": 0.3712,
"step": 9500
},
{
"epoch": 1.7094017094017095,
"grad_norm": 5.254650115966797,
"learning_rate": 3.2921367521367525e-05,
"loss": 0.3774,
"step": 10000
},
{
"epoch": 1.7948717948717947,
"grad_norm": 2.274414539337158,
"learning_rate": 3.206666666666667e-05,
"loss": 0.3541,
"step": 10500
},
{
"epoch": 1.8803418803418803,
"grad_norm": 3.5981504917144775,
"learning_rate": 3.121196581196581e-05,
"loss": 0.3694,
"step": 11000
},
{
"epoch": 1.965811965811966,
"grad_norm": 3.6442151069641113,
"learning_rate": 3.0357264957264958e-05,
"loss": 0.3585,
"step": 11500
},
{
"epoch": 2.0,
"eval_accuracy": 0.8454107464662687,
"eval_f1": 0.48178988326848243,
"eval_loss": 0.5111122131347656,
"eval_precision": 0.4453316069630269,
"eval_recall": 0.5247499576199356,
"eval_runtime": 20.6467,
"eval_samples_per_second": 141.621,
"eval_steps_per_second": 17.727,
"step": 11700
},
{
"epoch": 2.051282051282051,
"grad_norm": 3.4292666912078857,
"learning_rate": 2.9502564102564105e-05,
"loss": 0.2559,
"step": 12000
},
{
"epoch": 2.1367521367521367,
"grad_norm": 6.138054370880127,
"learning_rate": 2.864786324786325e-05,
"loss": 0.1817,
"step": 12500
},
{
"epoch": 2.2222222222222223,
"grad_norm": 3.659104347229004,
"learning_rate": 2.7793162393162394e-05,
"loss": 0.1759,
"step": 13000
},
{
"epoch": 2.3076923076923075,
"grad_norm": 5.470319747924805,
"learning_rate": 2.693846153846154e-05,
"loss": 0.184,
"step": 13500
},
{
"epoch": 2.393162393162393,
"grad_norm": 2.9865291118621826,
"learning_rate": 2.6083760683760682e-05,
"loss": 0.1761,
"step": 14000
},
{
"epoch": 2.4786324786324787,
"grad_norm": 6.152403831481934,
"learning_rate": 2.522905982905983e-05,
"loss": 0.1798,
"step": 14500
},
{
"epoch": 2.564102564102564,
"grad_norm": 4.3192338943481445,
"learning_rate": 2.4374358974358977e-05,
"loss": 0.1757,
"step": 15000
},
{
"epoch": 2.6495726495726495,
"grad_norm": 3.217804193496704,
"learning_rate": 2.3521367521367523e-05,
"loss": 0.1738,
"step": 15500
},
{
"epoch": 2.735042735042735,
"grad_norm": 3.670557737350464,
"learning_rate": 2.2666666666666668e-05,
"loss": 0.1861,
"step": 16000
},
{
"epoch": 2.8205128205128203,
"grad_norm": 2.3006069660186768,
"learning_rate": 2.1811965811965812e-05,
"loss": 0.1705,
"step": 16500
},
{
"epoch": 2.905982905982906,
"grad_norm": 1.9008346796035767,
"learning_rate": 2.0958974358974358e-05,
"loss": 0.1672,
"step": 17000
},
{
"epoch": 2.9914529914529915,
"grad_norm": 1.8553671836853027,
"learning_rate": 2.0104273504273506e-05,
"loss": 0.1758,
"step": 17500
},
{
"epoch": 3.0,
"eval_accuracy": 0.8497137463068983,
"eval_f1": 0.48074844074844075,
"eval_loss": 0.6349462270736694,
"eval_precision": 0.4718413320274241,
"eval_recall": 0.4899983047974233,
"eval_runtime": 20.6084,
"eval_samples_per_second": 141.884,
"eval_steps_per_second": 17.76,
"step": 17550
},
{
"epoch": 3.076923076923077,
"grad_norm": 1.145456314086914,
"learning_rate": 1.924957264957265e-05,
"loss": 0.0777,
"step": 18000
},
{
"epoch": 3.1623931623931623,
"grad_norm": 2.4131710529327393,
"learning_rate": 1.8394871794871797e-05,
"loss": 0.0769,
"step": 18500
},
{
"epoch": 3.247863247863248,
"grad_norm": 2.5216588973999023,
"learning_rate": 1.754017094017094e-05,
"loss": 0.0779,
"step": 19000
},
{
"epoch": 3.3333333333333335,
"grad_norm": 2.811354160308838,
"learning_rate": 1.6685470085470086e-05,
"loss": 0.0804,
"step": 19500
},
{
"epoch": 3.4188034188034186,
"grad_norm": 0.5833438634872437,
"learning_rate": 1.5830769230769233e-05,
"loss": 0.0772,
"step": 20000
},
{
"epoch": 3.5042735042735043,
"grad_norm": 3.458584785461426,
"learning_rate": 1.4976068376068378e-05,
"loss": 0.0751,
"step": 20500
},
{
"epoch": 3.58974358974359,
"grad_norm": 0.8929054141044617,
"learning_rate": 1.4121367521367524e-05,
"loss": 0.0761,
"step": 21000
},
{
"epoch": 3.6752136752136755,
"grad_norm": 5.908766269683838,
"learning_rate": 1.3268376068376068e-05,
"loss": 0.0736,
"step": 21500
},
{
"epoch": 3.7606837606837606,
"grad_norm": 1.0228583812713623,
"learning_rate": 1.2413675213675214e-05,
"loss": 0.0716,
"step": 22000
},
{
"epoch": 3.8461538461538463,
"grad_norm": 0.626966118812561,
"learning_rate": 1.1560683760683762e-05,
"loss": 0.0718,
"step": 22500
},
{
"epoch": 3.931623931623932,
"grad_norm": 1.9417258501052856,
"learning_rate": 1.0705982905982906e-05,
"loss": 0.0751,
"step": 23000
},
{
"epoch": 4.0,
"eval_accuracy": 0.8496892277892879,
"eval_f1": 0.49008894029434047,
"eval_loss": 0.9264360070228577,
"eval_precision": 0.46282485875706214,
"eval_recall": 0.5207662315646719,
"eval_runtime": 20.6775,
"eval_samples_per_second": 141.41,
"eval_steps_per_second": 17.7,
"step": 23400
},
{
"epoch": 4.017094017094017,
"grad_norm": 0.14389610290527344,
"learning_rate": 9.851282051282052e-06,
"loss": 0.0641,
"step": 23500
},
{
"epoch": 4.102564102564102,
"grad_norm": 0.6148084402084351,
"learning_rate": 8.996581196581196e-06,
"loss": 0.0352,
"step": 24000
},
{
"epoch": 4.188034188034188,
"grad_norm": 1.3859856128692627,
"learning_rate": 8.141880341880342e-06,
"loss": 0.0399,
"step": 24500
},
{
"epoch": 4.273504273504273,
"grad_norm": 1.6096197366714478,
"learning_rate": 7.287179487179488e-06,
"loss": 0.0385,
"step": 25000
},
{
"epoch": 4.358974358974359,
"grad_norm": 0.26734989881515503,
"learning_rate": 6.432478632478633e-06,
"loss": 0.0411,
"step": 25500
},
{
"epoch": 4.444444444444445,
"grad_norm": 1.3472919464111328,
"learning_rate": 5.577777777777778e-06,
"loss": 0.0395,
"step": 26000
},
{
"epoch": 4.52991452991453,
"grad_norm": 0.8369725942611694,
"learning_rate": 4.723076923076923e-06,
"loss": 0.0432,
"step": 26500
},
{
"epoch": 4.615384615384615,
"grad_norm": 0.7225199341773987,
"learning_rate": 3.87008547008547e-06,
"loss": 0.0412,
"step": 27000
},
{
"epoch": 4.700854700854701,
"grad_norm": 0.6592767834663391,
"learning_rate": 3.0153846153846154e-06,
"loss": 0.0407,
"step": 27500
},
{
"epoch": 4.786324786324786,
"grad_norm": 0.9876635670661926,
"learning_rate": 2.160683760683761e-06,
"loss": 0.0372,
"step": 28000
},
{
"epoch": 4.871794871794872,
"grad_norm": 0.7053186297416687,
"learning_rate": 1.3059829059829061e-06,
"loss": 0.0384,
"step": 28500
},
{
"epoch": 4.957264957264957,
"grad_norm": 0.24596278369426727,
"learning_rate": 4.52991452991453e-07,
"loss": 0.0387,
"step": 29000
},
{
"epoch": 5.0,
"eval_accuracy": 0.8518223388213949,
"eval_f1": 0.4960441433034446,
"eval_loss": 1.0903491973876953,
"eval_precision": 0.47575686823877344,
"eval_recall": 0.5181386675707748,
"eval_runtime": 20.8036,
"eval_samples_per_second": 140.552,
"eval_steps_per_second": 17.593,
"step": 29250
},
{
"epoch": 5.0,
"step": 29250,
"total_flos": 5.182622875540416e+16,
"train_loss": 0.2549698821499816,
"train_runtime": 7489.8753,
"train_samples_per_second": 15.62,
"train_steps_per_second": 3.905
}
],
"logging_steps": 500,
"max_steps": 29250,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.182622875540416e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}