{ "best_global_step": 11700, "best_metric": 0.5111122131347656, "best_model_checkpoint": "./output/gpt-medmentions/checkpoint-11700", "epoch": 5.0, "eval_steps": 500, "global_step": 29250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08547008547008547, "grad_norm": 7.1186957359313965, "learning_rate": 4.9153846153846157e-05, "loss": 0.9374, "step": 500 }, { "epoch": 0.17094017094017094, "grad_norm": 2.6842756271362305, "learning_rate": 4.829914529914531e-05, "loss": 0.6708, "step": 1000 }, { "epoch": 0.2564102564102564, "grad_norm": 4.874898433685303, "learning_rate": 4.7444444444444445e-05, "loss": 0.6453, "step": 1500 }, { "epoch": 0.3418803418803419, "grad_norm": 5.255520820617676, "learning_rate": 4.658974358974359e-05, "loss": 0.6121, "step": 2000 }, { "epoch": 0.42735042735042733, "grad_norm": 2.5840187072753906, "learning_rate": 4.573504273504274e-05, "loss": 0.5936, "step": 2500 }, { "epoch": 0.5128205128205128, "grad_norm": 5.246805667877197, "learning_rate": 4.488547008547009e-05, "loss": 0.5943, "step": 3000 }, { "epoch": 0.5982905982905983, "grad_norm": 3.491377592086792, "learning_rate": 4.4032478632478637e-05, "loss": 0.5663, "step": 3500 }, { "epoch": 0.6837606837606838, "grad_norm": 4.0636210441589355, "learning_rate": 4.317777777777778e-05, "loss": 0.5611, "step": 4000 }, { "epoch": 0.7692307692307693, "grad_norm": 4.316960334777832, "learning_rate": 4.2323076923076925e-05, "loss": 0.5449, "step": 4500 }, { "epoch": 0.8547008547008547, "grad_norm": 4.260451316833496, "learning_rate": 4.146837606837607e-05, "loss": 0.5481, "step": 5000 }, { "epoch": 0.9401709401709402, "grad_norm": 5.917566299438477, "learning_rate": 4.061367521367522e-05, "loss": 0.5307, "step": 5500 }, { "epoch": 1.0, "eval_accuracy": 0.8341444876242783, "eval_f1": 0.44006334125098967, "eval_loss": 0.5368949770927429, "eval_precision": 0.41286584459961373, "eval_recall": 0.47109679606713, "eval_runtime": 20.612, "eval_samples_per_second": 141.859, "eval_steps_per_second": 17.757, "step": 5850 }, { "epoch": 1.0256410256410255, "grad_norm": 3.265369415283203, "learning_rate": 3.975897435897436e-05, "loss": 0.4748, "step": 6000 }, { "epoch": 1.1111111111111112, "grad_norm": 3.2452337741851807, "learning_rate": 3.890427350427351e-05, "loss": 0.3767, "step": 6500 }, { "epoch": 1.1965811965811965, "grad_norm": 2.8009512424468994, "learning_rate": 3.804957264957265e-05, "loss": 0.3761, "step": 7000 }, { "epoch": 1.282051282051282, "grad_norm": 5.858109951019287, "learning_rate": 3.71948717948718e-05, "loss": 0.3922, "step": 7500 }, { "epoch": 1.3675213675213675, "grad_norm": 1.7275584936141968, "learning_rate": 3.634017094017094e-05, "loss": 0.3677, "step": 8000 }, { "epoch": 1.452991452991453, "grad_norm": 4.6104044914245605, "learning_rate": 3.5485470085470085e-05, "loss": 0.3746, "step": 8500 }, { "epoch": 1.5384615384615383, "grad_norm": 2.6036839485168457, "learning_rate": 3.4630769230769236e-05, "loss": 0.3661, "step": 9000 }, { "epoch": 1.623931623931624, "grad_norm": 2.7406065464019775, "learning_rate": 3.3776068376068374e-05, "loss": 0.3712, "step": 9500 }, { "epoch": 1.7094017094017095, "grad_norm": 5.254650115966797, "learning_rate": 3.2921367521367525e-05, "loss": 0.3774, "step": 10000 }, { "epoch": 1.7948717948717947, "grad_norm": 2.274414539337158, "learning_rate": 3.206666666666667e-05, "loss": 0.3541, "step": 10500 }, { "epoch": 1.8803418803418803, "grad_norm": 3.5981504917144775, "learning_rate": 3.121196581196581e-05, "loss": 0.3694, "step": 11000 }, { "epoch": 1.965811965811966, "grad_norm": 3.6442151069641113, "learning_rate": 3.0357264957264958e-05, "loss": 0.3585, "step": 11500 }, { "epoch": 2.0, "eval_accuracy": 0.8454107464662687, "eval_f1": 0.48178988326848243, "eval_loss": 0.5111122131347656, "eval_precision": 0.4453316069630269, "eval_recall": 0.5247499576199356, "eval_runtime": 20.6467, "eval_samples_per_second": 141.621, "eval_steps_per_second": 17.727, "step": 11700 }, { "epoch": 2.051282051282051, "grad_norm": 3.4292666912078857, "learning_rate": 2.9502564102564105e-05, "loss": 0.2559, "step": 12000 }, { "epoch": 2.1367521367521367, "grad_norm": 6.138054370880127, "learning_rate": 2.864786324786325e-05, "loss": 0.1817, "step": 12500 }, { "epoch": 2.2222222222222223, "grad_norm": 3.659104347229004, "learning_rate": 2.7793162393162394e-05, "loss": 0.1759, "step": 13000 }, { "epoch": 2.3076923076923075, "grad_norm": 5.470319747924805, "learning_rate": 2.693846153846154e-05, "loss": 0.184, "step": 13500 }, { "epoch": 2.393162393162393, "grad_norm": 2.9865291118621826, "learning_rate": 2.6083760683760682e-05, "loss": 0.1761, "step": 14000 }, { "epoch": 2.4786324786324787, "grad_norm": 6.152403831481934, "learning_rate": 2.522905982905983e-05, "loss": 0.1798, "step": 14500 }, { "epoch": 2.564102564102564, "grad_norm": 4.3192338943481445, "learning_rate": 2.4374358974358977e-05, "loss": 0.1757, "step": 15000 }, { "epoch": 2.6495726495726495, "grad_norm": 3.217804193496704, "learning_rate": 2.3521367521367523e-05, "loss": 0.1738, "step": 15500 }, { "epoch": 2.735042735042735, "grad_norm": 3.670557737350464, "learning_rate": 2.2666666666666668e-05, "loss": 0.1861, "step": 16000 }, { "epoch": 2.8205128205128203, "grad_norm": 2.3006069660186768, "learning_rate": 2.1811965811965812e-05, "loss": 0.1705, "step": 16500 }, { "epoch": 2.905982905982906, "grad_norm": 1.9008346796035767, "learning_rate": 2.0958974358974358e-05, "loss": 0.1672, "step": 17000 }, { "epoch": 2.9914529914529915, "grad_norm": 1.8553671836853027, "learning_rate": 2.0104273504273506e-05, "loss": 0.1758, "step": 17500 }, { "epoch": 3.0, "eval_accuracy": 0.8497137463068983, "eval_f1": 0.48074844074844075, "eval_loss": 0.6349462270736694, "eval_precision": 0.4718413320274241, "eval_recall": 0.4899983047974233, "eval_runtime": 20.6084, "eval_samples_per_second": 141.884, "eval_steps_per_second": 17.76, "step": 17550 }, { "epoch": 3.076923076923077, "grad_norm": 1.145456314086914, "learning_rate": 1.924957264957265e-05, "loss": 0.0777, "step": 18000 }, { "epoch": 3.1623931623931623, "grad_norm": 2.4131710529327393, "learning_rate": 1.8394871794871797e-05, "loss": 0.0769, "step": 18500 }, { "epoch": 3.247863247863248, "grad_norm": 2.5216588973999023, "learning_rate": 1.754017094017094e-05, "loss": 0.0779, "step": 19000 }, { "epoch": 3.3333333333333335, "grad_norm": 2.811354160308838, "learning_rate": 1.6685470085470086e-05, "loss": 0.0804, "step": 19500 }, { "epoch": 3.4188034188034186, "grad_norm": 0.5833438634872437, "learning_rate": 1.5830769230769233e-05, "loss": 0.0772, "step": 20000 }, { "epoch": 3.5042735042735043, "grad_norm": 3.458584785461426, "learning_rate": 1.4976068376068378e-05, "loss": 0.0751, "step": 20500 }, { "epoch": 3.58974358974359, "grad_norm": 0.8929054141044617, "learning_rate": 1.4121367521367524e-05, "loss": 0.0761, "step": 21000 }, { "epoch": 3.6752136752136755, "grad_norm": 5.908766269683838, "learning_rate": 1.3268376068376068e-05, "loss": 0.0736, "step": 21500 }, { "epoch": 3.7606837606837606, "grad_norm": 1.0228583812713623, "learning_rate": 1.2413675213675214e-05, "loss": 0.0716, "step": 22000 }, { "epoch": 3.8461538461538463, "grad_norm": 0.626966118812561, "learning_rate": 1.1560683760683762e-05, "loss": 0.0718, "step": 22500 }, { "epoch": 3.931623931623932, "grad_norm": 1.9417258501052856, "learning_rate": 1.0705982905982906e-05, "loss": 0.0751, "step": 23000 }, { "epoch": 4.0, "eval_accuracy": 0.8496892277892879, "eval_f1": 0.49008894029434047, "eval_loss": 0.9264360070228577, "eval_precision": 0.46282485875706214, "eval_recall": 0.5207662315646719, "eval_runtime": 20.6775, "eval_samples_per_second": 141.41, "eval_steps_per_second": 17.7, "step": 23400 }, { "epoch": 4.017094017094017, "grad_norm": 0.14389610290527344, "learning_rate": 9.851282051282052e-06, "loss": 0.0641, "step": 23500 }, { "epoch": 4.102564102564102, "grad_norm": 0.6148084402084351, "learning_rate": 8.996581196581196e-06, "loss": 0.0352, "step": 24000 }, { "epoch": 4.188034188034188, "grad_norm": 1.3859856128692627, "learning_rate": 8.141880341880342e-06, "loss": 0.0399, "step": 24500 }, { "epoch": 4.273504273504273, "grad_norm": 1.6096197366714478, "learning_rate": 7.287179487179488e-06, "loss": 0.0385, "step": 25000 }, { "epoch": 4.358974358974359, "grad_norm": 0.26734989881515503, "learning_rate": 6.432478632478633e-06, "loss": 0.0411, "step": 25500 }, { "epoch": 4.444444444444445, "grad_norm": 1.3472919464111328, "learning_rate": 5.577777777777778e-06, "loss": 0.0395, "step": 26000 }, { "epoch": 4.52991452991453, "grad_norm": 0.8369725942611694, "learning_rate": 4.723076923076923e-06, "loss": 0.0432, "step": 26500 }, { "epoch": 4.615384615384615, "grad_norm": 0.7225199341773987, "learning_rate": 3.87008547008547e-06, "loss": 0.0412, "step": 27000 }, { "epoch": 4.700854700854701, "grad_norm": 0.6592767834663391, "learning_rate": 3.0153846153846154e-06, "loss": 0.0407, "step": 27500 }, { "epoch": 4.786324786324786, "grad_norm": 0.9876635670661926, "learning_rate": 2.160683760683761e-06, "loss": 0.0372, "step": 28000 }, { "epoch": 4.871794871794872, "grad_norm": 0.7053186297416687, "learning_rate": 1.3059829059829061e-06, "loss": 0.0384, "step": 28500 }, { "epoch": 4.957264957264957, "grad_norm": 0.24596278369426727, "learning_rate": 4.52991452991453e-07, "loss": 0.0387, "step": 29000 }, { "epoch": 5.0, "eval_accuracy": 0.8518223388213949, "eval_f1": 0.4960441433034446, "eval_loss": 1.0903491973876953, "eval_precision": 0.47575686823877344, "eval_recall": 0.5181386675707748, "eval_runtime": 20.8036, "eval_samples_per_second": 140.552, "eval_steps_per_second": 17.593, "step": 29250 }, { "epoch": 5.0, "step": 29250, "total_flos": 5.182622875540416e+16, "train_loss": 0.2549698821499816, "train_runtime": 7489.8753, "train_samples_per_second": 15.62, "train_steps_per_second": 3.905 } ], "logging_steps": 500, "max_steps": 29250, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.182622875540416e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }