qwen2.5-1.5b-sft3-25-3 / trainer_state.json
hZzy's picture
Model save
5bdae9f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.75609756097561,
"eval_steps": 5,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04878048780487805,
"grad_norm": 5.151791580979949,
"learning_rate": 5e-08,
"loss": 2.9815,
"step": 1
},
{
"epoch": 0.24390243902439024,
"grad_norm": 5.13763305558627,
"learning_rate": 2.5e-07,
"loss": 2.9732,
"step": 5
},
{
"epoch": 0.24390243902439024,
"eval_loss": 2.9549503326416016,
"eval_runtime": 23.256,
"eval_samples_per_second": 31.734,
"eval_steps_per_second": 0.817,
"step": 5
},
{
"epoch": 0.4878048780487805,
"grad_norm": 4.3040377415100535,
"learning_rate": 5e-07,
"loss": 2.9685,
"step": 10
},
{
"epoch": 0.4878048780487805,
"eval_loss": 2.9328665733337402,
"eval_runtime": 19.8689,
"eval_samples_per_second": 37.144,
"eval_steps_per_second": 0.956,
"step": 10
},
{
"epoch": 0.7317073170731707,
"grad_norm": 3.403638208876821,
"learning_rate": 7.5e-07,
"loss": 2.9341,
"step": 15
},
{
"epoch": 0.7317073170731707,
"eval_loss": 2.886631488800049,
"eval_runtime": 18.4607,
"eval_samples_per_second": 39.977,
"eval_steps_per_second": 1.029,
"step": 15
},
{
"epoch": 0.975609756097561,
"grad_norm": 3.00111206949746,
"learning_rate": 1e-06,
"loss": 2.8788,
"step": 20
},
{
"epoch": 0.975609756097561,
"eval_loss": 2.8079330921173096,
"eval_runtime": 19.9381,
"eval_samples_per_second": 37.014,
"eval_steps_per_second": 0.953,
"step": 20
},
{
"epoch": 1.2195121951219512,
"grad_norm": 2.7591628779527766,
"learning_rate": 9.980973490458728e-07,
"loss": 2.8082,
"step": 25
},
{
"epoch": 1.2195121951219512,
"eval_loss": 2.7484195232391357,
"eval_runtime": 18.8326,
"eval_samples_per_second": 39.187,
"eval_steps_per_second": 1.009,
"step": 25
},
{
"epoch": 1.4634146341463414,
"grad_norm": 2.388152230770368,
"learning_rate": 9.92403876506104e-07,
"loss": 2.7341,
"step": 30
},
{
"epoch": 1.4634146341463414,
"eval_loss": 2.6838204860687256,
"eval_runtime": 20.3367,
"eval_samples_per_second": 36.289,
"eval_steps_per_second": 0.934,
"step": 30
},
{
"epoch": 1.7073170731707317,
"grad_norm": 2.339030304927524,
"learning_rate": 9.82962913144534e-07,
"loss": 2.6784,
"step": 35
},
{
"epoch": 1.7073170731707317,
"eval_loss": 2.633502244949341,
"eval_runtime": 18.7828,
"eval_samples_per_second": 39.291,
"eval_steps_per_second": 1.012,
"step": 35
},
{
"epoch": 1.951219512195122,
"grad_norm": 2.289213281990378,
"learning_rate": 9.698463103929541e-07,
"loss": 2.6326,
"step": 40
},
{
"epoch": 1.951219512195122,
"eval_loss": 2.5951168537139893,
"eval_runtime": 20.4763,
"eval_samples_per_second": 36.042,
"eval_steps_per_second": 0.928,
"step": 40
},
{
"epoch": 2.1951219512195124,
"grad_norm": 2.224029018392997,
"learning_rate": 9.531538935183249e-07,
"loss": 2.5934,
"step": 45
},
{
"epoch": 2.1951219512195124,
"eval_loss": 2.5593512058258057,
"eval_runtime": 18.9816,
"eval_samples_per_second": 38.88,
"eval_steps_per_second": 1.001,
"step": 45
},
{
"epoch": 2.4390243902439024,
"grad_norm": 2.2412373397938583,
"learning_rate": 9.330127018922193e-07,
"loss": 2.5543,
"step": 50
},
{
"epoch": 2.4390243902439024,
"eval_loss": 2.521718740463257,
"eval_runtime": 20.215,
"eval_samples_per_second": 36.508,
"eval_steps_per_second": 0.94,
"step": 50
},
{
"epoch": 2.682926829268293,
"grad_norm": 2.209083408074763,
"learning_rate": 9.095760221444959e-07,
"loss": 2.513,
"step": 55
},
{
"epoch": 2.682926829268293,
"eval_loss": 2.4829368591308594,
"eval_runtime": 19.4395,
"eval_samples_per_second": 37.964,
"eval_steps_per_second": 0.977,
"step": 55
},
{
"epoch": 2.926829268292683,
"grad_norm": 2.297068739774513,
"learning_rate": 8.83022221559489e-07,
"loss": 2.4712,
"step": 60
},
{
"epoch": 2.926829268292683,
"eval_loss": 2.446091890335083,
"eval_runtime": 19.007,
"eval_samples_per_second": 38.828,
"eval_steps_per_second": 1.0,
"step": 60
},
{
"epoch": 3.1707317073170733,
"grad_norm": 2.3349163677591314,
"learning_rate": 8.535533905932737e-07,
"loss": 2.4365,
"step": 65
},
{
"epoch": 3.1707317073170733,
"eval_loss": 2.413790225982666,
"eval_runtime": 20.2301,
"eval_samples_per_second": 36.48,
"eval_steps_per_second": 0.939,
"step": 65
},
{
"epoch": 3.4146341463414633,
"grad_norm": 2.285003975033001,
"learning_rate": 8.213938048432696e-07,
"loss": 2.4066,
"step": 70
},
{
"epoch": 3.4146341463414633,
"eval_loss": 2.3858845233917236,
"eval_runtime": 18.3399,
"eval_samples_per_second": 40.24,
"eval_steps_per_second": 1.036,
"step": 70
},
{
"epoch": 3.658536585365854,
"grad_norm": 2.1725408410741203,
"learning_rate": 7.86788218175523e-07,
"loss": 2.375,
"step": 75
},
{
"epoch": 3.658536585365854,
"eval_loss": 2.3606066703796387,
"eval_runtime": 19.9461,
"eval_samples_per_second": 37.0,
"eval_steps_per_second": 0.953,
"step": 75
},
{
"epoch": 3.902439024390244,
"grad_norm": 2.2674210698852133,
"learning_rate": 7.5e-07,
"loss": 2.3415,
"step": 80
},
{
"epoch": 3.902439024390244,
"eval_loss": 2.336864709854126,
"eval_runtime": 18.4703,
"eval_samples_per_second": 39.956,
"eval_steps_per_second": 1.029,
"step": 80
},
{
"epoch": 4.146341463414634,
"grad_norm": 2.22603925687789,
"learning_rate": 7.113091308703497e-07,
"loss": 2.3225,
"step": 85
},
{
"epoch": 4.146341463414634,
"eval_loss": 2.3142693042755127,
"eval_runtime": 19.8254,
"eval_samples_per_second": 37.225,
"eval_steps_per_second": 0.958,
"step": 85
},
{
"epoch": 4.390243902439025,
"grad_norm": 2.250461857867485,
"learning_rate": 6.710100716628344e-07,
"loss": 2.2989,
"step": 90
},
{
"epoch": 4.390243902439025,
"eval_loss": 2.292672634124756,
"eval_runtime": 19.1763,
"eval_samples_per_second": 38.485,
"eval_steps_per_second": 0.991,
"step": 90
},
{
"epoch": 4.634146341463414,
"grad_norm": 2.2063059930566107,
"learning_rate": 6.294095225512604e-07,
"loss": 2.2748,
"step": 95
},
{
"epoch": 4.634146341463414,
"eval_loss": 2.2731637954711914,
"eval_runtime": 18.4462,
"eval_samples_per_second": 40.008,
"eval_steps_per_second": 1.03,
"step": 95
},
{
"epoch": 4.878048780487805,
"grad_norm": 2.2114255470751205,
"learning_rate": 5.868240888334652e-07,
"loss": 2.2513,
"step": 100
},
{
"epoch": 4.878048780487805,
"eval_loss": 2.2562241554260254,
"eval_runtime": 19.9755,
"eval_samples_per_second": 36.945,
"eval_steps_per_second": 0.951,
"step": 100
},
{
"epoch": 5.121951219512195,
"grad_norm": 2.1470949247310247,
"learning_rate": 5.435778713738292e-07,
"loss": 2.2401,
"step": 105
},
{
"epoch": 5.121951219512195,
"eval_loss": 2.241244077682495,
"eval_runtime": 18.4347,
"eval_samples_per_second": 40.033,
"eval_steps_per_second": 1.031,
"step": 105
},
{
"epoch": 5.365853658536586,
"grad_norm": 2.2399669699712543,
"learning_rate": 5e-07,
"loss": 2.2172,
"step": 110
},
{
"epoch": 5.365853658536586,
"eval_loss": 2.228184461593628,
"eval_runtime": 19.9167,
"eval_samples_per_second": 37.054,
"eval_steps_per_second": 0.954,
"step": 110
},
{
"epoch": 5.609756097560975,
"grad_norm": 2.248609346835798,
"learning_rate": 4.5642212862617085e-07,
"loss": 2.204,
"step": 115
},
{
"epoch": 5.609756097560975,
"eval_loss": 2.216791868209839,
"eval_runtime": 18.6439,
"eval_samples_per_second": 39.584,
"eval_steps_per_second": 1.019,
"step": 115
},
{
"epoch": 5.853658536585366,
"grad_norm": 2.2843160705737393,
"learning_rate": 4.131759111665348e-07,
"loss": 2.1893,
"step": 120
},
{
"epoch": 5.853658536585366,
"eval_loss": 2.2068991661071777,
"eval_runtime": 19.7257,
"eval_samples_per_second": 37.413,
"eval_steps_per_second": 0.963,
"step": 120
},
{
"epoch": 6.097560975609756,
"grad_norm": 2.198659146419105,
"learning_rate": 3.7059047744873955e-07,
"loss": 2.1784,
"step": 125
},
{
"epoch": 6.097560975609756,
"eval_loss": 2.1983890533447266,
"eval_runtime": 20.2531,
"eval_samples_per_second": 36.439,
"eval_steps_per_second": 0.938,
"step": 125
},
{
"epoch": 6.341463414634147,
"grad_norm": 2.2746004739227996,
"learning_rate": 3.2898992833716563e-07,
"loss": 2.1646,
"step": 130
},
{
"epoch": 6.341463414634147,
"eval_loss": 2.191380500793457,
"eval_runtime": 18.8387,
"eval_samples_per_second": 39.175,
"eval_steps_per_second": 1.009,
"step": 130
},
{
"epoch": 6.585365853658536,
"grad_norm": 2.182140726336947,
"learning_rate": 2.8869086912965036e-07,
"loss": 2.1673,
"step": 135
},
{
"epoch": 6.585365853658536,
"eval_loss": 2.185249090194702,
"eval_runtime": 20.3689,
"eval_samples_per_second": 36.232,
"eval_steps_per_second": 0.933,
"step": 135
},
{
"epoch": 6.829268292682927,
"grad_norm": 2.2128884344288244,
"learning_rate": 2.500000000000001e-07,
"loss": 2.1555,
"step": 140
},
{
"epoch": 6.829268292682927,
"eval_loss": 2.180130958557129,
"eval_runtime": 19.0502,
"eval_samples_per_second": 38.74,
"eval_steps_per_second": 0.997,
"step": 140
},
{
"epoch": 7.073170731707317,
"grad_norm": 2.102015386581095,
"learning_rate": 2.1321178182447709e-07,
"loss": 2.1599,
"step": 145
},
{
"epoch": 7.073170731707317,
"eval_loss": 2.175684690475464,
"eval_runtime": 20.511,
"eval_samples_per_second": 35.981,
"eval_steps_per_second": 0.926,
"step": 145
},
{
"epoch": 7.317073170731708,
"grad_norm": 2.313350074598611,
"learning_rate": 1.7860619515673032e-07,
"loss": 2.145,
"step": 150
},
{
"epoch": 7.317073170731708,
"eval_loss": 2.1720824241638184,
"eval_runtime": 18.8834,
"eval_samples_per_second": 39.082,
"eval_steps_per_second": 1.006,
"step": 150
},
{
"epoch": 7.560975609756097,
"grad_norm": 2.253715272870049,
"learning_rate": 1.4644660940672627e-07,
"loss": 2.1359,
"step": 155
},
{
"epoch": 7.560975609756097,
"eval_loss": 2.169234275817871,
"eval_runtime": 19.3523,
"eval_samples_per_second": 38.135,
"eval_steps_per_second": 0.982,
"step": 155
},
{
"epoch": 7.804878048780488,
"grad_norm": 2.216308681434654,
"learning_rate": 1.1697777844051104e-07,
"loss": 2.1391,
"step": 160
},
{
"epoch": 7.804878048780488,
"eval_loss": 2.1668455600738525,
"eval_runtime": 20.0953,
"eval_samples_per_second": 36.725,
"eval_steps_per_second": 0.945,
"step": 160
},
{
"epoch": 8.048780487804878,
"grad_norm": 2.2398335056684116,
"learning_rate": 9.042397785550404e-08,
"loss": 2.1274,
"step": 165
},
{
"epoch": 8.048780487804878,
"eval_loss": 2.1650218963623047,
"eval_runtime": 18.6515,
"eval_samples_per_second": 39.568,
"eval_steps_per_second": 1.019,
"step": 165
},
{
"epoch": 8.292682926829269,
"grad_norm": 2.183896419200447,
"learning_rate": 6.698729810778064e-08,
"loss": 2.1342,
"step": 170
},
{
"epoch": 8.292682926829269,
"eval_loss": 2.163686752319336,
"eval_runtime": 20.1658,
"eval_samples_per_second": 36.597,
"eval_steps_per_second": 0.942,
"step": 170
},
{
"epoch": 8.536585365853659,
"grad_norm": 2.260276175787966,
"learning_rate": 4.684610648167503e-08,
"loss": 2.1272,
"step": 175
},
{
"epoch": 8.536585365853659,
"eval_loss": 2.1627187728881836,
"eval_runtime": 18.4069,
"eval_samples_per_second": 40.094,
"eval_steps_per_second": 1.032,
"step": 175
},
{
"epoch": 8.78048780487805,
"grad_norm": 2.2467910530017043,
"learning_rate": 3.015368960704584e-08,
"loss": 2.133,
"step": 180
},
{
"epoch": 8.78048780487805,
"eval_loss": 2.162067413330078,
"eval_runtime": 19.691,
"eval_samples_per_second": 37.479,
"eval_steps_per_second": 0.965,
"step": 180
},
{
"epoch": 9.024390243902438,
"grad_norm": 2.2631951574285387,
"learning_rate": 1.7037086855465898e-08,
"loss": 2.1286,
"step": 185
},
{
"epoch": 9.024390243902438,
"eval_loss": 2.1616575717926025,
"eval_runtime": 19.1797,
"eval_samples_per_second": 38.478,
"eval_steps_per_second": 0.991,
"step": 185
},
{
"epoch": 9.268292682926829,
"grad_norm": 2.2305961635805214,
"learning_rate": 7.59612349389599e-09,
"loss": 2.1296,
"step": 190
},
{
"epoch": 9.268292682926829,
"eval_loss": 2.161451578140259,
"eval_runtime": 18.6125,
"eval_samples_per_second": 39.651,
"eval_steps_per_second": 1.021,
"step": 190
},
{
"epoch": 9.512195121951219,
"grad_norm": 2.209641020058134,
"learning_rate": 1.9026509541272273e-09,
"loss": 2.1256,
"step": 195
},
{
"epoch": 9.512195121951219,
"eval_loss": 2.161362409591675,
"eval_runtime": 20.0337,
"eval_samples_per_second": 36.838,
"eval_steps_per_second": 0.948,
"step": 195
},
{
"epoch": 9.75609756097561,
"grad_norm": 2.2480648233150617,
"learning_rate": 0.0,
"loss": 2.1267,
"step": 200
},
{
"epoch": 9.75609756097561,
"eval_loss": 2.1613569259643555,
"eval_runtime": 18.4232,
"eval_samples_per_second": 40.058,
"eval_steps_per_second": 1.031,
"step": 200
},
{
"epoch": 9.75609756097561,
"step": 200,
"total_flos": 1.742036445167616e+16,
"train_loss": 2.3644245743751524,
"train_runtime": 7118.2054,
"train_samples_per_second": 9.167,
"train_steps_per_second": 0.028
}
],
"logging_steps": 5,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.742036445167616e+16,
"train_batch_size": 10,
"trial_name": null,
"trial_params": null
}