JesseLiu's picture
Upload folder using huggingface_hub
311969c verified
{
"best_global_step": 200,
"best_metric": 0.27645987272262573,
"best_model_checkpoint": "./model_weights/llama32-1b-re-kpath-baseline/checkpoint-200",
"epoch": 0.2222222222222222,
"eval_steps": 25,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005555555555555556,
"grad_norm": 2.139066219329834,
"learning_rate": 8.88888888888889e-06,
"loss": 2.687,
"step": 5
},
{
"epoch": 0.011111111111111112,
"grad_norm": 1.8011914491653442,
"learning_rate": 2e-05,
"loss": 2.8564,
"step": 10
},
{
"epoch": 0.016666666666666666,
"grad_norm": 1.9902386665344238,
"learning_rate": 3.111111111111111e-05,
"loss": 2.8579,
"step": 15
},
{
"epoch": 0.022222222222222223,
"grad_norm": 2.4330344200134277,
"learning_rate": 4.222222222222222e-05,
"loss": 2.7422,
"step": 20
},
{
"epoch": 0.027777777777777776,
"grad_norm": 3.047966957092285,
"learning_rate": 5.333333333333333e-05,
"loss": 2.2828,
"step": 25
},
{
"epoch": 0.027777777777777776,
"eval_loss": 2.117645263671875,
"eval_runtime": 7.2642,
"eval_samples_per_second": 27.532,
"eval_steps_per_second": 27.532,
"step": 25
},
{
"epoch": 0.03333333333333333,
"grad_norm": 2.9378368854522705,
"learning_rate": 6.444444444444446e-05,
"loss": 1.8589,
"step": 30
},
{
"epoch": 0.03888888888888889,
"grad_norm": 3.6299028396606445,
"learning_rate": 7.555555555555556e-05,
"loss": 1.3353,
"step": 35
},
{
"epoch": 0.044444444444444446,
"grad_norm": 2.8062655925750732,
"learning_rate": 8.666666666666667e-05,
"loss": 0.9788,
"step": 40
},
{
"epoch": 0.05,
"grad_norm": 2.1027402877807617,
"learning_rate": 9.777777777777778e-05,
"loss": 0.6529,
"step": 45
},
{
"epoch": 0.05555555555555555,
"grad_norm": 4.867608547210693,
"learning_rate": 0.00010888888888888889,
"loss": 0.6028,
"step": 50
},
{
"epoch": 0.05555555555555555,
"eval_loss": 0.5804749131202698,
"eval_runtime": 7.2598,
"eval_samples_per_second": 27.549,
"eval_steps_per_second": 27.549,
"step": 50
},
{
"epoch": 0.06111111111111111,
"grad_norm": 3.358004331588745,
"learning_rate": 0.00012,
"loss": 0.6569,
"step": 55
},
{
"epoch": 0.06666666666666667,
"grad_norm": 3.5663840770721436,
"learning_rate": 0.00013111111111111111,
"loss": 0.4684,
"step": 60
},
{
"epoch": 0.07222222222222222,
"grad_norm": 4.208554267883301,
"learning_rate": 0.00014222222222222224,
"loss": 0.5365,
"step": 65
},
{
"epoch": 0.07777777777777778,
"grad_norm": 1.8781005144119263,
"learning_rate": 0.00015333333333333334,
"loss": 0.4501,
"step": 70
},
{
"epoch": 0.08333333333333333,
"grad_norm": 1.7693791389465332,
"learning_rate": 0.00016444444444444444,
"loss": 0.414,
"step": 75
},
{
"epoch": 0.08333333333333333,
"eval_loss": 0.42754867672920227,
"eval_runtime": 7.3627,
"eval_samples_per_second": 27.164,
"eval_steps_per_second": 27.164,
"step": 75
},
{
"epoch": 0.08888888888888889,
"grad_norm": 1.6324015855789185,
"learning_rate": 0.00017555555555555556,
"loss": 0.4589,
"step": 80
},
{
"epoch": 0.09444444444444444,
"grad_norm": 1.6526750326156616,
"learning_rate": 0.0001866666666666667,
"loss": 0.3764,
"step": 85
},
{
"epoch": 0.1,
"grad_norm": 2.409635066986084,
"learning_rate": 0.00019777777777777778,
"loss": 0.4764,
"step": 90
},
{
"epoch": 0.10555555555555556,
"grad_norm": 1.594348669052124,
"learning_rate": 0.0001999972998023366,
"loss": 0.3086,
"step": 95
},
{
"epoch": 0.1111111111111111,
"grad_norm": 2.8289716243743896,
"learning_rate": 0.0001999863304992469,
"loss": 0.3952,
"step": 100
},
{
"epoch": 0.1111111111111111,
"eval_loss": 0.34535014629364014,
"eval_runtime": 7.6969,
"eval_samples_per_second": 25.984,
"eval_steps_per_second": 25.984,
"step": 100
},
{
"epoch": 0.11666666666666667,
"grad_norm": 1.6983684301376343,
"learning_rate": 0.00019996692425326533,
"loss": 0.4116,
"step": 105
},
{
"epoch": 0.12222222222222222,
"grad_norm": 1.1369361877441406,
"learning_rate": 0.0001999390827019096,
"loss": 0.3407,
"step": 110
},
{
"epoch": 0.12777777777777777,
"grad_norm": 1.4411648511886597,
"learning_rate": 0.0001999028081944766,
"loss": 0.318,
"step": 115
},
{
"epoch": 0.13333333333333333,
"grad_norm": 1.4671945571899414,
"learning_rate": 0.00019985810379184426,
"loss": 0.2887,
"step": 120
},
{
"epoch": 0.1388888888888889,
"grad_norm": 1.9020946025848389,
"learning_rate": 0.00019980497326621316,
"loss": 0.4495,
"step": 125
},
{
"epoch": 0.1388888888888889,
"eval_loss": 0.2982136905193329,
"eval_runtime": 7.4623,
"eval_samples_per_second": 26.801,
"eval_steps_per_second": 26.801,
"step": 125
},
{
"epoch": 0.14444444444444443,
"grad_norm": 2.5713489055633545,
"learning_rate": 0.00019974342110078817,
"loss": 0.3463,
"step": 130
},
{
"epoch": 0.15,
"grad_norm": 0.6249582767486572,
"learning_rate": 0.00019967345248940034,
"loss": 0.3235,
"step": 135
},
{
"epoch": 0.15555555555555556,
"grad_norm": 0.5932977199554443,
"learning_rate": 0.00019959507333606853,
"loss": 0.3018,
"step": 140
},
{
"epoch": 0.16111111111111112,
"grad_norm": 1.2969094514846802,
"learning_rate": 0.00019950829025450114,
"loss": 0.2389,
"step": 145
},
{
"epoch": 0.16666666666666666,
"grad_norm": 2.421125888824463,
"learning_rate": 0.00019941311056753826,
"loss": 0.3315,
"step": 150
},
{
"epoch": 0.16666666666666666,
"eval_loss": 0.2966170310974121,
"eval_runtime": 7.463,
"eval_samples_per_second": 26.799,
"eval_steps_per_second": 26.799,
"step": 150
},
{
"epoch": 0.17222222222222222,
"grad_norm": 2.2616477012634277,
"learning_rate": 0.00019930954230653355,
"loss": 0.3305,
"step": 155
},
{
"epoch": 0.17777777777777778,
"grad_norm": 2.539074659347534,
"learning_rate": 0.0001991975942106767,
"loss": 0.351,
"step": 160
},
{
"epoch": 0.18333333333333332,
"grad_norm": 1.5154845714569092,
"learning_rate": 0.0001990772757262558,
"loss": 0.2465,
"step": 165
},
{
"epoch": 0.18888888888888888,
"grad_norm": 1.7008557319641113,
"learning_rate": 0.00019894859700586047,
"loss": 0.2776,
"step": 170
},
{
"epoch": 0.19444444444444445,
"grad_norm": 0.9177167415618896,
"learning_rate": 0.00019881156890752517,
"loss": 0.2448,
"step": 175
},
{
"epoch": 0.19444444444444445,
"eval_loss": 0.27951088547706604,
"eval_runtime": 7.3091,
"eval_samples_per_second": 27.363,
"eval_steps_per_second": 27.363,
"step": 175
},
{
"epoch": 0.2,
"grad_norm": 1.8103035688400269,
"learning_rate": 0.00019866620299381285,
"loss": 0.27,
"step": 180
},
{
"epoch": 0.20555555555555555,
"grad_norm": 2.015127658843994,
"learning_rate": 0.0001985125115308393,
"loss": 0.2856,
"step": 185
},
{
"epoch": 0.2111111111111111,
"grad_norm": 1.7647919654846191,
"learning_rate": 0.00019835050748723824,
"loss": 0.312,
"step": 190
},
{
"epoch": 0.21666666666666667,
"grad_norm": 1.8039538860321045,
"learning_rate": 0.00019818020453306697,
"loss": 0.3223,
"step": 195
},
{
"epoch": 0.2222222222222222,
"grad_norm": 1.753201961517334,
"learning_rate": 0.00019800161703865282,
"loss": 0.2553,
"step": 200
},
{
"epoch": 0.2222222222222222,
"eval_loss": 0.27645987272262573,
"eval_runtime": 7.272,
"eval_samples_per_second": 27.503,
"eval_steps_per_second": 27.503,
"step": 200
}
],
"logging_steps": 5,
"max_steps": 1800,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.005
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1202727577190400.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}