Instruments-16bit-3B-4Epoch / trainer_state.json
WangXFng's picture
Model save
e102c6d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 8240,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.12135922330097088,
"grad_norm": 0.9599943161010742,
"learning_rate": 9.698956057295461e-05,
"loss": 1.2196,
"step": 250
},
{
"epoch": 0.24271844660194175,
"grad_norm": 1.0285232067108154,
"learning_rate": 9.395484340859432e-05,
"loss": 0.6688,
"step": 500
},
{
"epoch": 0.3640776699029126,
"grad_norm": 0.8658091425895691,
"learning_rate": 9.092012624423404e-05,
"loss": 0.5603,
"step": 750
},
{
"epoch": 0.4854368932038835,
"grad_norm": 0.8770154714584351,
"learning_rate": 8.788540907987377e-05,
"loss": 0.5165,
"step": 1000
},
{
"epoch": 0.6067961165048543,
"grad_norm": 0.6812583208084106,
"learning_rate": 8.485069191551348e-05,
"loss": 0.5015,
"step": 1250
},
{
"epoch": 0.7281553398058253,
"grad_norm": 0.6911689043045044,
"learning_rate": 8.181597475115321e-05,
"loss": 0.4846,
"step": 1500
},
{
"epoch": 0.8495145631067961,
"grad_norm": 0.6556753516197205,
"learning_rate": 7.878125758679291e-05,
"loss": 0.4769,
"step": 1750
},
{
"epoch": 0.970873786407767,
"grad_norm": 0.5876182317733765,
"learning_rate": 7.574654042243264e-05,
"loss": 0.4701,
"step": 2000
},
{
"epoch": 1.0922330097087378,
"grad_norm": 0.6175569891929626,
"learning_rate": 7.271182325807235e-05,
"loss": 0.4616,
"step": 2250
},
{
"epoch": 1.2135922330097086,
"grad_norm": 0.6353004574775696,
"learning_rate": 6.967710609371208e-05,
"loss": 0.4518,
"step": 2500
},
{
"epoch": 1.3349514563106797,
"grad_norm": 0.5879459977149963,
"learning_rate": 6.664238892935178e-05,
"loss": 0.4483,
"step": 2750
},
{
"epoch": 1.4563106796116505,
"grad_norm": 0.6575189232826233,
"learning_rate": 6.360767176499151e-05,
"loss": 0.4367,
"step": 3000
},
{
"epoch": 1.5776699029126213,
"grad_norm": 0.724533200263977,
"learning_rate": 6.0572954600631224e-05,
"loss": 0.4226,
"step": 3250
},
{
"epoch": 1.6990291262135924,
"grad_norm": 0.7686433792114258,
"learning_rate": 5.7538237436270945e-05,
"loss": 0.4104,
"step": 3500
},
{
"epoch": 1.820388349514563,
"grad_norm": 0.7101556658744812,
"learning_rate": 5.450352027191066e-05,
"loss": 0.3954,
"step": 3750
},
{
"epoch": 1.941747572815534,
"grad_norm": 0.7856088280677795,
"learning_rate": 5.146880310755038e-05,
"loss": 0.3827,
"step": 4000
},
{
"epoch": 2.063106796116505,
"grad_norm": 0.8785816431045532,
"learning_rate": 4.84340859431901e-05,
"loss": 0.3585,
"step": 4250
},
{
"epoch": 2.1844660194174756,
"grad_norm": 0.858726441860199,
"learning_rate": 4.539936877882982e-05,
"loss": 0.341,
"step": 4500
},
{
"epoch": 2.3058252427184467,
"grad_norm": 0.8789017200469971,
"learning_rate": 4.236465161446954e-05,
"loss": 0.3313,
"step": 4750
},
{
"epoch": 2.4271844660194173,
"grad_norm": 0.9984813928604126,
"learning_rate": 3.932993445010925e-05,
"loss": 0.321,
"step": 5000
},
{
"epoch": 2.5485436893203883,
"grad_norm": 0.8649771213531494,
"learning_rate": 3.6295217285748975e-05,
"loss": 0.3104,
"step": 5250
},
{
"epoch": 2.6699029126213594,
"grad_norm": 0.9905620217323303,
"learning_rate": 3.326050012138869e-05,
"loss": 0.3008,
"step": 5500
},
{
"epoch": 2.79126213592233,
"grad_norm": 0.9460727572441101,
"learning_rate": 3.022578295702841e-05,
"loss": 0.2965,
"step": 5750
},
{
"epoch": 2.912621359223301,
"grad_norm": 0.8885589241981506,
"learning_rate": 2.7191065792668125e-05,
"loss": 0.2876,
"step": 6000
},
{
"epoch": 3.033980582524272,
"grad_norm": 0.9261214733123779,
"learning_rate": 2.4156348628307843e-05,
"loss": 0.2759,
"step": 6250
},
{
"epoch": 3.1553398058252426,
"grad_norm": 0.9241772294044495,
"learning_rate": 2.112163146394756e-05,
"loss": 0.2618,
"step": 6500
},
{
"epoch": 3.2766990291262137,
"grad_norm": 0.929602861404419,
"learning_rate": 1.808691429958728e-05,
"loss": 0.2578,
"step": 6750
},
{
"epoch": 3.3980582524271843,
"grad_norm": 0.9885833263397217,
"learning_rate": 1.5052197135226997e-05,
"loss": 0.2547,
"step": 7000
},
{
"epoch": 3.5194174757281553,
"grad_norm": 0.9474493861198425,
"learning_rate": 1.2017479970866715e-05,
"loss": 0.2528,
"step": 7250
},
{
"epoch": 3.6407766990291264,
"grad_norm": 0.9105657935142517,
"learning_rate": 8.982762806506435e-06,
"loss": 0.25,
"step": 7500
},
{
"epoch": 3.762135922330097,
"grad_norm": 0.9185407161712646,
"learning_rate": 5.948045642146152e-06,
"loss": 0.2463,
"step": 7750
},
{
"epoch": 3.883495145631068,
"grad_norm": 0.8763870000839233,
"learning_rate": 2.9133284777858704e-06,
"loss": 0.2462,
"step": 8000
},
{
"epoch": 4.0,
"step": 8240,
"total_flos": 1.1079720316327956e+18,
"train_loss": 0.39851856185394585,
"train_runtime": 97554.2615,
"train_samples_per_second": 5.406,
"train_steps_per_second": 0.084
}
],
"logging_steps": 250,
"max_steps": 8240,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1079720316327956e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}