|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 8240, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.12135922330097088, |
|
"grad_norm": 0.9599943161010742, |
|
"learning_rate": 9.698956057295461e-05, |
|
"loss": 1.2196, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.24271844660194175, |
|
"grad_norm": 1.0285232067108154, |
|
"learning_rate": 9.395484340859432e-05, |
|
"loss": 0.6688, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3640776699029126, |
|
"grad_norm": 0.8658091425895691, |
|
"learning_rate": 9.092012624423404e-05, |
|
"loss": 0.5603, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4854368932038835, |
|
"grad_norm": 0.8770154714584351, |
|
"learning_rate": 8.788540907987377e-05, |
|
"loss": 0.5165, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6067961165048543, |
|
"grad_norm": 0.6812583208084106, |
|
"learning_rate": 8.485069191551348e-05, |
|
"loss": 0.5015, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.7281553398058253, |
|
"grad_norm": 0.6911689043045044, |
|
"learning_rate": 8.181597475115321e-05, |
|
"loss": 0.4846, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8495145631067961, |
|
"grad_norm": 0.6556753516197205, |
|
"learning_rate": 7.878125758679291e-05, |
|
"loss": 0.4769, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.970873786407767, |
|
"grad_norm": 0.5876182317733765, |
|
"learning_rate": 7.574654042243264e-05, |
|
"loss": 0.4701, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.0922330097087378, |
|
"grad_norm": 0.6175569891929626, |
|
"learning_rate": 7.271182325807235e-05, |
|
"loss": 0.4616, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.2135922330097086, |
|
"grad_norm": 0.6353004574775696, |
|
"learning_rate": 6.967710609371208e-05, |
|
"loss": 0.4518, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.3349514563106797, |
|
"grad_norm": 0.5879459977149963, |
|
"learning_rate": 6.664238892935178e-05, |
|
"loss": 0.4483, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.4563106796116505, |
|
"grad_norm": 0.6575189232826233, |
|
"learning_rate": 6.360767176499151e-05, |
|
"loss": 0.4367, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.5776699029126213, |
|
"grad_norm": 0.724533200263977, |
|
"learning_rate": 6.0572954600631224e-05, |
|
"loss": 0.4226, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.6990291262135924, |
|
"grad_norm": 0.7686433792114258, |
|
"learning_rate": 5.7538237436270945e-05, |
|
"loss": 0.4104, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.820388349514563, |
|
"grad_norm": 0.7101556658744812, |
|
"learning_rate": 5.450352027191066e-05, |
|
"loss": 0.3954, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.941747572815534, |
|
"grad_norm": 0.7856088280677795, |
|
"learning_rate": 5.146880310755038e-05, |
|
"loss": 0.3827, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.063106796116505, |
|
"grad_norm": 0.8785816431045532, |
|
"learning_rate": 4.84340859431901e-05, |
|
"loss": 0.3585, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.1844660194174756, |
|
"grad_norm": 0.858726441860199, |
|
"learning_rate": 4.539936877882982e-05, |
|
"loss": 0.341, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.3058252427184467, |
|
"grad_norm": 0.8789017200469971, |
|
"learning_rate": 4.236465161446954e-05, |
|
"loss": 0.3313, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 2.4271844660194173, |
|
"grad_norm": 0.9984813928604126, |
|
"learning_rate": 3.932993445010925e-05, |
|
"loss": 0.321, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.5485436893203883, |
|
"grad_norm": 0.8649771213531494, |
|
"learning_rate": 3.6295217285748975e-05, |
|
"loss": 0.3104, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 2.6699029126213594, |
|
"grad_norm": 0.9905620217323303, |
|
"learning_rate": 3.326050012138869e-05, |
|
"loss": 0.3008, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.79126213592233, |
|
"grad_norm": 0.9460727572441101, |
|
"learning_rate": 3.022578295702841e-05, |
|
"loss": 0.2965, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 2.912621359223301, |
|
"grad_norm": 0.8885589241981506, |
|
"learning_rate": 2.7191065792668125e-05, |
|
"loss": 0.2876, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.033980582524272, |
|
"grad_norm": 0.9261214733123779, |
|
"learning_rate": 2.4156348628307843e-05, |
|
"loss": 0.2759, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 3.1553398058252426, |
|
"grad_norm": 0.9241772294044495, |
|
"learning_rate": 2.112163146394756e-05, |
|
"loss": 0.2618, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.2766990291262137, |
|
"grad_norm": 0.929602861404419, |
|
"learning_rate": 1.808691429958728e-05, |
|
"loss": 0.2578, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 3.3980582524271843, |
|
"grad_norm": 0.9885833263397217, |
|
"learning_rate": 1.5052197135226997e-05, |
|
"loss": 0.2547, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.5194174757281553, |
|
"grad_norm": 0.9474493861198425, |
|
"learning_rate": 1.2017479970866715e-05, |
|
"loss": 0.2528, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 3.6407766990291264, |
|
"grad_norm": 0.9105657935142517, |
|
"learning_rate": 8.982762806506435e-06, |
|
"loss": 0.25, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.762135922330097, |
|
"grad_norm": 0.9185407161712646, |
|
"learning_rate": 5.948045642146152e-06, |
|
"loss": 0.2463, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 3.883495145631068, |
|
"grad_norm": 0.8763870000839233, |
|
"learning_rate": 2.9133284777858704e-06, |
|
"loss": 0.2462, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 8240, |
|
"total_flos": 1.1079720316327956e+18, |
|
"train_loss": 0.39851856185394585, |
|
"train_runtime": 97554.2615, |
|
"train_samples_per_second": 5.406, |
|
"train_steps_per_second": 0.084 |
|
} |
|
], |
|
"logging_steps": 250, |
|
"max_steps": 8240, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1079720316327956e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|