qwen-0.5b-tricky_meek_hamster / trainer_state.json
xaobai's picture
End of training
631b7a6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 20,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 268.125,
"epoch": 0.1,
"grad_norm": 212.015380859375,
"kl": 0.0,
"learning_rate": 4.965903258506806e-07,
"loss": -0.0,
"reward": 2.971410434693098,
"reward_std": 1.660103004425764,
"rewards/concensus_correctness_reward_func": 0.8277499973773956,
"rewards/consensus_reward_func": 0.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.5752854123711586,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.15625,
"rewards/xmlcount_reward_func": 0.5371250007301569,
"step": 2
},
{
"completion_length": 161.84375,
"epoch": 0.2,
"grad_norm": 4049.1455078125,
"kl": 3116631.138454,
"learning_rate": 4.698684378016222e-07,
"loss": 3116.6313,
"reward": 4.441190658137202,
"reward_std": 2.443159078247845,
"rewards/concensus_correctness_reward_func": 1.127562504261732,
"rewards/consensus_reward_func": 1.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.7441594742704183,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.265625,
"rewards/xmlcount_reward_func": 0.9288437515497208,
"step": 4
},
{
"completion_length": 173.96875,
"epoch": 0.3,
"grad_norm": 6921.29296875,
"kl": 18184.31745560351,
"learning_rate": 4.193203929064353e-07,
"loss": 18.1843,
"reward": 5.475079163908958,
"reward_std": 1.6933745071146404,
"rewards/concensus_correctness_reward_func": 1.6708124876022339,
"rewards/consensus_reward_func": 1.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.7829542085528374,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3125,
"rewards/xmlcount_reward_func": 1.0213124975562096,
"step": 6
},
{
"completion_length": 206.09375,
"epoch": 0.4,
"grad_norm": 600.0673217773438,
"kl": 10434.655427431862,
"learning_rate": 3.5042385616324236e-07,
"loss": 10.4347,
"reward": 5.194255158305168,
"reward_std": 1.982654629391618,
"rewards/concensus_correctness_reward_func": 1.5163750085048378,
"rewards/consensus_reward_func": 1.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.8320676572620869,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.28125,
"rewards/xmlcount_reward_func": 1.0020624957978725,
"step": 8
},
{
"completion_length": 154.8125,
"epoch": 0.5,
"grad_norm": 278.44561767578125,
"kl": 26163472.887304425,
"learning_rate": 2.706448363680831e-07,
"loss": 26163.4727,
"reward": 6.384411156177521,
"reward_std": 1.6228036261891248,
"rewards/concensus_correctness_reward_func": 2.0416249968111515,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.8268798030912876,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.296875,
"rewards/xmlcount_reward_func": 1.0315312538295984,
"step": 10
},
{
"completion_length": 231.46875,
"epoch": 0.6,
"grad_norm": 50832.27734375,
"kl": 6071.537940578186,
"learning_rate": 1.886286282148002e-07,
"loss": 6.0715,
"reward": 4.917726576328278,
"reward_std": 1.8517463966272771,
"rewards/concensus_correctness_reward_func": 1.3334374949336052,
"rewards/consensus_reward_func": 1.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.8517578095197678,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 1.0450312420725822,
"step": 12
},
{
"completion_length": 159.0625,
"epoch": 0.7,
"grad_norm": 71.38750457763672,
"kl": 4.22442401223816,
"learning_rate": 1.1326296046939333e-07,
"loss": 0.0042,
"reward": 5.902323350310326,
"reward_std": 0.8223230724979658,
"rewards/concensus_correctness_reward_func": 1.6347499899566174,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9046983420848846,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.390625,
"rewards/xmlcount_reward_func": 1.1597499996423721,
"step": 14
},
{
"completion_length": 237.5,
"epoch": 0.8,
"grad_norm": 86529.1875,
"kl": 4002.5769818275585,
"learning_rate": 5.271487265090163e-08,
"loss": 4.0026,
"reward": 4.748107491061091,
"reward_std": 1.2591828682525374,
"rewards/concensus_correctness_reward_func": 1.2969374898821115,
"rewards/consensus_reward_func": 1.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.814795003272593,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.265625,
"rewards/xmlcount_reward_func": 0.9332499988377094,
"step": 16
},
{
"completion_length": 169.125,
"epoch": 0.9,
"grad_norm": 37287.53515625,
"kl": 417.28647581877885,
"learning_rate": 1.3545689574841341e-08,
"loss": 0.4173,
"reward": 4.91078519821167,
"reward_std": 1.4972448431071825,
"rewards/concensus_correctness_reward_func": 1.3146874904632568,
"rewards/consensus_reward_func": 1.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.84206647798419,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.328125,
"rewards/xmlcount_reward_func": 1.0509062558412552,
"step": 18
},
{
"completion_length": 174.0625,
"epoch": 1.0,
"grad_norm": 79.22607421875,
"kl": 3.6632585607585497,
"learning_rate": 0.0,
"loss": 0.0037,
"reward": 6.3031881004571915,
"reward_std": 0.9906455368909519,
"rewards/concensus_correctness_reward_func": 2.099812502041459,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.8537818752229214,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.296875,
"rewards/xmlcount_reward_func": 1.0527187511324883,
"step": 20
},
{
"epoch": 1.0,
"step": 20,
"total_flos": 0.0,
"train_loss": 2931.9222262827448,
"train_runtime": 158.7363,
"train_samples_per_second": 2.016,
"train_steps_per_second": 0.126
}
],
"logging_steps": 2,
"max_steps": 20,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}