|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 20, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 268.125, |
|
"epoch": 0.1, |
|
"grad_norm": 212.015380859375, |
|
"kl": 0.0, |
|
"learning_rate": 4.965903258506806e-07, |
|
"loss": -0.0, |
|
"reward": 2.971410434693098, |
|
"reward_std": 1.660103004425764, |
|
"rewards/concensus_correctness_reward_func": 0.8277499973773956, |
|
"rewards/consensus_reward_func": 0.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5752854123711586, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.15625, |
|
"rewards/xmlcount_reward_func": 0.5371250007301569, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 161.84375, |
|
"epoch": 0.2, |
|
"grad_norm": 4049.1455078125, |
|
"kl": 3116631.138454, |
|
"learning_rate": 4.698684378016222e-07, |
|
"loss": 3116.6313, |
|
"reward": 4.441190658137202, |
|
"reward_std": 2.443159078247845, |
|
"rewards/concensus_correctness_reward_func": 1.127562504261732, |
|
"rewards/consensus_reward_func": 1.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.7441594742704183, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.265625, |
|
"rewards/xmlcount_reward_func": 0.9288437515497208, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 173.96875, |
|
"epoch": 0.3, |
|
"grad_norm": 6921.29296875, |
|
"kl": 18184.31745560351, |
|
"learning_rate": 4.193203929064353e-07, |
|
"loss": 18.1843, |
|
"reward": 5.475079163908958, |
|
"reward_std": 1.6933745071146404, |
|
"rewards/concensus_correctness_reward_func": 1.6708124876022339, |
|
"rewards/consensus_reward_func": 1.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.7829542085528374, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.3125, |
|
"rewards/xmlcount_reward_func": 1.0213124975562096, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 206.09375, |
|
"epoch": 0.4, |
|
"grad_norm": 600.0673217773438, |
|
"kl": 10434.655427431862, |
|
"learning_rate": 3.5042385616324236e-07, |
|
"loss": 10.4347, |
|
"reward": 5.194255158305168, |
|
"reward_std": 1.982654629391618, |
|
"rewards/concensus_correctness_reward_func": 1.5163750085048378, |
|
"rewards/consensus_reward_func": 1.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.8320676572620869, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.28125, |
|
"rewards/xmlcount_reward_func": 1.0020624957978725, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 154.8125, |
|
"epoch": 0.5, |
|
"grad_norm": 278.44561767578125, |
|
"kl": 26163472.887304425, |
|
"learning_rate": 2.706448363680831e-07, |
|
"loss": 26163.4727, |
|
"reward": 6.384411156177521, |
|
"reward_std": 1.6228036261891248, |
|
"rewards/concensus_correctness_reward_func": 2.0416249968111515, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.8268798030912876, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.296875, |
|
"rewards/xmlcount_reward_func": 1.0315312538295984, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 231.46875, |
|
"epoch": 0.6, |
|
"grad_norm": 50832.27734375, |
|
"kl": 6071.537940578186, |
|
"learning_rate": 1.886286282148002e-07, |
|
"loss": 6.0715, |
|
"reward": 4.917726576328278, |
|
"reward_std": 1.8517463966272771, |
|
"rewards/concensus_correctness_reward_func": 1.3334374949336052, |
|
"rewards/consensus_reward_func": 1.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.8517578095197678, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.25, |
|
"rewards/xmlcount_reward_func": 1.0450312420725822, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 159.0625, |
|
"epoch": 0.7, |
|
"grad_norm": 71.38750457763672, |
|
"kl": 4.22442401223816, |
|
"learning_rate": 1.1326296046939333e-07, |
|
"loss": 0.0042, |
|
"reward": 5.902323350310326, |
|
"reward_std": 0.8223230724979658, |
|
"rewards/concensus_correctness_reward_func": 1.6347499899566174, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.9046983420848846, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.390625, |
|
"rewards/xmlcount_reward_func": 1.1597499996423721, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 237.5, |
|
"epoch": 0.8, |
|
"grad_norm": 86529.1875, |
|
"kl": 4002.5769818275585, |
|
"learning_rate": 5.271487265090163e-08, |
|
"loss": 4.0026, |
|
"reward": 4.748107491061091, |
|
"reward_std": 1.2591828682525374, |
|
"rewards/concensus_correctness_reward_func": 1.2969374898821115, |
|
"rewards/consensus_reward_func": 1.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.814795003272593, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.265625, |
|
"rewards/xmlcount_reward_func": 0.9332499988377094, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 169.125, |
|
"epoch": 0.9, |
|
"grad_norm": 37287.53515625, |
|
"kl": 417.28647581877885, |
|
"learning_rate": 1.3545689574841341e-08, |
|
"loss": 0.4173, |
|
"reward": 4.91078519821167, |
|
"reward_std": 1.4972448431071825, |
|
"rewards/concensus_correctness_reward_func": 1.3146874904632568, |
|
"rewards/consensus_reward_func": 1.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.84206647798419, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.328125, |
|
"rewards/xmlcount_reward_func": 1.0509062558412552, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 174.0625, |
|
"epoch": 1.0, |
|
"grad_norm": 79.22607421875, |
|
"kl": 3.6632585607585497, |
|
"learning_rate": 0.0, |
|
"loss": 0.0037, |
|
"reward": 6.3031881004571915, |
|
"reward_std": 0.9906455368909519, |
|
"rewards/concensus_correctness_reward_func": 2.099812502041459, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.8537818752229214, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.296875, |
|
"rewards/xmlcount_reward_func": 1.0527187511324883, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 20, |
|
"total_flos": 0.0, |
|
"train_loss": 2931.9222262827448, |
|
"train_runtime": 158.7363, |
|
"train_samples_per_second": 2.016, |
|
"train_steps_per_second": 0.126 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 20, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|