|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 20, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 384.84375, |
|
"epoch": 0.1, |
|
"grad_norm": 45.04591751098633, |
|
"kl": 0.0, |
|
"learning_rate": 4.965903258506806e-07, |
|
"loss": -0.0, |
|
"reward": 3.691977483453229, |
|
"reward_std": 1.1356534652295522, |
|
"rewards/concensus_correctness_reward_func": 0.6882499977946281, |
|
"rewards/consensus_reward_func": 0.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.6260087669361383, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.15625, |
|
"rewards/xmlcount_reward_func": 0.7214687555097044, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 290.28125, |
|
"epoch": 0.2, |
|
"grad_norm": 56.04582595825195, |
|
"kl": 0.048623996786773205, |
|
"learning_rate": 4.698684378016222e-07, |
|
"loss": 0.0, |
|
"reward": 5.547513723373413, |
|
"reward_std": 0.9548060596571304, |
|
"rewards/concensus_correctness_reward_func": 1.1509375181049109, |
|
"rewards/consensus_reward_func": 1.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.9249514192342758, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.21875, |
|
"rewards/xmlcount_reward_func": 1.1903750076889992, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 216.34375, |
|
"epoch": 0.3, |
|
"grad_norm": 23.09837532043457, |
|
"kl": 892.624653175706, |
|
"learning_rate": 4.193203929064353e-07, |
|
"loss": 0.8926, |
|
"reward": 4.4300857884809375, |
|
"reward_std": 1.2345747215877054, |
|
"rewards/concensus_correctness_reward_func": 0.9410000052303076, |
|
"rewards/consensus_reward_func": 0.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.7900232495740056, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.25, |
|
"rewards/xmlcount_reward_func": 1.074062503874302, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 257.6875, |
|
"epoch": 0.4, |
|
"grad_norm": 25.814899444580078, |
|
"kl": 1.4642285024747252, |
|
"learning_rate": 3.5042385616324236e-07, |
|
"loss": 0.0015, |
|
"reward": 4.062486097216606, |
|
"reward_std": 1.364403745283198, |
|
"rewards/concensus_correctness_reward_func": 0.5777500020340085, |
|
"rewards/consensus_reward_func": 0.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.7780798338353634, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.171875, |
|
"rewards/xmlcount_reward_func": 1.0972812492400408, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 286.8125, |
|
"epoch": 0.5, |
|
"grad_norm": 17.246906280517578, |
|
"kl": 2.0072783255018294, |
|
"learning_rate": 2.706448363680831e-07, |
|
"loss": 0.002, |
|
"reward": 6.18870684504509, |
|
"reward_std": 0.7351647131145, |
|
"rewards/concensus_correctness_reward_func": 1.5492499843239784, |
|
"rewards/consensus_reward_func": 1.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.25, |
|
"rewards/question_recreation_reward_func": 0.929519459605217, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.21875, |
|
"rewards/xmlcount_reward_func": 1.2411874905228615, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 222.3125, |
|
"epoch": 0.6, |
|
"grad_norm": 38.915470123291016, |
|
"kl": 3.321804000530392, |
|
"learning_rate": 1.886286282148002e-07, |
|
"loss": 0.0033, |
|
"reward": 4.649261876940727, |
|
"reward_std": 1.167486259713769, |
|
"rewards/concensus_correctness_reward_func": 1.1363750249147415, |
|
"rewards/consensus_reward_func": 0.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.793293122202158, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.28125, |
|
"rewards/xmlcount_reward_func": 1.0633437521755695, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 259.875, |
|
"epoch": 0.7, |
|
"grad_norm": 12.302684783935547, |
|
"kl": 1.9860680536367, |
|
"learning_rate": 1.1326296046939333e-07, |
|
"loss": 0.002, |
|
"reward": 4.9309286857023835, |
|
"reward_std": 0.8483433704241179, |
|
"rewards/concensus_correctness_reward_func": 1.1531875003129244, |
|
"rewards/consensus_reward_func": 0.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.6953349150717258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.21875, |
|
"rewards/xmlcount_reward_func": 1.113656248897314, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 260.1875, |
|
"epoch": 0.8, |
|
"grad_norm": 12.547839164733887, |
|
"kl": 0.4333856268785894, |
|
"learning_rate": 5.271487265090163e-08, |
|
"loss": 0.0004, |
|
"reward": 5.027402684092522, |
|
"reward_std": 1.352033179719001, |
|
"rewards/concensus_correctness_reward_func": 1.0091875093057752, |
|
"rewards/consensus_reward_func": 0.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.75, |
|
"rewards/question_recreation_reward_func": 0.8164025899022818, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.265625, |
|
"rewards/xmlcount_reward_func": 1.2486874982714653, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 258.15625, |
|
"epoch": 0.9, |
|
"grad_norm": 14.867289543151855, |
|
"kl": 0.38399916142225266, |
|
"learning_rate": 1.3545689574841341e-08, |
|
"loss": 0.0004, |
|
"reward": 5.083541095256805, |
|
"reward_std": 1.3432283122092485, |
|
"rewards/concensus_correctness_reward_func": 1.1145625049248338, |
|
"rewards/consensus_reward_func": 0.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.7862598411738873, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.234375, |
|
"rewards/xmlcount_reward_func": 1.1983437463641167, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 269.21875, |
|
"epoch": 1.0, |
|
"grad_norm": 76.03479766845703, |
|
"kl": 0.8027685410343111, |
|
"learning_rate": 0.0, |
|
"loss": 0.0008, |
|
"reward": 4.308647858910263, |
|
"reward_std": 0.8510435534408316, |
|
"rewards/concensus_correctness_reward_func": 0.8827499886974692, |
|
"rewards/consensus_reward_func": 1.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.767147877253592, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.203125, |
|
"rewards/xmlcount_reward_func": 1.0806249976158142, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 20, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0903062883833627, |
|
"train_runtime": 775.6737, |
|
"train_samples_per_second": 0.413, |
|
"train_steps_per_second": 0.026 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 20, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|