{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 344.5, "epoch": 0.20253164556962025, "grad_norm": 9.9772310256958, "kl": 0.0, "learning_rate": 5e-07, "loss": -0.0, "reward": 0.8219375633634627, "reward_std": 0.7066538096405566, "rewards/concensus_correctness_reward_func": 0.05999999865889549, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.2743438226170838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.4094687553588301, "step": 2 }, { "completion_length": 377.8125, "epoch": 0.4050632911392405, "grad_norm": 8.212695121765137, "kl": 0.0010669995590433246, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 0.9275585329160094, "reward_std": 0.9481612900854088, "rewards/concensus_correctness_reward_func": 0.06237500160932541, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.3217460191808641, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4184375060722232, "step": 4 }, { "completion_length": 307.4375, "epoch": 0.6075949367088608, "grad_norm": 9.819817543029785, "kl": 0.001468627218855545, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 0.9682412928668782, "reward_std": 0.59242624102626, "rewards/concensus_correctness_reward_func": 0.17887500301003456, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.29499130306066945, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2912500018719584, "step": 6 }, { "completion_length": 366.21875, "epoch": 0.810126582278481, "grad_norm": 6.490735054016113, "kl": 0.0012780786164512392, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 0.5933836810290813, "reward_std": 1.0359376267297193, "rewards/concensus_correctness_reward_func": 0.12250000238418579, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.24210240540560335, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10378125216811895, "step": 8 }, { "completion_length": 375.8333333333333, "epoch": 1.0, "grad_norm": 6.2283196449279785, "kl": 0.001403550726051132, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 0.9483249210442106, "reward_std": 0.7847038037454088, "rewards/concensus_correctness_reward_func": 0.0665333350499471, "rewards/consensus_reward_func": 0.06666666666666667, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.06666666666666667, "rewards/question_recreation_reward_func": 0.35792491491883993, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3905333320299784, "step": 10 }, { "completion_length": 509.8125, "epoch": 1.2025316455696202, "grad_norm": 6.293169975280762, "kl": 0.0016378157160943374, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 0.40420354809612036, "reward_std": 1.324048607615623, "rewards/concensus_correctness_reward_func": 0.12250000238418579, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.23114103089028504, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07443749927915633, "step": 12 }, { "completion_length": 362.4375, "epoch": 1.4050632911392404, "grad_norm": 9.059679985046387, "kl": 0.002234285893791821, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 0.6384785049594939, "reward_std": 0.7130910204723477, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.3409159923903644, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23506250511854887, "step": 14 }, { "completion_length": 257.625, "epoch": 1.6075949367088609, "grad_norm": 17.0372314453125, "kl": 0.0024873377551557496, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 0.7720209148246795, "reward_std": 0.5560847400847706, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.3203334001591429, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.38918749894946814, "step": 16 }, { "completion_length": 436.65625, "epoch": 1.810126582278481, "grad_norm": 6.472311973571777, "kl": 0.0021311557193257613, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 0.6501477827550843, "reward_std": 0.8022108940640464, "rewards/concensus_correctness_reward_func": 0.05412499979138374, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.2959602870978415, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17506249761208892, "step": 18 }, { "completion_length": 405.1, "epoch": 2.0, "grad_norm": 13.560686111450195, "kl": 0.0018062964198179543, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 0.7393059747914473, "reward_std": 0.6481222245842219, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.428839307030042, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31046666838228704, "step": 20 }, { "epoch": 2.0, "step": 20, "total_flos": 0.0, "train_loss": 1.4645672763435869e-06, "train_runtime": 1759.1185, "train_samples_per_second": 0.182, "train_steps_per_second": 0.011 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }