{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 245.875, "epoch": 0.5714285714285714, "grad_norm": 0.7527321577072144, "kl": 0.021911813993938267, "learning_rate": 5e-07, "loss": 0.0, "reward": 5.7590044140815735, "reward_std": 5.257039688527584, "rewards/concensus_correctness_reward_func": 3.504999991506338, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.5125668197870255, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.5383125096559525, "step": 2 }, { "completion_length": 231.0, "epoch": 1.0, "grad_norm": 0.6544833183288574, "kl": 0.020641639983902376, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 7.92664901415507, "reward_std": 7.975609441598256, "rewards/concensus_correctness_reward_func": 4.655416717131932, "rewards/consensus_reward_func": 0.4166666666666667, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.5833333333333333, "rewards/question_recreation_reward_func": 0.5634823093811671, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.020833333333333332, "rewards/xmlcount_reward_func": 0.6869166592756907, "step": 4 }, { "completion_length": 263.6875, "epoch": 1.5714285714285714, "grad_norm": 1.050506353378296, "kl": 0.019874241203069687, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 6.4608331471681595, "reward_std": 6.1422329396009445, "rewards/concensus_correctness_reward_func": 4.200937490910292, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.8125, "rewards/question_recreation_reward_func": 0.5279268752783537, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.4507187530398369, "step": 6 }, { "completion_length": 236.16666666666666, "epoch": 2.0, "grad_norm": 0.5823754668235779, "kl": 0.019788251879314583, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 6.491782108942668, "reward_std": 4.88298628727595, "rewards/concensus_correctness_reward_func": 3.506750007470449, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.4166666666666667, "rewards/question_recreation_reward_func": 0.5224903101722399, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.020833333333333332, "rewards/xmlcount_reward_func": 0.5250416745742162, "step": 8 }, { "completion_length": 227.46875, "epoch": 2.571428571428571, "grad_norm": 0.9334640502929688, "kl": 0.02316682948730886, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 7.54842072725296, "reward_std": 7.575185611844063, "rewards/concensus_correctness_reward_func": 4.861937437206507, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.9375, "rewards/question_recreation_reward_func": 0.5451083504594862, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.5320000052452087, "step": 10 }, { "completion_length": 248.58333333333334, "epoch": 3.0, "grad_norm": 0.5745718479156494, "kl": 0.02394568407908082, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 6.296762814124425, "reward_std": 4.437843253215154, "rewards/concensus_correctness_reward_func": 3.5823333263397217, "rewards/consensus_reward_func": 0.3333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.3333333333333333, "rewards/question_recreation_reward_func": 0.3383881400028865, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.020833333333333332, "rewards/xmlcount_reward_func": 0.6885416607062022, "step": 12 }, { "completion_length": 250.03125, "epoch": 3.571428571428571, "grad_norm": 0.7098527550697327, "kl": 0.02179947926197201, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 7.116845011711121, "reward_std": 6.763956539332867, "rewards/concensus_correctness_reward_func": 4.194062519818544, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.125, "rewards/question_recreation_reward_func": 0.6923136711120605, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.5898437611758709, "step": 14 }, { "completion_length": 202.16666666666666, "epoch": 4.0, "grad_norm": 0.6563175916671753, "kl": 0.024282248069842655, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 7.37228246529897, "reward_std": 5.8389623661835985, "rewards/concensus_correctness_reward_func": 4.730333363016446, "rewards/consensus_reward_func": 0.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.0, "rewards/question_recreation_reward_func": 0.2509909396370252, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.5784583141406378, "step": 16 }, { "completion_length": 289.8125, "epoch": 4.571428571428571, "grad_norm": 0.6700316667556763, "kl": 0.021815102780237794, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 5.917491778731346, "reward_std": 6.260626524686813, "rewards/concensus_correctness_reward_func": 3.528875023126602, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.875, "rewards/question_recreation_reward_func": 0.48792930506169796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.5725625162012875, "step": 18 }, { "completion_length": 217.54166666666666, "epoch": 5.0, "grad_norm": 2.9088988304138184, "kl": 0.023700623928258818, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 4.78704692920049, "reward_std": 6.283455337087314, "rewards/concensus_correctness_reward_func": 2.713333343466123, "rewards/consensus_reward_func": 0.3333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.9166666666666666, "rewards/question_recreation_reward_func": 0.37579695383707684, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4479166679084301, "step": 20 }, { "epoch": 5.0, "step": 20, "total_flos": 0.0, "train_loss": 1.9282869925518752e-05, "train_runtime": 792.851, "train_samples_per_second": 0.404, "train_steps_per_second": 0.025 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }