{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 326.59375, "epoch": 0.5714285714285714, "grad_norm": 4.934439182281494, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.9843985685729422, "reward_std": 0.7370969671756029, "rewards/concensus_correctness_reward_func": -0.02437499910593033, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.39177357318112627, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3670000056736171, "step": 2 }, { "completion_length": 371.0, "epoch": 1.0, "grad_norm": 3.6815781593322754, "kl": 0.0011949266287653397, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 1.5001778639853, "reward_std": 0.7827915487190088, "rewards/concensus_correctness_reward_func": 0.03458333263794581, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.4009695214529832, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5646250148614248, "step": 4 }, { "completion_length": 359.96875, "epoch": 1.5714285714285714, "grad_norm": 4.556028842926025, "kl": 0.0010278360459778924, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 1.1807272167643532, "reward_std": 0.653216159567819, "rewards/concensus_correctness_reward_func": 0.06068750098347664, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.47382097283843905, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.45871875062584877, "step": 6 }, { "completion_length": 439.2916666666667, "epoch": 2.0, "grad_norm": 2.2755231857299805, "kl": 0.0013096223750229303, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 0.7530886133511862, "reward_std": 0.9865690590813756, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.08333333333333333, "rewards/question_recreation_reward_func": 0.37442193475241464, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.295333335796992, "step": 8 }, { "completion_length": 370.40625, "epoch": 2.571428571428571, "grad_norm": 6.62834358215332, "kl": 0.0012265997065696865, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 0.8175993217155337, "reward_std": 1.0472558625042439, "rewards/concensus_correctness_reward_func": 0.05756250023841858, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4674430672894232, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16759375017136335, "step": 10 }, { "completion_length": 356.4583333333333, "epoch": 3.0, "grad_norm": 3.410081148147583, "kl": 0.001667177789689352, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 1.0313266267379124, "reward_std": 0.6612533178801338, "rewards/concensus_correctness_reward_func": 0.0790833334128062, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.572909953383108, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37933334335684776, "step": 12 }, { "completion_length": 401.03125, "epoch": 3.571428571428571, "grad_norm": 6.091869354248047, "kl": 0.0018705126858549193, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 0.7606599755818024, "reward_std": 0.9554330813698471, "rewards/concensus_correctness_reward_func": 0.06806249916553497, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5348162406007759, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09528125822544098, "step": 14 }, { "completion_length": 407.9166666666667, "epoch": 4.0, "grad_norm": 2.649594306945801, "kl": 0.001604035545218115, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 0.6916571507851282, "reward_std": 1.01990426153255, "rewards/concensus_correctness_reward_func": 0.024666666984558105, "rewards/consensus_reward_func": 0.08333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4062821501865983, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17737499624490738, "step": 16 }, { "completion_length": 412.125, "epoch": 4.571428571428571, "grad_norm": 119.07859802246094, "kl": 0.0031047585580381565, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 0.8601471782312728, "reward_std": 0.6142369294539094, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.464647185173817, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2705000061541796, "step": 18 }, { "completion_length": 346.3333333333333, "epoch": 5.0, "grad_norm": 4.018592834472656, "kl": 0.0016965343481084953, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 1.0170780109862487, "reward_std": 0.9925470755745968, "rewards/concensus_correctness_reward_func": -0.0019999990860621133, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.08333333333333333, "rewards/question_recreation_reward_func": 0.32816133989642066, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.35758334149916965, "step": 20 }, { "epoch": 5.0, "step": 20, "total_flos": 0.0, "train_loss": 1.2790784239768983e-06, "train_runtime": 291.7059, "train_samples_per_second": 1.097, "train_steps_per_second": 0.069 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }