{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.7272727272727275, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 448.96875, "epoch": 0.7272727272727273, "grad_norm": 0.4079512357711792, "kl": 0.0006960896716918796, "learning_rate": 5e-07, "loss": 0.0, "reward": 1.014976717531681, "reward_std": 0.931159220635891, "rewards/concensus_correctness_reward_func": 0.019187500700354576, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.4754454605281353, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.042156245559453964, "step": 2 }, { "completion_length": 503.35714285714283, "epoch": 1.3636363636363638, "grad_norm": 0.3215738534927368, "kl": 0.0007693321510617222, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 0.7434799671173096, "reward_std": 1.4438068866729736, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.42857142857142855, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.2857142857142857, "rewards/question_recreation_reward_func": 0.545658535190991, "rewards/soft_format_reward_func": 0.017857142857142856, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5343214379889625, "step": 4 }, { "completion_length": 501.5, "epoch": 2.0, "grad_norm": 0.4019313454627991, "kl": 0.0007268055175830211, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 1.6937032427106584, "reward_std": 2.689331420830318, "rewards/concensus_correctness_reward_func": 0.7362142865146909, "rewards/consensus_reward_func": 0.35714285714285715, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.07142857142857142, "rewards/question_recreation_reward_func": 0.5813103488513401, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.05239285635096686, "step": 6 }, { "completion_length": 454.6875, "epoch": 2.7272727272727275, "grad_norm": 0.433057576417923, "kl": 0.0007550113077741116, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 2.1343575585633516, "reward_std": 2.4107902091927826, "rewards/concensus_correctness_reward_func": 0.6973749808967113, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.589295007288456, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.050812505185604095, "step": 8 }, { "completion_length": 545.7857142857143, "epoch": 3.3636363636363638, "grad_norm": 0.33662065863609314, "kl": 0.0006870871750704412, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 2.292337281363351, "reward_std": 2.4016739779285023, "rewards/concensus_correctness_reward_func": 0.7142857142857143, "rewards/consensus_reward_func": 0.6428571428571429, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.14285714285714285, "rewards/question_recreation_reward_func": 0.6199443808623722, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.05357142857142857, "rewards/xmlcount_reward_func": 0.11882142403296062, "step": 10 }, { "completion_length": 430.32142857142856, "epoch": 4.0, "grad_norm": 0.37114790081977844, "kl": 0.0007296750430084233, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 0.9510723671742848, "reward_std": 1.1165074642215456, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.14285714285714285, "rewards/question_recreation_reward_func": 0.47832239099911283, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04417857527732849, "step": 12 }, { "completion_length": 552.34375, "epoch": 4.7272727272727275, "grad_norm": 2.1891822814941406, "kl": 0.0006460850017901976, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 0.6809651907533407, "reward_std": 1.454764710739255, "rewards/concensus_correctness_reward_func": 0.03125, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.5482776816934347, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3360625049099326, "step": 14 }, { "completion_length": 487.89285714285717, "epoch": 5.363636363636363, "grad_norm": 0.29302528500556946, "kl": 0.000639179945989911, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 0.9617090416806084, "reward_std": 0.9098760443074363, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.14285714285714285, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.14285714285714285, "rewards/question_recreation_reward_func": 0.4583519186292376, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.1997857200247901, "step": 16 }, { "completion_length": 432.17857142857144, "epoch": 6.0, "grad_norm": 4.430377006530762, "kl": 0.0008528920589014888, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 1.4834006769316537, "reward_std": 1.13985803936209, "rewards/concensus_correctness_reward_func": 0.043857144457953315, "rewards/consensus_reward_func": 0.5714285714285714, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.14285714285714285, "rewards/question_recreation_reward_func": 0.6183649897575378, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.0890357175043651, "step": 18 }, { "completion_length": 538.59375, "epoch": 6.7272727272727275, "grad_norm": 0.4364769458770752, "kl": 0.0006118008568591904, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 1.3893648125231266, "reward_std": 1.4084616005420685, "rewards/concensus_correctness_reward_func": 0.053187502548098564, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.5439273174852133, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.15162499900907278, "step": 20 }, { "epoch": 6.7272727272727275, "step": 20, "total_flos": 0.0, "train_loss": 6.502134226593625e-07, "train_runtime": 650.1533, "train_samples_per_second": 0.492, "train_steps_per_second": 0.031 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }