{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 271.53125, "epoch": 0.5714285714285714, "grad_norm": 0.6156684756278992, "kl": 0.003713510144734755, "learning_rate": 5e-07, "loss": 0.0, "reward": 4.28339558839798, "reward_std": 5.015739507973194, "rewards/concensus_correctness_reward_func": 2.1066250316798687, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.8125, "rewards/question_recreation_reward_func": 0.49977052211761475, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3644999973475933, "step": 2 }, { "completion_length": 237.33333333333334, "epoch": 1.0, "grad_norm": 0.3896828293800354, "kl": 0.0036000795274352035, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 7.742002268632253, "reward_std": 8.532249887784323, "rewards/concensus_correctness_reward_func": 5.1745833332339926, "rewards/consensus_reward_func": 0.4166666666666667, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.25, "rewards/question_recreation_reward_func": 0.4115021179119746, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.041666666666666664, "rewards/xmlcount_reward_func": 0.4475833574930827, "step": 4 }, { "completion_length": 268.1875, "epoch": 1.5714285714285714, "grad_norm": 0.6913865208625793, "kl": 0.003932094754418358, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 5.461809501051903, "reward_std": 5.302938483655453, "rewards/concensus_correctness_reward_func": 3.2423125002533197, "rewards/consensus_reward_func": 0.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.75, "rewards/question_recreation_reward_func": 0.43490335159003735, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2845937521196902, "step": 6 }, { "completion_length": 248.33333333333334, "epoch": 2.0, "grad_norm": 0.40261271595954895, "kl": 0.0035152111668139696, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 6.17376080652078, "reward_std": 5.402185544371605, "rewards/concensus_correctness_reward_func": 3.6611666629711785, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.0833333333333333, "rewards/question_recreation_reward_func": 0.3990109084794919, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.020833333333333332, "rewards/xmlcount_reward_func": 0.5094166745742162, "step": 8 }, { "completion_length": 239.0625, "epoch": 2.571428571428571, "grad_norm": 0.6596890687942505, "kl": 0.004055387194966897, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 7.403021275997162, "reward_std": 7.769663609564304, "rewards/concensus_correctness_reward_func": 4.745562508702278, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.125, "rewards/question_recreation_reward_func": 0.42364630103111267, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.46818749513477087, "step": 10 }, { "completion_length": 272.5, "epoch": 3.0, "grad_norm": 0.3865489363670349, "kl": 0.004031806602142751, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 3.3122252325216928, "reward_std": 2.822227214773496, "rewards/concensus_correctness_reward_func": 1.8305832520127296, "rewards/consensus_reward_func": 0.16666666666666666, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.477475106716156, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.020833333333333332, "rewards/xmlcount_reward_func": 0.3166666701436043, "step": 12 }, { "completion_length": 245.59375, "epoch": 3.571428571428571, "grad_norm": 0.7244676351547241, "kl": 0.0035437151964288205, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 6.540969520807266, "reward_std": 6.686426967382431, "rewards/concensus_correctness_reward_func": 3.60187497548759, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.25, "rewards/question_recreation_reward_func": 0.5858760215342045, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.4625937454402447, "step": 14 }, { "completion_length": 285.8333333333333, "epoch": 4.0, "grad_norm": 0.4408351480960846, "kl": 0.005074007475438218, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 3.404947509368261, "reward_std": 2.91806568702062, "rewards/concensus_correctness_reward_func": 1.8025833343466122, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5833333333333334, "rewards/question_recreation_reward_func": 0.2744477453331153, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4945833434661229, "step": 16 }, { "completion_length": 245.5625, "epoch": 4.571428571428571, "grad_norm": 1.1728448867797852, "kl": 0.004925011526211165, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 8.376463562250137, "reward_std": 6.909352557733655, "rewards/concensus_correctness_reward_func": 5.674437500536442, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.0, "rewards/question_recreation_reward_func": 0.5359011180698872, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.6348750218749046, "step": 18 }, { "completion_length": 228.45833333333334, "epoch": 5.0, "grad_norm": 0.5003474950790405, "kl": 0.003518054281206181, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 2.8317282050848007, "reward_std": 2.5725097159544625, "rewards/concensus_correctness_reward_func": 1.0097499986489613, "rewards/consensus_reward_func": 0.3333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.8333333333333334, "rewards/question_recreation_reward_func": 0.2900198350350062, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.36529167120655376, "step": 20 }, { "epoch": 5.0, "step": 20, "total_flos": 0.0, "train_loss": 3.5113219382765236e-06, "train_runtime": 2215.8404, "train_samples_per_second": 0.144, "train_steps_per_second": 0.009 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }