{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.7272727272727275, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 506.6875, "epoch": 0.7272727272727273, "grad_norm": 0.4768719971179962, "kl": 0.0006958687226870097, "learning_rate": 5e-07, "loss": 0.0, "reward": 1.8010706398636103, "reward_std": 2.413410998880863, "rewards/concensus_correctness_reward_func": 1.2969374991953373, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.5273518972098827, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": -0.13259375002235174, "step": 2 }, { "completion_length": 486.7857142857143, "epoch": 1.3636363636363638, "grad_norm": 0.33231866359710693, "kl": 0.0006329953279678843, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 0.918908280985696, "reward_std": 0.9659519238131387, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5362654126116208, "rewards/soft_format_reward_func": 0.017857142857142856, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07907143448080335, "step": 4 }, { "completion_length": 416.42857142857144, "epoch": 2.0, "grad_norm": 0.49434342980384827, "kl": 0.0008667865518613585, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 2.337849646806717, "reward_std": 2.588186740875244, "rewards/concensus_correctness_reward_func": 1.429999999968069, "rewards/consensus_reward_func": 0.21428571428571427, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.14285714285714285, "rewards/question_recreation_reward_func": 0.40942103415727615, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14128571322986058, "step": 6 }, { "completion_length": 475.03125, "epoch": 2.7272727272727275, "grad_norm": 0.3978624641895294, "kl": 0.0007225602130347397, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 0.712943073362112, "reward_std": 0.844877365976572, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5247243288904428, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": -0.13990625087171793, "step": 8 }, { "completion_length": 457.17857142857144, "epoch": 3.3636363636363638, "grad_norm": 0.48263752460479736, "kl": 0.0008737414692794638, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 1.424566850066185, "reward_std": 1.2323028487818581, "rewards/concensus_correctness_reward_func": 0.1630000046321324, "rewards/consensus_reward_func": 0.42857142857142855, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.14285714285714285, "rewards/question_recreation_reward_func": 0.5036740648959365, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1864642830831664, "step": 10 }, { "completion_length": 404.17857142857144, "epoch": 4.0, "grad_norm": 1.2086650133132935, "kl": 0.0008574270676555378, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 1.2912089100905828, "reward_std": 1.2101118607180459, "rewards/concensus_correctness_reward_func": 0.0014285713966403688, "rewards/consensus_reward_func": 0.35714285714285715, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.21428571428571427, "rewards/question_recreation_reward_func": 0.5251731936420713, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.17532142837132728, "step": 12 }, { "completion_length": 478.6875, "epoch": 4.7272727272727275, "grad_norm": 0.45308005809783936, "kl": 0.0007028918153082486, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 2.8731972258538008, "reward_std": 2.0956926345825195, "rewards/concensus_correctness_reward_func": 1.875, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.5900409906171262, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.0800312552601099, "step": 14 }, { "completion_length": 570.0, "epoch": 5.363636363636363, "grad_norm": 0.32726794481277466, "kl": 0.0006596180735089417, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 1.9665765294006892, "reward_std": 2.4709178635052274, "rewards/concensus_correctness_reward_func": 1.429999999968069, "rewards/consensus_reward_func": 0.07142857142857142, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.14285714285714285, "rewards/question_recreation_reward_func": 0.4725050444581679, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15021428252969468, "step": 16 }, { "completion_length": 474.10714285714283, "epoch": 6.0, "grad_norm": 0.3391857445240021, "kl": 0.0007322329412480551, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 0.8521491778748376, "reward_std": 0.8532949728625161, "rewards/concensus_correctness_reward_func": 0.0689285716840199, "rewards/consensus_reward_func": 0.14285714285714285, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5392563108886991, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10110714712313243, "step": 18 }, { "completion_length": 452.90625, "epoch": 6.7272727272727275, "grad_norm": 23.497669219970703, "kl": 0.002060862439975608, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 0.922668170183897, "reward_std": 1.0907279998064041, "rewards/concensus_correctness_reward_func": 0.07512500137090683, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.5273868888616562, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.08578124456107616, "step": 20 }, { "epoch": 6.7272727272727275, "step": 20, "total_flos": 0.0, "train_loss": 8.18494260101943e-07, "train_runtime": 1354.9062, "train_samples_per_second": 0.236, "train_steps_per_second": 0.015 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }