{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.7272727272727275, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 507.25, "epoch": 0.7272727272727273, "grad_norm": 0.5102810263633728, "kl": 0.0006413383634935599, "learning_rate": 5e-07, "loss": 0.0, "reward": 1.2246774435043335, "reward_std": 1.0250092409551144, "rewards/concensus_correctness_reward_func": 0.026750000193715096, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.6202712282538414, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.24953125789761543, "step": 2 }, { "completion_length": 545.5357142857143, "epoch": 1.3636363636363638, "grad_norm": 0.6383160948753357, "kl": 0.0006939700771389263, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 0.9871079708848681, "reward_std": 0.9425153072391238, "rewards/concensus_correctness_reward_func": 0.06157142775399344, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5393222442695073, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10050000729305404, "step": 4 }, { "completion_length": 556.75, "epoch": 2.0, "grad_norm": 0.30501818656921387, "kl": 0.0006923160399310291, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 0.9744588136672974, "reward_std": 0.997994201523917, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.21428571428571427, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.651923132794244, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.09039285566125597, "step": 6 }, { "completion_length": 358.21875, "epoch": 2.7272727272727275, "grad_norm": 0.5563687086105347, "kl": 0.0008131818103720434, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 1.1429598741233349, "reward_std": 1.0274235233664513, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5877098739147186, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.16462498623877764, "step": 8 }, { "completion_length": 722.3214285714286, "epoch": 3.3636363636363638, "grad_norm": 0.537308931350708, "kl": 0.0027192495430686642, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 0.9694735769714627, "reward_std": 0.9623457491397858, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6571164429187775, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02664285898208618, "step": 10 }, { "completion_length": 540.9285714285714, "epoch": 4.0, "grad_norm": 0.345342218875885, "kl": 0.0007426313989396606, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 1.4026029365403312, "reward_std": 1.0735368302890234, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5714285714285714, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6404600867203304, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19071428264890397, "step": 12 }, { "completion_length": 501.375, "epoch": 4.7272727272727275, "grad_norm": 0.4603041410446167, "kl": 0.0007860659461584873, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 1.0617278590798378, "reward_std": 1.036953940987587, "rewards/concensus_correctness_reward_func": 0.0078125, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5582278668880463, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.27693750336766243, "step": 14 }, { "completion_length": 536.1428571428571, "epoch": 5.363636363636363, "grad_norm": 0.5306868553161621, "kl": 0.0007226377077001546, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 1.5109271705150604, "reward_std": 1.1485101410320826, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.35714285714285715, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6943200017724719, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03571428571428571, "rewards/xmlcount_reward_func": 0.4237499992762293, "step": 16 }, { "completion_length": 618.7142857142857, "epoch": 6.0, "grad_norm": 0.2140144258737564, "kl": 0.00047585583524778485, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 1.1479524033410209, "reward_std": 1.0819738251822335, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.8127023918288094, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0495357130255018, "step": 18 }, { "completion_length": 595.125, "epoch": 6.7272727272727275, "grad_norm": 0.7731544375419617, "kl": 0.0006078763362893369, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 1.2466152980923653, "reward_std": 1.057570207864046, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6501153353601694, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.2683749981224537, "step": 20 }, { "epoch": 6.7272727272727275, "step": 20, "total_flos": 0.0, "train_loss": 8.154420868322632e-07, "train_runtime": 1003.4696, "train_samples_per_second": 0.319, "train_steps_per_second": 0.02 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }