{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.7272727272727275, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 376.28125, "epoch": 0.7272727272727273, "grad_norm": 0.3437902331352234, "kl": 0.000827149415272288, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.9650179520249367, "reward_std": 0.9399564154446125, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.48545546270906925, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.26081249862909317, "step": 2 }, { "completion_length": 451.2857142857143, "epoch": 1.3636363636363638, "grad_norm": 0.5616311430931091, "kl": 0.0006403406815869468, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 1.5441367796489172, "reward_std": 1.3162878240857805, "rewards/concensus_correctness_reward_func": 0.0816428576196943, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6427796483039856, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3197142779827118, "step": 4 }, { "completion_length": 462.10714285714283, "epoch": 2.0, "grad_norm": 0.4168936312198639, "kl": 0.000719171848946384, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 0.8503530536379132, "reward_std": 1.232443732874734, "rewards/concensus_correctness_reward_func": 0.1277142869574683, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5647101657731193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": -0.1456428606595312, "step": 6 }, { "completion_length": 422.0625, "epoch": 2.7272727272727275, "grad_norm": 0.5831401944160461, "kl": 0.0006976408658374567, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 1.1317045465111732, "reward_std": 1.075602475553751, "rewards/concensus_correctness_reward_func": 0.05581250041723251, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5921108238399029, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.3431562567129731, "step": 8 }, { "completion_length": 443.64285714285717, "epoch": 3.3636363636363638, "grad_norm": 0.4295225143432617, "kl": 0.0007067695010586508, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 1.6077298362340247, "reward_std": 1.0551166534423828, "rewards/concensus_correctness_reward_func": 0.1914999932050705, "rewards/consensus_reward_func": 0.5714285714285714, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4601941300289972, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.366749997649874, "step": 10 }, { "completion_length": 512.8571428571429, "epoch": 4.0, "grad_norm": 0.42578762769699097, "kl": 0.0006562869945940163, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 0.4705148552145277, "reward_std": 0.6741891162736076, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.07142857142857142, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.43669344059058596, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03760716106210436, "step": 12 }, { "completion_length": 460.1875, "epoch": 4.7272727272727275, "grad_norm": 0.5053578615188599, "kl": 0.0006863677699584514, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 1.2665400207042694, "reward_std": 1.1816745065152645, "rewards/concensus_correctness_reward_func": 0.1038750009611249, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6119150072336197, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.17575000412762165, "step": 14 }, { "completion_length": 596.8928571428571, "epoch": 5.363636363636363, "grad_norm": 0.40903177857398987, "kl": 0.0006001410407147237, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 0.8541636381830487, "reward_std": 1.013209513255528, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.14285714285714285, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.48498509185654776, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.20846428615706308, "step": 16 }, { "completion_length": 388.64285714285717, "epoch": 6.0, "grad_norm": 0.3834454119205475, "kl": 0.0007274486041361732, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 1.0557954971279417, "reward_std": 1.018987434250968, "rewards/concensus_correctness_reward_func": 0.009000000144754137, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5256883757455009, "rewards/soft_format_reward_func": 0.017857142857142856, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.19967857216085708, "step": 18 }, { "completion_length": 399.28125, "epoch": 6.7272727272727275, "grad_norm": 0.4910911023616791, "kl": 0.0007835296382836532, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 0.9040417112410069, "reward_std": 0.9552651233971119, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5213229823857546, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.25771874852944165, "step": 20 }, { "epoch": 6.7272727272727275, "step": 20, "total_flos": 0.0, "train_loss": 6.35625968925524e-07, "train_runtime": 597.6366, "train_samples_per_second": 0.535, "train_steps_per_second": 0.033 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }