{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.7272727272727275, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 442.46875, "epoch": 0.7272727272727273, "grad_norm": 0.5991543531417847, "kl": 0.0007749500146019273, "learning_rate": 5e-07, "loss": 0.0, "reward": 1.3889684304594994, "reward_std": 1.309515006840229, "rewards/concensus_correctness_reward_func": 0.048500001430511475, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.44768718257546425, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2677812520414591, "step": 2 }, { "completion_length": 505.5357142857143, "epoch": 1.3636363636363638, "grad_norm": 0.3918515145778656, "kl": 0.0007179479746680174, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 1.4672155209950037, "reward_std": 1.435799777507782, "rewards/concensus_correctness_reward_func": 0.09328571387699672, "rewards/consensus_reward_func": 0.6428571428571429, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5747869280832154, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15628571595464433, "step": 4 }, { "completion_length": 403.67857142857144, "epoch": 2.0, "grad_norm": 0.43622881174087524, "kl": 0.0007206426318069654, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 3.182981014251709, "reward_std": 2.6137921043804715, "rewards/concensus_correctness_reward_func": 1.4762142854077476, "rewards/consensus_reward_func": 0.42857142857142855, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.2857142857142857, "rewards/question_recreation_reward_func": 0.5557669103145599, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.41885715403727125, "step": 6 }, { "completion_length": 400.25, "epoch": 2.7272727272727275, "grad_norm": 0.40697142481803894, "kl": 0.0007051618376863189, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 2.549363426864147, "reward_std": 2.5611291229724884, "rewards/concensus_correctness_reward_func": 0.7308750003576279, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.5245196153409779, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48146875388920307, "step": 8 }, { "completion_length": 367.85714285714283, "epoch": 3.3636363636363638, "grad_norm": 0.5716286897659302, "kl": 0.000788627308793366, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 1.304913112095424, "reward_std": 0.8771725382123675, "rewards/concensus_correctness_reward_func": 0.01792857050895691, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5422702112368175, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.05357142857142857, "rewards/xmlcount_reward_func": 0.4054285671029772, "step": 10 }, { "completion_length": 476.60714285714283, "epoch": 4.0, "grad_norm": 0.3252517282962799, "kl": 0.0006907685476887439, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 3.995438354355948, "reward_std": 2.7802960106304715, "rewards/concensus_correctness_reward_func": 2.142857142857143, "rewards/consensus_reward_func": 0.7142857142857143, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.2857142857142857, "rewards/question_recreation_reward_func": 0.5197239475590842, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.33285714260169436, "step": 12 }, { "completion_length": 404.0625, "epoch": 4.7272727272727275, "grad_norm": 0.395914763212204, "kl": 0.000713749454007484, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 1.381257563829422, "reward_std": 1.204833347350359, "rewards/concensus_correctness_reward_func": 0.0833749994635582, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.46469501964747906, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.2863125056028366, "step": 14 }, { "completion_length": 431.7857142857143, "epoch": 5.363636363636363, "grad_norm": 0.4147193431854248, "kl": 0.0006908452321243073, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 1.8860733594213213, "reward_std": 1.2617861075060708, "rewards/concensus_correctness_reward_func": 0.12271428959710258, "rewards/consensus_reward_func": 0.6428571428571429, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.21428571428571427, "rewards/question_recreation_reward_func": 0.5446448081306049, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.3437142851097243, "step": 16 }, { "completion_length": 495.57142857142856, "epoch": 6.0, "grad_norm": 0.30478447675704956, "kl": 0.0006883361326929714, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 1.7385921393121992, "reward_std": 1.5500890357153756, "rewards/concensus_correctness_reward_func": 0.11321428418159485, "rewards/consensus_reward_func": 0.35714285714285715, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.14285714285714285, "rewards/question_recreation_reward_func": 0.5604849649327142, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5648928880691528, "step": 18 }, { "completion_length": 387.59375, "epoch": 6.7272727272727275, "grad_norm": 0.4601048231124878, "kl": 0.0007431526173604652, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 2.224214667454362, "reward_std": 2.225543730892241, "rewards/concensus_correctness_reward_func": 0.7394375018775463, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.4345896728336811, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5501874964684248, "step": 20 }, { "epoch": 6.7272727272727275, "step": 20, "total_flos": 0.0, "train_loss": 6.705659131966968e-07, "train_runtime": 586.0026, "train_samples_per_second": 0.546, "train_steps_per_second": 0.034 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }