{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 429.375, "epoch": 0.1, "grad_norm": 6.362526893615723, "kl": 0.0, "learning_rate": 4.965903258506806e-07, "loss": -0.0, "reward": 0.5672355876304209, "reward_std": 0.6572313613723963, "rewards/concensus_correctness_reward_func": 0.015625, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.28067309511243366, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13031249400228262, "step": 2 }, { "completion_length": 383.78125, "epoch": 0.2, "grad_norm": 9.042190551757812, "kl": 0.002136965991667239, "learning_rate": 4.698684378016222e-07, "loss": 0.0, "reward": 0.7526717400178313, "reward_std": 0.6807779443915933, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.4497029990889132, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.16234374791383743, "step": 4 }, { "completion_length": 395.71875, "epoch": 0.3, "grad_norm": 5.500570297241211, "kl": 0.001661403013713425, "learning_rate": 4.193203929064353e-07, "loss": 0.0, "reward": 0.2962013106443919, "reward_std": 0.546006468415726, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.2031700624502264, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09303125087171793, "step": 6 }, { "completion_length": 381.5625, "epoch": 0.4, "grad_norm": 5.596879959106445, "kl": 0.003241013033402851, "learning_rate": 3.5042385616324236e-07, "loss": 0.0, "reward": 0.10878897225484252, "reward_std": 0.47377347096335143, "rewards/concensus_correctness_reward_func": -0.012125000357627869, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.22816396737471223, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10725000081583858, "step": 8 }, { "completion_length": 406.15625, "epoch": 0.5, "grad_norm": 9.993197441101074, "kl": 0.00388924634171417, "learning_rate": 2.706448363680831e-07, "loss": 0.0, "reward": 0.6692488370463252, "reward_std": 0.6855223017628305, "rewards/concensus_correctness_reward_func": 0.09624999761581421, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.2978425864712335, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21265625394880772, "step": 10 }, { "completion_length": 319.53125, "epoch": 0.6, "grad_norm": 51.14741897583008, "kl": 0.0033573091350262985, "learning_rate": 1.886286282148002e-07, "loss": 0.0, "reward": 0.9884568026755005, "reward_std": 1.2231837591389194, "rewards/concensus_correctness_reward_func": 0.625, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.3208317665848881, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04262499767355621, "step": 12 }, { "completion_length": 390.78125, "epoch": 0.7, "grad_norm": 9.56969928741455, "kl": 0.0013517648112610914, "learning_rate": 1.1326296046939333e-07, "loss": 0.0, "reward": 0.6456013559363782, "reward_std": 0.7172366937738843, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.27797637216281146, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05512500088661909, "step": 14 }, { "completion_length": 385.9375, "epoch": 0.8, "grad_norm": 7.131856918334961, "kl": 0.0025733955844771117, "learning_rate": 5.271487265090163e-08, "loss": 0.0, "reward": 0.32755042728967965, "reward_std": 0.5763637216405186, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.25761292362585664, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06993749551475048, "step": 16 }, { "completion_length": 294.0625, "epoch": 0.9, "grad_norm": 77.56391143798828, "kl": 0.004774373170221224, "learning_rate": 1.3545689574841341e-08, "loss": 0.0, "reward": 0.2950056961271912, "reward_std": 0.3544262985305977, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.27097444399259984, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0384687427431345, "step": 18 }, { "completion_length": 320.21875, "epoch": 1.0, "grad_norm": 209.2000274658203, "kl": 0.002910930074904172, "learning_rate": 0.0, "loss": 0.0, "reward": 1.0020002899691463, "reward_std": 1.1356388710410101, "rewards/concensus_correctness_reward_func": 0.6333125000819564, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.2912815499585122, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.061781247379258275, "step": 20 }, { "epoch": 1.0, "step": 20, "total_flos": 0.0, "train_loss": 2.567842602729797e-06, "train_runtime": 216.6255, "train_samples_per_second": 1.477, "train_steps_per_second": 0.092 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }