{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.7272727272727275, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 418.59375, "epoch": 0.7272727272727273, "grad_norm": 0.705902099609375, "kl": 0.0007176655053626746, "learning_rate": 5e-07, "loss": 0.0, "reward": 1.2138379961252213, "reward_std": 0.9511784389615059, "rewards/concensus_correctness_reward_func": 0.052000001072883606, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6546192467212677, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.19471874088048935, "step": 2 }, { "completion_length": 545.6785714285714, "epoch": 1.3636363636363638, "grad_norm": 0.4254428744316101, "kl": 0.0007325080223381519, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 0.7083579770156315, "reward_std": 1.2698204985686712, "rewards/concensus_correctness_reward_func": 0.02971428632736206, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.38753653903092655, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.005392868603978839, "step": 4 }, { "completion_length": 521.8214285714286, "epoch": 2.0, "grad_norm": 0.26565220952033997, "kl": 0.0006595713951225792, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 1.7453854935509818, "reward_std": 1.2103857483182634, "rewards/concensus_correctness_reward_func": 0.0025000000106436865, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.07142857142857142, "rewards/question_recreation_reward_func": 0.7418498141424996, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.429607161453792, "step": 6 }, { "completion_length": 398.34375, "epoch": 2.7272727272727275, "grad_norm": 0.707371175289154, "kl": 0.0008318861873704009, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 0.9764930829405785, "reward_std": 1.182244822382927, "rewards/concensus_correctness_reward_func": 0.026000000536441803, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.494743081741035, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.19012501090765, "step": 8 }, { "completion_length": 586.8571428571429, "epoch": 3.3636363636363638, "grad_norm": 0.4055241346359253, "kl": 0.0006922878757385271, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 0.9346474262752703, "reward_std": 0.8339587066854749, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.21428571428571427, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5591831388218063, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.14332143430198943, "step": 10 }, { "completion_length": 424.0357142857143, "epoch": 4.0, "grad_norm": 0.6968374848365784, "kl": 0.0007245375101254987, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 1.283107191324234, "reward_std": 1.1396433966500419, "rewards/concensus_correctness_reward_func": 0.0675714271409171, "rewards/consensus_reward_func": 0.5714285714285714, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6747857428022793, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.030678574528012956, "step": 12 }, { "completion_length": 482.78125, "epoch": 4.7272727272727275, "grad_norm": 18.94466209411621, "kl": 0.0011837783531518653, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 1.2526901960372925, "reward_std": 0.9441794194281101, "rewards/concensus_correctness_reward_func": 0.05925000109709799, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.542658930644393, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3382812477648258, "step": 14 }, { "completion_length": 439.5357142857143, "epoch": 5.363636363636363, "grad_norm": 0.509993851184845, "kl": 0.0012793634170001106, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 0.9020633314337049, "reward_std": 0.9196936445576804, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.35714285714285715, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.588849025113242, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.043928576367242, "step": 16 }, { "completion_length": 420.57142857142856, "epoch": 6.0, "grad_norm": 0.4443037807941437, "kl": 0.0007548547083778041, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 0.938133961388043, "reward_std": 1.0177770512444633, "rewards/concensus_correctness_reward_func": 0.03785714081355503, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5216696602957589, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09289286392075675, "step": 18 }, { "completion_length": 508.03125, "epoch": 6.7272727272727275, "grad_norm": 0.43586501479148865, "kl": 0.0006341933367366437, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 1.1600369960069656, "reward_std": 1.1098762974143028, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6399120055139065, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0826250072568655, "step": 20 }, { "epoch": 6.7272727272727275, "step": 20, "total_flos": 0.0, "train_loss": 7.721477743416472e-07, "train_runtime": 1665.8377, "train_samples_per_second": 0.192, "train_steps_per_second": 0.012 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }