{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 285.40625, "epoch": 0.20512820512820512, "grad_norm": 0.5498890280723572, "kl": 0.0, "learning_rate": 5e-07, "loss": -0.0, "reward": 1.2953027412295341, "reward_std": 1.2016863971948624, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.424521510489285, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.41765623539686203, "step": 2 }, { "completion_length": 332.71875, "epoch": 0.41025641025641024, "grad_norm": 0.6451846957206726, "kl": 0.0009901017911033705, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 1.095869965851307, "reward_std": 0.8837384283542633, "rewards/concensus_correctness_reward_func": 0.05999999865889549, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.6056824438273907, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.16456249170005322, "step": 4 }, { "completion_length": 305.78125, "epoch": 0.6153846153846154, "grad_norm": 0.6242564916610718, "kl": 0.0010371932803536765, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 1.6818831153213978, "reward_std": 1.1792906485497952, "rewards/concensus_correctness_reward_func": 0.18818750232458115, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.6482893712818623, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.26728123845532537, "step": 6 }, { "completion_length": 272.28125, "epoch": 0.8205128205128205, "grad_norm": 1.0570781230926514, "kl": 0.0010937876504613087, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 1.474930763244629, "reward_std": 1.6228845715522766, "rewards/concensus_correctness_reward_func": 0.21981250122189522, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.392680736258626, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.39368750154972076, "step": 8 }, { "completion_length": 308.85714285714283, "epoch": 1.0, "grad_norm": 0.8395159840583801, "kl": 0.0010394884489609727, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 1.5684541825737273, "reward_std": 1.035864144563675, "rewards/concensus_correctness_reward_func": 0.06871428659984044, "rewards/consensus_reward_func": 0.07142857142857142, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.2857142857142857, "rewards/question_recreation_reward_func": 0.5535256351743426, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03571428571428571, "rewards/xmlcount_reward_func": 0.553357156259673, "step": 10 }, { "completion_length": 288.96875, "epoch": 1.205128205128205, "grad_norm": 0.7363212704658508, "kl": 0.0010777587885968387, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 1.245329074561596, "reward_std": 1.1123233437538147, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.38261033222079277, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.28459377074614167, "step": 12 }, { "completion_length": 347.34375, "epoch": 1.4102564102564101, "grad_norm": 0.7052019834518433, "kl": 0.0010524256431381218, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 1.2003353461623192, "reward_std": 1.4865036271512508, "rewards/concensus_correctness_reward_func": 0.24025000259280205, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.431210333481431, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.13825000263750553, "step": 14 }, { "completion_length": 280.65625, "epoch": 1.6153846153846154, "grad_norm": 0.5906476974487305, "kl": 0.001059539194102399, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 1.1633149608969688, "reward_std": 0.8385320976376534, "rewards/concensus_correctness_reward_func": 0.015625, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.5233462303876877, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.4524687472730875, "step": 16 }, { "completion_length": 264.46875, "epoch": 1.8205128205128205, "grad_norm": 0.6119975447654724, "kl": 0.0011653968540485948, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 1.4025923907756805, "reward_std": 1.28750778734684, "rewards/concensus_correctness_reward_func": 0.05999999865889549, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.5302173933014274, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 0.53112499602139, "step": 18 }, { "completion_length": 401.39285714285717, "epoch": 2.0, "grad_norm": 0.7071681022644043, "kl": 0.0010130427877551743, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 1.219790335212435, "reward_std": 1.6927061762128557, "rewards/concensus_correctness_reward_func": 0.20557143007005965, "rewards/consensus_reward_func": 0.21428571428571427, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.2857142857142857, "rewards/question_recreation_reward_func": 0.49729034304618835, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03571428571428571, "rewards/xmlcount_reward_func": -0.01878571723188673, "step": 20 }, { "epoch": 2.0, "step": 20, "total_flos": 0.0, "train_loss": 9.246089391012901e-07, "train_runtime": 1602.5484, "train_samples_per_second": 0.2, "train_steps_per_second": 0.012 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }