{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 421.84375, "epoch": 0.1, "grad_norm": 7.744514465332031, "kl": 0.0, "learning_rate": 4.965903258506806e-07, "loss": -0.0, "reward": 0.3874333486892283, "reward_std": 0.8486921527073719, "rewards/concensus_correctness_reward_func": 0.03393750078976154, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.3380895941518247, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04709374811500311, "step": 2 }, { "completion_length": 391.84375, "epoch": 0.2, "grad_norm": 5.300667762756348, "kl": 0.0019505722484609578, "learning_rate": 4.698684378016222e-07, "loss": 0.0, "reward": 1.3110849247314036, "reward_std": 1.6469321683980525, "rewards/concensus_correctness_reward_func": 0.625, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.3452412204351276, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21584375109523535, "step": 4 }, { "completion_length": 394.46875, "epoch": 0.3, "grad_norm": 46.67064666748047, "kl": 0.0019083435508946422, "learning_rate": 4.193203929064353e-07, "loss": 0.0, "reward": 0.3914839383214712, "reward_std": 1.3497825153172016, "rewards/concensus_correctness_reward_func": 0.05831250175833702, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.31582766558858566, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10765624674968421, "step": 6 }, { "completion_length": 411.25, "epoch": 0.4, "grad_norm": 16.974180221557617, "kl": 0.00413768243015511, "learning_rate": 3.5042385616324236e-07, "loss": 0.0, "reward": 0.5314379204064608, "reward_std": 0.5463541564531624, "rewards/concensus_correctness_reward_func": 0.0018749999580904841, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.2779379215789959, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0641250021290034, "step": 8 }, { "completion_length": 385.9375, "epoch": 0.5, "grad_norm": 11.258064270019531, "kl": 0.002582737106422428, "learning_rate": 2.706448363680831e-07, "loss": 0.0, "reward": 2.051718756556511, "reward_std": 0.7069938564673066, "rewards/concensus_correctness_reward_func": 1.3228750005364418, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.4470312101766467, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0318124967161566, "step": 10 }, { "completion_length": 446.59375, "epoch": 0.6, "grad_norm": 4.753056526184082, "kl": 0.004790044869878329, "learning_rate": 1.886286282148002e-07, "loss": 0.0, "reward": 0.8615806391462684, "reward_std": 0.8717811293900013, "rewards/concensus_correctness_reward_func": 0.08181250095367432, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.474549381993711, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0072812524158507586, "step": 12 }, { "completion_length": 401.125, "epoch": 0.7, "grad_norm": 6.981433868408203, "kl": 0.0025986589171225205, "learning_rate": 1.1326296046939333e-07, "loss": 0.0, "reward": 2.256255905609578, "reward_std": 2.391559364972636, "rewards/concensus_correctness_reward_func": 1.361625000834465, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.35309968960064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.22903124894946814, "step": 14 }, { "completion_length": 395.125, "epoch": 0.8, "grad_norm": 7.406398773193359, "kl": 0.008011827812879346, "learning_rate": 5.271487265090163e-08, "loss": 0.0, "reward": 0.8543199766427279, "reward_std": 1.0620909905992448, "rewards/concensus_correctness_reward_func": 0.13006250001490116, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.5446637291461229, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.00790624599903822, "step": 16 }, { "completion_length": 370.03125, "epoch": 0.9, "grad_norm": 56.691341400146484, "kl": 0.011987929798124242, "learning_rate": 1.3545689574841341e-08, "loss": 0.0, "reward": 1.4162302482873201, "reward_std": 1.8340731484349817, "rewards/concensus_correctness_reward_func": 0.6328125, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.4869490059791133, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23396874405443668, "step": 18 }, { "completion_length": 429.40625, "epoch": 1.0, "grad_norm": 5.524323463439941, "kl": 0.00813784722413402, "learning_rate": 0.0, "loss": 0.0, "reward": 1.216457948088646, "reward_std": 0.9376593017950654, "rewards/concensus_correctness_reward_func": 0.2383749932050705, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.44552044360898435, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2825624970719218, "step": 20 }, { "epoch": 1.0, "step": 20, "total_flos": 0.0, "train_loss": 4.647485911846161e-06, "train_runtime": 435.1003, "train_samples_per_second": 0.735, "train_steps_per_second": 0.046 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }