{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.7272727272727275, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 493.09375, "epoch": 0.7272727272727273, "grad_norm": 19.217636108398438, "kl": 0.0011702408446581103, "learning_rate": 5e-07, "loss": 0.0, "reward": 1.405498445034027, "reward_std": 0.94932034984231, "rewards/concensus_correctness_reward_func": 0.017999999225139618, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.47096719685941935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.38528125965967774, "step": 2 }, { "completion_length": 477.2142857142857, "epoch": 1.3636363636363638, "grad_norm": 0.34427568316459656, "kl": 0.0007293328609583634, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 0.7860592518533979, "reward_std": 0.8674239729131971, "rewards/concensus_correctness_reward_func": 0.030142856495720998, "rewards/consensus_reward_func": 0.42857142857142855, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.374666411961828, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": -0.06517856248787471, "step": 4 }, { "completion_length": 464.17857142857144, "epoch": 2.0, "grad_norm": 0.2992457449436188, "kl": 0.0007828814975385155, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 1.2176040070397514, "reward_std": 0.9366523197719029, "rewards/concensus_correctness_reward_func": 0.08085714067731585, "rewards/consensus_reward_func": 0.42857142857142855, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.423675468989781, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2845000113759722, "step": 6 }, { "completion_length": 416.875, "epoch": 2.7272727272727275, "grad_norm": 1.6301639080047607, "kl": 0.001307578495470807, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 1.0240302421152592, "reward_std": 0.9169644303619862, "rewards/concensus_correctness_reward_func": 0.04437499865889549, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.39487397111952305, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.2566562509164214, "step": 8 }, { "completion_length": 488.2857142857143, "epoch": 3.3636363636363638, "grad_norm": 0.41716712713241577, "kl": 0.0012800927756221167, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 1.2444563167435783, "reward_std": 1.2771144764763969, "rewards/concensus_correctness_reward_func": 0.06314285578472274, "rewards/consensus_reward_func": 0.35714285714285715, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.3460634670087269, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.46024998809610096, "step": 10 }, { "completion_length": 451.60714285714283, "epoch": 4.0, "grad_norm": 0.22203044593334198, "kl": 0.000720701870575015, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 1.0469175236565726, "reward_std": 1.0470092296600342, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.42857142857142855, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5656318260090691, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.03485713792698724, "step": 12 }, { "completion_length": 515.53125, "epoch": 4.7272727272727275, "grad_norm": 0.5425270795822144, "kl": 0.0007404747229884379, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 1.1888295784592628, "reward_std": 0.8663323745131493, "rewards/concensus_correctness_reward_func": 0.07324999943375587, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.42651709727942944, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.37656251387670636, "step": 14 }, { "completion_length": 498.82142857142856, "epoch": 5.363636363636363, "grad_norm": 0.4237583577632904, "kl": 0.0007124320171507341, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 1.024352667587144, "reward_std": 0.6125156283378601, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.35714285714285715, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5225669869354793, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14464285969734192, "step": 16 }, { "completion_length": 447.64285714285717, "epoch": 6.0, "grad_norm": 0.24781115353107452, "kl": 0.0006924252333452127, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 1.0331759282520838, "reward_std": 0.9632131372179303, "rewards/concensus_correctness_reward_func": 0.08085714067731585, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.3932830457176481, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2733214327267238, "step": 18 }, { "completion_length": 481.5625, "epoch": 6.7272727272727275, "grad_norm": 0.5104337334632874, "kl": 0.0007607093502883799, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 1.1216800138354301, "reward_std": 0.8741806000471115, "rewards/concensus_correctness_reward_func": 0.05274999886751175, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.3667425373569131, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.23343750461935997, "step": 20 }, { "epoch": 6.7272727272727275, "step": 20, "total_flos": 0.0, "train_loss": 8.081094904355268e-07, "train_runtime": 611.7777, "train_samples_per_second": 0.523, "train_steps_per_second": 0.033 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }