{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.7272727272727275, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 439.75, "epoch": 0.7272727272727273, "grad_norm": 0.33189961314201355, "kl": 0.0005682226837961935, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.8935811221599579, "reward_std": 1.1520937494933605, "rewards/concensus_correctness_reward_func": 0.05575000122189522, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.49667483754456043, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.20053125359117985, "step": 2 }, { "completion_length": 517.5, "epoch": 1.3636363636363638, "grad_norm": 0.34845486283302307, "kl": 0.0006451955580684755, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 2.103976920247078, "reward_std": 2.594899445772171, "rewards/concensus_correctness_reward_func": 1.4285714285714286, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.14285714285714285, "rewards/question_recreation_reward_func": 0.6586555327687945, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4118214558277811, "step": 4 }, { "completion_length": 503.60714285714283, "epoch": 2.0, "grad_norm": 0.3112787902355194, "kl": 0.0006537369460732277, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 1.9017176287514823, "reward_std": 2.761974368776594, "rewards/concensus_correctness_reward_func": 0.7780000013964516, "rewards/consensus_reward_func": 0.5714285714285714, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.07142857142857142, "rewards/question_recreation_reward_func": 0.4881462114197867, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.007285714149475098, "step": 6 }, { "completion_length": 488.09375, "epoch": 2.7272727272727275, "grad_norm": 0.28854992985725403, "kl": 0.0006033929603290744, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 0.5143466033041477, "reward_std": 0.7794156074523926, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.4986591096967459, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": -0.12493750266730785, "step": 8 }, { "completion_length": 445.25, "epoch": 3.3636363636363638, "grad_norm": 0.2975492477416992, "kl": 0.0007233569548199219, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 0.6810689483370099, "reward_std": 1.0756073594093323, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.14285714285714285, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.07142857142857142, "rewards/question_recreation_reward_func": 0.42706896364688873, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.021857144577162608, "step": 10 }, { "completion_length": 436.92857142857144, "epoch": 4.0, "grad_norm": 0.3033583462238312, "kl": 0.0007356918441863465, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 0.7826846497399467, "reward_std": 0.9522724577358791, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.07142857142857142, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.07142857142857142, "rewards/question_recreation_reward_func": 0.48600608323301586, "rewards/soft_format_reward_func": 0.017857142857142856, "rewards/strict_format_reward_func": 0.03571428571428571, "rewards/xmlcount_reward_func": 0.10025001210825783, "step": 12 }, { "completion_length": 447.5, "epoch": 4.7272727272727275, "grad_norm": 0.3856496512889862, "kl": 0.0006589046752196737, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 1.3541742600500584, "reward_std": 1.122757863253355, "rewards/concensus_correctness_reward_func": 0.20525000244379044, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.5546430200338364, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0942812692373991, "step": 14 }, { "completion_length": 478.2142857142857, "epoch": 5.363636363636363, "grad_norm": 0.39702117443084717, "kl": 0.0005860687191930733, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 0.7731724424021584, "reward_std": 1.3566676803997584, "rewards/concensus_correctness_reward_func": 0.06371428711073739, "rewards/consensus_reward_func": 0.35714285714285715, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4409581848553249, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": -0.10649998698915754, "step": 16 }, { "completion_length": 483.17857142857144, "epoch": 6.0, "grad_norm": 0.2792685925960541, "kl": 0.0006680597808943796, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 1.7258293820278985, "reward_std": 2.5234424557004655, "rewards/concensus_correctness_reward_func": 0.7142857142857143, "rewards/consensus_reward_func": 0.21428571428571427, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.21428571428571427, "rewards/question_recreation_reward_func": 0.6093294237341199, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": -0.04421427526644298, "step": 18 }, { "completion_length": 437.4375, "epoch": 6.7272727272727275, "grad_norm": 0.38684436678886414, "kl": 0.0006600186170544475, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 0.6950539969839156, "reward_std": 0.6780839432030916, "rewards/concensus_correctness_reward_func": 0.05575000122189522, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.45280400663614273, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06349999457597733, "step": 20 }, { "epoch": 6.7272727272727275, "step": 20, "total_flos": 0.0, "train_loss": 6.047600265901565e-07, "train_runtime": 611.9145, "train_samples_per_second": 0.523, "train_steps_per_second": 0.033 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }