{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.7272727272727275, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 424.6875, "epoch": 0.7272727272727273, "grad_norm": 0.43405377864837646, "kl": 0.0006831735299783759, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.8635006062686443, "reward_std": 0.9787142872810364, "rewards/concensus_correctness_reward_func": 0.035999998450279236, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4570631179958582, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.1048125009983778, "step": 2 }, { "completion_length": 378.2857142857143, "epoch": 1.3636363636363638, "grad_norm": 1.0889649391174316, "kl": 0.0006943551125004888, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 0.5840824406061854, "reward_std": 0.6718125343322754, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.07142857142857142, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.3689038535313947, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14375001617840358, "step": 4 }, { "completion_length": 408.42857142857144, "epoch": 2.0, "grad_norm": 5.749520778656006, "kl": 0.0008105350176005491, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 1.5108186176845007, "reward_std": 1.2569768513951982, "rewards/concensus_correctness_reward_func": 0.11528571162905012, "rewards/consensus_reward_func": 0.6428571428571429, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5690686276980809, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18360714720828192, "step": 6 }, { "completion_length": 541.09375, "epoch": 2.7272727272727275, "grad_norm": 0.27186667919158936, "kl": 0.0006820280032115988, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 1.114719219505787, "reward_std": 1.1962014511227608, "rewards/concensus_correctness_reward_func": 0.00787500012665987, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.5445942096412182, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18725000135600567, "step": 8 }, { "completion_length": 381.64285714285717, "epoch": 3.3636363636363638, "grad_norm": 0.4672013223171234, "kl": 0.0007658531373765852, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 1.027580979679312, "reward_std": 1.0205590171473367, "rewards/concensus_correctness_reward_func": 0.06171428305762155, "rewards/consensus_reward_func": 0.35714285714285715, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.45865238670791897, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.1322142971413476, "step": 10 }, { "completion_length": 493.4642857142857, "epoch": 4.0, "grad_norm": 0.33820706605911255, "kl": 0.0006163329900508481, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 0.9565395074231284, "reward_std": 1.093166755778449, "rewards/concensus_correctness_reward_func": 0.03142857125827244, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5024323686957359, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.017857142857142856, "rewards/xmlcount_reward_func": 0.11910714794482503, "step": 12 }, { "completion_length": 440.40625, "epoch": 4.7272727272727275, "grad_norm": 1.1928702592849731, "kl": 0.000714650159352459, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 1.2266200836747885, "reward_std": 1.2259398996829987, "rewards/concensus_correctness_reward_func": 0.10087499767541885, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.40987010672688484, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.26275000162422657, "step": 14 }, { "completion_length": 463.17857142857144, "epoch": 5.363636363636363, "grad_norm": 0.31915608048439026, "kl": 0.000624320494742798, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 1.2959183220352446, "reward_std": 1.0866505248206002, "rewards/concensus_correctness_reward_func": 0.05357142857142857, "rewards/consensus_reward_func": 0.42857142857142855, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5370612144470215, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2767142930201122, "step": 16 }, { "completion_length": 362.85714285714283, "epoch": 6.0, "grad_norm": 0.6950757503509521, "kl": 0.0008193998697346874, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 0.9905583645616259, "reward_std": 1.0358542374202184, "rewards/concensus_correctness_reward_func": 0.06171428305762155, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.3102726340293884, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11857143257345472, "step": 18 }, { "completion_length": 394.25, "epoch": 6.7272727272727275, "grad_norm": 0.523013710975647, "kl": 0.0007893772781244479, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 0.7763789296150208, "reward_std": 0.8996854834258556, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.48691016249358654, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03946874663233757, "step": 20 }, { "epoch": 6.7272727272727275, "step": 20, "total_flos": 0.0, "train_loss": 6.585680921489257e-07, "train_runtime": 1575.2293, "train_samples_per_second": 0.203, "train_steps_per_second": 0.013 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }