{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.7272727272727275, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 391.21875, "epoch": 0.7272727272727273, "grad_norm": 2.373386859893799, "kl": 0.0017298507882514969, "learning_rate": 5e-07, "loss": 0.0, "reward": 1.027605127543211, "reward_std": 0.8573780730366707, "rewards/concensus_correctness_reward_func": 0.01549999974668026, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.34066762682050467, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2183124925941229, "step": 2 }, { "completion_length": 441.0, "epoch": 1.3636363636363638, "grad_norm": 0.42142999172210693, "kl": 0.0007273116830869444, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 1.3222310415336065, "reward_std": 0.9695166115249906, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.35714285714285715, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.14285714285714285, "rewards/question_recreation_reward_func": 0.3179096079298428, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.05357142857142857, "rewards/xmlcount_reward_func": 0.4507499805518559, "step": 4 }, { "completion_length": 411.4642857142857, "epoch": 2.0, "grad_norm": 0.31262725591659546, "kl": 0.0007632451763908778, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 1.3550525946276528, "reward_std": 1.371002929551261, "rewards/concensus_correctness_reward_func": 0.07378571374075753, "rewards/consensus_reward_func": 0.5714285714285714, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.3721954588379179, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3376428612640926, "step": 6 }, { "completion_length": 393.34375, "epoch": 2.7272727272727275, "grad_norm": 0.5475780963897705, "kl": 0.0007579198500025086, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 1.18819116987288, "reward_std": 1.0423251129686832, "rewards/concensus_correctness_reward_func": 0.03343749977648258, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.399628683924675, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.34887499548494816, "step": 8 }, { "completion_length": 303.07142857142856, "epoch": 3.3636363636363638, "grad_norm": 0.46060845255851746, "kl": 0.0008165521680244378, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 1.2178367376327515, "reward_std": 1.3261123895645142, "rewards/concensus_correctness_reward_func": 0.024642857057707652, "rewards/consensus_reward_func": 0.5714285714285714, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.33272959504808697, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.28903571729149136, "step": 10 }, { "completion_length": 477.07142857142856, "epoch": 4.0, "grad_norm": 0.4437929689884186, "kl": 0.0007346254590499614, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 0.9648902629102979, "reward_std": 1.0374200471809931, "rewards/concensus_correctness_reward_func": 0.021857142448425293, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.3882473600762231, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03571428571428571, "rewards/xmlcount_reward_func": 0.23335714638233185, "step": 12 }, { "completion_length": 414.28125, "epoch": 4.7272727272727275, "grad_norm": 0.6084716320037842, "kl": 0.000752412059227936, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 1.439010389149189, "reward_std": 1.063487522304058, "rewards/concensus_correctness_reward_func": 0.05024999938905239, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.2996041877195239, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.46415624767541885, "step": 14 }, { "completion_length": 367.32142857142856, "epoch": 5.363636363636363, "grad_norm": 0.5186842083930969, "kl": 0.0007599871439327087, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 1.2309029187474931, "reward_std": 1.1545301760946, "rewards/concensus_correctness_reward_func": 0.01771428542477744, "rewards/consensus_reward_func": 0.2857142857142857, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.448474326304027, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4790000042745045, "step": 16 }, { "completion_length": 478.64285714285717, "epoch": 6.0, "grad_norm": 0.4252350628376007, "kl": 0.0007060284011199006, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 1.295845742736544, "reward_std": 0.9719425099236625, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.07142857142857142, "rewards/question_recreation_reward_func": 0.3770243099757603, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34739284004483906, "step": 18 }, { "completion_length": 368.9375, "epoch": 6.7272727272727275, "grad_norm": 1.0040364265441895, "kl": 0.0008240167480835225, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 1.2896132469177246, "reward_std": 1.0858882665634155, "rewards/concensus_correctness_reward_func": 0.06468749791383743, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.3639882355928421, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.4390625096857548, "step": 20 }, { "epoch": 6.7272727272727275, "step": 20, "total_flos": 0.0, "train_loss": 7.97019049514347e-07, "train_runtime": 537.6818, "train_samples_per_second": 0.595, "train_steps_per_second": 0.037 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }