{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 324.4375, "completions/mean_terminated_length": 224.50000762939453, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 9.534358024597168, "kl": -5.265982289115456e-09, "learning_rate": 5e-07, "loss": 0.0881, "num_tokens": 9287.0, "reward": 0.02333822101354599, "reward_std": 0.011800897307693958, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.02333822101354599, "rewards/question_recreation_reward_func/std": 0.014200002886354923, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 831.5, "completions/max_terminated_length": 502.5, "completions/mean_length": 283.4375, "completions/mean_terminated_length": 169.41666793823242, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.16, "frac_reward_zero_std": 0.0, "grad_norm": 12.92307186126709, "kl": 0.0008189797645172803, "learning_rate": 4.864543104251586e-07, "loss": -0.0098, "num_tokens": 17918.0, "reward": 0.08256983105093241, "reward_std": 0.01837824168615043, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.08256983105093241, "rewards/question_recreation_reward_func/std": 0.023414009949192405, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.5, "completions/max_terminated_length": 722.5, "completions/mean_length": 253.75, "completions/mean_terminated_length": 253.75, "completions/min_length": 24.5, "completions/min_terminated_length": 24.5, "epoch": 0.24, "frac_reward_zero_std": 0.0, "grad_norm": 6.703396320343018, "kl": 0.0009663624296081252, "learning_rate": 4.472851273490984e-07, "loss": 0.159, "num_tokens": 26074.0, "reward": 0.026149642653763294, "reward_std": 0.01439021248370409, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.026149642653763294, "rewards/question_recreation_reward_func/std": 0.01520916074514389, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.5, "completions/max_terminated_length": 678.5, "completions/mean_length": 289.75, "completions/mean_terminated_length": 289.75, "completions/min_length": 70.5, "completions/min_terminated_length": 70.5, "epoch": 0.32, "frac_reward_zero_std": 0.0, "grad_norm": 7.256834030151367, "kl": 0.0016369151817343663, "learning_rate": 3.867370395306068e-07, "loss": -0.0972, "num_tokens": 34806.0, "reward": 0.060055448208004236, "reward_std": 0.020912725245580077, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.060055448208004236, "rewards/question_recreation_reward_func/std": 0.034508606884628534, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.5, "completions/max_terminated_length": 555.5, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 3.5, "completions/min_terminated_length": 3.5, "epoch": 0.4, "frac_reward_zero_std": 0.0, "grad_norm": 5.563640594482422, "kl": 0.0017646014493948314, "learning_rate": 3.1137137178519977e-07, "loss": 0.1804, "num_tokens": 43036.0, "reward": 0.1413715137168765, "reward_std": 0.18358006980270147, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.125, "rewards/consensus_reward_func/std": 0.3535533845424652, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.016371519304811954, "rewards/question_recreation_reward_func/std": 0.011078037787228823, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 670.0, "completions/max_terminated_length": 533.5, "completions/mean_length": 258.75, "completions/mean_terminated_length": 214.0, "completions/min_length": 4.5, "completions/min_terminated_length": 4.5, "epoch": 0.48, "frac_reward_zero_std": 0.0, "grad_norm": 6.797975063323975, "kl": 0.003329606697661802, "learning_rate": 2.2935516363191693e-07, "loss": 0.0231, "num_tokens": 51272.0, "reward": 0.30666957050561905, "reward_std": 0.37889517843723297, "rewards/concensus_correctness_reward_func/mean": 0.1197500005364418, "rewards/concensus_correctness_reward_func/std": 0.3387041389942169, "rewards/consensus_reward_func/mean": 0.125, "rewards/consensus_reward_func/std": 0.3535533845424652, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.061919582076370716, "rewards/question_recreation_reward_func/std": 0.04408737272024155, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 285.3125, "completions/mean_terminated_length": 285.3125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.56, "frac_reward_zero_std": 0.0, "grad_norm": 7.043668270111084, "kl": 0.002082884529954754, "learning_rate": 1.4957614383675767e-07, "loss": 0.35, "num_tokens": 59933.0, "reward": 0.06385299749672413, "reward_std": 0.03737428830936551, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.06385299749672413, "rewards/question_recreation_reward_func/std": 0.03331646043807268, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 853.5, "completions/max_terminated_length": 685.0, "completions/mean_length": 341.3125, "completions/mean_terminated_length": 246.9166717529297, "completions/min_length": 64.5, "completions/min_terminated_length": 64.5, "epoch": 0.64, "frac_reward_zero_std": 0.0, "grad_norm": 5.556763172149658, "kl": 0.0018842843419406563, "learning_rate": 8.067960709356478e-08, "loss": 0.0971, "num_tokens": 69490.0, "reward": 0.06051425402984023, "reward_std": 0.019080545171163976, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.06051425402984023, "rewards/question_recreation_reward_func/std": 0.030638275435194373, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.5, "completions/max_terminated_length": 509.5, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 3.5, "completions/min_terminated_length": 3.5, "epoch": 0.72, "frac_reward_zero_std": 0.125, "grad_norm": 6.232974529266357, "kl": 0.006690542242722586, "learning_rate": 3.013156219837776e-08, "loss": -0.0781, "num_tokens": 76672.0, "reward": 2.7677047792822123, "reward_std": 0.006125873536802828, "rewards/concensus_correctness_reward_func/mean": 2.5, "rewards/concensus_correctness_reward_func/std": 4.629100322723389, "rewards/consensus_reward_func/mean": 0.25, "rewards/consensus_reward_func/std": 0.4629100561141968, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.01770483050495386, "rewards/question_recreation_reward_func/std": 0.01125000836327672, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.5, "completions/max_terminated_length": 564.5, "completions/mean_length": 184.0625, "completions/mean_terminated_length": 184.0625, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.8, "frac_reward_zero_std": 0.0, "grad_norm": 9.507555961608887, "kl": 0.002568137046182528, "learning_rate": 3.4096741493194193e-09, "loss": 0.0613, "num_tokens": 83713.0, "reward": 0.14513505343347788, "reward_std": 0.180526792537421, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.125, "rewards/consensus_reward_func/std": 0.3535533845424652, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.02013504970818758, "rewards/question_recreation_reward_func/std": 0.011476744432002306, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 20 }, { "epoch": 0.8, "step": 20, "total_flos": 0.0, "train_loss": 0.0773889608681202, "train_runtime": 1482.6063, "train_samples_per_second": 0.108, "train_steps_per_second": 0.013 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 83713, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }