|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.8932701796293259, |
|
"epoch": 0.01, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 17.141883850097656, |
|
"learning_rate": 9.649999999999999e-07, |
|
"loss": -0.0, |
|
"num_tokens": 49344.0, |
|
"reward": -459.925, |
|
"reward_std": 173.0284324645996, |
|
"rewards/reward_func/mean": -459.925, |
|
"rewards/reward_func/std": 173.02844009399413, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.5676407814025879, |
|
"epoch": 0.02, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 12.152915000915527, |
|
"learning_rate": 9.15e-07, |
|
"loss": 0.0, |
|
"num_tokens": 104592.0, |
|
"reward": -394.4625, |
|
"reward_std": 221.84682922363282, |
|
"rewards/reward_func/mean": -394.4625, |
|
"rewards/reward_func/std": 221.84682998657226, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.20221070423722268, |
|
"epoch": 0.03, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 6.0900750160217285, |
|
"learning_rate": 8.65e-07, |
|
"loss": 0.0, |
|
"num_tokens": 152208.0, |
|
"reward": -79.8125, |
|
"reward_std": 144.8038761138916, |
|
"rewards/reward_func/mean": -79.8125, |
|
"rewards/reward_func/std": 144.80387077331542, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.09748994875699282, |
|
"epoch": 0.04, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.9179723262786865, |
|
"learning_rate": 8.149999999999999e-07, |
|
"loss": -0.0, |
|
"num_tokens": 201216.0, |
|
"reward": 9.3625, |
|
"reward_std": 81.3893858909607, |
|
"rewards/reward_func/mean": 9.3625, |
|
"rewards/reward_func/std": 81.38938302993775, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.08117130994796753, |
|
"epoch": 0.05, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.0272433757781982, |
|
"learning_rate": 7.65e-07, |
|
"loss": -0.0, |
|
"num_tokens": 252344.0, |
|
"reward": 60.8, |
|
"reward_std": 22.109106731414794, |
|
"rewards/reward_func/mean": 60.8, |
|
"rewards/reward_func/std": 22.109107208251952, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.0658429590985179, |
|
"epoch": 0.06, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.603912115097046, |
|
"learning_rate": 7.149999999999999e-07, |
|
"loss": 0.0, |
|
"num_tokens": 303800.0, |
|
"reward": 65.1875, |
|
"reward_std": 14.921637725830077, |
|
"rewards/reward_func/mean": 65.1875, |
|
"rewards/reward_func/std": 14.921638202667236, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.08412722386419773, |
|
"epoch": 0.07, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.701263189315796, |
|
"learning_rate": 6.65e-07, |
|
"loss": 0.0, |
|
"num_tokens": 355936.0, |
|
"reward": 74.2625, |
|
"reward_std": 19.148268938064575, |
|
"rewards/reward_func/mean": 74.2625, |
|
"rewards/reward_func/std": 19.14826898574829, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.08717511333525181, |
|
"epoch": 0.08, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.643070936203003, |
|
"learning_rate": 6.149999999999999e-07, |
|
"loss": -0.0, |
|
"num_tokens": 409632.0, |
|
"reward": 71.3, |
|
"reward_std": 19.00277919769287, |
|
"rewards/reward_func/mean": 71.3, |
|
"rewards/reward_func/std": 19.0027795791626, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.10131141170859337, |
|
"epoch": 0.09, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 4.0365071296691895, |
|
"learning_rate": 5.649999999999999e-07, |
|
"loss": 0.0, |
|
"num_tokens": 457048.0, |
|
"reward": 81.2875, |
|
"reward_std": 12.263180470466613, |
|
"rewards/reward_func/mean": 81.2875, |
|
"rewards/reward_func/std": 12.263180875778199, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.07999873682856559, |
|
"epoch": 0.1, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 4.262997150421143, |
|
"learning_rate": 5.149999999999999e-07, |
|
"loss": 0.0, |
|
"num_tokens": 506584.0, |
|
"reward": 73.05, |
|
"reward_std": 16.826302528381348, |
|
"rewards/reward_func/mean": 73.05, |
|
"rewards/reward_func/std": 16.82630310058594, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.07205904349684715, |
|
"epoch": 0.11, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.98404598236084, |
|
"learning_rate": 4.65e-07, |
|
"loss": 0.0, |
|
"num_tokens": 556800.0, |
|
"reward": 80.7875, |
|
"reward_std": 11.600197219848633, |
|
"rewards/reward_func/mean": 80.7875, |
|
"rewards/reward_func/std": 11.600197505950927, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.08184156678617001, |
|
"epoch": 0.12, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 2.9209563732147217, |
|
"learning_rate": 4.1499999999999994e-07, |
|
"loss": -0.0, |
|
"num_tokens": 605280.0, |
|
"reward": 82.5625, |
|
"reward_std": 13.004734754562378, |
|
"rewards/reward_func/mean": 82.5625, |
|
"rewards/reward_func/std": 13.004735040664674, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.07969092782586813, |
|
"epoch": 0.13, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 4.116878509521484, |
|
"learning_rate": 3.65e-07, |
|
"loss": 0.0, |
|
"num_tokens": 657160.0, |
|
"reward": 55.6875, |
|
"reward_std": 24.997920417785643, |
|
"rewards/reward_func/mean": 55.6875, |
|
"rewards/reward_func/std": 24.99792127609253, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.08586088940501213, |
|
"epoch": 0.14, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.09967041015625, |
|
"learning_rate": 3.15e-07, |
|
"loss": 0.0, |
|
"num_tokens": 707272.0, |
|
"reward": 78.0375, |
|
"reward_std": 13.64857816696167, |
|
"rewards/reward_func/mean": 78.0375, |
|
"rewards/reward_func/std": 13.64857850074768, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.07407695688307285, |
|
"epoch": 0.15, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.6434996128082275, |
|
"learning_rate": 2.65e-07, |
|
"loss": 0.0, |
|
"num_tokens": 755712.0, |
|
"reward": 62.6875, |
|
"reward_std": 29.81237063407898, |
|
"rewards/reward_func/mean": 62.6875, |
|
"rewards/reward_func/std": 29.812370777130127, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.08506322149187326, |
|
"epoch": 0.16, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.3035037517547607, |
|
"learning_rate": 2.1499999999999998e-07, |
|
"loss": 0.0, |
|
"num_tokens": 806232.0, |
|
"reward": 80.3875, |
|
"reward_std": 12.722473907470704, |
|
"rewards/reward_func/mean": 80.3875, |
|
"rewards/reward_func/std": 12.722473907470704, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.07220943029969931, |
|
"epoch": 0.17, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 4.2111358642578125, |
|
"learning_rate": 1.65e-07, |
|
"loss": 0.0, |
|
"num_tokens": 858016.0, |
|
"reward": 81.125, |
|
"reward_std": 11.847685623168946, |
|
"rewards/reward_func/mean": 81.125, |
|
"rewards/reward_func/std": 11.84768624305725, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.07000144328922034, |
|
"epoch": 0.18, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 3.138835906982422, |
|
"learning_rate": 1.15e-07, |
|
"loss": 0.0, |
|
"num_tokens": 907688.0, |
|
"reward": 72.55, |
|
"reward_std": 17.49347562789917, |
|
"rewards/reward_func/mean": 72.55, |
|
"rewards/reward_func/std": 17.493476104736327, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.06578830443322659, |
|
"epoch": 0.19, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 4.317309379577637, |
|
"learning_rate": 6.5e-08, |
|
"loss": -0.0, |
|
"num_tokens": 960144.0, |
|
"reward": 78.3875, |
|
"reward_std": 13.952195310592652, |
|
"rewards/reward_func/mean": 78.3875, |
|
"rewards/reward_func/std": 13.952195501327514, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 1.0, |
|
"completions/max_length": 256.0, |
|
"completions/max_terminated_length": 0.0, |
|
"completions/mean_length": 256.0, |
|
"completions/mean_terminated_length": 0.0, |
|
"completions/min_length": 256.0, |
|
"completions/min_terminated_length": 0.0, |
|
"entropy": 0.06528270887210966, |
|
"epoch": 0.2, |
|
"frac_reward_zero_std": 0.0, |
|
"grad_norm": 2.896949291229248, |
|
"learning_rate": 1.5e-08, |
|
"loss": -0.0, |
|
"num_tokens": 1006496.0, |
|
"reward": 75.8875, |
|
"reward_std": 12.274623441696168, |
|
"rewards/reward_func/mean": 75.8875, |
|
"rewards/reward_func/std": 12.274623727798462, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 1006496, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|