|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9984, |
|
"eval_steps": 100, |
|
"global_step": 468, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.328125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 807.5, |
|
"completions/mean_length": 603.8828125, |
|
"completions/mean_terminated_length": 396.44896697998047, |
|
"completions/min_length": 85.25, |
|
"completions/min_terminated_length": 85.25, |
|
"epoch": 0.0021333333333333334, |
|
"grad_norm": 0.29046246524602504, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": -0.0123, |
|
"num_tokens": 96033.0, |
|
"reward": 0.109375, |
|
"reward_std": 0.18543372116982937, |
|
"rewards/accuracy_reward/mean": 0.109375, |
|
"rewards/accuracy_reward/std": 0.23456304892897606, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.310546875, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 809.125, |
|
"completions/mean_length": 579.490234375, |
|
"completions/mean_terminated_length": 381.3663749694824, |
|
"completions/min_length": 69.09375, |
|
"completions/min_terminated_length": 69.09375, |
|
"epoch": 0.010666666666666666, |
|
"grad_norm": 0.32341920946567715, |
|
"kl": 0.000206679105758667, |
|
"learning_rate": 2.553191489361702e-07, |
|
"loss": 0.0139, |
|
"num_tokens": 463812.0, |
|
"reward": 0.107421875, |
|
"reward_std": 0.18129237182438374, |
|
"rewards/accuracy_reward/mean": 0.107421875, |
|
"rewards/accuracy_reward/std": 0.23479509353637695, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2828125, |
|
"completions/max_length": 1022.75, |
|
"completions/max_terminated_length": 800.325, |
|
"completions/mean_length": 570.1828125, |
|
"completions/mean_terminated_length": 391.86217346191404, |
|
"completions/min_length": 65.05, |
|
"completions/min_terminated_length": 65.05, |
|
"epoch": 0.021333333333333333, |
|
"grad_norm": 0.31691394403397916, |
|
"kl": 0.00028862953186035154, |
|
"learning_rate": 5.74468085106383e-07, |
|
"loss": 0.0054, |
|
"num_tokens": 917313.0, |
|
"reward": 0.1046875, |
|
"reward_std": 0.19559106044471264, |
|
"rewards/accuracy_reward/mean": 0.1046875, |
|
"rewards/accuracy_reward/std": 0.2545704640448093, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.278125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 825.475, |
|
"completions/mean_length": 561.0359375, |
|
"completions/mean_terminated_length": 383.8768829345703, |
|
"completions/min_length": 75.625, |
|
"completions/min_terminated_length": 75.625, |
|
"epoch": 0.032, |
|
"grad_norm": 0.2712721137539722, |
|
"kl": 0.0003191232681274414, |
|
"learning_rate": 8.936170212765958e-07, |
|
"loss": 0.0012, |
|
"num_tokens": 1370600.0, |
|
"reward": 0.090625, |
|
"reward_std": 0.16034209839999675, |
|
"rewards/accuracy_reward/mean": 0.090625, |
|
"rewards/accuracy_reward/std": 0.20271828323602675, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2703125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 810.4, |
|
"completions/mean_length": 557.6, |
|
"completions/mean_terminated_length": 387.25951232910154, |
|
"completions/min_length": 68.8, |
|
"completions/min_terminated_length": 68.8, |
|
"epoch": 0.042666666666666665, |
|
"grad_norm": 0.22807478508268822, |
|
"kl": 0.0006281852722167969, |
|
"learning_rate": 1.2127659574468085e-06, |
|
"loss": 0.0119, |
|
"num_tokens": 1827680.0, |
|
"reward": 0.0953125, |
|
"reward_std": 0.1787047166377306, |
|
"rewards/accuracy_reward/mean": 0.0953125, |
|
"rewards/accuracy_reward/std": 0.23710305467247964, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.221875, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 842.425, |
|
"completions/mean_length": 573.5546875, |
|
"completions/mean_terminated_length": 449.37009353637694, |
|
"completions/min_length": 123.775, |
|
"completions/min_terminated_length": 123.775, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.44388579378313797, |
|
"kl": 0.0013070106506347656, |
|
"learning_rate": 1.5319148936170212e-06, |
|
"loss": -0.0003, |
|
"num_tokens": 2288499.0, |
|
"reward": 0.115625, |
|
"reward_std": 0.19537889324128627, |
|
"rewards/accuracy_reward/mean": 0.115625, |
|
"rewards/accuracy_reward/std": 0.24712119549512862, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2125, |
|
"completions/max_length": 1013.025, |
|
"completions/max_terminated_length": 822.925, |
|
"completions/mean_length": 571.5015625, |
|
"completions/mean_terminated_length": 456.50114212036135, |
|
"completions/min_length": 126.625, |
|
"completions/min_terminated_length": 126.625, |
|
"epoch": 0.064, |
|
"grad_norm": 0.29459049413117055, |
|
"kl": 0.0028873443603515624, |
|
"learning_rate": 1.851063829787234e-06, |
|
"loss": 0.0168, |
|
"num_tokens": 2747108.0, |
|
"reward": 0.1703125, |
|
"reward_std": 0.23525591157376766, |
|
"rewards/accuracy_reward/mean": 0.1703125, |
|
"rewards/accuracy_reward/std": 0.30535352900624274, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1921875, |
|
"completions/max_length": 983.125, |
|
"completions/max_terminated_length": 856.15, |
|
"completions/mean_length": 569.5109375, |
|
"completions/mean_terminated_length": 468.74621734619143, |
|
"completions/min_length": 140.675, |
|
"completions/min_terminated_length": 140.675, |
|
"epoch": 0.07466666666666667, |
|
"grad_norm": 0.312902146600806, |
|
"kl": 0.004683685302734375, |
|
"learning_rate": 2.170212765957447e-06, |
|
"loss": -0.0096, |
|
"num_tokens": 3203995.0, |
|
"reward": 0.2921875, |
|
"reward_std": 0.2818045925348997, |
|
"rewards/accuracy_reward/mean": 0.2921875, |
|
"rewards/accuracy_reward/std": 0.35747046694159507, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.20625, |
|
"completions/max_length": 1015.0, |
|
"completions/max_terminated_length": 860.75, |
|
"completions/mean_length": 610.028125, |
|
"completions/mean_terminated_length": 512.4429527282715, |
|
"completions/min_length": 243.85, |
|
"completions/min_terminated_length": 243.85, |
|
"epoch": 0.08533333333333333, |
|
"grad_norm": 0.3492387121634339, |
|
"kl": 0.006721115112304688, |
|
"learning_rate": 2.4893617021276598e-06, |
|
"loss": 0.0155, |
|
"num_tokens": 3683013.0, |
|
"reward": 0.4015625, |
|
"reward_std": 0.30027358755469324, |
|
"rewards/accuracy_reward/mean": 0.4015625, |
|
"rewards/accuracy_reward/std": 0.405949179828167, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.18125, |
|
"completions/max_length": 1003.275, |
|
"completions/max_terminated_length": 807.4, |
|
"completions/mean_length": 574.1125, |
|
"completions/mean_terminated_length": 482.73938674926757, |
|
"completions/min_length": 233.7, |
|
"completions/min_terminated_length": 233.7, |
|
"epoch": 0.096, |
|
"grad_norm": 0.34838562088741554, |
|
"kl": 0.009783935546875, |
|
"learning_rate": 2.8085106382978724e-06, |
|
"loss": 0.0274, |
|
"num_tokens": 4141981.0, |
|
"reward": 0.453125, |
|
"reward_std": 0.26892418451607225, |
|
"rewards/accuracy_reward/mean": 0.453125, |
|
"rewards/accuracy_reward/std": 0.428996454924345, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.24375, |
|
"completions/max_length": 1007.625, |
|
"completions/max_terminated_length": 854.4, |
|
"completions/mean_length": 660.3234375, |
|
"completions/mean_terminated_length": 554.0355278015137, |
|
"completions/min_length": 317.9, |
|
"completions/min_terminated_length": 317.9, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 0.3583613636029149, |
|
"kl": 0.009557342529296875, |
|
"learning_rate": 2.9998329491279003e-06, |
|
"loss": 0.0343, |
|
"num_tokens": 4648684.0, |
|
"reward": 0.4359375, |
|
"reward_std": 0.26149683743715285, |
|
"rewards/accuracy_reward/mean": 0.4359375, |
|
"rewards/accuracy_reward/std": 0.40705749318003653, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2875, |
|
"completions/max_length": 1015.175, |
|
"completions/max_terminated_length": 881.525, |
|
"completions/mean_length": 678.659375, |
|
"completions/mean_terminated_length": 557.4395378112793, |
|
"completions/min_length": 286.05, |
|
"completions/min_terminated_length": 286.05, |
|
"epoch": 0.11733333333333333, |
|
"grad_norm": 0.2916953090873945, |
|
"kl": 0.01027984619140625, |
|
"learning_rate": 2.9979540541011378e-06, |
|
"loss": 0.0193, |
|
"num_tokens": 5163018.0, |
|
"reward": 0.4265625, |
|
"reward_std": 0.2812244530767202, |
|
"rewards/accuracy_reward/mean": 0.4265625, |
|
"rewards/accuracy_reward/std": 0.39905172660946847, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.31875, |
|
"completions/max_length": 1011.0, |
|
"completions/max_terminated_length": 856.575, |
|
"completions/mean_length": 680.340625, |
|
"completions/mean_terminated_length": 527.7020431518555, |
|
"completions/min_length": 285.125, |
|
"completions/min_terminated_length": 285.125, |
|
"epoch": 0.128, |
|
"grad_norm": 0.3097602989893739, |
|
"kl": 0.01049346923828125, |
|
"learning_rate": 2.993990074515455e-06, |
|
"loss": 0.0374, |
|
"num_tokens": 5687748.0, |
|
"reward": 0.4953125, |
|
"reward_std": 0.27433369904756544, |
|
"rewards/accuracy_reward/mean": 0.4953125, |
|
"rewards/accuracy_reward/std": 0.42715574726462363, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.378125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 860.525, |
|
"completions/mean_length": 737.828125, |
|
"completions/mean_terminated_length": 572.9206932067871, |
|
"completions/min_length": 346.95, |
|
"completions/min_terminated_length": 346.95, |
|
"epoch": 0.13866666666666666, |
|
"grad_norm": 0.3757665149243777, |
|
"kl": 0.0093902587890625, |
|
"learning_rate": 2.987946528051855e-06, |
|
"loss": 0.0336, |
|
"num_tokens": 6255822.0, |
|
"reward": 0.3546875, |
|
"reward_std": 0.26096867546439173, |
|
"rewards/accuracy_reward/mean": 0.3546875, |
|
"rewards/accuracy_reward/std": 0.3680890344083309, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.3265625, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 794.05, |
|
"completions/mean_length": 666.8078125, |
|
"completions/mean_terminated_length": 498.4326515197754, |
|
"completions/min_length": 271.175, |
|
"completions/min_terminated_length": 271.175, |
|
"epoch": 0.14933333333333335, |
|
"grad_norm": 0.3067868868446902, |
|
"kl": 0.0102752685546875, |
|
"learning_rate": 2.9798318270547456e-06, |
|
"loss": 0.0687, |
|
"num_tokens": 6760139.0, |
|
"reward": 0.5265625, |
|
"reward_std": 0.3408231448382139, |
|
"rewards/accuracy_reward/mean": 0.5265625, |
|
"rewards/accuracy_reward/std": 0.4369465745985508, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2609375, |
|
"completions/max_length": 1020.325, |
|
"completions/max_terminated_length": 863.3, |
|
"completions/mean_length": 670.8140625, |
|
"completions/mean_terminated_length": 550.6407440185546, |
|
"completions/min_length": 294.25, |
|
"completions/min_terminated_length": 294.25, |
|
"epoch": 0.16, |
|
"grad_norm": 0.29577449728572786, |
|
"kl": 0.00943450927734375, |
|
"learning_rate": 2.96965726682234e-06, |
|
"loss": 0.0437, |
|
"num_tokens": 7277532.0, |
|
"reward": 0.359375, |
|
"reward_std": 0.26728419214487076, |
|
"rewards/accuracy_reward/mean": 0.359375, |
|
"rewards/accuracy_reward/std": 0.3770089760422707, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2140625, |
|
"completions/max_length": 992.175, |
|
"completions/max_terminated_length": 824.875, |
|
"completions/mean_length": 624.7859375, |
|
"completions/mean_terminated_length": 528.7103263854981, |
|
"completions/min_length": 286.725, |
|
"completions/min_terminated_length": 286.725, |
|
"epoch": 0.17066666666666666, |
|
"grad_norm": 0.25358782513135053, |
|
"kl": 0.01270294189453125, |
|
"learning_rate": 2.9574370098841073e-06, |
|
"loss": 0.0206, |
|
"num_tokens": 7770987.0, |
|
"reward": 0.446875, |
|
"reward_std": 0.25387620851397513, |
|
"rewards/accuracy_reward/mean": 0.446875, |
|
"rewards/accuracy_reward/std": 0.41064497008919715, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2296875, |
|
"completions/max_length": 1019.075, |
|
"completions/max_terminated_length": 863.475, |
|
"completions/mean_length": 652.8296875, |
|
"completions/mean_terminated_length": 547.2898578643799, |
|
"completions/min_length": 284.5, |
|
"completions/min_terminated_length": 284.5, |
|
"epoch": 0.18133333333333335, |
|
"grad_norm": 0.24694213995347897, |
|
"kl": 0.012436676025390624, |
|
"learning_rate": 2.9431880662871697e-06, |
|
"loss": 0.0418, |
|
"num_tokens": 8277326.0, |
|
"reward": 0.43125, |
|
"reward_std": 0.27223431766033174, |
|
"rewards/accuracy_reward/mean": 0.43125, |
|
"rewards/accuracy_reward/std": 0.40274464786052705, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2734375, |
|
"completions/max_length": 1016.775, |
|
"completions/max_terminated_length": 867.3, |
|
"completions/mean_length": 672.5609375, |
|
"completions/mean_terminated_length": 555.7766807556152, |
|
"completions/min_length": 290.45, |
|
"completions/min_terminated_length": 290.45, |
|
"epoch": 0.192, |
|
"grad_norm": 0.43538797427920595, |
|
"kl": 0.01246337890625, |
|
"learning_rate": 2.926930269919085e-06, |
|
"loss": 0.0397, |
|
"num_tokens": 8791085.0, |
|
"reward": 0.484375, |
|
"reward_std": 0.32977913729846475, |
|
"rewards/accuracy_reward/mean": 0.484375, |
|
"rewards/accuracy_reward/std": 0.4205139525234699, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.21875, |
|
"completions/max_length": 1015.15, |
|
"completions/max_terminated_length": 810.625, |
|
"completions/mean_length": 604.1609375, |
|
"completions/mean_terminated_length": 492.36522674560547, |
|
"completions/min_length": 240.85, |
|
"completions/min_terminated_length": 240.85, |
|
"epoch": 0.20266666666666666, |
|
"grad_norm": 0.363946563847488, |
|
"kl": 0.014809417724609374, |
|
"learning_rate": 2.908686250899966e-06, |
|
"loss": 0.0326, |
|
"num_tokens": 9263084.0, |
|
"reward": 0.4546875, |
|
"reward_std": 0.2643970146775246, |
|
"rewards/accuracy_reward/mean": 0.4546875, |
|
"rewards/accuracy_reward/std": 0.37832572385668756, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 0.40119715842853293, |
|
"learning_rate": 2.8884814040823755e-06, |
|
"loss": 0.0642, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completions/clipped_ratio": 0.254325, |
|
"eval_completions/max_length": 1017.8164, |
|
"eval_completions/max_terminated_length": 844.72, |
|
"eval_completions/mean_length": 641.31905, |
|
"eval_completions/mean_terminated_length": 518.6613545043946, |
|
"eval_completions/min_length": 271.8056, |
|
"eval_completions/min_terminated_length": 271.8056, |
|
"eval_kl": 0.01416409912109375, |
|
"eval_loss": 0.043244268745183945, |
|
"eval_num_tokens": 9739068.0, |
|
"eval_reward": 0.402925, |
|
"eval_reward_std": 0.2727512115895748, |
|
"eval_rewards/accuracy_reward/mean": 0.402925, |
|
"eval_rewards/accuracy_reward/std": 0.4037328534960747, |
|
"eval_rewards/format_reward/mean": 0.0, |
|
"eval_rewards/format_reward/std": 0.0, |
|
"eval_runtime": 17198.7184, |
|
"eval_samples_per_second": 0.291, |
|
"eval_steps_per_second": 0.018, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2265625, |
|
"completions/max_length": 1017.7875, |
|
"completions/max_terminated_length": 829.6, |
|
"completions/mean_length": 614.565625, |
|
"completions/mean_terminated_length": 500.5110694885254, |
|
"completions/min_length": 247.775, |
|
"completions/min_terminated_length": 247.775, |
|
"epoch": 0.224, |
|
"grad_norm": 0.40852771135479854, |
|
"kl": 0.0152984619140625, |
|
"learning_rate": 2.866343853702835e-06, |
|
"loss": 0.0384, |
|
"num_tokens": 10216536.0, |
|
"reward": 0.48515625, |
|
"reward_std": 0.2992698045447469, |
|
"rewards/accuracy_reward/mean": 0.48515625, |
|
"rewards/accuracy_reward/std": 0.43572295978665354, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2515625, |
|
"completions/max_length": 1018.075, |
|
"completions/max_terminated_length": 817.625, |
|
"completions/mean_length": 635.7125, |
|
"completions/mean_terminated_length": 516.0587692260742, |
|
"completions/min_length": 288.85, |
|
"completions/min_terminated_length": 288.85, |
|
"epoch": 0.23466666666666666, |
|
"grad_norm": 0.3358851233027883, |
|
"kl": 0.015325927734375, |
|
"learning_rate": 2.842304414234153e-06, |
|
"loss": 0.0292, |
|
"num_tokens": 10712328.0, |
|
"reward": 0.46875, |
|
"reward_std": 0.254452820494771, |
|
"rewards/accuracy_reward/mean": 0.46875, |
|
"rewards/accuracy_reward/std": 0.3950736179947853, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2625, |
|
"completions/max_length": 1012.575, |
|
"completions/max_terminated_length": 838.725, |
|
"completions/mean_length": 643.7859375, |
|
"completions/mean_terminated_length": 518.7130432128906, |
|
"completions/min_length": 279.9, |
|
"completions/min_terminated_length": 279.9, |
|
"epoch": 0.24533333333333332, |
|
"grad_norm": 0.3712248783220928, |
|
"kl": 0.01651611328125, |
|
"learning_rate": 2.8163965474930673e-06, |
|
"loss": 0.0588, |
|
"num_tokens": 11223503.0, |
|
"reward": 0.4390625, |
|
"reward_std": 0.30547982417047026, |
|
"rewards/accuracy_reward/mean": 0.4390625, |
|
"rewards/accuracy_reward/std": 0.4317807413637638, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.3046875, |
|
"completions/max_length": 1014.375, |
|
"completions/max_terminated_length": 813.85, |
|
"completions/mean_length": 662.615625, |
|
"completions/mean_terminated_length": 519.530436706543, |
|
"completions/min_length": 310.8, |
|
"completions/min_terminated_length": 310.8, |
|
"epoch": 0.256, |
|
"grad_norm": 0.24811201734525618, |
|
"kl": 0.019573974609375, |
|
"learning_rate": 2.7886563160629058e-06, |
|
"loss": 0.0372, |
|
"num_tokens": 11742665.0, |
|
"reward": 0.4453125, |
|
"reward_std": 0.23867536522448063, |
|
"rewards/accuracy_reward/mean": 0.4453125, |
|
"rewards/accuracy_reward/std": 0.417168840020895, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2359375, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 827.8, |
|
"completions/mean_length": 629.021875, |
|
"completions/mean_terminated_length": 511.2683464050293, |
|
"completions/min_length": 278.975, |
|
"completions/min_terminated_length": 278.975, |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.344688397601857, |
|
"kl": 0.02204742431640625, |
|
"learning_rate": 2.759122333096093e-06, |
|
"loss": 0.0357, |
|
"num_tokens": 12236375.0, |
|
"reward": 0.4375, |
|
"reward_std": 0.2976874437183142, |
|
"rewards/accuracy_reward/mean": 0.4375, |
|
"rewards/accuracy_reward/std": 0.40365294441580774, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2265625, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 866.45, |
|
"completions/mean_length": 639.3734375, |
|
"completions/mean_terminated_length": 530.9625923156739, |
|
"completions/min_length": 275.025, |
|
"completions/min_terminated_length": 275.025, |
|
"epoch": 0.2773333333333333, |
|
"grad_norm": 0.3373017645799004, |
|
"kl": 0.024224853515625, |
|
"learning_rate": 2.727835708566381e-06, |
|
"loss": 0.0511, |
|
"num_tokens": 12729606.0, |
|
"reward": 0.3921875, |
|
"reward_std": 0.2933184415102005, |
|
"rewards/accuracy_reward/mean": 0.3921875, |
|
"rewards/accuracy_reward/std": 0.3842721916735172, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2015625, |
|
"completions/max_length": 1013.65, |
|
"completions/max_terminated_length": 827.25, |
|
"completions/mean_length": 609.1671875, |
|
"completions/mean_terminated_length": 517.1776489257812, |
|
"completions/min_length": 257.25, |
|
"completions/min_terminated_length": 257.25, |
|
"epoch": 0.288, |
|
"grad_norm": 0.47904116965848426, |
|
"kl": 0.02641754150390625, |
|
"learning_rate": 2.6948399920456223e-06, |
|
"loss": 0.0707, |
|
"num_tokens": 13210177.0, |
|
"reward": 0.4015625, |
|
"reward_std": 0.29006172008812425, |
|
"rewards/accuracy_reward/mean": 0.4015625, |
|
"rewards/accuracy_reward/std": 0.3616370469331741, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2703125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 813.775, |
|
"completions/mean_length": 630.115625, |
|
"completions/mean_terminated_length": 490.09764099121094, |
|
"completions/min_length": 231.425, |
|
"completions/min_terminated_length": 231.425, |
|
"epoch": 0.2986666666666667, |
|
"grad_norm": 0.4924178489635141, |
|
"kl": 0.0541534423828125, |
|
"learning_rate": 2.6601811120847203e-06, |
|
"loss": 0.1306, |
|
"num_tokens": 13698515.0, |
|
"reward": 0.3859375, |
|
"reward_std": 0.3275134950876236, |
|
"rewards/accuracy_reward/mean": 0.3859375, |
|
"rewards/accuracy_reward/std": 0.4226386234164238, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.3296875, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 825.4, |
|
"completions/mean_length": 655.7046875, |
|
"completions/mean_terminated_length": 479.4415023803711, |
|
"completions/min_length": 163.15, |
|
"completions/min_terminated_length": 163.15, |
|
"epoch": 0.30933333333333335, |
|
"grad_norm": 2.123406819046491, |
|
"kl": 0.1634521484375, |
|
"learning_rate": 2.6239073122831634e-06, |
|
"loss": 0.1303, |
|
"num_tokens": 14208934.0, |
|
"reward": 0.2640625, |
|
"reward_std": 0.31472352109849455, |
|
"rewards/accuracy_reward/mean": 0.2640625, |
|
"rewards/accuracy_reward/std": 0.38663889542222024, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.109375, |
|
"completions/max_length": 876.975, |
|
"completions/max_terminated_length": 738.675, |
|
"completions/mean_length": 424.2921875, |
|
"completions/mean_terminated_length": 355.7382415771484, |
|
"completions/min_length": 43.55, |
|
"completions/min_terminated_length": 43.55, |
|
"epoch": 0.32, |
|
"grad_norm": 1.0873205027125876, |
|
"kl": 0.9138916015625, |
|
"learning_rate": 2.586069084136102e-06, |
|
"loss": 0.013, |
|
"num_tokens": 14563945.0, |
|
"reward": 0.203125, |
|
"reward_std": 0.27037961296737195, |
|
"rewards/accuracy_reward/mean": 0.203125, |
|
"rewards/accuracy_reward/std": 0.35853438526391984, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.015625, |
|
"completions/max_length": 791.55, |
|
"completions/max_terminated_length": 750.375, |
|
"completions/mean_length": 370.10625, |
|
"completions/mean_terminated_length": 359.8678153991699, |
|
"completions/min_length": 44.675, |
|
"completions/min_terminated_length": 44.675, |
|
"epoch": 0.33066666666666666, |
|
"grad_norm": 3.705700983890623, |
|
"kl": 0.6580078125, |
|
"learning_rate": 2.5467190967524655e-06, |
|
"loss": -0.034, |
|
"num_tokens": 14891197.0, |
|
"reward": 0.315625, |
|
"reward_std": 0.3082596454769373, |
|
"rewards/accuracy_reward/mean": 0.315625, |
|
"rewards/accuracy_reward/std": 0.37598869428038595, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.0140625, |
|
"completions/max_length": 730.85, |
|
"completions/max_terminated_length": 714.8, |
|
"completions/mean_length": 380.8421875, |
|
"completions/mean_terminated_length": 372.76271896362306, |
|
"completions/min_length": 87.425, |
|
"completions/min_terminated_length": 87.425, |
|
"epoch": 0.3413333333333333, |
|
"grad_norm": 1.6894860441577937, |
|
"kl": 2.70595703125, |
|
"learning_rate": 2.5059121235419343e-06, |
|
"loss": 0.0725, |
|
"num_tokens": 15224776.0, |
|
"reward": 0.2046875, |
|
"reward_std": 0.27738223411142826, |
|
"rewards/accuracy_reward/mean": 0.2046875, |
|
"rewards/accuracy_reward/std": 0.3607805661857128, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1, |
|
"completions/max_length": 938.025, |
|
"completions/max_terminated_length": 841.95, |
|
"completions/mean_length": 528.2921875, |
|
"completions/mean_terminated_length": 475.2810600280762, |
|
"completions/min_length": 129.55, |
|
"completions/min_terminated_length": 129.55, |
|
"epoch": 0.352, |
|
"grad_norm": 1.528288841175706, |
|
"kl": 1.4619140625, |
|
"learning_rate": 2.4637049659728207e-06, |
|
"loss": 0.065, |
|
"num_tokens": 15654979.0, |
|
"reward": 0.1921875, |
|
"reward_std": 0.28522051945328714, |
|
"rewards/accuracy_reward/mean": 0.1921875, |
|
"rewards/accuracy_reward/std": 0.3629676692187786, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2359375, |
|
"completions/max_length": 1019.875, |
|
"completions/max_terminated_length": 885.0, |
|
"completions/mean_length": 598.4390625, |
|
"completions/mean_terminated_length": 470.77597427368164, |
|
"completions/min_length": 82.1, |
|
"completions/min_terminated_length": 82.1, |
|
"epoch": 0.3626666666666667, |
|
"grad_norm": 2.803962527847425, |
|
"kl": 3.966015625, |
|
"learning_rate": 2.4201563745069844e-06, |
|
"loss": 0.2138, |
|
"num_tokens": 16128140.0, |
|
"reward": 0.159375, |
|
"reward_std": 0.2523420058190823, |
|
"rewards/accuracy_reward/mean": 0.159375, |
|
"rewards/accuracy_reward/std": 0.29368442595005034, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.259375, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 878.375, |
|
"completions/mean_length": 601.2546875, |
|
"completions/mean_terminated_length": 457.18211555480957, |
|
"completions/min_length": 76.55, |
|
"completions/min_terminated_length": 76.55, |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 5.829506705682438, |
|
"kl": 2.53173828125, |
|
"learning_rate": 2.3753269668218372e-06, |
|
"loss": 0.1753, |
|
"num_tokens": 16595127.0, |
|
"reward": 0.2046875, |
|
"reward_std": 0.3113111235201359, |
|
"rewards/accuracy_reward/mean": 0.2046875, |
|
"rewards/accuracy_reward/std": 0.36455870419740677, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.259375, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 896.85, |
|
"completions/mean_length": 621.9390625, |
|
"completions/mean_terminated_length": 485.3581092834473, |
|
"completions/min_length": 118.325, |
|
"completions/min_terminated_length": 118.325, |
|
"epoch": 0.384, |
|
"grad_norm": 1.5424634966107815, |
|
"kl": 3.1536865234375, |
|
"learning_rate": 2.3292791434332643e-06, |
|
"loss": 0.2008, |
|
"num_tokens": 17087648.0, |
|
"reward": 0.228125, |
|
"reward_std": 0.2769108280539513, |
|
"rewards/accuracy_reward/mean": 0.228125, |
|
"rewards/accuracy_reward/std": 0.36085558757185937, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2984375, |
|
"completions/max_length": 1019.65, |
|
"completions/max_terminated_length": 893.25, |
|
"completions/mean_length": 664.053125, |
|
"completions/mean_terminated_length": 521.8890960693359, |
|
"completions/min_length": 193.275, |
|
"completions/min_terminated_length": 193.275, |
|
"epoch": 0.39466666666666667, |
|
"grad_norm": 3.157457940002755, |
|
"kl": 2.6540283203125, |
|
"learning_rate": 2.2820770008369208e-06, |
|
"loss": 0.2153, |
|
"num_tokens": 17601938.0, |
|
"reward": 0.2890625, |
|
"reward_std": 0.2875839114189148, |
|
"rewards/accuracy_reward/mean": 0.2890625, |
|
"rewards/accuracy_reward/std": 0.37700623869895933, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.303125, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 877.9, |
|
"completions/mean_length": 662.246875, |
|
"completions/mean_terminated_length": 510.23935470581057, |
|
"completions/min_length": 191.375, |
|
"completions/min_terminated_length": 191.375, |
|
"epoch": 0.4053333333333333, |
|
"grad_norm": 5.343900565343268, |
|
"kl": 3.46298828125, |
|
"learning_rate": 2.233786242288795e-06, |
|
"loss": 0.2422, |
|
"num_tokens": 18114840.0, |
|
"reward": 0.259375, |
|
"reward_std": 0.3033699918538332, |
|
"rewards/accuracy_reward/mean": 0.259375, |
|
"rewards/accuracy_reward/std": 0.3962884694337845, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2765625, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 803.6, |
|
"completions/mean_length": 610.2328125, |
|
"completions/mean_terminated_length": 449.8131446838379, |
|
"completions/min_length": 150.925, |
|
"completions/min_terminated_length": 150.925, |
|
"epoch": 0.416, |
|
"grad_norm": 1.4123262482389565, |
|
"kl": 4.473828125, |
|
"learning_rate": 2.184474086349237e-06, |
|
"loss": 0.3007, |
|
"num_tokens": 18590405.0, |
|
"reward": 0.2953125, |
|
"reward_std": 0.329832362011075, |
|
"rewards/accuracy_reward/mean": 0.2953125, |
|
"rewards/accuracy_reward/std": 0.3952501378953457, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 1.335408537990007, |
|
"learning_rate": 2.1342091733177504e-06, |
|
"loss": 0.1653, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completions/clipped_ratio": 0.249725, |
|
"eval_completions/max_length": 1020.764, |
|
"eval_completions/max_terminated_length": 864.5468, |
|
"eval_completions/mean_length": 603.358575, |
|
"eval_completions/mean_terminated_length": 465.70587670288086, |
|
"eval_completions/min_length": 101.0624, |
|
"eval_completions/min_terminated_length": 101.0624, |
|
"eval_kl": 4.10953203125, |
|
"eval_loss": 0.22873078286647797, |
|
"eval_num_tokens": 19080613.0, |
|
"eval_reward": 0.181575, |
|
"eval_reward_std": 0.25937469556927684, |
|
"eval_rewards/accuracy_reward/mean": 0.181575, |
|
"eval_rewards/accuracy_reward/std": 0.3261967261195183, |
|
"eval_rewards/format_reward/mean": 0.0, |
|
"eval_rewards/format_reward/std": 0.0, |
|
"eval_runtime": 17076.072, |
|
"eval_samples_per_second": 0.293, |
|
"eval_steps_per_second": 0.018, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.26640625, |
|
"completions/max_length": 1023.3875, |
|
"completions/max_terminated_length": 867.1625, |
|
"completions/mean_length": 619.5703125, |
|
"completions/mean_terminated_length": 475.73999614715575, |
|
"completions/min_length": 108.0875, |
|
"completions/min_terminated_length": 108.0875, |
|
"epoch": 0.43733333333333335, |
|
"grad_norm": 1.9587392151855365, |
|
"kl": 2.8674636840820313, |
|
"learning_rate": 2.0830614696887893e-06, |
|
"loss": 0.1883, |
|
"num_tokens": 19562599.0, |
|
"reward": 0.19609375, |
|
"reward_std": 0.26347289625555276, |
|
"rewards/accuracy_reward/mean": 0.19609375, |
|
"rewards/accuracy_reward/std": 0.34130338206887245, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.225, |
|
"completions/max_length": 1015.2, |
|
"completions/max_terminated_length": 865.525, |
|
"completions/mean_length": 601.5953125, |
|
"completions/mean_terminated_length": 481.98564224243165, |
|
"completions/min_length": 99.225, |
|
"completions/min_terminated_length": 99.225, |
|
"epoch": 0.448, |
|
"grad_norm": 2.694092158079111, |
|
"kl": 2.51298828125, |
|
"learning_rate": 2.0311021707615474e-06, |
|
"loss": 0.1151, |
|
"num_tokens": 20033692.0, |
|
"reward": 0.2125, |
|
"reward_std": 0.2790681302547455, |
|
"rewards/accuracy_reward/mean": 0.2125, |
|
"rewards/accuracy_reward/std": 0.3386077769100666, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.2375, |
|
"completions/max_length": 1010.2, |
|
"completions/max_terminated_length": 868.775, |
|
"completions/mean_length": 599.2375, |
|
"completions/mean_terminated_length": 471.63191413879395, |
|
"completions/min_length": 124.35, |
|
"completions/min_terminated_length": 124.35, |
|
"epoch": 0.45866666666666667, |
|
"grad_norm": 1.5861291482690305, |
|
"kl": 3.3865478515625, |
|
"learning_rate": 1.9784036015393108e-06, |
|
"loss": 0.221, |
|
"num_tokens": 20505108.0, |
|
"reward": 0.203125, |
|
"reward_std": 0.26243593245744706, |
|
"rewards/accuracy_reward/mean": 0.203125, |
|
"rewards/accuracy_reward/std": 0.3315103515982628, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 1001.85, |
|
"completions/max_terminated_length": 826.225, |
|
"completions/mean_length": 562.4015625, |
|
"completions/mean_terminated_length": 460.1457885742187, |
|
"completions/min_length": 126.6, |
|
"completions/min_terminated_length": 126.6, |
|
"epoch": 0.4693333333333333, |
|
"grad_norm": 1.7695739456560913, |
|
"kl": 2.744598388671875, |
|
"learning_rate": 1.9250391160563114e-06, |
|
"loss": 0.1631, |
|
"num_tokens": 20947373.0, |
|
"reward": 0.315625, |
|
"reward_std": 0.3029405642300844, |
|
"rewards/accuracy_reward/mean": 0.315625, |
|
"rewards/accuracy_reward/std": 0.3904874354600906, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1828125, |
|
"completions/max_length": 1019.125, |
|
"completions/max_terminated_length": 890.5, |
|
"completions/mean_length": 564.0375, |
|
"completions/mean_terminated_length": 460.37510833740237, |
|
"completions/min_length": 125.95, |
|
"completions/min_terminated_length": 125.95, |
|
"epoch": 0.48, |
|
"grad_norm": 3.4078593954773204, |
|
"kl": 3.49443359375, |
|
"learning_rate": 1.8710829952722163e-06, |
|
"loss": 0.2053, |
|
"num_tokens": 21400653.0, |
|
"reward": 0.3140625, |
|
"reward_std": 0.32277651987969874, |
|
"rewards/accuracy_reward/mean": 0.3140625, |
|
"rewards/accuracy_reward/std": 0.4073892831802368, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1421875, |
|
"completions/max_length": 984.2, |
|
"completions/max_terminated_length": 844.9, |
|
"completions/mean_length": 538.209375, |
|
"completions/mean_terminated_length": 461.8832672119141, |
|
"completions/min_length": 74.375, |
|
"completions/min_terminated_length": 74.375, |
|
"epoch": 0.49066666666666664, |
|
"grad_norm": 4.5323959558225395, |
|
"kl": 2.24453125, |
|
"learning_rate": 1.8166103436763777e-06, |
|
"loss": 0.1171, |
|
"num_tokens": 21837691.0, |
|
"reward": 0.3359375, |
|
"reward_std": 0.3033065766096115, |
|
"rewards/accuracy_reward/mean": 0.3359375, |
|
"rewards/accuracy_reward/std": 0.42303009480237963, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1921875, |
|
"completions/max_length": 1003.35, |
|
"completions/max_terminated_length": 889.725, |
|
"completions/mean_length": 567.85625, |
|
"completions/mean_terminated_length": 464.1244514465332, |
|
"completions/min_length": 105.775, |
|
"completions/min_terminated_length": 105.775, |
|
"epoch": 0.5013333333333333, |
|
"grad_norm": 5.127794348866437, |
|
"kl": 2.8025146484375, |
|
"learning_rate": 1.7616969847457666e-06, |
|
"loss": 0.1524, |
|
"num_tokens": 22288055.0, |
|
"reward": 0.2484375, |
|
"reward_std": 0.265233264490962, |
|
"rewards/accuracy_reward/mean": 0.2484375, |
|
"rewards/accuracy_reward/std": 0.3456104569137096, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1625, |
|
"completions/max_length": 1004.375, |
|
"completions/max_terminated_length": 885.5, |
|
"completions/mean_length": 578.2203125, |
|
"completions/mean_terminated_length": 495.75377731323243, |
|
"completions/min_length": 116.125, |
|
"completions/min_terminated_length": 116.125, |
|
"epoch": 0.512, |
|
"grad_norm": 2.2253978317000347, |
|
"kl": 2.90908203125, |
|
"learning_rate": 1.7064193554021108e-06, |
|
"loss": 0.138, |
|
"num_tokens": 22747892.0, |
|
"reward": 0.25, |
|
"reward_std": 0.3034702904522419, |
|
"rewards/accuracy_reward/mean": 0.25, |
|
"rewards/accuracy_reward/std": 0.39005310013890265, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1578125, |
|
"completions/max_length": 1012.0, |
|
"completions/max_terminated_length": 860.35, |
|
"completions/mean_length": 565.81875, |
|
"completions/mean_terminated_length": 479.0156246185303, |
|
"completions/min_length": 136.125, |
|
"completions/min_terminated_length": 136.125, |
|
"epoch": 0.5226666666666666, |
|
"grad_norm": 1.4256050986060915, |
|
"kl": 3.39208984375, |
|
"learning_rate": 1.650854399615142e-06, |
|
"loss": 0.2056, |
|
"num_tokens": 23206816.0, |
|
"reward": 0.2375, |
|
"reward_std": 0.2754390574991703, |
|
"rewards/accuracy_reward/mean": 0.2375, |
|
"rewards/accuracy_reward/std": 0.3435414247214794, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1953125, |
|
"completions/max_length": 1013.175, |
|
"completions/max_terminated_length": 904.125, |
|
"completions/mean_length": 600.421875, |
|
"completions/mean_terminated_length": 501.47122344970705, |
|
"completions/min_length": 146.425, |
|
"completions/min_terminated_length": 146.425, |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 8.99884757175663, |
|
"kl": 3.725, |
|
"learning_rate": 1.5950794613000567e-06, |
|
"loss": 0.2221, |
|
"num_tokens": 23687462.0, |
|
"reward": 0.1796875, |
|
"reward_std": 0.24051060006022454, |
|
"rewards/accuracy_reward/mean": 0.1796875, |
|
"rewards/accuracy_reward/std": 0.2946265310049057, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1890625, |
|
"completions/max_length": 1019.725, |
|
"completions/max_terminated_length": 886.175, |
|
"completions/mean_length": 576.121875, |
|
"completions/mean_terminated_length": 469.11180267333987, |
|
"completions/min_length": 121.925, |
|
"completions/min_terminated_length": 121.925, |
|
"epoch": 0.544, |
|
"grad_norm": 2.729106221857157, |
|
"kl": 3.590283203125, |
|
"learning_rate": 1.539172176658271e-06, |
|
"loss": 0.1995, |
|
"num_tokens": 24149060.0, |
|
"reward": 0.1828125, |
|
"reward_std": 0.25070299617946146, |
|
"rewards/accuracy_reward/mean": 0.1828125, |
|
"rewards/accuracy_reward/std": 0.3266736909747124, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1359375, |
|
"completions/max_length": 993.8, |
|
"completions/max_terminated_length": 849.3, |
|
"completions/mean_length": 526.68125, |
|
"completions/mean_terminated_length": 446.4154727935791, |
|
"completions/min_length": 120.825, |
|
"completions/min_terminated_length": 120.825, |
|
"epoch": 0.5546666666666666, |
|
"grad_norm": 1.4990062784258935, |
|
"kl": 2.2947998046875, |
|
"learning_rate": 1.4832103661113236e-06, |
|
"loss": 0.1452, |
|
"num_tokens": 24577128.0, |
|
"reward": 0.2078125, |
|
"reward_std": 0.27170360907912255, |
|
"rewards/accuracy_reward/mean": 0.2078125, |
|
"rewards/accuracy_reward/std": 0.35073904544115064, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1390625, |
|
"completions/max_length": 1008.375, |
|
"completions/max_terminated_length": 875.0, |
|
"completions/mean_length": 541.9234375, |
|
"completions/mean_terminated_length": 467.3186508178711, |
|
"completions/min_length": 130.6, |
|
"completions/min_terminated_length": 130.6, |
|
"epoch": 0.5653333333333334, |
|
"grad_norm": 9.148886998248635, |
|
"kl": 4.06611328125, |
|
"learning_rate": 1.4272719259783586e-06, |
|
"loss": 0.2278, |
|
"num_tokens": 25023999.0, |
|
"reward": 0.221875, |
|
"reward_std": 0.2942699518054724, |
|
"rewards/accuracy_reward/mean": 0.221875, |
|
"rewards/accuracy_reward/std": 0.3605892524123192, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.128125, |
|
"completions/max_length": 978.775, |
|
"completions/max_terminated_length": 867.6, |
|
"completions/mean_length": 528.0546875, |
|
"completions/mean_terminated_length": 461.0882026672363, |
|
"completions/min_length": 116.375, |
|
"completions/min_terminated_length": 116.375, |
|
"epoch": 0.576, |
|
"grad_norm": 1.904370064809824, |
|
"kl": 4.56875, |
|
"learning_rate": 1.3714347200479572e-06, |
|
"loss": 0.216, |
|
"num_tokens": 25454978.0, |
|
"reward": 0.23125, |
|
"reward_std": 0.3002675145864487, |
|
"rewards/accuracy_reward/mean": 0.23125, |
|
"rewards/accuracy_reward/std": 0.37225412726402285, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1484375, |
|
"completions/max_length": 1009.9, |
|
"completions/max_terminated_length": 880.575, |
|
"completions/mean_length": 583.6515625, |
|
"completions/mean_terminated_length": 508.4187866210938, |
|
"completions/min_length": 192.575, |
|
"completions/min_terminated_length": 192.575, |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 2.863201246372455, |
|
"kl": 1.67100830078125, |
|
"learning_rate": 1.3157764711952488e-06, |
|
"loss": 0.1031, |
|
"num_tokens": 25916763.0, |
|
"reward": 0.296875, |
|
"reward_std": 0.2727500643581152, |
|
"rewards/accuracy_reward/mean": 0.296875, |
|
"rewards/accuracy_reward/std": 0.36488829776644705, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.159375, |
|
"completions/max_length": 1002.75, |
|
"completions/max_terminated_length": 857.4, |
|
"completions/mean_length": 561.0359375, |
|
"completions/mean_terminated_length": 478.21215591430666, |
|
"completions/min_length": 133.1, |
|
"completions/min_terminated_length": 133.1, |
|
"epoch": 0.5973333333333334, |
|
"grad_norm": 6.145508296791858, |
|
"kl": 2.973114013671875, |
|
"learning_rate": 1.2603746531951753e-06, |
|
"loss": 0.1455, |
|
"num_tokens": 26363146.0, |
|
"reward": 0.240625, |
|
"reward_std": 0.27465322241187096, |
|
"rewards/accuracy_reward/mean": 0.240625, |
|
"rewards/accuracy_reward/std": 0.3454093523323536, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.175, |
|
"completions/max_length": 1017.1, |
|
"completions/max_terminated_length": 848.325, |
|
"completions/mean_length": 533.7046875, |
|
"completions/mean_terminated_length": 434.07742347717283, |
|
"completions/min_length": 123.6, |
|
"completions/min_terminated_length": 123.6, |
|
"epoch": 0.608, |
|
"grad_norm": 7.437800094717786, |
|
"kl": 5.8435546875, |
|
"learning_rate": 1.2053063828824825e-06, |
|
"loss": 0.2914, |
|
"num_tokens": 26800613.0, |
|
"reward": 0.1484375, |
|
"reward_std": 0.22240859605371952, |
|
"rewards/accuracy_reward/mean": 0.1484375, |
|
"rewards/accuracy_reward/std": 0.3025800295174122, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1546875, |
|
"completions/max_length": 1008.075, |
|
"completions/max_terminated_length": 873.875, |
|
"completions/mean_length": 522.96875, |
|
"completions/mean_terminated_length": 434.07748947143557, |
|
"completions/min_length": 101.3, |
|
"completions/min_terminated_length": 101.3, |
|
"epoch": 0.6186666666666667, |
|
"grad_norm": 4.1540581465241475, |
|
"kl": 2.5693359375, |
|
"learning_rate": 1.1506483128085632e-06, |
|
"loss": 0.1233, |
|
"num_tokens": 27224441.0, |
|
"reward": 0.1328125, |
|
"reward_std": 0.20410939157009125, |
|
"rewards/accuracy_reward/mean": 0.1328125, |
|
"rewards/accuracy_reward/std": 0.27071708589792254, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.134375, |
|
"completions/max_length": 987.3, |
|
"completions/max_terminated_length": 832.7, |
|
"completions/mean_length": 508.940625, |
|
"completions/mean_terminated_length": 430.16086196899414, |
|
"completions/min_length": 114.6, |
|
"completions/min_terminated_length": 114.6, |
|
"epoch": 0.6293333333333333, |
|
"grad_norm": 3.767225289300394, |
|
"kl": 1.0184326171875, |
|
"learning_rate": 1.0964765245445572e-06, |
|
"loss": 0.0948, |
|
"num_tokens": 27641059.0, |
|
"reward": 0.1984375, |
|
"reward_std": 0.27595735117793085, |
|
"rewards/accuracy_reward/mean": 0.1984375, |
|
"rewards/accuracy_reward/std": 0.3417211093008518, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.6353961367471301, |
|
"learning_rate": 1.042866422779233e-06, |
|
"loss": 0.1214, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completions/clipped_ratio": 0.1834, |
|
"eval_completions/max_length": 1016.308, |
|
"eval_completions/max_terminated_length": 858.028, |
|
"eval_completions/mean_length": 556.6972, |
|
"eval_completions/mean_terminated_length": 453.3012262512207, |
|
"eval_completions/min_length": 128.7608, |
|
"eval_completions/min_terminated_length": 128.7608, |
|
"eval_kl": 3.4017, |
|
"eval_loss": 0.18172034621238708, |
|
"eval_num_tokens": 28075018.0, |
|
"eval_reward": 0.144625, |
|
"eval_reward_std": 0.2177023424565792, |
|
"eval_rewards/accuracy_reward/mean": 0.144625, |
|
"eval_rewards/accuracy_reward/std": 0.28106210635900497, |
|
"eval_rewards/format_reward/mean": 0.0, |
|
"eval_rewards/format_reward/std": 0.0, |
|
"eval_runtime": 16581.1407, |
|
"eval_samples_per_second": 0.302, |
|
"eval_steps_per_second": 0.019, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.18046875, |
|
"completions/max_length": 1021.1625, |
|
"completions/max_terminated_length": 858.15, |
|
"completions/mean_length": 553.5671875, |
|
"completions/mean_terminated_length": 452.57582130432127, |
|
"completions/min_length": 137.275, |
|
"completions/min_terminated_length": 137.275, |
|
"epoch": 0.6506666666666666, |
|
"grad_norm": 1.5919395722688163, |
|
"kl": 2.91702880859375, |
|
"learning_rate": 9.898926303590562e-07, |
|
"loss": 0.2225, |
|
"num_tokens": 28530073.0, |
|
"reward": 0.17890625, |
|
"reward_std": 0.25828637965023515, |
|
"rewards/accuracy_reward/mean": 0.17890625, |
|
"rewards/accuracy_reward/std": 0.32390750013291836, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.15625, |
|
"completions/max_length": 1006.85, |
|
"completions/max_terminated_length": 822.9, |
|
"completions/mean_length": 533.6984375, |
|
"completions/mean_terminated_length": 447.3911735534668, |
|
"completions/min_length": 134.1, |
|
"completions/min_terminated_length": 134.1, |
|
"epoch": 0.6613333333333333, |
|
"grad_norm": 2.975928575760629, |
|
"kl": 2.889697265625, |
|
"learning_rate": 9.376288844165526e-07, |
|
"loss": 0.1673, |
|
"num_tokens": 28955912.0, |
|
"reward": 0.2828125, |
|
"reward_std": 0.3022126518189907, |
|
"rewards/accuracy_reward/mean": 0.2828125, |
|
"rewards/accuracy_reward/std": 0.3880471274256706, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 1015.55, |
|
"completions/max_terminated_length": 834.25, |
|
"completions/mean_length": 571.515625, |
|
"completions/mean_terminated_length": 467.75330581665037, |
|
"completions/min_length": 144.75, |
|
"completions/min_terminated_length": 144.75, |
|
"epoch": 0.672, |
|
"grad_norm": 6.74008756789244, |
|
"kl": 3.23857421875, |
|
"learning_rate": 8.861479337315364e-07, |
|
"loss": 0.1582, |
|
"num_tokens": 29408978.0, |
|
"reward": 0.2046875, |
|
"reward_std": 0.24277526065707206, |
|
"rewards/accuracy_reward/mean": 0.2046875, |
|
"rewards/accuracy_reward/std": 0.32984848618507384, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1640625, |
|
"completions/max_length": 998.125, |
|
"completions/max_terminated_length": 869.1, |
|
"completions/mean_length": 573.5921875, |
|
"completions/mean_terminated_length": 491.0239372253418, |
|
"completions/min_length": 142.975, |
|
"completions/min_terminated_length": 142.975, |
|
"epoch": 0.6826666666666666, |
|
"grad_norm": 1.9324158149075346, |
|
"kl": 2.8765625, |
|
"learning_rate": 8.355214374680916e-07, |
|
"loss": 0.148, |
|
"num_tokens": 29870653.0, |
|
"reward": 0.1828125, |
|
"reward_std": 0.2596026983112097, |
|
"rewards/accuracy_reward/mean": 0.1828125, |
|
"rewards/accuracy_reward/std": 0.3082869052886963, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.175, |
|
"completions/max_length": 1012.875, |
|
"completions/max_terminated_length": 871.675, |
|
"completions/mean_length": 568.49375, |
|
"completions/mean_terminated_length": 474.15330467224123, |
|
"completions/min_length": 137.95, |
|
"completions/min_terminated_length": 137.95, |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 2.9206893390038653, |
|
"kl": 2.656689453125, |
|
"learning_rate": 7.858198654282411e-07, |
|
"loss": 0.1795, |
|
"num_tokens": 30318865.0, |
|
"reward": 0.2359375, |
|
"reward_std": 0.28795090727508066, |
|
"rewards/accuracy_reward/mean": 0.2359375, |
|
"rewards/accuracy_reward/std": 0.37537991255521774, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1640625, |
|
"completions/max_length": 1020.825, |
|
"completions/max_terminated_length": 861.275, |
|
"completions/mean_length": 556.4671875, |
|
"completions/mean_terminated_length": 466.5027053833008, |
|
"completions/min_length": 126.9, |
|
"completions/min_terminated_length": 126.9, |
|
"epoch": 0.704, |
|
"grad_norm": 1.2056546338068763, |
|
"kl": 3.62041015625, |
|
"learning_rate": 7.371123999611557e-07, |
|
"loss": 0.2013, |
|
"num_tokens": 30762364.0, |
|
"reward": 0.23125, |
|
"reward_std": 0.30185016691684724, |
|
"rewards/accuracy_reward/mean": 0.23125, |
|
"rewards/accuracy_reward/std": 0.3574558347463608, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.184375, |
|
"completions/max_length": 1005.925, |
|
"completions/max_terminated_length": 822.675, |
|
"completions/mean_length": 565.3328125, |
|
"completions/mean_terminated_length": 462.0029693603516, |
|
"completions/min_length": 116.5, |
|
"completions/min_terminated_length": 116.5, |
|
"epoch": 0.7146666666666667, |
|
"grad_norm": 1.3742588150692108, |
|
"kl": 3.824609375, |
|
"learning_rate": 6.894668396644439e-07, |
|
"loss": 0.1776, |
|
"num_tokens": 31211257.0, |
|
"reward": 0.215625, |
|
"reward_std": 0.2738583661615849, |
|
"rewards/accuracy_reward/mean": 0.215625, |
|
"rewards/accuracy_reward/std": 0.3528005562722683, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1828125, |
|
"completions/max_length": 1019.475, |
|
"completions/max_terminated_length": 851.825, |
|
"completions/mean_length": 577.940625, |
|
"completions/mean_terminated_length": 479.3160934448242, |
|
"completions/min_length": 123.675, |
|
"completions/min_terminated_length": 123.675, |
|
"epoch": 0.7253333333333334, |
|
"grad_norm": 1.4758651208465319, |
|
"kl": 2.486328125, |
|
"learning_rate": 6.429495050115576e-07, |
|
"loss": 0.1256, |
|
"num_tokens": 31673435.0, |
|
"reward": 0.2078125, |
|
"reward_std": 0.2131578464061022, |
|
"rewards/accuracy_reward/mean": 0.2078125, |
|
"rewards/accuracy_reward/std": 0.32094074562191965, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1515625, |
|
"completions/max_length": 1000.475, |
|
"completions/max_terminated_length": 834.775, |
|
"completions/mean_length": 517.709375, |
|
"completions/mean_terminated_length": 429.16680297851565, |
|
"completions/min_length": 92.775, |
|
"completions/min_terminated_length": 92.775, |
|
"epoch": 0.736, |
|
"grad_norm": 2.818679663631946, |
|
"kl": 1.72802734375, |
|
"learning_rate": 5.976251460366778e-07, |
|
"loss": 0.1104, |
|
"num_tokens": 32104025.0, |
|
"reward": 0.1953125, |
|
"reward_std": 0.24928856678307057, |
|
"rewards/accuracy_reward/mean": 0.1953125, |
|
"rewards/accuracy_reward/std": 0.30930979251861573, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.140625, |
|
"completions/max_length": 1007.825, |
|
"completions/max_terminated_length": 848.925, |
|
"completions/mean_length": 525.865625, |
|
"completions/mean_terminated_length": 444.4057159423828, |
|
"completions/min_length": 111.3, |
|
"completions/min_terminated_length": 111.3, |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 2.579459444619993, |
|
"kl": 2.3644287109375, |
|
"learning_rate": 5.535568522055892e-07, |
|
"loss": 0.15, |
|
"num_tokens": 32533547.0, |
|
"reward": 0.2140625, |
|
"reward_std": 0.26049785539507864, |
|
"rewards/accuracy_reward/mean": 0.2140625, |
|
"rewards/accuracy_reward/std": 0.3133357249200344, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1640625, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 822.4, |
|
"completions/mean_length": 535.765625, |
|
"completions/mean_terminated_length": 441.6015251159668, |
|
"completions/min_length": 126.925, |
|
"completions/min_terminated_length": 126.925, |
|
"epoch": 0.7573333333333333, |
|
"grad_norm": 1.9814797295208133, |
|
"kl": 3.35029296875, |
|
"learning_rate": 5.108059645979812e-07, |
|
"loss": 0.1853, |
|
"num_tokens": 32969357.0, |
|
"reward": 0.21875, |
|
"reward_std": 0.28468980863690374, |
|
"rewards/accuracy_reward/mean": 0.21875, |
|
"rewards/accuracy_reward/std": 0.35949567407369615, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1578125, |
|
"completions/max_length": 972.05, |
|
"completions/max_terminated_length": 812.65, |
|
"completions/mean_length": 522.775, |
|
"completions/mean_terminated_length": 431.6031944274902, |
|
"completions/min_length": 99.15, |
|
"completions/min_terminated_length": 99.15, |
|
"epoch": 0.768, |
|
"grad_norm": 1.213077412434873, |
|
"kl": 2.9021240234375, |
|
"learning_rate": 4.694319905234322e-07, |
|
"loss": 0.1545, |
|
"num_tokens": 33399765.0, |
|
"reward": 0.2203125, |
|
"reward_std": 0.26060167923569677, |
|
"rewards/accuracy_reward/mean": 0.2203125, |
|
"rewards/accuracy_reward/std": 0.33386615812778475, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.171875, |
|
"completions/max_length": 1012.75, |
|
"completions/max_terminated_length": 848.125, |
|
"completions/mean_length": 540.746875, |
|
"completions/mean_terminated_length": 442.72081604003904, |
|
"completions/min_length": 106.125, |
|
"completions/min_terminated_length": 106.125, |
|
"epoch": 0.7786666666666666, |
|
"grad_norm": 2.7577857686089398, |
|
"kl": 3.180859375, |
|
"learning_rate": 4.2949252068991326e-07, |
|
"loss": 0.1558, |
|
"num_tokens": 33836803.0, |
|
"reward": 0.1703125, |
|
"reward_std": 0.24860655926167965, |
|
"rewards/accuracy_reward/mean": 0.1703125, |
|
"rewards/accuracy_reward/std": 0.3174226224422455, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1390625, |
|
"completions/max_length": 1014.45, |
|
"completions/max_terminated_length": 857.475, |
|
"completions/mean_length": 531.0265625, |
|
"completions/mean_terminated_length": 451.6034767150879, |
|
"completions/min_length": 87.1, |
|
"completions/min_terminated_length": 87.1, |
|
"epoch": 0.7893333333333333, |
|
"grad_norm": 1.9163359591102496, |
|
"kl": 2.898291015625, |
|
"learning_rate": 3.91043149040118e-07, |
|
"loss": 0.1431, |
|
"num_tokens": 34267244.0, |
|
"reward": 0.15625, |
|
"reward_std": 0.22862185277044772, |
|
"rewards/accuracy_reward/mean": 0.15625, |
|
"rewards/accuracy_reward/std": 0.3019771084189415, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1375, |
|
"completions/max_length": 999.2, |
|
"completions/max_terminated_length": 855.575, |
|
"completions/mean_length": 511.95, |
|
"completions/mean_terminated_length": 432.5935943603516, |
|
"completions/min_length": 115.575, |
|
"completions/min_terminated_length": 115.575, |
|
"epoch": 0.8, |
|
"grad_norm": 7.856282117594133, |
|
"kl": 2.960986328125, |
|
"learning_rate": 3.541373953671986e-07, |
|
"loss": 0.1759, |
|
"num_tokens": 34681772.0, |
|
"reward": 0.2546875, |
|
"reward_std": 0.2616026271134615, |
|
"rewards/accuracy_reward/mean": 0.2546875, |
|
"rewards/accuracy_reward/std": 0.32934130281209945, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1234375, |
|
"completions/max_length": 999.7, |
|
"completions/max_terminated_length": 814.5, |
|
"completions/mean_length": 519.0625, |
|
"completions/mean_terminated_length": 451.72740325927737, |
|
"completions/min_length": 147.0, |
|
"completions/min_terminated_length": 147.0, |
|
"epoch": 0.8106666666666666, |
|
"grad_norm": 1.5846595981949476, |
|
"kl": 1.841064453125, |
|
"learning_rate": 3.18826630817626e-07, |
|
"loss": 0.123, |
|
"num_tokens": 35113548.0, |
|
"reward": 0.3171875, |
|
"reward_std": 0.3077378235757351, |
|
"rewards/accuracy_reward/mean": 0.3171875, |
|
"rewards/accuracy_reward/std": 0.39782319590449333, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.165625, |
|
"completions/max_length": 984.15, |
|
"completions/max_terminated_length": 822.15, |
|
"completions/mean_length": 540.75625, |
|
"completions/mean_terminated_length": 446.8059135437012, |
|
"completions/min_length": 113.4, |
|
"completions/min_terminated_length": 113.4, |
|
"epoch": 0.8213333333333334, |
|
"grad_norm": 2.121434797315098, |
|
"kl": 2.864453125, |
|
"learning_rate": 2.851600063848651e-07, |
|
"loss": 0.1622, |
|
"num_tokens": 35554176.0, |
|
"reward": 0.2546875, |
|
"reward_std": 0.2852169893682003, |
|
"rewards/accuracy_reward/mean": 0.25625, |
|
"rewards/accuracy_reward/std": 0.36226451173424723, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.125, |
|
"completions/max_length": 994.7, |
|
"completions/max_terminated_length": 881.975, |
|
"completions/mean_length": 542.5953125, |
|
"completions/mean_terminated_length": 474.35446014404295, |
|
"completions/min_length": 147.075, |
|
"completions/min_terminated_length": 147.075, |
|
"epoch": 0.832, |
|
"grad_norm": 2.11442210090208, |
|
"kl": 3.627099609375, |
|
"learning_rate": 2.5318438449341047e-07, |
|
"loss": 0.1706, |
|
"num_tokens": 35991597.0, |
|
"reward": 0.2, |
|
"reward_std": 0.26254682019352915, |
|
"rewards/accuracy_reward/mean": 0.2, |
|
"rewards/accuracy_reward/std": 0.3453776828944683, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.18125, |
|
"completions/max_length": 1022.825, |
|
"completions/max_terminated_length": 830.85, |
|
"completions/mean_length": 552.690625, |
|
"completions/mean_terminated_length": 449.2019546508789, |
|
"completions/min_length": 128.5, |
|
"completions/min_terminated_length": 128.5, |
|
"epoch": 0.8426666666666667, |
|
"grad_norm": 1.5077720298338484, |
|
"kl": 3.9841552734375, |
|
"learning_rate": 2.2294427376840055e-07, |
|
"loss": 0.212, |
|
"num_tokens": 36435279.0, |
|
"reward": 0.2421875, |
|
"reward_std": 0.264066056907177, |
|
"rewards/accuracy_reward/mean": 0.2421875, |
|
"rewards/accuracy_reward/std": 0.3508764863014221, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 1.2192246871952803, |
|
"learning_rate": 1.9448176708161513e-07, |
|
"loss": 0.2023, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completions/clipped_ratio": 0.150125, |
|
"eval_completions/max_length": 1005.3156, |
|
"eval_completions/max_terminated_length": 839.3408, |
|
"eval_completions/mean_length": 531.305675, |
|
"eval_completions/mean_terminated_length": 446.57037117919924, |
|
"eval_completions/min_length": 132.6728, |
|
"eval_completions/min_terminated_length": 132.6728, |
|
"eval_kl": 3.71179091796875, |
|
"eval_loss": 0.19600652158260345, |
|
"eval_num_tokens": 36868550.0, |
|
"eval_reward": 0.20215, |
|
"eval_reward_std": 0.25261726505756377, |
|
"eval_rewards/accuracy_reward/mean": 0.20215, |
|
"eval_rewards/accuracy_reward/std": 0.3302568965673447, |
|
"eval_rewards/format_reward/mean": 0.0, |
|
"eval_rewards/format_reward/std": 0.0, |
|
"eval_runtime": 17634.5575, |
|
"eval_samples_per_second": 0.284, |
|
"eval_steps_per_second": 0.018, |
|
"step": 400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.15, |
|
"completions/max_length": 1005.6875, |
|
"completions/max_terminated_length": 833.475, |
|
"completions/mean_length": 535.62890625, |
|
"completions/mean_terminated_length": 452.4154224395752, |
|
"completions/min_length": 145.45, |
|
"completions/min_terminated_length": 145.45, |
|
"epoch": 0.864, |
|
"grad_norm": 4.507654850853612, |
|
"kl": 3.559375, |
|
"learning_rate": 1.6783648296009728e-07, |
|
"loss": 0.1897, |
|
"num_tokens": 37296524.0, |
|
"reward": 0.23984375, |
|
"reward_std": 0.28821724094450474, |
|
"rewards/accuracy_reward/mean": 0.23984375, |
|
"rewards/accuracy_reward/std": 0.36311594024300575, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1515625, |
|
"completions/max_length": 1024.0, |
|
"completions/max_terminated_length": 885.625, |
|
"completions/mean_length": 544.4328125, |
|
"completions/mean_terminated_length": 461.6532211303711, |
|
"completions/min_length": 137.025, |
|
"completions/min_terminated_length": 137.025, |
|
"epoch": 0.8746666666666667, |
|
"grad_norm": 0.9085864417795622, |
|
"kl": 3.36923828125, |
|
"learning_rate": 1.430455104389463e-07, |
|
"loss": 0.1799, |
|
"num_tokens": 37734025.0, |
|
"reward": 0.215625, |
|
"reward_std": 0.2822155307978392, |
|
"rewards/accuracy_reward/mean": 0.215625, |
|
"rewards/accuracy_reward/std": 0.350068199634552, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1359375, |
|
"completions/max_length": 998.525, |
|
"completions/max_terminated_length": 864.525, |
|
"completions/mean_length": 531.0125, |
|
"completions/mean_terminated_length": 456.2215087890625, |
|
"completions/min_length": 119.025, |
|
"completions/min_terminated_length": 119.025, |
|
"epoch": 0.8853333333333333, |
|
"grad_norm": 1.4991122052195613, |
|
"kl": 3.038134765625, |
|
"learning_rate": 1.2014335743505234e-07, |
|
"loss": 0.132, |
|
"num_tokens": 38162537.0, |
|
"reward": 0.2125, |
|
"reward_std": 0.26438754238188267, |
|
"rewards/accuracy_reward/mean": 0.2125, |
|
"rewards/accuracy_reward/std": 0.35445903837680814, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1328125, |
|
"completions/max_length": 1004.3, |
|
"completions/max_terminated_length": 819.325, |
|
"completions/mean_length": 519.06875, |
|
"completions/mean_terminated_length": 442.86925506591797, |
|
"completions/min_length": 138.95, |
|
"completions/min_terminated_length": 138.95, |
|
"epoch": 0.896, |
|
"grad_norm": 2.3887742458853194, |
|
"kl": 2.613427734375, |
|
"learning_rate": 9.916190271363112e-08, |
|
"loss": 0.1644, |
|
"num_tokens": 38579869.0, |
|
"reward": 0.2765625, |
|
"reward_std": 0.3201499585062265, |
|
"rewards/accuracy_reward/mean": 0.2765625, |
|
"rewards/accuracy_reward/std": 0.3956493496894836, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.15625, |
|
"completions/max_length": 985.525, |
|
"completions/max_terminated_length": 810.875, |
|
"completions/mean_length": 539.99375, |
|
"completions/mean_terminated_length": 451.690470123291, |
|
"completions/min_length": 120.5, |
|
"completions/min_terminated_length": 120.5, |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 1.3401864604257274, |
|
"kl": 2.73095703125, |
|
"learning_rate": 8.013035151441828e-08, |
|
"loss": 0.171, |
|
"num_tokens": 39024553.0, |
|
"reward": 0.2234375, |
|
"reward_std": 0.25571046136319636, |
|
"rewards/accuracy_reward/mean": 0.2234375, |
|
"rewards/accuracy_reward/std": 0.31687583178281786, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1171875, |
|
"completions/max_length": 1005.1, |
|
"completions/max_terminated_length": 851.7, |
|
"completions/mean_length": 518.528125, |
|
"completions/mean_terminated_length": 452.1327270507812, |
|
"completions/min_length": 130.3, |
|
"completions/min_terminated_length": 130.3, |
|
"epoch": 0.9173333333333333, |
|
"grad_norm": 2.167470698605123, |
|
"kl": 3.103076171875, |
|
"learning_rate": 6.307519489929209e-08, |
|
"loss": 0.1795, |
|
"num_tokens": 39445811.0, |
|
"reward": 0.271875, |
|
"reward_std": 0.33798977397382257, |
|
"rewards/accuracy_reward/mean": 0.271875, |
|
"rewards/accuracy_reward/std": 0.3924663387238979, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1578125, |
|
"completions/max_length": 999.3, |
|
"completions/max_terminated_length": 834.65, |
|
"completions/mean_length": 530.0015625, |
|
"completions/mean_terminated_length": 438.1447872161865, |
|
"completions/min_length": 111.7, |
|
"completions/min_terminated_length": 111.7, |
|
"epoch": 0.928, |
|
"grad_norm": 1.846618428525097, |
|
"kl": 3.21512451171875, |
|
"learning_rate": 4.8020172877908494e-08, |
|
"loss": 0.1451, |
|
"num_tokens": 39879828.0, |
|
"reward": 0.19375, |
|
"reward_std": 0.26397562474012376, |
|
"rewards/accuracy_reward/mean": 0.19375, |
|
"rewards/accuracy_reward/std": 0.34237141236662866, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1546875, |
|
"completions/max_length": 985.5, |
|
"completions/max_terminated_length": 826.025, |
|
"completions/mean_length": 539.29375, |
|
"completions/mean_terminated_length": 453.41730575561525, |
|
"completions/min_length": 141.975, |
|
"completions/min_terminated_length": 141.975, |
|
"epoch": 0.9386666666666666, |
|
"grad_norm": 1.616441959368508, |
|
"kl": 2.7537109375, |
|
"learning_rate": 3.498624136267653e-08, |
|
"loss": 0.1327, |
|
"num_tokens": 40309160.0, |
|
"reward": 0.2390625, |
|
"reward_std": 0.28233275562524796, |
|
"rewards/accuracy_reward/mean": 0.2390625, |
|
"rewards/accuracy_reward/std": 0.3621255189180374, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1234375, |
|
"completions/max_length": 1013.775, |
|
"completions/max_terminated_length": 843.275, |
|
"completions/mean_length": 515.215625, |
|
"completions/mean_terminated_length": 444.84048385620116, |
|
"completions/min_length": 131.525, |
|
"completions/min_terminated_length": 131.525, |
|
"epoch": 0.9493333333333334, |
|
"grad_norm": 1.9458890335400199, |
|
"kl": 2.9161376953125, |
|
"learning_rate": 2.39915429990733e-08, |
|
"loss": 0.166, |
|
"num_tokens": 40735346.0, |
|
"reward": 0.1859375, |
|
"reward_std": 0.26107093654572966, |
|
"rewards/accuracy_reward/mean": 0.1859375, |
|
"rewards/accuracy_reward/std": 0.3356678240001202, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.14375, |
|
"completions/max_length": 994.675, |
|
"completions/max_terminated_length": 836.625, |
|
"completions/mean_length": 542.1921875, |
|
"completions/mean_terminated_length": 463.8310737609863, |
|
"completions/min_length": 143.95, |
|
"completions/min_terminated_length": 143.95, |
|
"epoch": 0.96, |
|
"grad_norm": 1.801995464485617, |
|
"kl": 3.32373046875, |
|
"learning_rate": 1.5051381911898253e-08, |
|
"loss": 0.1805, |
|
"num_tokens": 41171661.0, |
|
"reward": 0.190625, |
|
"reward_std": 0.24303234927356243, |
|
"rewards/accuracy_reward/mean": 0.190625, |
|
"rewards/accuracy_reward/std": 0.30271864533424375, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.15625, |
|
"completions/max_length": 1004.325, |
|
"completions/max_terminated_length": 831.425, |
|
"completions/mean_length": 526.859375, |
|
"completions/mean_terminated_length": 436.33713760375974, |
|
"completions/min_length": 115.4, |
|
"completions/min_terminated_length": 115.4, |
|
"epoch": 0.9706666666666667, |
|
"grad_norm": 3.175555647761817, |
|
"kl": 3.665087890625, |
|
"learning_rate": 8.178202402621349e-09, |
|
"loss": 0.2182, |
|
"num_tokens": 41601355.0, |
|
"reward": 0.2296875, |
|
"reward_std": 0.2530842948704958, |
|
"rewards/accuracy_reward/mean": 0.2296875, |
|
"rewards/accuracy_reward/std": 0.3312860429286957, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1375, |
|
"completions/max_length": 1002.925, |
|
"completions/max_terminated_length": 839.05, |
|
"completions/mean_length": 538.9109375, |
|
"completions/mean_terminated_length": 461.0768562316895, |
|
"completions/min_length": 141.975, |
|
"completions/min_terminated_length": 141.975, |
|
"epoch": 0.9813333333333333, |
|
"grad_norm": 1.693673832434065, |
|
"kl": 2.9865234375, |
|
"learning_rate": 3.381571627475488e-09, |
|
"loss": 0.1702, |
|
"num_tokens": 42034314.0, |
|
"reward": 0.221875, |
|
"reward_std": 0.2828045580536127, |
|
"rewards/accuracy_reward/mean": 0.221875, |
|
"rewards/accuracy_reward/std": 0.3636058814823627, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.1609375, |
|
"completions/max_length": 996.125, |
|
"completions/max_terminated_length": 834.275, |
|
"completions/mean_length": 526.0546875, |
|
"completions/mean_terminated_length": 432.3683372497559, |
|
"completions/min_length": 118.925, |
|
"completions/min_terminated_length": 118.925, |
|
"epoch": 0.992, |
|
"grad_norm": 1.2046637410754644, |
|
"kl": 3.300732421875, |
|
"learning_rate": 6.681662804065569e-10, |
|
"loss": 0.1793, |
|
"num_tokens": 42460397.0, |
|
"reward": 0.2484375, |
|
"reward_std": 0.2802219405770302, |
|
"rewards/accuracy_reward/mean": 0.2484375, |
|
"rewards/accuracy_reward/std": 0.36698284819722177, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completions/clipped_ratio": 0.15104166666666666, |
|
"completions/max_length": 965.4166666666666, |
|
"completions/max_terminated_length": 821.9166666666666, |
|
"completions/mean_length": 551.5199699401855, |
|
"completions/mean_terminated_length": 467.1291364034017, |
|
"completions/min_length": 121.79166666666667, |
|
"completions/min_terminated_length": 121.79166666666667, |
|
"epoch": 0.9984, |
|
"kl": 3.1354166666666665, |
|
"num_tokens": 42724152.0, |
|
"reward": 0.2421875, |
|
"reward_std": 0.28976076406737167, |
|
"rewards/accuracy_reward/mean": 0.2421875, |
|
"rewards/accuracy_reward/std": 0.35445400203267735, |
|
"rewards/format_reward/mean": 0.0, |
|
"rewards/format_reward/std": 0.0, |
|
"step": 468, |
|
"total_flos": 0.0, |
|
"train_loss": 0.12545046692467335, |
|
"train_runtime": 98008.0298, |
|
"train_samples_per_second": 0.077, |
|
"train_steps_per_second": 0.005 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 468, |
|
"num_input_tokens_seen": 42724152, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|