|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.3865546218487395, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 635.7109375, |
|
"epoch": 0.01680672268907563, |
|
"grad_norm": 0.31708475947380066, |
|
"kl": 0.0, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0099, |
|
"num_tokens": 143267.0, |
|
"reward": 0.039062500349245965, |
|
"reward_std": 0.0698821279220283, |
|
"rewards/curriculum_aware_reward_fn": 0.023437500349245965, |
|
"rewards/format_reward": 0.015625, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 527.6328125, |
|
"epoch": 0.03361344537815126, |
|
"grad_norm": 0.43825313448905945, |
|
"kl": 0.0002913475036621094, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0432, |
|
"num_tokens": 270812.0, |
|
"reward": 0.09292763145640492, |
|
"reward_std": 0.12866380205377936, |
|
"rewards/curriculum_aware_reward_fn": 0.06949013145640492, |
|
"rewards/format_reward": 0.0234375, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 608.9921875, |
|
"epoch": 0.05042016806722689, |
|
"grad_norm": 0.4227641224861145, |
|
"kl": 0.0002665519714355469, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0273, |
|
"num_tokens": 410971.0, |
|
"reward": 0.059621710795909166, |
|
"reward_std": 0.07889116508886218, |
|
"rewards/curriculum_aware_reward_fn": 0.059621710795909166, |
|
"rewards/format_reward": 0.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 558.921875, |
|
"epoch": 0.06722689075630252, |
|
"grad_norm": 0.4796917140483856, |
|
"kl": 0.0002789497375488281, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0009, |
|
"num_tokens": 542313.0, |
|
"reward": 0.08552631549537182, |
|
"reward_std": 0.12651031091809273, |
|
"rewards/curriculum_aware_reward_fn": 0.06990131689235568, |
|
"rewards/format_reward": 0.015625, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 590.265625, |
|
"epoch": 0.08403361344537816, |
|
"grad_norm": 0.5620821118354797, |
|
"kl": 0.0003027915954589844, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0288, |
|
"num_tokens": 677075.0, |
|
"reward": 0.14925987273454666, |
|
"reward_std": 0.24606542102992535, |
|
"rewards/curriculum_aware_reward_fn": 0.09457236900925636, |
|
"rewards/format_reward": 0.0546875, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 592.5234375, |
|
"epoch": 0.10084033613445378, |
|
"grad_norm": 0.4298699200153351, |
|
"kl": 0.0003509521484375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0151, |
|
"num_tokens": 812710.0, |
|
"reward": 0.08840460644569248, |
|
"reward_std": 0.1141207623295486, |
|
"rewards/curriculum_aware_reward_fn": 0.03371710644569248, |
|
"rewards/format_reward": 0.0546875, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 582.046875, |
|
"epoch": 0.11764705882352941, |
|
"grad_norm": 0.526942253112793, |
|
"kl": 0.0004343986511230469, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0192, |
|
"num_tokens": 943268.0, |
|
"reward": 0.12088815867900848, |
|
"reward_std": 0.17540471255779266, |
|
"rewards/curriculum_aware_reward_fn": 0.07401315867900848, |
|
"rewards/format_reward": 0.046875, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 534.75, |
|
"epoch": 0.13445378151260504, |
|
"grad_norm": 0.44275274872779846, |
|
"kl": 0.0003724098205566406, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0033, |
|
"num_tokens": 1074300.0, |
|
"reward": 0.030016446253284812, |
|
"reward_std": 0.08489933330565691, |
|
"rewards/curriculum_aware_reward_fn": 0.014391447650268674, |
|
"rewards/format_reward": 0.015625, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 539.09375, |
|
"epoch": 0.15126050420168066, |
|
"grad_norm": 0.5494865775108337, |
|
"kl": 0.0007390975952148438, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0036, |
|
"num_tokens": 1197896.0, |
|
"reward": 0.16570723708719015, |
|
"reward_std": 0.21696669608354568, |
|
"rewards/curriculum_aware_reward_fn": 0.05633223685435951, |
|
"rewards/format_reward": 0.109375, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 593.7734375, |
|
"epoch": 0.16806722689075632, |
|
"grad_norm": 0.5171737670898438, |
|
"kl": 0.0006322860717773438, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0193, |
|
"num_tokens": 1336931.0, |
|
"reward": 0.11143092066049576, |
|
"reward_std": 0.19064411148428917, |
|
"rewards/curriculum_aware_reward_fn": 0.017680921009741724, |
|
"rewards/format_reward": 0.09375, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 578.4765625, |
|
"epoch": 0.18487394957983194, |
|
"grad_norm": 0.6088258028030396, |
|
"kl": 0.001346588134765625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.037, |
|
"num_tokens": 1467592.0, |
|
"reward": 0.22944078594446182, |
|
"reward_std": 0.3224767856299877, |
|
"rewards/curriculum_aware_reward_fn": 0.04194079013541341, |
|
"rewards/format_reward": 0.1875, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 601.171875, |
|
"epoch": 0.20168067226890757, |
|
"grad_norm": 0.4451327621936798, |
|
"kl": 0.0010366439819335938, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0148, |
|
"num_tokens": 1607894.0, |
|
"reward": 0.1204769799951464, |
|
"reward_std": 0.1381341191008687, |
|
"rewards/curriculum_aware_reward_fn": 0.018914473825134337, |
|
"rewards/format_reward": 0.1015625, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 526.28125, |
|
"epoch": 0.2184873949579832, |
|
"grad_norm": 0.636314332485199, |
|
"kl": 0.00191497802734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0125, |
|
"num_tokens": 1735650.0, |
|
"reward": 0.26644736528396606, |
|
"reward_std": 0.30141641572117805, |
|
"rewards/curriculum_aware_reward_fn": 0.03988486935850233, |
|
"rewards/format_reward": 0.2265625, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 507.515625, |
|
"epoch": 0.23529411764705882, |
|
"grad_norm": 0.6864922642707825, |
|
"kl": 0.004413604736328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0802, |
|
"num_tokens": 1856316.0, |
|
"reward": 0.3112664446234703, |
|
"reward_std": 0.31644799932837486, |
|
"rewards/curriculum_aware_reward_fn": 0.05345394683536142, |
|
"rewards/format_reward": 0.2578125, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 554.5859375, |
|
"epoch": 0.25210084033613445, |
|
"grad_norm": 0.6268811225891113, |
|
"kl": 0.0036067962646484375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0044, |
|
"num_tokens": 1987511.0, |
|
"reward": 0.4337993338704109, |
|
"reward_std": 0.32329631969332695, |
|
"rewards/curriculum_aware_reward_fn": 0.050986841320991516, |
|
"rewards/format_reward": 0.3828125, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 584.3828125, |
|
"epoch": 0.2689075630252101, |
|
"grad_norm": 0.5531853437423706, |
|
"kl": 0.003597259521484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0104, |
|
"num_tokens": 2119768.0, |
|
"reward": 0.3828125037252903, |
|
"reward_std": 0.26145630702376366, |
|
"rewards/curriculum_aware_reward_fn": 0.0546875, |
|
"rewards/format_reward": 0.328125, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 481.8046875, |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.6449251174926758, |
|
"kl": 0.005481719970703125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0094, |
|
"num_tokens": 2238911.0, |
|
"reward": 0.4543585404753685, |
|
"reward_std": 0.26075971499085426, |
|
"rewards/curriculum_aware_reward_fn": 0.05592105304822326, |
|
"rewards/format_reward": 0.3984375, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 645.75, |
|
"epoch": 0.3025210084033613, |
|
"grad_norm": 0.37918156385421753, |
|
"kl": 0.001049041748046875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0055, |
|
"num_tokens": 2385767.0, |
|
"reward": 0.1451480264076963, |
|
"reward_std": 0.1290158643387258, |
|
"rewards/curriculum_aware_reward_fn": 0.04358552640769631, |
|
"rewards/format_reward": 0.1015625, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 617.6953125, |
|
"epoch": 0.31932773109243695, |
|
"grad_norm": 0.39814478158950806, |
|
"kl": 0.00528717041015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0518, |
|
"num_tokens": 2525656.0, |
|
"reward": 0.35115131735801697, |
|
"reward_std": 0.11648409254848957, |
|
"rewards/curriculum_aware_reward_fn": 0.02302631549537182, |
|
"rewards/format_reward": 0.328125, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 459.6015625, |
|
"epoch": 0.33613445378151263, |
|
"grad_norm": 0.7307525873184204, |
|
"kl": 0.005184173583984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.083, |
|
"num_tokens": 2644077.0, |
|
"reward": 0.47574012726545334, |
|
"reward_std": 0.2815094441175461, |
|
"rewards/curriculum_aware_reward_fn": 0.04605263099074364, |
|
"rewards/format_reward": 0.4296875, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 620.46875, |
|
"epoch": 0.35294117647058826, |
|
"grad_norm": 0.46509799361228943, |
|
"kl": 0.0036363601684570312, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0145, |
|
"num_tokens": 2786169.0, |
|
"reward": 0.24177631677594036, |
|
"reward_std": 0.09853590792044997, |
|
"rewards/curriculum_aware_reward_fn": 0.023026315728202462, |
|
"rewards/format_reward": 0.21875, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 578.9609375, |
|
"epoch": 0.3697478991596639, |
|
"grad_norm": 0.5765166878700256, |
|
"kl": 0.005565643310546875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0042, |
|
"num_tokens": 2917180.0, |
|
"reward": 0.4958881437778473, |
|
"reward_std": 0.10692231869325042, |
|
"rewards/curriculum_aware_reward_fn": 0.0740131582133472, |
|
"rewards/format_reward": 0.421875, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 579.578125, |
|
"epoch": 0.3865546218487395, |
|
"grad_norm": 0.5340356826782227, |
|
"kl": 0.00540924072265625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0083, |
|
"num_tokens": 3053414.0, |
|
"reward": 0.3708881437778473, |
|
"reward_std": 0.11791826784610748, |
|
"rewards/curriculum_aware_reward_fn": 0.06620065867900848, |
|
"rewards/format_reward": 0.3046875, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 507.5625, |
|
"epoch": 0.40336134453781514, |
|
"grad_norm": 0.4752294719219208, |
|
"kl": 0.031703948974609375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0004, |
|
"num_tokens": 3181894.0, |
|
"reward": 0.3700657896697521, |
|
"reward_std": 0.1367718242108822, |
|
"rewards/curriculum_aware_reward_fn": 0.002878289553336799, |
|
"rewards/format_reward": 0.3671875, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 496.4296875, |
|
"epoch": 0.42016806722689076, |
|
"grad_norm": 0.46164318919181824, |
|
"kl": 0.0082855224609375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0091, |
|
"num_tokens": 3304077.0, |
|
"reward": 0.5016447380185127, |
|
"reward_std": 0.09064025245606899, |
|
"rewards/curriculum_aware_reward_fn": 0.017269736621528864, |
|
"rewards/format_reward": 0.484375, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 454.5859375, |
|
"epoch": 0.4369747899159664, |
|
"grad_norm": 0.5706049799919128, |
|
"kl": 0.01887798309326172, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0096, |
|
"num_tokens": 3420488.0, |
|
"reward": 0.6875, |
|
"reward_std": 0.12697386741638184, |
|
"rewards/curriculum_aware_reward_fn": 0.0234375, |
|
"rewards/format_reward": 0.6640625, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 554.46875, |
|
"epoch": 0.453781512605042, |
|
"grad_norm": 0.45473384857177734, |
|
"kl": 0.0068416595458984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0074, |
|
"num_tokens": 3552340.0, |
|
"reward": 0.34868420753628016, |
|
"reward_std": 0.10102300066500902, |
|
"rewards/curriculum_aware_reward_fn": 0.012746710679493845, |
|
"rewards/format_reward": 0.3359375, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 534.0, |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 0.348452091217041, |
|
"kl": 0.01036834716796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0145, |
|
"num_tokens": 3677892.0, |
|
"reward": 0.5571546033024788, |
|
"reward_std": 0.055680982768535614, |
|
"rewards/curriculum_aware_reward_fn": 0.010279605048708618, |
|
"rewards/format_reward": 0.546875, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 584.03125, |
|
"epoch": 0.48739495798319327, |
|
"grad_norm": 0.452033668756485, |
|
"kl": 0.0071258544921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0115, |
|
"num_tokens": 3813600.0, |
|
"reward": 0.3984375, |
|
"reward_std": 0.08443661965429783, |
|
"rewards/curriculum_aware_reward_fn": 0.046875, |
|
"rewards/format_reward": 0.3515625, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 552.4921875, |
|
"epoch": 0.5042016806722689, |
|
"grad_norm": 0.4926210641860962, |
|
"kl": 0.005392551422119141, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0285, |
|
"num_tokens": 3947807.0, |
|
"reward": 0.4683388201519847, |
|
"reward_std": 0.11112732999026775, |
|
"rewards/curriculum_aware_reward_fn": 0.03865131549537182, |
|
"rewards/format_reward": 0.4296875, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 559.6953125, |
|
"epoch": 0.5210084033613446, |
|
"grad_norm": 0.5463467240333557, |
|
"kl": 0.004418373107910156, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0233, |
|
"num_tokens": 4080704.0, |
|
"reward": 0.22203946067020297, |
|
"reward_std": 0.09257729165256023, |
|
"rewards/curriculum_aware_reward_fn": 0.042351973708719015, |
|
"rewards/format_reward": 0.1796875, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 549.921875, |
|
"epoch": 0.5378151260504201, |
|
"grad_norm": 0.36463335156440735, |
|
"kl": 0.006511688232421875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0032, |
|
"num_tokens": 4214870.0, |
|
"reward": 0.4346217215061188, |
|
"reward_std": 0.03605314111337066, |
|
"rewards/curriculum_aware_reward_fn": 0.004934210563078523, |
|
"rewards/format_reward": 0.4296875, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 478.7890625, |
|
"epoch": 0.5546218487394958, |
|
"grad_norm": 0.5116223692893982, |
|
"kl": 0.008532524108886719, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0153, |
|
"num_tokens": 4338203.0, |
|
"reward": 0.4560032896697521, |
|
"reward_std": 0.12314211018383503, |
|
"rewards/curriculum_aware_reward_fn": 0.08881578966975212, |
|
"rewards/format_reward": 0.3671875, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 476.7890625, |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.43187472224235535, |
|
"kl": 0.007843017578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0134, |
|
"num_tokens": 4461184.0, |
|
"reward": 0.4333881586790085, |
|
"reward_std": 0.12357822060585022, |
|
"rewards/curriculum_aware_reward_fn": 0.02713815774768591, |
|
"rewards/format_reward": 0.40625, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 529.7578125, |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.4466142952442169, |
|
"kl": 0.0057315826416015625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.015, |
|
"num_tokens": 4590329.0, |
|
"reward": 0.426809199154377, |
|
"reward_std": 0.10671343095600605, |
|
"rewards/curriculum_aware_reward_fn": 0.059621710097417235, |
|
"rewards/format_reward": 0.3671875, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 521.9453125, |
|
"epoch": 0.6050420168067226, |
|
"grad_norm": 0.5088793635368347, |
|
"kl": 0.00739288330078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0193, |
|
"num_tokens": 4717658.0, |
|
"reward": 0.5740131624042988, |
|
"reward_std": 0.09916227497160435, |
|
"rewards/curriculum_aware_reward_fn": 0.08182565809693187, |
|
"rewards/format_reward": 0.4921875, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 481.59375, |
|
"epoch": 0.6218487394957983, |
|
"grad_norm": 0.3755647540092468, |
|
"kl": 0.005794525146484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0106, |
|
"num_tokens": 4837174.0, |
|
"reward": 0.5123355314135551, |
|
"reward_std": 0.023199534974992275, |
|
"rewards/curriculum_aware_reward_fn": 0.0748355258256197, |
|
"rewards/format_reward": 0.4375, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 465.1953125, |
|
"epoch": 0.6386554621848739, |
|
"grad_norm": 0.5442925691604614, |
|
"kl": 0.008731842041015625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0111, |
|
"num_tokens": 4953039.0, |
|
"reward": 0.7232730239629745, |
|
"reward_std": 0.1315580508671701, |
|
"rewards/curriculum_aware_reward_fn": 0.16077302338089794, |
|
"rewards/format_reward": 0.5625, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 501.484375, |
|
"epoch": 0.6554621848739496, |
|
"grad_norm": 0.4446295201778412, |
|
"kl": 0.00624847412109375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0286, |
|
"num_tokens": 5076965.0, |
|
"reward": 0.47327301651239395, |
|
"reward_std": 0.08440816402435303, |
|
"rewards/curriculum_aware_reward_fn": 0.09827302861958742, |
|
"rewards/format_reward": 0.375, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 539.15625, |
|
"epoch": 0.6722689075630253, |
|
"grad_norm": 0.37400856614112854, |
|
"kl": 0.005260467529296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0044, |
|
"num_tokens": 5207185.0, |
|
"reward": 0.4745065679308027, |
|
"reward_std": 0.07072597183287144, |
|
"rewards/curriculum_aware_reward_fn": 0.09950657887384295, |
|
"rewards/format_reward": 0.375, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 492.0859375, |
|
"epoch": 0.6890756302521008, |
|
"grad_norm": 0.4103780686855316, |
|
"kl": 0.00856781005859375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0049, |
|
"num_tokens": 5328012.0, |
|
"reward": 0.71875, |
|
"reward_std": 0.10247145313769579, |
|
"rewards/curriculum_aware_reward_fn": 0.09375000419095159, |
|
"rewards/format_reward": 0.625, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 405.1328125, |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 0.6738374829292297, |
|
"kl": 0.0108184814453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0454, |
|
"num_tokens": 5438933.0, |
|
"reward": 0.757401317358017, |
|
"reward_std": 0.212964728474617, |
|
"rewards/curriculum_aware_reward_fn": 0.1636513164266944, |
|
"rewards/format_reward": 0.59375, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 516.640625, |
|
"epoch": 0.7226890756302521, |
|
"grad_norm": 0.31194940209388733, |
|
"kl": 0.0074005126953125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0205, |
|
"num_tokens": 5563887.0, |
|
"reward": 0.6562500149011612, |
|
"reward_std": 0.04224720690399408, |
|
"rewards/curriculum_aware_reward_fn": 0.15625, |
|
"rewards/format_reward": 0.5, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 477.3984375, |
|
"epoch": 0.7394957983193278, |
|
"grad_norm": 0.38581541180610657, |
|
"kl": 0.00885009765625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0164, |
|
"num_tokens": 5688114.0, |
|
"reward": 0.6402138248085976, |
|
"reward_std": 0.08311590366065502, |
|
"rewards/curriculum_aware_reward_fn": 0.03083881549537182, |
|
"rewards/format_reward": 0.609375, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 521.8046875, |
|
"epoch": 0.7563025210084033, |
|
"grad_norm": 0.36903509497642517, |
|
"kl": 0.0078277587890625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0022, |
|
"num_tokens": 5814153.0, |
|
"reward": 0.5513980239629745, |
|
"reward_std": 0.06967925047501922, |
|
"rewards/curriculum_aware_reward_fn": 0.05139802524354309, |
|
"rewards/format_reward": 0.5, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 416.4296875, |
|
"epoch": 0.773109243697479, |
|
"grad_norm": 0.5821658968925476, |
|
"kl": 0.0094757080078125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0056, |
|
"num_tokens": 5923904.0, |
|
"reward": 0.7257401347160339, |
|
"reward_std": 0.13419464463368058, |
|
"rewards/curriculum_aware_reward_fn": 0.10074013192206621, |
|
"rewards/format_reward": 0.625, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 526.9296875, |
|
"epoch": 0.7899159663865546, |
|
"grad_norm": 0.449553519487381, |
|
"kl": 0.005664825439453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0001, |
|
"num_tokens": 6053447.0, |
|
"reward": 0.4819078892469406, |
|
"reward_std": 0.09099963493645191, |
|
"rewards/curriculum_aware_reward_fn": 0.10690789669752121, |
|
"rewards/format_reward": 0.375, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 536.5, |
|
"epoch": 0.8067226890756303, |
|
"grad_norm": 0.5381475687026978, |
|
"kl": 0.008424758911132812, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0099, |
|
"num_tokens": 6183559.0, |
|
"reward": 0.46833881735801697, |
|
"reward_std": 0.08668615715578198, |
|
"rewards/curriculum_aware_reward_fn": 0.03865131642669439, |
|
"rewards/format_reward": 0.4296875, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 539.6328125, |
|
"epoch": 0.8235294117647058, |
|
"grad_norm": 0.44155657291412354, |
|
"kl": 0.0077495574951171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0085, |
|
"num_tokens": 6314544.0, |
|
"reward": 0.5526315867900848, |
|
"reward_std": 0.027912108227610588, |
|
"rewards/curriculum_aware_reward_fn": 0.11513157933950424, |
|
"rewards/format_reward": 0.4375, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 554.0546875, |
|
"epoch": 0.8403361344537815, |
|
"grad_norm": 0.4840262532234192, |
|
"kl": 0.0054950714111328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0073, |
|
"num_tokens": 6445087.0, |
|
"reward": 0.33634869009256363, |
|
"reward_std": 0.10334387933835387, |
|
"rewards/curriculum_aware_reward_fn": 0.03166118450462818, |
|
"rewards/format_reward": 0.3046875, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 578.6796875, |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.30791598558425903, |
|
"kl": 0.005002021789550781, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0068, |
|
"num_tokens": 6582878.0, |
|
"reward": 0.348684199154377, |
|
"reward_std": 0.07469352334737778, |
|
"rewards/curriculum_aware_reward_fn": 0.036184209398925304, |
|
"rewards/format_reward": 0.3125, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 448.1328125, |
|
"epoch": 0.8739495798319328, |
|
"grad_norm": 0.5027822852134705, |
|
"kl": 0.00795745849609375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0178, |
|
"num_tokens": 6698503.0, |
|
"reward": 0.6311677470803261, |
|
"reward_std": 0.11679959110915661, |
|
"rewards/curriculum_aware_reward_fn": 0.09210526384413242, |
|
"rewards/format_reward": 0.5390625, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 521.1875, |
|
"epoch": 0.8907563025210085, |
|
"grad_norm": 0.4084753394126892, |
|
"kl": 0.00714111328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0193, |
|
"num_tokens": 6823951.0, |
|
"reward": 0.5028782784938812, |
|
"reward_std": 0.059696739073842764, |
|
"rewards/curriculum_aware_reward_fn": 0.06537829001899809, |
|
"rewards/format_reward": 0.4375, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 539.109375, |
|
"epoch": 0.907563025210084, |
|
"grad_norm": 0.2098054140806198, |
|
"kl": 0.007198333740234375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0114, |
|
"num_tokens": 6953317.0, |
|
"reward": 0.46052631735801697, |
|
"reward_std": 0.03168220818042755, |
|
"rewards/curriculum_aware_reward_fn": 0.023026317358016968, |
|
"rewards/format_reward": 0.4375, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 521.2265625, |
|
"epoch": 0.9243697478991597, |
|
"grad_norm": 0.4919142425060272, |
|
"kl": 0.007293701171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0101, |
|
"num_tokens": 7079922.0, |
|
"reward": 0.49547697603702545, |
|
"reward_std": 0.10914274398237467, |
|
"rewards/curriculum_aware_reward_fn": 0.12047697883099318, |
|
"rewards/format_reward": 0.375, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 500.5625, |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 0.46875280141830444, |
|
"kl": 0.00684356689453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0266, |
|
"num_tokens": 7206954.0, |
|
"reward": 0.40830591320991516, |
|
"reward_std": 0.1075905729085207, |
|
"rewards/curriculum_aware_reward_fn": 0.04111842066049576, |
|
"rewards/format_reward": 0.3671875, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 482.3046875, |
|
"epoch": 0.957983193277311, |
|
"grad_norm": 0.40924757719039917, |
|
"kl": 0.012725830078125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0041, |
|
"num_tokens": 7327857.0, |
|
"reward": 0.5958059132099152, |
|
"reward_std": 0.06403321353718638, |
|
"rewards/curriculum_aware_reward_fn": 0.04111842007841915, |
|
"rewards/format_reward": 0.5546875, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 445.375, |
|
"epoch": 0.9747899159663865, |
|
"grad_norm": 0.4467240273952484, |
|
"kl": 0.0105743408203125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0061, |
|
"num_tokens": 7440561.0, |
|
"reward": 0.7578125, |
|
"reward_std": 0.057358515448868275, |
|
"rewards/curriculum_aware_reward_fn": 0.13281250069849193, |
|
"rewards/format_reward": 0.625, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 582.3452377319336, |
|
"epoch": 0.9915966386554622, |
|
"grad_norm": 0.5007306933403015, |
|
"kl": 0.007415771484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0029, |
|
"num_tokens": 7569086.0, |
|
"reward": 0.4514802545309067, |
|
"reward_std": 0.06341935088858008, |
|
"rewards/curriculum_aware_reward_fn": 0.0217927637277171, |
|
"rewards/format_reward": 0.4296875, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 553.09375, |
|
"epoch": 1.0168067226890756, |
|
"grad_norm": 0.4292355179786682, |
|
"kl": 0.005462646484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0119, |
|
"num_tokens": 7702626.0, |
|
"reward": 0.4325658082962036, |
|
"reward_std": 0.07455102633684874, |
|
"rewards/curriculum_aware_reward_fn": 0.05756579013541341, |
|
"rewards/format_reward": 0.375, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 521.375, |
|
"epoch": 1.0336134453781514, |
|
"grad_norm": 0.41578003764152527, |
|
"kl": 0.008762359619140625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0056, |
|
"num_tokens": 7827818.0, |
|
"reward": 0.5082236900925636, |
|
"reward_std": 0.07253926200792193, |
|
"rewards/curriculum_aware_reward_fn": 0.07072368497028947, |
|
"rewards/format_reward": 0.4375, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 648.2734375, |
|
"epoch": 1.050420168067227, |
|
"grad_norm": 0.48642197251319885, |
|
"kl": 0.0062713623046875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0136, |
|
"num_tokens": 7974333.0, |
|
"reward": 0.3449835442006588, |
|
"reward_std": 0.07259867247194052, |
|
"rewards/curriculum_aware_reward_fn": 0.03248355258256197, |
|
"rewards/format_reward": 0.3125, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 436.9296875, |
|
"epoch": 1.0672268907563025, |
|
"grad_norm": 0.3184286653995514, |
|
"kl": 0.0114593505859375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0139, |
|
"num_tokens": 8084908.0, |
|
"reward": 0.6899671256542206, |
|
"reward_std": 0.0728745711967349, |
|
"rewards/curriculum_aware_reward_fn": 0.0649671049322933, |
|
"rewards/format_reward": 0.625, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 541.53125, |
|
"epoch": 1.084033613445378, |
|
"grad_norm": 0.16483676433563232, |
|
"kl": 0.0060882568359375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0032, |
|
"num_tokens": 8216696.0, |
|
"reward": 0.2627467066049576, |
|
"reward_std": 0.024391429498791695, |
|
"rewards/curriculum_aware_reward_fn": 0.012746710330247879, |
|
"rewards/format_reward": 0.25, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 509.7890625, |
|
"epoch": 1.1008403361344539, |
|
"grad_norm": 0.4256879985332489, |
|
"kl": 0.00730133056640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0102, |
|
"num_tokens": 8342845.0, |
|
"reward": 0.5197368264198303, |
|
"reward_std": 0.030515023041516542, |
|
"rewards/curriculum_aware_reward_fn": 0.019736842485144734, |
|
"rewards/format_reward": 0.5, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 486.296875, |
|
"epoch": 1.1176470588235294, |
|
"grad_norm": 0.3091375231742859, |
|
"kl": 0.008016586303710938, |
|
"learning_rate": 1e-06, |
|
"loss": -0.003, |
|
"num_tokens": 8462971.0, |
|
"reward": 0.46299341320991516, |
|
"reward_std": 0.04847824294120073, |
|
"rewards/curriculum_aware_reward_fn": 0.025493420660495758, |
|
"rewards/format_reward": 0.4375, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 596.234375, |
|
"epoch": 1.134453781512605, |
|
"grad_norm": 0.4554305076599121, |
|
"kl": 0.006458282470703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.011, |
|
"num_tokens": 8598337.0, |
|
"reward": 0.3758223643526435, |
|
"reward_std": 0.08455474488437176, |
|
"rewards/curriculum_aware_reward_fn": 0.1258223680779338, |
|
"rewards/format_reward": 0.25, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 444.265625, |
|
"epoch": 1.1512605042016806, |
|
"grad_norm": 0.4700126349925995, |
|
"kl": 0.013336181640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0065, |
|
"num_tokens": 8715091.0, |
|
"reward": 0.67434211820364, |
|
"reward_std": 0.12386543769389391, |
|
"rewards/curriculum_aware_reward_fn": 0.11965460598003119, |
|
"rewards/format_reward": 0.5546875, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 537.6484375, |
|
"epoch": 1.1680672268907564, |
|
"grad_norm": 0.5387859344482422, |
|
"kl": 0.0084075927734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0113, |
|
"num_tokens": 8845582.0, |
|
"reward": 0.5822368338704109, |
|
"reward_std": 0.16140672331675887, |
|
"rewards/curriculum_aware_reward_fn": 0.10567433899268508, |
|
"rewards/format_reward": 0.4765625, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 571.25, |
|
"epoch": 1.184873949579832, |
|
"grad_norm": 0.28276559710502625, |
|
"kl": 0.005802154541015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0147, |
|
"num_tokens": 8979574.0, |
|
"reward": 0.2606907826848328, |
|
"reward_std": 0.051840442698448896, |
|
"rewards/curriculum_aware_reward_fn": 0.018503289436921477, |
|
"rewards/format_reward": 0.2421875, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 495.6953125, |
|
"epoch": 1.2016806722689075, |
|
"grad_norm": 0.3467198312282562, |
|
"kl": 0.007556915283203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0133, |
|
"num_tokens": 9104143.0, |
|
"reward": 0.5476973727345467, |
|
"reward_std": 0.0878668250516057, |
|
"rewards/curriculum_aware_reward_fn": 0.04769736947491765, |
|
"rewards/format_reward": 0.5, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 628.96875, |
|
"epoch": 1.2184873949579833, |
|
"grad_norm": 0.30438435077667236, |
|
"kl": 0.0047740936279296875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0058, |
|
"num_tokens": 9247579.0, |
|
"reward": 0.25863486528396606, |
|
"reward_std": 0.05783074861392379, |
|
"rewards/curriculum_aware_reward_fn": 0.016447368427179754, |
|
"rewards/format_reward": 0.2421875, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 581.7109375, |
|
"epoch": 1.2352941176470589, |
|
"grad_norm": 0.16290180385112762, |
|
"kl": 0.005523681640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0069, |
|
"num_tokens": 9383094.0, |
|
"reward": 0.32195723056793213, |
|
"reward_std": 0.014439198188483715, |
|
"rewards/curriculum_aware_reward_fn": 0.009457237087190151, |
|
"rewards/format_reward": 0.3125, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 536.9609375, |
|
"epoch": 1.2521008403361344, |
|
"grad_norm": 1.2357046604156494, |
|
"kl": 0.170867919921875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0054, |
|
"num_tokens": 9513217.0, |
|
"reward": 0.582236819434911, |
|
"reward_std": 0.0510927583090961, |
|
"rewards/curriculum_aware_reward_fn": 0.01973684225231409, |
|
"rewards/format_reward": 0.5625, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 487.8671875, |
|
"epoch": 1.26890756302521, |
|
"grad_norm": 0.46429404616355896, |
|
"kl": 0.0113677978515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0446, |
|
"num_tokens": 9635408.0, |
|
"reward": 0.726973682641983, |
|
"reward_std": 0.11705214250832796, |
|
"rewards/curriculum_aware_reward_fn": 0.10197368450462818, |
|
"rewards/format_reward": 0.625, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 584.296875, |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 0.42755427956581116, |
|
"kl": 0.00647735595703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0307, |
|
"num_tokens": 9770998.0, |
|
"reward": 0.49136512726545334, |
|
"reward_std": 0.10772840678691864, |
|
"rewards/curriculum_aware_reward_fn": 0.06167763099074364, |
|
"rewards/format_reward": 0.4296875, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 429.296875, |
|
"epoch": 1.3025210084033614, |
|
"grad_norm": 0.45878008008003235, |
|
"kl": 0.01023101806640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0156, |
|
"num_tokens": 9886868.0, |
|
"reward": 0.7347861528396606, |
|
"reward_std": 0.10009488789364696, |
|
"rewards/curriculum_aware_reward_fn": 0.06291118392255157, |
|
"rewards/format_reward": 0.671875, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 516.6328125, |
|
"epoch": 1.319327731092437, |
|
"grad_norm": 0.3113223910331726, |
|
"kl": 0.0077972412109375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0078, |
|
"num_tokens": 10011221.0, |
|
"reward": 0.5966282933950424, |
|
"reward_std": 0.041548303328454494, |
|
"rewards/curriculum_aware_reward_fn": 0.09662828780710697, |
|
"rewards/format_reward": 0.5, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 557.4765625, |
|
"epoch": 1.3361344537815127, |
|
"grad_norm": 0.33871227502822876, |
|
"kl": 0.0073699951171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0191, |
|
"num_tokens": 10140970.0, |
|
"reward": 0.5415295958518982, |
|
"reward_std": 0.07458627689629793, |
|
"rewards/curriculum_aware_reward_fn": 0.04152960516512394, |
|
"rewards/format_reward": 0.5, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 564.125, |
|
"epoch": 1.3529411764705883, |
|
"grad_norm": 0.4491986930370331, |
|
"kl": 0.006259918212890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0074, |
|
"num_tokens": 10271986.0, |
|
"reward": 0.5296052545309067, |
|
"reward_std": 0.1359914354979992, |
|
"rewards/curriculum_aware_reward_fn": 0.09210526291280985, |
|
"rewards/format_reward": 0.4375, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 503.125, |
|
"epoch": 1.3697478991596639, |
|
"grad_norm": 0.2838430404663086, |
|
"kl": 0.00777435302734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0186, |
|
"num_tokens": 10398090.0, |
|
"reward": 0.6521381586790085, |
|
"reward_std": 0.05697542009875178, |
|
"rewards/curriculum_aware_reward_fn": 0.027138158679008484, |
|
"rewards/format_reward": 0.625, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 530.234375, |
|
"epoch": 1.3865546218487395, |
|
"grad_norm": 0.4765428602695465, |
|
"kl": 0.00778961181640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0302, |
|
"num_tokens": 10526192.0, |
|
"reward": 0.6208881437778473, |
|
"reward_std": 0.12499829288572073, |
|
"rewards/curriculum_aware_reward_fn": 0.06620065588504076, |
|
"rewards/format_reward": 0.5546875, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 566.84375, |
|
"epoch": 1.403361344537815, |
|
"grad_norm": 0.4760180711746216, |
|
"kl": 0.0066986083984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0084, |
|
"num_tokens": 10657412.0, |
|
"reward": 0.46916117519140244, |
|
"reward_std": 0.10547287575900555, |
|
"rewards/curriculum_aware_reward_fn": 0.09416118310764432, |
|
"rewards/format_reward": 0.375, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 560.7734375, |
|
"epoch": 1.4201680672268908, |
|
"grad_norm": 0.27778276801109314, |
|
"kl": 0.005718231201171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0127, |
|
"num_tokens": 10788255.0, |
|
"reward": 0.44736841320991516, |
|
"reward_std": 0.06990169547498226, |
|
"rewards/curriculum_aware_reward_fn": 0.07236842112615705, |
|
"rewards/format_reward": 0.375, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 484.9296875, |
|
"epoch": 1.4369747899159664, |
|
"grad_norm": 0.34481725096702576, |
|
"kl": 0.02048492431640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0032, |
|
"num_tokens": 10911166.0, |
|
"reward": 0.7388980239629745, |
|
"reward_std": 0.08143611438572407, |
|
"rewards/curriculum_aware_reward_fn": 0.1138980237301439, |
|
"rewards/format_reward": 0.625, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 457.71875, |
|
"epoch": 1.453781512605042, |
|
"grad_norm": 0.4829816222190857, |
|
"kl": 0.0100555419921875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0016, |
|
"num_tokens": 11027554.0, |
|
"reward": 0.6735197305679321, |
|
"reward_std": 0.08864451944828033, |
|
"rewards/curriculum_aware_reward_fn": 0.11101973801851273, |
|
"rewards/format_reward": 0.5625, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 508.6953125, |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 0.5016542077064514, |
|
"kl": 0.00922393798828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.01, |
|
"num_tokens": 11149275.0, |
|
"reward": 0.6870888322591782, |
|
"reward_std": 0.08495050063356757, |
|
"rewards/curriculum_aware_reward_fn": 0.12458881677594036, |
|
"rewards/format_reward": 0.5625, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 602.9921875, |
|
"epoch": 1.4873949579831933, |
|
"grad_norm": 0.29301658272743225, |
|
"kl": 0.004894256591796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0249, |
|
"num_tokens": 11288106.0, |
|
"reward": 0.29481907188892365, |
|
"reward_std": 0.0620402698405087, |
|
"rewards/curriculum_aware_reward_fn": 0.04481907875742763, |
|
"rewards/format_reward": 0.25, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 438.71875, |
|
"epoch": 1.504201680672269, |
|
"grad_norm": 0.5715950727462769, |
|
"kl": 0.01503753662109375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0041, |
|
"num_tokens": 11401118.0, |
|
"reward": 0.8972039222717285, |
|
"reward_std": 0.10221139155328274, |
|
"rewards/curriculum_aware_reward_fn": 0.1472039446234703, |
|
"rewards/format_reward": 0.75, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 547.015625, |
|
"epoch": 1.5210084033613445, |
|
"grad_norm": 0.31229323148727417, |
|
"kl": 0.0074462890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0001, |
|
"num_tokens": 11531400.0, |
|
"reward": 0.5945723652839661, |
|
"reward_std": 0.05676991865038872, |
|
"rewards/curriculum_aware_reward_fn": 0.15707236900925636, |
|
"rewards/format_reward": 0.4375, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 599.9375, |
|
"epoch": 1.53781512605042, |
|
"grad_norm": 0.3754754066467285, |
|
"kl": 0.005001068115234375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0337, |
|
"num_tokens": 11667224.0, |
|
"reward": 0.4358552396297455, |
|
"reward_std": 0.1078398427926004, |
|
"rewards/curriculum_aware_reward_fn": 0.060855261399410665, |
|
"rewards/format_reward": 0.375, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 563.3671875, |
|
"epoch": 1.5546218487394958, |
|
"grad_norm": 0.44682905077934265, |
|
"kl": 0.00695037841796875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0261, |
|
"num_tokens": 11800087.0, |
|
"reward": 0.47820721566677094, |
|
"reward_std": 0.11488656094297767, |
|
"rewards/curriculum_aware_reward_fn": 0.10320723743643612, |
|
"rewards/format_reward": 0.375, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 524.7578125, |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 0.4093223214149475, |
|
"kl": 0.0079803466796875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0003, |
|
"num_tokens": 11927808.0, |
|
"reward": 0.5555098727345467, |
|
"reward_std": 0.10677139926701784, |
|
"rewards/curriculum_aware_reward_fn": 0.06332236900925636, |
|
"rewards/format_reward": 0.4921875, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 612.375, |
|
"epoch": 1.5882352941176472, |
|
"grad_norm": 0.28754857182502747, |
|
"kl": 0.004489898681640625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0277, |
|
"num_tokens": 12069560.0, |
|
"reward": 0.3371710553765297, |
|
"reward_std": 0.050214093178510666, |
|
"rewards/curriculum_aware_reward_fn": 0.02467105258256197, |
|
"rewards/format_reward": 0.3125, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 383.2578125, |
|
"epoch": 1.6050420168067228, |
|
"grad_norm": 0.47502318024635315, |
|
"kl": 0.0126953125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.016, |
|
"num_tokens": 12176625.0, |
|
"reward": 0.7224506512284279, |
|
"reward_std": 0.10677911480888724, |
|
"rewards/curriculum_aware_reward_fn": 0.10526315728202462, |
|
"rewards/format_reward": 0.6171875, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 578.078125, |
|
"epoch": 1.6218487394957983, |
|
"grad_norm": 0.34693828225135803, |
|
"kl": 0.006988525390625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.006, |
|
"num_tokens": 12310939.0, |
|
"reward": 0.5254934206604958, |
|
"reward_std": 0.06210480257868767, |
|
"rewards/curriculum_aware_reward_fn": 0.08799342112615705, |
|
"rewards/format_reward": 0.4375, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 529.828125, |
|
"epoch": 1.638655462184874, |
|
"grad_norm": 2.9580295085906982, |
|
"kl": 0.21123504638671875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0019, |
|
"num_tokens": 12436949.0, |
|
"reward": 0.5230263099074364, |
|
"reward_std": 0.13364601507782936, |
|
"rewards/curriculum_aware_reward_fn": 0.11677631549537182, |
|
"rewards/format_reward": 0.40625, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 470.8828125, |
|
"epoch": 1.6554621848739495, |
|
"grad_norm": 0.39620673656463623, |
|
"kl": 0.00954437255859375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0048, |
|
"num_tokens": 12558190.0, |
|
"reward": 0.8194901347160339, |
|
"reward_std": 0.09049705043435097, |
|
"rewards/curriculum_aware_reward_fn": 0.26480263471603394, |
|
"rewards/format_reward": 0.5546875, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 495.3515625, |
|
"epoch": 1.6722689075630253, |
|
"grad_norm": 0.5109691619873047, |
|
"kl": 0.007015228271484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0351, |
|
"num_tokens": 12681859.0, |
|
"reward": 0.4362664595246315, |
|
"reward_std": 0.0971333347260952, |
|
"rewards/curriculum_aware_reward_fn": 0.1237664483487606, |
|
"rewards/format_reward": 0.3125, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 478.0703125, |
|
"epoch": 1.6890756302521008, |
|
"grad_norm": 0.4189630150794983, |
|
"kl": 0.0095977783203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0011, |
|
"num_tokens": 12801148.0, |
|
"reward": 0.6920230239629745, |
|
"reward_std": 0.10883715003728867, |
|
"rewards/curriculum_aware_reward_fn": 0.19983552768826485, |
|
"rewards/format_reward": 0.4921875, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 460.6875, |
|
"epoch": 1.7058823529411766, |
|
"grad_norm": 0.5282026529312134, |
|
"kl": 0.007904052734375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0042, |
|
"num_tokens": 12921692.0, |
|
"reward": 0.3396381661295891, |
|
"reward_std": 0.11080991290509701, |
|
"rewards/curriculum_aware_reward_fn": 0.04276315798051655, |
|
"rewards/format_reward": 0.296875, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 554.40625, |
|
"epoch": 1.7226890756302522, |
|
"grad_norm": 0.5177521109580994, |
|
"kl": 0.01079559326171875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.009, |
|
"num_tokens": 13052136.0, |
|
"reward": 0.36965460516512394, |
|
"reward_std": 0.10201659612357616, |
|
"rewards/curriculum_aware_reward_fn": 0.01809210516512394, |
|
"rewards/format_reward": 0.3515625, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 524.421875, |
|
"epoch": 1.7394957983193278, |
|
"grad_norm": 0.44328662753105164, |
|
"kl": 0.008655548095703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0114, |
|
"num_tokens": 13178822.0, |
|
"reward": 0.5349506437778473, |
|
"reward_std": 0.12413342297077179, |
|
"rewards/curriculum_aware_reward_fn": 0.058388158679008484, |
|
"rewards/format_reward": 0.4765625, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 446.46875, |
|
"epoch": 1.7563025210084033, |
|
"grad_norm": 0.647972583770752, |
|
"kl": 0.01692962646484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0047, |
|
"num_tokens": 13297402.0, |
|
"reward": 0.6476151421666145, |
|
"reward_std": 0.22924628667533398, |
|
"rewards/curriculum_aware_reward_fn": 0.07730263285338879, |
|
"rewards/format_reward": 0.5703125, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 503.703125, |
|
"epoch": 1.773109243697479, |
|
"grad_norm": 0.631151556968689, |
|
"kl": 0.008514404296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0008, |
|
"num_tokens": 13418340.0, |
|
"reward": 0.46299341320991516, |
|
"reward_std": 0.2022387906908989, |
|
"rewards/curriculum_aware_reward_fn": 0.06455592066049576, |
|
"rewards/format_reward": 0.3984375, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 562.03125, |
|
"epoch": 1.7899159663865545, |
|
"grad_norm": 0.3566150963306427, |
|
"kl": 0.006641387939453125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0002, |
|
"num_tokens": 13550952.0, |
|
"reward": 0.35773025802336633, |
|
"reward_std": 0.09330996312201023, |
|
"rewards/curriculum_aware_reward_fn": 0.05304276151582599, |
|
"rewards/format_reward": 0.3046875, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 539.1796875, |
|
"epoch": 1.8067226890756303, |
|
"grad_norm": 0.4120214581489563, |
|
"kl": 0.00933074951171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0026, |
|
"num_tokens": 13678999.0, |
|
"reward": 0.5435855314135551, |
|
"reward_std": 0.15557273291051388, |
|
"rewards/curriculum_aware_reward_fn": 0.12171052396297455, |
|
"rewards/format_reward": 0.421875, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 542.1796875, |
|
"epoch": 1.8235294117647058, |
|
"grad_norm": 0.36332470178604126, |
|
"kl": 0.00751495361328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0098, |
|
"num_tokens": 13811206.0, |
|
"reward": 0.48643091320991516, |
|
"reward_std": 0.13410842791199684, |
|
"rewards/curriculum_aware_reward_fn": 0.11924342392012477, |
|
"rewards/format_reward": 0.3671875, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 553.953125, |
|
"epoch": 1.8403361344537816, |
|
"grad_norm": 0.3152480721473694, |
|
"kl": 0.00626373291015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0016, |
|
"num_tokens": 13945808.0, |
|
"reward": 0.3293585479259491, |
|
"reward_std": 0.045257058925926685, |
|
"rewards/curriculum_aware_reward_fn": 0.016858553048223257, |
|
"rewards/format_reward": 0.3125, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 573.59375, |
|
"epoch": 1.8571428571428572, |
|
"grad_norm": 0.2340080589056015, |
|
"kl": 0.00682830810546875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0071, |
|
"num_tokens": 14080460.0, |
|
"reward": 0.3347039520740509, |
|
"reward_std": 0.038679007440805435, |
|
"rewards/curriculum_aware_reward_fn": 0.02220394741743803, |
|
"rewards/format_reward": 0.3125, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 463.0390625, |
|
"epoch": 1.8739495798319328, |
|
"grad_norm": 0.36526933312416077, |
|
"kl": 0.009578704833984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0039, |
|
"num_tokens": 14201065.0, |
|
"reward": 0.6328125149011612, |
|
"reward_std": 0.05027205403894186, |
|
"rewards/curriculum_aware_reward_fn": 0.07031250046566129, |
|
"rewards/format_reward": 0.5625, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 481.9609375, |
|
"epoch": 1.8907563025210083, |
|
"grad_norm": 0.4954119324684143, |
|
"kl": 0.0100555419921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0005, |
|
"num_tokens": 14323068.0, |
|
"reward": 0.5254934206604958, |
|
"reward_std": 0.12779070809483528, |
|
"rewards/curriculum_aware_reward_fn": 0.08799342159181833, |
|
"rewards/format_reward": 0.4375, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 494.96875, |
|
"epoch": 1.907563025210084, |
|
"grad_norm": 0.46778982877731323, |
|
"kl": 0.00978851318359375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0199, |
|
"num_tokens": 14447008.0, |
|
"reward": 0.5370065793395042, |
|
"reward_std": 0.1048955712467432, |
|
"rewards/curriculum_aware_reward_fn": 0.09950657980516553, |
|
"rewards/format_reward": 0.4375, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 501.6796875, |
|
"epoch": 1.9243697478991597, |
|
"grad_norm": 0.3055194616317749, |
|
"kl": 0.00933074951171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0032, |
|
"num_tokens": 14571103.0, |
|
"reward": 0.5111019909381866, |
|
"reward_std": 0.024554526433348656, |
|
"rewards/curriculum_aware_reward_fn": 0.08141447440721095, |
|
"rewards/format_reward": 0.4296875, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 508.8203125, |
|
"epoch": 1.9411764705882353, |
|
"grad_norm": 0.4632183611392975, |
|
"kl": 0.012451171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0157, |
|
"num_tokens": 14694424.0, |
|
"reward": 0.6089638024568558, |
|
"reward_std": 0.10860061645507812, |
|
"rewards/curriculum_aware_reward_fn": 0.11677631549537182, |
|
"rewards/format_reward": 0.4921875, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 495.6875, |
|
"epoch": 1.957983193277311, |
|
"grad_norm": 0.41369161009788513, |
|
"kl": 0.0089874267578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0212, |
|
"num_tokens": 14819792.0, |
|
"reward": 0.4621710479259491, |
|
"reward_std": 0.07010683044791222, |
|
"rewards/curriculum_aware_reward_fn": 0.0871710479259491, |
|
"rewards/format_reward": 0.375, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 529.359375, |
|
"epoch": 1.9747899159663866, |
|
"grad_norm": 0.40478190779685974, |
|
"kl": 0.012042999267578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0388, |
|
"num_tokens": 14946718.0, |
|
"reward": 0.48190788179636, |
|
"reward_std": 0.10751516558229923, |
|
"rewards/curriculum_aware_reward_fn": 0.11472039762884378, |
|
"rewards/format_reward": 0.3671875, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 493.4881134033203, |
|
"epoch": 1.9915966386554622, |
|
"grad_norm": 0.3562357425689697, |
|
"kl": 0.0123748779296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0141, |
|
"num_tokens": 15064457.0, |
|
"reward": 0.6706414446234703, |
|
"reward_std": 0.101046122610569, |
|
"rewards/curriculum_aware_reward_fn": 0.05345394788309932, |
|
"rewards/format_reward": 0.6171875, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 517.7578125, |
|
"epoch": 2.0168067226890756, |
|
"grad_norm": 0.3487071394920349, |
|
"kl": 0.0104217529296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0163, |
|
"num_tokens": 15191538.0, |
|
"reward": 0.5201480090618134, |
|
"reward_std": 0.04716231161728501, |
|
"rewards/curriculum_aware_reward_fn": 0.02014802652411163, |
|
"rewards/format_reward": 0.5, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 577.765625, |
|
"epoch": 2.033613445378151, |
|
"grad_norm": 0.35752227902412415, |
|
"kl": 0.008148193359375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0108, |
|
"num_tokens": 15327204.0, |
|
"reward": 0.42763157933950424, |
|
"reward_std": 0.09388388879597187, |
|
"rewards/curriculum_aware_reward_fn": 0.05263157933950424, |
|
"rewards/format_reward": 0.375, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 460.53125, |
|
"epoch": 2.0504201680672267, |
|
"grad_norm": 0.5020465850830078, |
|
"kl": 0.014190673828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0113, |
|
"num_tokens": 15447608.0, |
|
"reward": 0.693256601691246, |
|
"reward_std": 0.12680460885167122, |
|
"rewards/curriculum_aware_reward_fn": 0.06825657840818167, |
|
"rewards/format_reward": 0.625, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 526.7890625, |
|
"epoch": 2.0672268907563027, |
|
"grad_norm": 0.33090242743492126, |
|
"kl": 0.00830841064453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0212, |
|
"num_tokens": 15577021.0, |
|
"reward": 0.3022203971631825, |
|
"reward_std": 0.052566134836524725, |
|
"rewards/curriculum_aware_reward_fn": 0.0522203971631825, |
|
"rewards/format_reward": 0.25, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 465.390625, |
|
"epoch": 2.0840336134453783, |
|
"grad_norm": 0.25564736127853394, |
|
"kl": 0.018894195556640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.001, |
|
"num_tokens": 15693543.0, |
|
"reward": 0.5879934281110764, |
|
"reward_std": 0.03513536183163524, |
|
"rewards/curriculum_aware_reward_fn": 0.15830592159181833, |
|
"rewards/format_reward": 0.4296875, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 438.015625, |
|
"epoch": 2.100840336134454, |
|
"grad_norm": 0.5210288763046265, |
|
"kl": 0.0128936767578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.038, |
|
"num_tokens": 15805441.0, |
|
"reward": 0.7685032784938812, |
|
"reward_std": 0.15490676742047071, |
|
"rewards/curriculum_aware_reward_fn": 0.20600328128784895, |
|
"rewards/format_reward": 0.5625, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 419.3515625, |
|
"epoch": 2.1176470588235294, |
|
"grad_norm": 0.48274165391921997, |
|
"kl": 0.01959228515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0226, |
|
"num_tokens": 15913862.0, |
|
"reward": 0.671875, |
|
"reward_std": 0.11604671645909548, |
|
"rewards/curriculum_aware_reward_fn": 0.10937500139698386, |
|
"rewards/format_reward": 0.5625, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 520.8203125, |
|
"epoch": 2.134453781512605, |
|
"grad_norm": 0.35000789165496826, |
|
"kl": 0.0090179443359375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0026, |
|
"num_tokens": 16041007.0, |
|
"reward": 0.49794407607987523, |
|
"reward_std": 0.10071868449449539, |
|
"rewards/curriculum_aware_reward_fn": 0.12294407980516553, |
|
"rewards/format_reward": 0.375, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 557.6328125, |
|
"epoch": 2.1512605042016806, |
|
"grad_norm": 0.5103374719619751, |
|
"kl": 0.0096435546875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0051, |
|
"num_tokens": 16173728.0, |
|
"reward": 0.45641446858644485, |
|
"reward_std": 0.10976400738582015, |
|
"rewards/curriculum_aware_reward_fn": 0.08141447091475129, |
|
"rewards/format_reward": 0.375, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 414.9140625, |
|
"epoch": 2.168067226890756, |
|
"grad_norm": 0.43994390964508057, |
|
"kl": 0.014190673828125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0002, |
|
"num_tokens": 16285445.0, |
|
"reward": 0.7236842215061188, |
|
"reward_std": 0.11914092372171581, |
|
"rewards/curriculum_aware_reward_fn": 0.09868421289138496, |
|
"rewards/format_reward": 0.625, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 567.921875, |
|
"epoch": 2.184873949579832, |
|
"grad_norm": 0.319624662399292, |
|
"kl": 0.0082244873046875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0285, |
|
"num_tokens": 16420019.0, |
|
"reward": 0.4259868264198303, |
|
"reward_std": 0.05608854768797755, |
|
"rewards/curriculum_aware_reward_fn": 0.11348683759570122, |
|
"rewards/format_reward": 0.3125, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 463.53125, |
|
"epoch": 2.2016806722689077, |
|
"grad_norm": 0.359430193901062, |
|
"kl": 0.014495849609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0185, |
|
"num_tokens": 16541143.0, |
|
"reward": 0.4699835479259491, |
|
"reward_std": 0.08584295958280563, |
|
"rewards/curriculum_aware_reward_fn": 0.0949835516512394, |
|
"rewards/format_reward": 0.375, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 469.609375, |
|
"epoch": 2.2184873949579833, |
|
"grad_norm": 0.41892191767692566, |
|
"kl": 0.0117034912109375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0365, |
|
"num_tokens": 16662909.0, |
|
"reward": 0.5522204041481018, |
|
"reward_std": 0.0973742357455194, |
|
"rewards/curriculum_aware_reward_fn": 0.052220395184122026, |
|
"rewards/format_reward": 0.5, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 540.5703125, |
|
"epoch": 2.235294117647059, |
|
"grad_norm": 0.48490580916404724, |
|
"kl": 0.0093231201171875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0203, |
|
"num_tokens": 16795070.0, |
|
"reward": 0.41324013471603394, |
|
"reward_std": 0.08475807495415211, |
|
"rewards/curriculum_aware_reward_fn": 0.038240132853388786, |
|
"rewards/format_reward": 0.375, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 525.1796875, |
|
"epoch": 2.2521008403361344, |
|
"grad_norm": 0.4449516832828522, |
|
"kl": 0.0105438232421875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0023, |
|
"num_tokens": 16923613.0, |
|
"reward": 0.5604440867900848, |
|
"reward_std": 0.1288975402712822, |
|
"rewards/curriculum_aware_reward_fn": 0.12294407933950424, |
|
"rewards/format_reward": 0.4375, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 476.3125, |
|
"epoch": 2.26890756302521, |
|
"grad_norm": 0.4340604543685913, |
|
"kl": 0.01129150390625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.028, |
|
"num_tokens": 17045693.0, |
|
"reward": 0.5587993413209915, |
|
"reward_std": 0.09385511744767427, |
|
"rewards/curriculum_aware_reward_fn": 0.058799343183636665, |
|
"rewards/format_reward": 0.5, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 420.9921875, |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 0.45602235198020935, |
|
"kl": 0.01416015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0042, |
|
"num_tokens": 17154012.0, |
|
"reward": 0.7602795735001564, |
|
"reward_std": 0.09590415796265006, |
|
"rewards/curriculum_aware_reward_fn": 0.14309210563078523, |
|
"rewards/format_reward": 0.6171875, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 489.6640625, |
|
"epoch": 2.302521008403361, |
|
"grad_norm": 0.4504002332687378, |
|
"kl": 0.0130157470703125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0126, |
|
"num_tokens": 17274481.0, |
|
"reward": 0.6295230239629745, |
|
"reward_std": 0.15420474018901587, |
|
"rewards/curriculum_aware_reward_fn": 0.13733552629128098, |
|
"rewards/format_reward": 0.4921875, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 492.3046875, |
|
"epoch": 2.3193277310924367, |
|
"grad_norm": 0.3228984773159027, |
|
"kl": 0.0111846923828125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.004, |
|
"num_tokens": 17399360.0, |
|
"reward": 0.5587993413209915, |
|
"reward_std": 0.0586426155641675, |
|
"rewards/curriculum_aware_reward_fn": 0.05879934271797538, |
|
"rewards/format_reward": 0.5, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 508.5625, |
|
"epoch": 2.3361344537815127, |
|
"grad_norm": 0.3110595643520355, |
|
"kl": 0.015472412109375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0107, |
|
"num_tokens": 17521248.0, |
|
"reward": 0.546875, |
|
"reward_std": 0.07312605157494545, |
|
"rewards/curriculum_aware_reward_fn": 0.0546875, |
|
"rewards/format_reward": 0.4921875, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 574.015625, |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 0.4071909487247467, |
|
"kl": 0.0107421875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0034, |
|
"num_tokens": 17659522.0, |
|
"reward": 0.47450655698776245, |
|
"reward_std": 0.07414581999182701, |
|
"rewards/curriculum_aware_reward_fn": 0.03700657980516553, |
|
"rewards/format_reward": 0.4375, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 522.5546875, |
|
"epoch": 2.369747899159664, |
|
"grad_norm": 0.34431034326553345, |
|
"kl": 0.00946044921875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0068, |
|
"num_tokens": 17788537.0, |
|
"reward": 0.4099506512284279, |
|
"reward_std": 0.05903024738654494, |
|
"rewards/curriculum_aware_reward_fn": 0.0349506571656093, |
|
"rewards/format_reward": 0.375, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 553.2421875, |
|
"epoch": 2.3865546218487395, |
|
"grad_norm": 0.4213170111179352, |
|
"kl": 0.009979248046875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0132, |
|
"num_tokens": 17918288.0, |
|
"reward": 0.4177631586790085, |
|
"reward_std": 0.08044615527614951, |
|
"rewards/curriculum_aware_reward_fn": 0.042763158096931875, |
|
"rewards/format_reward": 0.375, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 559.3203125, |
|
"epoch": 2.403361344537815, |
|
"grad_norm": 0.23342828452587128, |
|
"kl": 0.008510589599609375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0035, |
|
"num_tokens": 18052169.0, |
|
"reward": 0.3762335553765297, |
|
"reward_std": 0.03740033693611622, |
|
"rewards/curriculum_aware_reward_fn": 0.07154605258256197, |
|
"rewards/format_reward": 0.3046875, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 493.078125, |
|
"epoch": 2.4201680672268906, |
|
"grad_norm": 0.4362901449203491, |
|
"kl": 0.012481689453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0386, |
|
"num_tokens": 18177251.0, |
|
"reward": 0.5805921033024788, |
|
"reward_std": 0.12307591829448938, |
|
"rewards/curriculum_aware_reward_fn": 0.08840460516512394, |
|
"rewards/format_reward": 0.4921875, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 435.65625, |
|
"epoch": 2.4369747899159666, |
|
"grad_norm": 0.6844424605369568, |
|
"kl": 0.0600128173828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0209, |
|
"num_tokens": 18292007.0, |
|
"reward": 0.6208881735801697, |
|
"reward_std": 0.15131067298352718, |
|
"rewards/curriculum_aware_reward_fn": 0.12088816147297621, |
|
"rewards/format_reward": 0.5, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 490.296875, |
|
"epoch": 2.453781512605042, |
|
"grad_norm": 0.30699044466018677, |
|
"kl": 0.010986328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0248, |
|
"num_tokens": 18415301.0, |
|
"reward": 0.49424342066049576, |
|
"reward_std": 0.04014611290767789, |
|
"rewards/curriculum_aware_reward_fn": 0.05674342147540301, |
|
"rewards/format_reward": 0.4375, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 471.984375, |
|
"epoch": 2.4705882352941178, |
|
"grad_norm": 0.403209924697876, |
|
"kl": 0.0122528076171875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0353, |
|
"num_tokens": 18532667.0, |
|
"reward": 0.6027960330247879, |
|
"reward_std": 0.0935791190713644, |
|
"rewards/curriculum_aware_reward_fn": 0.1027960553765297, |
|
"rewards/format_reward": 0.5, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 422.703125, |
|
"epoch": 2.4873949579831933, |
|
"grad_norm": 0.42733973264694214, |
|
"kl": 0.0163116455078125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.002, |
|
"num_tokens": 18645941.0, |
|
"reward": 0.7845394462347031, |
|
"reward_std": 0.0871797576546669, |
|
"rewards/curriculum_aware_reward_fn": 0.0345394741743803, |
|
"rewards/format_reward": 0.75, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 548.828125, |
|
"epoch": 2.504201680672269, |
|
"grad_norm": 0.2545667290687561, |
|
"kl": 0.01213836669921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0089, |
|
"num_tokens": 18774111.0, |
|
"reward": 0.539473682641983, |
|
"reward_std": 0.060992954298853874, |
|
"rewards/curriculum_aware_reward_fn": 0.10197368077933788, |
|
"rewards/format_reward": 0.4375, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 540.0703125, |
|
"epoch": 2.5210084033613445, |
|
"grad_norm": 0.3914143145084381, |
|
"kl": 0.00789642333984375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0052, |
|
"num_tokens": 18904992.0, |
|
"reward": 0.27878287341445684, |
|
"reward_std": 0.06910991575568914, |
|
"rewards/curriculum_aware_reward_fn": 0.02878289413638413, |
|
"rewards/format_reward": 0.25, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 550.9375, |
|
"epoch": 2.53781512605042, |
|
"grad_norm": 0.2912365794181824, |
|
"kl": 0.00799560546875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0067, |
|
"num_tokens": 19035968.0, |
|
"reward": 0.3215460553765297, |
|
"reward_std": 0.01937512680888176, |
|
"rewards/curriculum_aware_reward_fn": 0.07154605211690068, |
|
"rewards/format_reward": 0.25, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 471.1640625, |
|
"epoch": 2.5546218487394956, |
|
"grad_norm": 0.3965752124786377, |
|
"kl": 0.01221466064453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0062, |
|
"num_tokens": 19153861.0, |
|
"reward": 0.582648016512394, |
|
"reward_std": 0.08400850929319859, |
|
"rewards/curriculum_aware_reward_fn": 0.08264802652411163, |
|
"rewards/format_reward": 0.5, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 421.234375, |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 0.6044662594795227, |
|
"kl": 0.026885986328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0165, |
|
"num_tokens": 19265379.0, |
|
"reward": 0.8112664222717285, |
|
"reward_std": 0.1459241509437561, |
|
"rewards/curriculum_aware_reward_fn": 0.19407895021140575, |
|
"rewards/format_reward": 0.6171875, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 546.8671875, |
|
"epoch": 2.588235294117647, |
|
"grad_norm": 0.4222107231616974, |
|
"kl": 0.01050567626953125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0261, |
|
"num_tokens": 19396626.0, |
|
"reward": 0.38733551651239395, |
|
"reward_std": 0.06776260398328304, |
|
"rewards/curriculum_aware_reward_fn": 0.02014802605845034, |
|
"rewards/format_reward": 0.3671875, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 548.875, |
|
"epoch": 2.6050420168067228, |
|
"grad_norm": 0.30043891072273254, |
|
"kl": 0.010498046875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0291, |
|
"num_tokens": 19531202.0, |
|
"reward": 0.28166119009256363, |
|
"reward_std": 0.07623977493494749, |
|
"rewards/curriculum_aware_reward_fn": 0.031661184038966894, |
|
"rewards/format_reward": 0.25, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 560.640625, |
|
"epoch": 2.6218487394957983, |
|
"grad_norm": 0.39753058552742004, |
|
"kl": 0.0109710693359375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0213, |
|
"num_tokens": 19665404.0, |
|
"reward": 0.5197368338704109, |
|
"reward_std": 0.08217737264931202, |
|
"rewards/curriculum_aware_reward_fn": 0.08223684225231409, |
|
"rewards/format_reward": 0.4375, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 481.6640625, |
|
"epoch": 2.638655462184874, |
|
"grad_norm": 0.39810478687286377, |
|
"kl": 0.009063720703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0044, |
|
"num_tokens": 19787409.0, |
|
"reward": 0.44202302396297455, |
|
"reward_std": 0.08141717128455639, |
|
"rewards/curriculum_aware_reward_fn": 0.12952302768826485, |
|
"rewards/format_reward": 0.3125, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 389.7890625, |
|
"epoch": 2.6554621848739495, |
|
"grad_norm": 0.4911426305770874, |
|
"kl": 0.02197265625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0028, |
|
"num_tokens": 19896190.0, |
|
"reward": 0.7331414520740509, |
|
"reward_std": 0.17763726785779, |
|
"rewards/curriculum_aware_reward_fn": 0.1784539483487606, |
|
"rewards/format_reward": 0.5546875, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 518.9609375, |
|
"epoch": 2.6722689075630255, |
|
"grad_norm": 0.2420579046010971, |
|
"kl": 0.011962890625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0303, |
|
"num_tokens": 20022809.0, |
|
"reward": 0.4453125, |
|
"reward_std": 0.01647413382306695, |
|
"rewards/curriculum_aware_reward_fn": 0.007812500232830644, |
|
"rewards/format_reward": 0.4375, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 424.4453125, |
|
"epoch": 2.689075630252101, |
|
"grad_norm": 0.46578091382980347, |
|
"kl": 0.01375579833984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0011, |
|
"num_tokens": 20136314.0, |
|
"reward": 0.49095392785966396, |
|
"reward_std": 0.13701673224568367, |
|
"rewards/curriculum_aware_reward_fn": 0.06126644625328481, |
|
"rewards/format_reward": 0.4296875, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 483.921875, |
|
"epoch": 2.7058823529411766, |
|
"grad_norm": 0.32379522919654846, |
|
"kl": 0.01180267333984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0065, |
|
"num_tokens": 20257344.0, |
|
"reward": 0.5197368343360722, |
|
"reward_std": 0.07396957790479064, |
|
"rewards/curriculum_aware_reward_fn": 0.0822368417866528, |
|
"rewards/format_reward": 0.4375, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 470.3515625, |
|
"epoch": 2.722689075630252, |
|
"grad_norm": 0.4478832483291626, |
|
"kl": 0.014068603515625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0003, |
|
"num_tokens": 20375685.0, |
|
"reward": 0.5801809281110764, |
|
"reward_std": 0.06543473433703184, |
|
"rewards/curriculum_aware_reward_fn": 0.08018092112615705, |
|
"rewards/format_reward": 0.5, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 462.3046875, |
|
"epoch": 2.7394957983193278, |
|
"grad_norm": 0.4915456175804138, |
|
"kl": 0.0140838623046875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0286, |
|
"num_tokens": 20491340.0, |
|
"reward": 0.6981907933950424, |
|
"reward_std": 0.1432387800887227, |
|
"rewards/curriculum_aware_reward_fn": 0.13569078594446182, |
|
"rewards/format_reward": 0.5625, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 460.046875, |
|
"epoch": 2.7563025210084033, |
|
"grad_norm": 0.388621062040329, |
|
"kl": 0.0123138427734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0144, |
|
"num_tokens": 20613466.0, |
|
"reward": 0.4124177619814873, |
|
"reward_std": 0.07370226783677936, |
|
"rewards/curriculum_aware_reward_fn": 0.037417763262055814, |
|
"rewards/format_reward": 0.375, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 462.25, |
|
"epoch": 2.773109243697479, |
|
"grad_norm": 0.4878624677658081, |
|
"kl": 0.01593017578125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0006, |
|
"num_tokens": 20729058.0, |
|
"reward": 0.6221217066049576, |
|
"reward_std": 0.12872529029846191, |
|
"rewards/curriculum_aware_reward_fn": 0.12212171033024788, |
|
"rewards/format_reward": 0.5, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 486.4609375, |
|
"epoch": 2.7899159663865545, |
|
"grad_norm": 0.4500262141227722, |
|
"kl": 0.0099029541015625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0219, |
|
"num_tokens": 20853869.0, |
|
"reward": 0.4050164371728897, |
|
"reward_std": 0.11422262340784073, |
|
"rewards/curriculum_aware_reward_fn": 0.09251644648611546, |
|
"rewards/format_reward": 0.3125, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 448.5859375, |
|
"epoch": 2.80672268907563, |
|
"grad_norm": 0.5006850957870483, |
|
"kl": 0.0168914794921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0135, |
|
"num_tokens": 20973736.0, |
|
"reward": 0.677631601691246, |
|
"reward_std": 0.0868874522857368, |
|
"rewards/curriculum_aware_reward_fn": 0.12294407980516553, |
|
"rewards/format_reward": 0.5546875, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 428.7265625, |
|
"epoch": 2.8235294117647056, |
|
"grad_norm": 0.42931458353996277, |
|
"kl": 0.01781463623046875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0042, |
|
"num_tokens": 21086485.0, |
|
"reward": 0.6040295884013176, |
|
"reward_std": 0.05929867131635547, |
|
"rewards/curriculum_aware_reward_fn": 0.041529607493430376, |
|
"rewards/format_reward": 0.5625, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 388.421875, |
|
"epoch": 2.8403361344537816, |
|
"grad_norm": 0.44046640396118164, |
|
"kl": 0.0161895751953125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.01, |
|
"num_tokens": 21193627.0, |
|
"reward": 0.7483552545309067, |
|
"reward_std": 0.09682157123461366, |
|
"rewards/curriculum_aware_reward_fn": 0.060855262679979205, |
|
"rewards/format_reward": 0.6875, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 477.6953125, |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.36667370796203613, |
|
"kl": 0.0146484375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0002, |
|
"num_tokens": 21313716.0, |
|
"reward": 0.6060855239629745, |
|
"reward_std": 0.10079656913876534, |
|
"rewards/curriculum_aware_reward_fn": 0.11389802675694227, |
|
"rewards/format_reward": 0.4921875, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 534.4296875, |
|
"epoch": 2.8739495798319328, |
|
"grad_norm": 0.3436344563961029, |
|
"kl": 0.00984954833984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0022, |
|
"num_tokens": 21445667.0, |
|
"reward": 0.48231907933950424, |
|
"reward_std": 0.08960662921890616, |
|
"rewards/curriculum_aware_reward_fn": 0.10731907980516553, |
|
"rewards/format_reward": 0.375, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 496.21875, |
|
"epoch": 2.8907563025210083, |
|
"grad_norm": 0.48088422417640686, |
|
"kl": 0.0130767822265625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0182, |
|
"num_tokens": 21570871.0, |
|
"reward": 0.4465460618957877, |
|
"reward_std": 0.1538134217262268, |
|
"rewards/curriculum_aware_reward_fn": 0.13404605071991682, |
|
"rewards/format_reward": 0.3125, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 429.484375, |
|
"epoch": 2.907563025210084, |
|
"grad_norm": 0.5827536582946777, |
|
"kl": 0.016109466552734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0127, |
|
"num_tokens": 21686093.0, |
|
"reward": 0.4502467103302479, |
|
"reward_std": 0.15407454315572977, |
|
"rewards/curriculum_aware_reward_fn": 0.09087171172723174, |
|
"rewards/format_reward": 0.359375, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 482.4609375, |
|
"epoch": 2.92436974789916, |
|
"grad_norm": 0.467061311006546, |
|
"kl": 0.013336181640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.033, |
|
"num_tokens": 21808264.0, |
|
"reward": 0.6632401347160339, |
|
"reward_std": 0.10484125558286905, |
|
"rewards/curriculum_aware_reward_fn": 0.22574013099074364, |
|
"rewards/format_reward": 0.4375, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 500.96875, |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 0.41948550939559937, |
|
"kl": 0.009563446044921875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0329, |
|
"num_tokens": 21933084.0, |
|
"reward": 0.400082241743803, |
|
"reward_std": 0.10662292037159204, |
|
"rewards/curriculum_aware_reward_fn": 0.04070723708719015, |
|
"rewards/format_reward": 0.359375, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 557.640625, |
|
"epoch": 2.957983193277311, |
|
"grad_norm": 0.41708114743232727, |
|
"kl": 0.007190704345703125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.021, |
|
"num_tokens": 22068550.0, |
|
"reward": 0.3005756618222222, |
|
"reward_std": 0.06424513552337885, |
|
"rewards/curriculum_aware_reward_fn": 0.050575657514855266, |
|
"rewards/format_reward": 0.25, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 482.421875, |
|
"epoch": 2.9747899159663866, |
|
"grad_norm": 0.6009016633033752, |
|
"kl": 0.013702392578125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0048, |
|
"num_tokens": 22189356.0, |
|
"reward": 0.6620065569877625, |
|
"reward_std": 0.149446252733469, |
|
"rewards/curriculum_aware_reward_fn": 0.16200657933950424, |
|
"rewards/format_reward": 0.5, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 490.53572845458984, |
|
"epoch": 2.991596638655462, |
|
"grad_norm": 0.49134695529937744, |
|
"kl": 0.01397705078125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0025, |
|
"num_tokens": 22309028.0, |
|
"reward": 0.6726973652839661, |
|
"reward_std": 0.14456172287464142, |
|
"rewards/curriculum_aware_reward_fn": 0.1101973676122725, |
|
"rewards/format_reward": 0.5625, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 358.109375, |
|
"epoch": 3.0168067226890756, |
|
"grad_norm": 0.5925723314285278, |
|
"kl": 0.0204315185546875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0117, |
|
"num_tokens": 22410794.0, |
|
"reward": 0.87787826359272, |
|
"reward_std": 0.1721474528312683, |
|
"rewards/curriculum_aware_reward_fn": 0.19819078594446182, |
|
"rewards/format_reward": 0.6796875, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 523.0078125, |
|
"epoch": 3.033613445378151, |
|
"grad_norm": 0.2975535988807678, |
|
"kl": 0.01165771484375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0697, |
|
"num_tokens": 22539299.0, |
|
"reward": 0.5168585479259491, |
|
"reward_std": 0.048361226450651884, |
|
"rewards/curriculum_aware_reward_fn": 0.08717105106916279, |
|
"rewards/format_reward": 0.4296875, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 484.7578125, |
|
"epoch": 3.0504201680672267, |
|
"grad_norm": 0.45362988114356995, |
|
"kl": 0.0162200927734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0023, |
|
"num_tokens": 22660588.0, |
|
"reward": 0.5513980276882648, |
|
"reward_std": 0.1047646040096879, |
|
"rewards/curriculum_aware_reward_fn": 0.06702302722260356, |
|
"rewards/format_reward": 0.484375, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 452.9375, |
|
"epoch": 3.0672268907563027, |
|
"grad_norm": 0.5003635883331299, |
|
"kl": 0.0143890380859375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0104, |
|
"num_tokens": 22778956.0, |
|
"reward": 0.73149673640728, |
|
"reward_std": 0.17891032248735428, |
|
"rewards/curriculum_aware_reward_fn": 0.23149671405553818, |
|
"rewards/format_reward": 0.5, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 513.0390625, |
|
"epoch": 3.0840336134453783, |
|
"grad_norm": 0.31615540385246277, |
|
"kl": 0.01172637939453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0235, |
|
"num_tokens": 22905121.0, |
|
"reward": 0.3244243338704109, |
|
"reward_std": 0.03051401791162789, |
|
"rewards/curriculum_aware_reward_fn": 0.011924341786652803, |
|
"rewards/format_reward": 0.3125, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 411.6015625, |
|
"epoch": 3.100840336134454, |
|
"grad_norm": 0.4836508631706238, |
|
"kl": 0.0144195556640625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0043, |
|
"num_tokens": 23019342.0, |
|
"reward": 0.5826480239629745, |
|
"reward_std": 0.11801502481102943, |
|
"rewards/curriculum_aware_reward_fn": 0.07483552675694227, |
|
"rewards/format_reward": 0.5078125, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 416.921875, |
|
"epoch": 3.1176470588235294, |
|
"grad_norm": 0.3468119204044342, |
|
"kl": 0.01403045654296875, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0007, |
|
"num_tokens": 23137316.0, |
|
"reward": 0.47574012295808643, |
|
"reward_std": 0.05907326890155673, |
|
"rewards/curriculum_aware_reward_fn": 0.10074013040866703, |
|
"rewards/format_reward": 0.375, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 414.359375, |
|
"epoch": 3.134453781512605, |
|
"grad_norm": 0.4667985439300537, |
|
"kl": 0.0151519775390625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0299, |
|
"num_tokens": 23249858.0, |
|
"reward": 0.6344572305679321, |
|
"reward_std": 0.15162191167473793, |
|
"rewards/curriculum_aware_reward_fn": 0.13445723662152886, |
|
"rewards/format_reward": 0.5, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 482.109375, |
|
"epoch": 3.1512605042016806, |
|
"grad_norm": 0.4111727774143219, |
|
"kl": 0.013458251953125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0121, |
|
"num_tokens": 23370304.0, |
|
"reward": 0.4806743413209915, |
|
"reward_std": 0.052865433506667614, |
|
"rewards/curriculum_aware_reward_fn": 0.0431743401568383, |
|
"rewards/format_reward": 0.4375, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 500.5234375, |
|
"epoch": 3.168067226890756, |
|
"grad_norm": 0.4427432715892792, |
|
"kl": 0.01395416259765625, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0198, |
|
"num_tokens": 23496979.0, |
|
"reward": 0.4243420949205756, |
|
"reward_std": 0.07115951599553227, |
|
"rewards/curriculum_aware_reward_fn": 0.11184210795909166, |
|
"rewards/format_reward": 0.3125, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 450.671875, |
|
"epoch": 3.184873949579832, |
|
"grad_norm": 0.4217956066131592, |
|
"kl": 0.0164947509765625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0361, |
|
"num_tokens": 23613281.0, |
|
"reward": 0.5629111751914024, |
|
"reward_std": 0.07686262531206012, |
|
"rewards/curriculum_aware_reward_fn": 0.12541118264198303, |
|
"rewards/format_reward": 0.4375, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 399.21875, |
|
"epoch": 3.2016806722689077, |
|
"grad_norm": 0.6111953258514404, |
|
"kl": 0.017852783203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0248, |
|
"num_tokens": 23723725.0, |
|
"reward": 0.7121710479259491, |
|
"reward_std": 0.15234812535345554, |
|
"rewards/curriculum_aware_reward_fn": 0.08717105351388454, |
|
"rewards/format_reward": 0.625, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 433.015625, |
|
"epoch": 3.2184873949579833, |
|
"grad_norm": 0.4865033030509949, |
|
"kl": 0.0166778564453125, |
|
"learning_rate": 1e-06, |
|
"loss": -0.009, |
|
"num_tokens": 23835815.0, |
|
"reward": 0.7372532933950424, |
|
"reward_std": 0.13220055866986513, |
|
"rewards/curriculum_aware_reward_fn": 0.17475328966975212, |
|
"rewards/format_reward": 0.5625, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 525.4765625, |
|
"epoch": 3.235294117647059, |
|
"grad_norm": 0.3422640562057495, |
|
"kl": 0.016204833984375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0136, |
|
"num_tokens": 23964596.0, |
|
"reward": 0.43174342811107635, |
|
"reward_std": 0.09182633552700281, |
|
"rewards/curriculum_aware_reward_fn": 0.05674342066049576, |
|
"rewards/format_reward": 0.375, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 351.4453125, |
|
"epoch": 3.2521008403361344, |
|
"grad_norm": 0.5189781785011292, |
|
"kl": 0.023193359375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0349, |
|
"num_tokens": 24067141.0, |
|
"reward": 0.7643914222717285, |
|
"reward_std": 0.15736807510256767, |
|
"rewards/curriculum_aware_reward_fn": 0.1940789488144219, |
|
"rewards/format_reward": 0.5703125, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 460.296875, |
|
"epoch": 3.26890756302521, |
|
"grad_norm": 0.36804094910621643, |
|
"kl": 0.012298583984375, |
|
"learning_rate": 1e-06, |
|
"loss": -0.0044, |
|
"num_tokens": 24187067.0, |
|
"reward": 0.48273026943206787, |
|
"reward_std": 0.037970013450831175, |
|
"rewards/curriculum_aware_reward_fn": 0.10773026570677757, |
|
"rewards/format_reward": 0.375, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 418.078125, |
|
"epoch": 3.2857142857142856, |
|
"grad_norm": 0.4727684259414673, |
|
"kl": 0.01959228515625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0229, |
|
"num_tokens": 24300605.0, |
|
"reward": 0.5563322491943836, |
|
"reward_std": 0.06251880899071693, |
|
"rewards/curriculum_aware_reward_fn": 0.13445723743643612, |
|
"rewards/format_reward": 0.421875, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 498.9765625, |
|
"epoch": 3.302521008403361, |
|
"grad_norm": 0.5195404887199402, |
|
"kl": 0.01263427734375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0303, |
|
"num_tokens": 24427482.0, |
|
"reward": 0.38569077104330063, |
|
"reward_std": 0.10553359193727374, |
|
"rewards/curriculum_aware_reward_fn": 0.07319078966975212, |
|
"rewards/format_reward": 0.3125, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 448.625, |
|
"epoch": 3.3193277310924367, |
|
"grad_norm": 0.49932360649108887, |
|
"kl": 0.017852783203125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0397, |
|
"num_tokens": 24541746.0, |
|
"reward": 0.5193256512284279, |
|
"reward_std": 0.10704736225306988, |
|
"rewards/curriculum_aware_reward_fn": 0.08182565891183913, |
|
"rewards/format_reward": 0.4375, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 377.1015625, |
|
"epoch": 3.3361344537815127, |
|
"grad_norm": 0.4484708309173584, |
|
"kl": 0.01934814453125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0074, |
|
"num_tokens": 24647103.0, |
|
"reward": 0.6673519611358643, |
|
"reward_std": 0.06431722524575889, |
|
"rewards/curriculum_aware_reward_fn": 0.16735197603702545, |
|
"rewards/format_reward": 0.5, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 483.921875, |
|
"epoch": 3.3529411764705883, |
|
"grad_norm": 0.41696909070014954, |
|
"kl": 0.01053619384765625, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0158, |
|
"num_tokens": 24773253.0, |
|
"reward": 0.2717927638441324, |
|
"reward_std": 0.11790546495467424, |
|
"rewards/curriculum_aware_reward_fn": 0.08429276384413242, |
|
"rewards/format_reward": 0.1875, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 380.0546875, |
|
"epoch": 3.369747899159664, |
|
"grad_norm": 0.45737817883491516, |
|
"kl": 0.022186279296875, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0056, |
|
"num_tokens": 24881188.0, |
|
"reward": 0.7002467140555382, |
|
"reward_std": 0.03508220613002777, |
|
"rewards/curriculum_aware_reward_fn": 0.1377467131242156, |
|
"rewards/format_reward": 0.5625, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 390.3671875, |
|
"epoch": 3.3865546218487395, |
|
"grad_norm": 0.5029156804084778, |
|
"kl": 0.0277099609375, |
|
"learning_rate": 1e-06, |
|
"loss": 0.004, |
|
"num_tokens": 24991955.0, |
|
"reward": 0.6694078892469406, |
|
"reward_std": 0.10574874095618725, |
|
"rewards/curriculum_aware_reward_fn": 0.10690789762884378, |
|
"rewards/format_reward": 0.5625, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.3865546218487395, |
|
"step": 200, |
|
"total_flos": 0.0, |
|
"train_loss": 0.010024200768093579, |
|
"train_runtime": 35564.3846, |
|
"train_samples_per_second": 0.72, |
|
"train_steps_per_second": 0.006 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|