{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.3865546218487395, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 635.7109375, "epoch": 0.01680672268907563, "grad_norm": 0.31708475947380066, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 143267.0, "reward": 0.039062500349245965, "reward_std": 0.0698821279220283, "rewards/curriculum_aware_reward_fn": 0.023437500349245965, "rewards/format_reward": 0.015625, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 527.6328125, "epoch": 0.03361344537815126, "grad_norm": 0.43825313448905945, "kl": 0.0002913475036621094, "learning_rate": 1e-06, "loss": 0.0432, "num_tokens": 270812.0, "reward": 0.09292763145640492, "reward_std": 0.12866380205377936, "rewards/curriculum_aware_reward_fn": 0.06949013145640492, "rewards/format_reward": 0.0234375, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 608.9921875, "epoch": 0.05042016806722689, "grad_norm": 0.4227641224861145, "kl": 0.0002665519714355469, "learning_rate": 1e-06, "loss": -0.0273, "num_tokens": 410971.0, "reward": 0.059621710795909166, "reward_std": 0.07889116508886218, "rewards/curriculum_aware_reward_fn": 0.059621710795909166, "rewards/format_reward": 0.0, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 558.921875, "epoch": 0.06722689075630252, "grad_norm": 0.4796917140483856, "kl": 0.0002789497375488281, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 542313.0, "reward": 0.08552631549537182, "reward_std": 0.12651031091809273, "rewards/curriculum_aware_reward_fn": 0.06990131689235568, "rewards/format_reward": 0.015625, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 590.265625, "epoch": 0.08403361344537816, "grad_norm": 0.5620821118354797, "kl": 0.0003027915954589844, "learning_rate": 1e-06, "loss": 0.0288, "num_tokens": 677075.0, "reward": 0.14925987273454666, "reward_std": 0.24606542102992535, "rewards/curriculum_aware_reward_fn": 0.09457236900925636, "rewards/format_reward": 0.0546875, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 592.5234375, "epoch": 0.10084033613445378, "grad_norm": 0.4298699200153351, "kl": 0.0003509521484375, "learning_rate": 1e-06, "loss": -0.0151, "num_tokens": 812710.0, "reward": 0.08840460644569248, "reward_std": 0.1141207623295486, "rewards/curriculum_aware_reward_fn": 0.03371710644569248, "rewards/format_reward": 0.0546875, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 582.046875, "epoch": 0.11764705882352941, "grad_norm": 0.526942253112793, "kl": 0.0004343986511230469, "learning_rate": 1e-06, "loss": 0.0192, "num_tokens": 943268.0, "reward": 0.12088815867900848, "reward_std": 0.17540471255779266, "rewards/curriculum_aware_reward_fn": 0.07401315867900848, "rewards/format_reward": 0.046875, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 534.75, "epoch": 0.13445378151260504, "grad_norm": 0.44275274872779846, "kl": 0.0003724098205566406, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 1074300.0, "reward": 0.030016446253284812, "reward_std": 0.08489933330565691, "rewards/curriculum_aware_reward_fn": 0.014391447650268674, "rewards/format_reward": 0.015625, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 539.09375, "epoch": 0.15126050420168066, "grad_norm": 0.5494865775108337, "kl": 0.0007390975952148438, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 1197896.0, "reward": 0.16570723708719015, "reward_std": 0.21696669608354568, "rewards/curriculum_aware_reward_fn": 0.05633223685435951, "rewards/format_reward": 0.109375, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 593.7734375, "epoch": 0.16806722689075632, "grad_norm": 0.5171737670898438, "kl": 0.0006322860717773438, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 1336931.0, "reward": 0.11143092066049576, "reward_std": 0.19064411148428917, "rewards/curriculum_aware_reward_fn": 0.017680921009741724, "rewards/format_reward": 0.09375, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 578.4765625, "epoch": 0.18487394957983194, "grad_norm": 0.6088258028030396, "kl": 0.001346588134765625, "learning_rate": 1e-06, "loss": 0.037, "num_tokens": 1467592.0, "reward": 0.22944078594446182, "reward_std": 0.3224767856299877, "rewards/curriculum_aware_reward_fn": 0.04194079013541341, "rewards/format_reward": 0.1875, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 601.171875, "epoch": 0.20168067226890757, "grad_norm": 0.4451327621936798, "kl": 0.0010366439819335938, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 1607894.0, "reward": 0.1204769799951464, "reward_std": 0.1381341191008687, "rewards/curriculum_aware_reward_fn": 0.018914473825134337, "rewards/format_reward": 0.1015625, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 526.28125, "epoch": 0.2184873949579832, "grad_norm": 0.636314332485199, "kl": 0.00191497802734375, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 1735650.0, "reward": 0.26644736528396606, "reward_std": 0.30141641572117805, "rewards/curriculum_aware_reward_fn": 0.03988486935850233, "rewards/format_reward": 0.2265625, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 507.515625, "epoch": 0.23529411764705882, "grad_norm": 0.6864922642707825, "kl": 0.004413604736328125, "learning_rate": 1e-06, "loss": 0.0802, "num_tokens": 1856316.0, "reward": 0.3112664446234703, "reward_std": 0.31644799932837486, "rewards/curriculum_aware_reward_fn": 0.05345394683536142, "rewards/format_reward": 0.2578125, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 554.5859375, "epoch": 0.25210084033613445, "grad_norm": 0.6268811225891113, "kl": 0.0036067962646484375, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 1987511.0, "reward": 0.4337993338704109, "reward_std": 0.32329631969332695, "rewards/curriculum_aware_reward_fn": 0.050986841320991516, "rewards/format_reward": 0.3828125, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 584.3828125, "epoch": 0.2689075630252101, "grad_norm": 0.5531853437423706, "kl": 0.003597259521484375, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 2119768.0, "reward": 0.3828125037252903, "reward_std": 0.26145630702376366, "rewards/curriculum_aware_reward_fn": 0.0546875, "rewards/format_reward": 0.328125, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 481.8046875, "epoch": 0.2857142857142857, "grad_norm": 0.6449251174926758, "kl": 0.005481719970703125, "learning_rate": 1e-06, "loss": -0.0094, "num_tokens": 2238911.0, "reward": 0.4543585404753685, "reward_std": 0.26075971499085426, "rewards/curriculum_aware_reward_fn": 0.05592105304822326, "rewards/format_reward": 0.3984375, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 645.75, "epoch": 0.3025210084033613, "grad_norm": 0.37918156385421753, "kl": 0.001049041748046875, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 2385767.0, "reward": 0.1451480264076963, "reward_std": 0.1290158643387258, "rewards/curriculum_aware_reward_fn": 0.04358552640769631, "rewards/format_reward": 0.1015625, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 617.6953125, "epoch": 0.31932773109243695, "grad_norm": 0.39814478158950806, "kl": 0.00528717041015625, "learning_rate": 1e-06, "loss": 0.0518, "num_tokens": 2525656.0, "reward": 0.35115131735801697, "reward_std": 0.11648409254848957, "rewards/curriculum_aware_reward_fn": 0.02302631549537182, "rewards/format_reward": 0.328125, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 459.6015625, "epoch": 0.33613445378151263, "grad_norm": 0.7307525873184204, "kl": 0.005184173583984375, "learning_rate": 1e-06, "loss": 0.083, "num_tokens": 2644077.0, "reward": 0.47574012726545334, "reward_std": 0.2815094441175461, "rewards/curriculum_aware_reward_fn": 0.04605263099074364, "rewards/format_reward": 0.4296875, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 620.46875, "epoch": 0.35294117647058826, "grad_norm": 0.46509799361228943, "kl": 0.0036363601684570312, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 2786169.0, "reward": 0.24177631677594036, "reward_std": 0.09853590792044997, "rewards/curriculum_aware_reward_fn": 0.023026315728202462, "rewards/format_reward": 0.21875, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 578.9609375, "epoch": 0.3697478991596639, "grad_norm": 0.5765166878700256, "kl": 0.005565643310546875, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 2917180.0, "reward": 0.4958881437778473, "reward_std": 0.10692231869325042, "rewards/curriculum_aware_reward_fn": 0.0740131582133472, "rewards/format_reward": 0.421875, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 579.578125, "epoch": 0.3865546218487395, "grad_norm": 0.5340356826782227, "kl": 0.00540924072265625, "learning_rate": 1e-06, "loss": -0.0083, "num_tokens": 3053414.0, "reward": 0.3708881437778473, "reward_std": 0.11791826784610748, "rewards/curriculum_aware_reward_fn": 0.06620065867900848, "rewards/format_reward": 0.3046875, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 507.5625, "epoch": 0.40336134453781514, "grad_norm": 0.4752294719219208, "kl": 0.031703948974609375, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 3181894.0, "reward": 0.3700657896697521, "reward_std": 0.1367718242108822, "rewards/curriculum_aware_reward_fn": 0.002878289553336799, "rewards/format_reward": 0.3671875, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 496.4296875, "epoch": 0.42016806722689076, "grad_norm": 0.46164318919181824, "kl": 0.0082855224609375, "learning_rate": 1e-06, "loss": -0.0091, "num_tokens": 3304077.0, "reward": 0.5016447380185127, "reward_std": 0.09064025245606899, "rewards/curriculum_aware_reward_fn": 0.017269736621528864, "rewards/format_reward": 0.484375, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 454.5859375, "epoch": 0.4369747899159664, "grad_norm": 0.5706049799919128, "kl": 0.01887798309326172, "learning_rate": 1e-06, "loss": -0.0096, "num_tokens": 3420488.0, "reward": 0.6875, "reward_std": 0.12697386741638184, "rewards/curriculum_aware_reward_fn": 0.0234375, "rewards/format_reward": 0.6640625, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 554.46875, "epoch": 0.453781512605042, "grad_norm": 0.45473384857177734, "kl": 0.0068416595458984375, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 3552340.0, "reward": 0.34868420753628016, "reward_std": 0.10102300066500902, "rewards/curriculum_aware_reward_fn": 0.012746710679493845, "rewards/format_reward": 0.3359375, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 534.0, "epoch": 0.47058823529411764, "grad_norm": 0.348452091217041, "kl": 0.01036834716796875, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 3677892.0, "reward": 0.5571546033024788, "reward_std": 0.055680982768535614, "rewards/curriculum_aware_reward_fn": 0.010279605048708618, "rewards/format_reward": 0.546875, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 584.03125, "epoch": 0.48739495798319327, "grad_norm": 0.452033668756485, "kl": 0.0071258544921875, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 3813600.0, "reward": 0.3984375, "reward_std": 0.08443661965429783, "rewards/curriculum_aware_reward_fn": 0.046875, "rewards/format_reward": 0.3515625, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 552.4921875, "epoch": 0.5042016806722689, "grad_norm": 0.4926210641860962, "kl": 0.005392551422119141, "learning_rate": 1e-06, "loss": 0.0285, "num_tokens": 3947807.0, "reward": 0.4683388201519847, "reward_std": 0.11112732999026775, "rewards/curriculum_aware_reward_fn": 0.03865131549537182, "rewards/format_reward": 0.4296875, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 559.6953125, "epoch": 0.5210084033613446, "grad_norm": 0.5463467240333557, "kl": 0.004418373107910156, "learning_rate": 1e-06, "loss": -0.0233, "num_tokens": 4080704.0, "reward": 0.22203946067020297, "reward_std": 0.09257729165256023, "rewards/curriculum_aware_reward_fn": 0.042351973708719015, "rewards/format_reward": 0.1796875, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 549.921875, "epoch": 0.5378151260504201, "grad_norm": 0.36463335156440735, "kl": 0.006511688232421875, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 4214870.0, "reward": 0.4346217215061188, "reward_std": 0.03605314111337066, "rewards/curriculum_aware_reward_fn": 0.004934210563078523, "rewards/format_reward": 0.4296875, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 478.7890625, "epoch": 0.5546218487394958, "grad_norm": 0.5116223692893982, "kl": 0.008532524108886719, "learning_rate": 1e-06, "loss": -0.0153, "num_tokens": 4338203.0, "reward": 0.4560032896697521, "reward_std": 0.12314211018383503, "rewards/curriculum_aware_reward_fn": 0.08881578966975212, "rewards/format_reward": 0.3671875, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 476.7890625, "epoch": 0.5714285714285714, "grad_norm": 0.43187472224235535, "kl": 0.007843017578125, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 4461184.0, "reward": 0.4333881586790085, "reward_std": 0.12357822060585022, "rewards/curriculum_aware_reward_fn": 0.02713815774768591, "rewards/format_reward": 0.40625, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 529.7578125, "epoch": 0.5882352941176471, "grad_norm": 0.4466142952442169, "kl": 0.0057315826416015625, "learning_rate": 1e-06, "loss": -0.015, "num_tokens": 4590329.0, "reward": 0.426809199154377, "reward_std": 0.10671343095600605, "rewards/curriculum_aware_reward_fn": 0.059621710097417235, "rewards/format_reward": 0.3671875, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 521.9453125, "epoch": 0.6050420168067226, "grad_norm": 0.5088793635368347, "kl": 0.00739288330078125, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 4717658.0, "reward": 0.5740131624042988, "reward_std": 0.09916227497160435, "rewards/curriculum_aware_reward_fn": 0.08182565809693187, "rewards/format_reward": 0.4921875, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 481.59375, "epoch": 0.6218487394957983, "grad_norm": 0.3755647540092468, "kl": 0.005794525146484375, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 4837174.0, "reward": 0.5123355314135551, "reward_std": 0.023199534974992275, "rewards/curriculum_aware_reward_fn": 0.0748355258256197, "rewards/format_reward": 0.4375, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 465.1953125, "epoch": 0.6386554621848739, "grad_norm": 0.5442925691604614, "kl": 0.008731842041015625, "learning_rate": 1e-06, "loss": -0.0111, "num_tokens": 4953039.0, "reward": 0.7232730239629745, "reward_std": 0.1315580508671701, "rewards/curriculum_aware_reward_fn": 0.16077302338089794, "rewards/format_reward": 0.5625, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 501.484375, "epoch": 0.6554621848739496, "grad_norm": 0.4446295201778412, "kl": 0.00624847412109375, "learning_rate": 1e-06, "loss": 0.0286, "num_tokens": 5076965.0, "reward": 0.47327301651239395, "reward_std": 0.08440816402435303, "rewards/curriculum_aware_reward_fn": 0.09827302861958742, "rewards/format_reward": 0.375, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 539.15625, "epoch": 0.6722689075630253, "grad_norm": 0.37400856614112854, "kl": 0.005260467529296875, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 5207185.0, "reward": 0.4745065679308027, "reward_std": 0.07072597183287144, "rewards/curriculum_aware_reward_fn": 0.09950657887384295, "rewards/format_reward": 0.375, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 492.0859375, "epoch": 0.6890756302521008, "grad_norm": 0.4103780686855316, "kl": 0.00856781005859375, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 5328012.0, "reward": 0.71875, "reward_std": 0.10247145313769579, "rewards/curriculum_aware_reward_fn": 0.09375000419095159, "rewards/format_reward": 0.625, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 405.1328125, "epoch": 0.7058823529411765, "grad_norm": 0.6738374829292297, "kl": 0.0108184814453125, "learning_rate": 1e-06, "loss": 0.0454, "num_tokens": 5438933.0, "reward": 0.757401317358017, "reward_std": 0.212964728474617, "rewards/curriculum_aware_reward_fn": 0.1636513164266944, "rewards/format_reward": 0.59375, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 516.640625, "epoch": 0.7226890756302521, "grad_norm": 0.31194940209388733, "kl": 0.0074005126953125, "learning_rate": 1e-06, "loss": -0.0205, "num_tokens": 5563887.0, "reward": 0.6562500149011612, "reward_std": 0.04224720690399408, "rewards/curriculum_aware_reward_fn": 0.15625, "rewards/format_reward": 0.5, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 477.3984375, "epoch": 0.7394957983193278, "grad_norm": 0.38581541180610657, "kl": 0.00885009765625, "learning_rate": 1e-06, "loss": -0.0164, "num_tokens": 5688114.0, "reward": 0.6402138248085976, "reward_std": 0.08311590366065502, "rewards/curriculum_aware_reward_fn": 0.03083881549537182, "rewards/format_reward": 0.609375, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 521.8046875, "epoch": 0.7563025210084033, "grad_norm": 0.36903509497642517, "kl": 0.0078277587890625, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 5814153.0, "reward": 0.5513980239629745, "reward_std": 0.06967925047501922, "rewards/curriculum_aware_reward_fn": 0.05139802524354309, "rewards/format_reward": 0.5, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 416.4296875, "epoch": 0.773109243697479, "grad_norm": 0.5821658968925476, "kl": 0.0094757080078125, "learning_rate": 1e-06, "loss": -0.0056, "num_tokens": 5923904.0, "reward": 0.7257401347160339, "reward_std": 0.13419464463368058, "rewards/curriculum_aware_reward_fn": 0.10074013192206621, "rewards/format_reward": 0.625, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 526.9296875, "epoch": 0.7899159663865546, "grad_norm": 0.449553519487381, "kl": 0.005664825439453125, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 6053447.0, "reward": 0.4819078892469406, "reward_std": 0.09099963493645191, "rewards/curriculum_aware_reward_fn": 0.10690789669752121, "rewards/format_reward": 0.375, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 536.5, "epoch": 0.8067226890756303, "grad_norm": 0.5381475687026978, "kl": 0.008424758911132812, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 6183559.0, "reward": 0.46833881735801697, "reward_std": 0.08668615715578198, "rewards/curriculum_aware_reward_fn": 0.03865131642669439, "rewards/format_reward": 0.4296875, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 539.6328125, "epoch": 0.8235294117647058, "grad_norm": 0.44155657291412354, "kl": 0.0077495574951171875, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 6314544.0, "reward": 0.5526315867900848, "reward_std": 0.027912108227610588, "rewards/curriculum_aware_reward_fn": 0.11513157933950424, "rewards/format_reward": 0.4375, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 554.0546875, "epoch": 0.8403361344537815, "grad_norm": 0.4840262532234192, "kl": 0.0054950714111328125, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 6445087.0, "reward": 0.33634869009256363, "reward_std": 0.10334387933835387, "rewards/curriculum_aware_reward_fn": 0.03166118450462818, "rewards/format_reward": 0.3046875, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 578.6796875, "epoch": 0.8571428571428571, "grad_norm": 0.30791598558425903, "kl": 0.005002021789550781, "learning_rate": 1e-06, "loss": 0.0068, "num_tokens": 6582878.0, "reward": 0.348684199154377, "reward_std": 0.07469352334737778, "rewards/curriculum_aware_reward_fn": 0.036184209398925304, "rewards/format_reward": 0.3125, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 448.1328125, "epoch": 0.8739495798319328, "grad_norm": 0.5027822852134705, "kl": 0.00795745849609375, "learning_rate": 1e-06, "loss": -0.0178, "num_tokens": 6698503.0, "reward": 0.6311677470803261, "reward_std": 0.11679959110915661, "rewards/curriculum_aware_reward_fn": 0.09210526384413242, "rewards/format_reward": 0.5390625, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 521.1875, "epoch": 0.8907563025210085, "grad_norm": 0.4084753394126892, "kl": 0.00714111328125, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 6823951.0, "reward": 0.5028782784938812, "reward_std": 0.059696739073842764, "rewards/curriculum_aware_reward_fn": 0.06537829001899809, "rewards/format_reward": 0.4375, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 539.109375, "epoch": 0.907563025210084, "grad_norm": 0.2098054140806198, "kl": 0.007198333740234375, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 6953317.0, "reward": 0.46052631735801697, "reward_std": 0.03168220818042755, "rewards/curriculum_aware_reward_fn": 0.023026317358016968, "rewards/format_reward": 0.4375, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 521.2265625, "epoch": 0.9243697478991597, "grad_norm": 0.4919142425060272, "kl": 0.007293701171875, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 7079922.0, "reward": 0.49547697603702545, "reward_std": 0.10914274398237467, "rewards/curriculum_aware_reward_fn": 0.12047697883099318, "rewards/format_reward": 0.375, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 500.5625, "epoch": 0.9411764705882353, "grad_norm": 0.46875280141830444, "kl": 0.00684356689453125, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 7206954.0, "reward": 0.40830591320991516, "reward_std": 0.1075905729085207, "rewards/curriculum_aware_reward_fn": 0.04111842066049576, "rewards/format_reward": 0.3671875, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 482.3046875, "epoch": 0.957983193277311, "grad_norm": 0.40924757719039917, "kl": 0.012725830078125, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 7327857.0, "reward": 0.5958059132099152, "reward_std": 0.06403321353718638, "rewards/curriculum_aware_reward_fn": 0.04111842007841915, "rewards/format_reward": 0.5546875, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 445.375, "epoch": 0.9747899159663865, "grad_norm": 0.4467240273952484, "kl": 0.0105743408203125, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 7440561.0, "reward": 0.7578125, "reward_std": 0.057358515448868275, "rewards/curriculum_aware_reward_fn": 0.13281250069849193, "rewards/format_reward": 0.625, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 582.3452377319336, "epoch": 0.9915966386554622, "grad_norm": 0.5007306933403015, "kl": 0.007415771484375, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 7569086.0, "reward": 0.4514802545309067, "reward_std": 0.06341935088858008, "rewards/curriculum_aware_reward_fn": 0.0217927637277171, "rewards/format_reward": 0.4296875, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 553.09375, "epoch": 1.0168067226890756, "grad_norm": 0.4292355179786682, "kl": 0.005462646484375, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 7702626.0, "reward": 0.4325658082962036, "reward_std": 0.07455102633684874, "rewards/curriculum_aware_reward_fn": 0.05756579013541341, "rewards/format_reward": 0.375, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 521.375, "epoch": 1.0336134453781514, "grad_norm": 0.41578003764152527, "kl": 0.008762359619140625, "learning_rate": 1e-06, "loss": -0.0056, "num_tokens": 7827818.0, "reward": 0.5082236900925636, "reward_std": 0.07253926200792193, "rewards/curriculum_aware_reward_fn": 0.07072368497028947, "rewards/format_reward": 0.4375, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 648.2734375, "epoch": 1.050420168067227, "grad_norm": 0.48642197251319885, "kl": 0.0062713623046875, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 7974333.0, "reward": 0.3449835442006588, "reward_std": 0.07259867247194052, "rewards/curriculum_aware_reward_fn": 0.03248355258256197, "rewards/format_reward": 0.3125, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 436.9296875, "epoch": 1.0672268907563025, "grad_norm": 0.3184286653995514, "kl": 0.0114593505859375, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 8084908.0, "reward": 0.6899671256542206, "reward_std": 0.0728745711967349, "rewards/curriculum_aware_reward_fn": 0.0649671049322933, "rewards/format_reward": 0.625, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 541.53125, "epoch": 1.084033613445378, "grad_norm": 0.16483676433563232, "kl": 0.0060882568359375, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 8216696.0, "reward": 0.2627467066049576, "reward_std": 0.024391429498791695, "rewards/curriculum_aware_reward_fn": 0.012746710330247879, "rewards/format_reward": 0.25, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 509.7890625, "epoch": 1.1008403361344539, "grad_norm": 0.4256879985332489, "kl": 0.00730133056640625, "learning_rate": 1e-06, "loss": -0.0102, "num_tokens": 8342845.0, "reward": 0.5197368264198303, "reward_std": 0.030515023041516542, "rewards/curriculum_aware_reward_fn": 0.019736842485144734, "rewards/format_reward": 0.5, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 486.296875, "epoch": 1.1176470588235294, "grad_norm": 0.3091375231742859, "kl": 0.008016586303710938, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 8462971.0, "reward": 0.46299341320991516, "reward_std": 0.04847824294120073, "rewards/curriculum_aware_reward_fn": 0.025493420660495758, "rewards/format_reward": 0.4375, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 596.234375, "epoch": 1.134453781512605, "grad_norm": 0.4554305076599121, "kl": 0.006458282470703125, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 8598337.0, "reward": 0.3758223643526435, "reward_std": 0.08455474488437176, "rewards/curriculum_aware_reward_fn": 0.1258223680779338, "rewards/format_reward": 0.25, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 444.265625, "epoch": 1.1512605042016806, "grad_norm": 0.4700126349925995, "kl": 0.013336181640625, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 8715091.0, "reward": 0.67434211820364, "reward_std": 0.12386543769389391, "rewards/curriculum_aware_reward_fn": 0.11965460598003119, "rewards/format_reward": 0.5546875, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 537.6484375, "epoch": 1.1680672268907564, "grad_norm": 0.5387859344482422, "kl": 0.0084075927734375, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 8845582.0, "reward": 0.5822368338704109, "reward_std": 0.16140672331675887, "rewards/curriculum_aware_reward_fn": 0.10567433899268508, "rewards/format_reward": 0.4765625, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 571.25, "epoch": 1.184873949579832, "grad_norm": 0.28276559710502625, "kl": 0.005802154541015625, "learning_rate": 1e-06, "loss": 0.0147, "num_tokens": 8979574.0, "reward": 0.2606907826848328, "reward_std": 0.051840442698448896, "rewards/curriculum_aware_reward_fn": 0.018503289436921477, "rewards/format_reward": 0.2421875, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 495.6953125, "epoch": 1.2016806722689075, "grad_norm": 0.3467198312282562, "kl": 0.007556915283203125, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 9104143.0, "reward": 0.5476973727345467, "reward_std": 0.0878668250516057, "rewards/curriculum_aware_reward_fn": 0.04769736947491765, "rewards/format_reward": 0.5, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 628.96875, "epoch": 1.2184873949579833, "grad_norm": 0.30438435077667236, "kl": 0.0047740936279296875, "learning_rate": 1e-06, "loss": -0.0058, "num_tokens": 9247579.0, "reward": 0.25863486528396606, "reward_std": 0.05783074861392379, "rewards/curriculum_aware_reward_fn": 0.016447368427179754, "rewards/format_reward": 0.2421875, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 581.7109375, "epoch": 1.2352941176470589, "grad_norm": 0.16290180385112762, "kl": 0.005523681640625, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 9383094.0, "reward": 0.32195723056793213, "reward_std": 0.014439198188483715, "rewards/curriculum_aware_reward_fn": 0.009457237087190151, "rewards/format_reward": 0.3125, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 536.9609375, "epoch": 1.2521008403361344, "grad_norm": 1.2357046604156494, "kl": 0.170867919921875, "learning_rate": 1e-06, "loss": -0.0054, "num_tokens": 9513217.0, "reward": 0.582236819434911, "reward_std": 0.0510927583090961, "rewards/curriculum_aware_reward_fn": 0.01973684225231409, "rewards/format_reward": 0.5625, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 487.8671875, "epoch": 1.26890756302521, "grad_norm": 0.46429404616355896, "kl": 0.0113677978515625, "learning_rate": 1e-06, "loss": 0.0446, "num_tokens": 9635408.0, "reward": 0.726973682641983, "reward_std": 0.11705214250832796, "rewards/curriculum_aware_reward_fn": 0.10197368450462818, "rewards/format_reward": 0.625, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 584.296875, "epoch": 1.2857142857142856, "grad_norm": 0.42755427956581116, "kl": 0.00647735595703125, "learning_rate": 1e-06, "loss": 0.0307, "num_tokens": 9770998.0, "reward": 0.49136512726545334, "reward_std": 0.10772840678691864, "rewards/curriculum_aware_reward_fn": 0.06167763099074364, "rewards/format_reward": 0.4296875, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 429.296875, "epoch": 1.3025210084033614, "grad_norm": 0.45878008008003235, "kl": 0.01023101806640625, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 9886868.0, "reward": 0.7347861528396606, "reward_std": 0.10009488789364696, "rewards/curriculum_aware_reward_fn": 0.06291118392255157, "rewards/format_reward": 0.671875, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 516.6328125, "epoch": 1.319327731092437, "grad_norm": 0.3113223910331726, "kl": 0.0077972412109375, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 10011221.0, "reward": 0.5966282933950424, "reward_std": 0.041548303328454494, "rewards/curriculum_aware_reward_fn": 0.09662828780710697, "rewards/format_reward": 0.5, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 557.4765625, "epoch": 1.3361344537815127, "grad_norm": 0.33871227502822876, "kl": 0.0073699951171875, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 10140970.0, "reward": 0.5415295958518982, "reward_std": 0.07458627689629793, "rewards/curriculum_aware_reward_fn": 0.04152960516512394, "rewards/format_reward": 0.5, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 564.125, "epoch": 1.3529411764705883, "grad_norm": 0.4491986930370331, "kl": 0.006259918212890625, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 10271986.0, "reward": 0.5296052545309067, "reward_std": 0.1359914354979992, "rewards/curriculum_aware_reward_fn": 0.09210526291280985, "rewards/format_reward": 0.4375, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 503.125, "epoch": 1.3697478991596639, "grad_norm": 0.2838430404663086, "kl": 0.00777435302734375, "learning_rate": 1e-06, "loss": 0.0186, "num_tokens": 10398090.0, "reward": 0.6521381586790085, "reward_std": 0.05697542009875178, "rewards/curriculum_aware_reward_fn": 0.027138158679008484, "rewards/format_reward": 0.625, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 530.234375, "epoch": 1.3865546218487395, "grad_norm": 0.4765428602695465, "kl": 0.00778961181640625, "learning_rate": 1e-06, "loss": 0.0302, "num_tokens": 10526192.0, "reward": 0.6208881437778473, "reward_std": 0.12499829288572073, "rewards/curriculum_aware_reward_fn": 0.06620065588504076, "rewards/format_reward": 0.5546875, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 566.84375, "epoch": 1.403361344537815, "grad_norm": 0.4760180711746216, "kl": 0.0066986083984375, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 10657412.0, "reward": 0.46916117519140244, "reward_std": 0.10547287575900555, "rewards/curriculum_aware_reward_fn": 0.09416118310764432, "rewards/format_reward": 0.375, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 560.7734375, "epoch": 1.4201680672268908, "grad_norm": 0.27778276801109314, "kl": 0.005718231201171875, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 10788255.0, "reward": 0.44736841320991516, "reward_std": 0.06990169547498226, "rewards/curriculum_aware_reward_fn": 0.07236842112615705, "rewards/format_reward": 0.375, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 484.9296875, "epoch": 1.4369747899159664, "grad_norm": 0.34481725096702576, "kl": 0.02048492431640625, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 10911166.0, "reward": 0.7388980239629745, "reward_std": 0.08143611438572407, "rewards/curriculum_aware_reward_fn": 0.1138980237301439, "rewards/format_reward": 0.625, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 457.71875, "epoch": 1.453781512605042, "grad_norm": 0.4829816222190857, "kl": 0.0100555419921875, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 11027554.0, "reward": 0.6735197305679321, "reward_std": 0.08864451944828033, "rewards/curriculum_aware_reward_fn": 0.11101973801851273, "rewards/format_reward": 0.5625, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 508.6953125, "epoch": 1.4705882352941178, "grad_norm": 0.5016542077064514, "kl": 0.00922393798828125, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 11149275.0, "reward": 0.6870888322591782, "reward_std": 0.08495050063356757, "rewards/curriculum_aware_reward_fn": 0.12458881677594036, "rewards/format_reward": 0.5625, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 602.9921875, "epoch": 1.4873949579831933, "grad_norm": 0.29301658272743225, "kl": 0.004894256591796875, "learning_rate": 1e-06, "loss": 0.0249, "num_tokens": 11288106.0, "reward": 0.29481907188892365, "reward_std": 0.0620402698405087, "rewards/curriculum_aware_reward_fn": 0.04481907875742763, "rewards/format_reward": 0.25, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 438.71875, "epoch": 1.504201680672269, "grad_norm": 0.5715950727462769, "kl": 0.01503753662109375, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 11401118.0, "reward": 0.8972039222717285, "reward_std": 0.10221139155328274, "rewards/curriculum_aware_reward_fn": 0.1472039446234703, "rewards/format_reward": 0.75, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 547.015625, "epoch": 1.5210084033613445, "grad_norm": 0.31229323148727417, "kl": 0.0074462890625, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 11531400.0, "reward": 0.5945723652839661, "reward_std": 0.05676991865038872, "rewards/curriculum_aware_reward_fn": 0.15707236900925636, "rewards/format_reward": 0.4375, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 599.9375, "epoch": 1.53781512605042, "grad_norm": 0.3754754066467285, "kl": 0.005001068115234375, "learning_rate": 1e-06, "loss": 0.0337, "num_tokens": 11667224.0, "reward": 0.4358552396297455, "reward_std": 0.1078398427926004, "rewards/curriculum_aware_reward_fn": 0.060855261399410665, "rewards/format_reward": 0.375, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 563.3671875, "epoch": 1.5546218487394958, "grad_norm": 0.44682905077934265, "kl": 0.00695037841796875, "learning_rate": 1e-06, "loss": 0.0261, "num_tokens": 11800087.0, "reward": 0.47820721566677094, "reward_std": 0.11488656094297767, "rewards/curriculum_aware_reward_fn": 0.10320723743643612, "rewards/format_reward": 0.375, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 524.7578125, "epoch": 1.5714285714285714, "grad_norm": 0.4093223214149475, "kl": 0.0079803466796875, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 11927808.0, "reward": 0.5555098727345467, "reward_std": 0.10677139926701784, "rewards/curriculum_aware_reward_fn": 0.06332236900925636, "rewards/format_reward": 0.4921875, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 612.375, "epoch": 1.5882352941176472, "grad_norm": 0.28754857182502747, "kl": 0.004489898681640625, "learning_rate": 1e-06, "loss": -0.0277, "num_tokens": 12069560.0, "reward": 0.3371710553765297, "reward_std": 0.050214093178510666, "rewards/curriculum_aware_reward_fn": 0.02467105258256197, "rewards/format_reward": 0.3125, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 383.2578125, "epoch": 1.6050420168067228, "grad_norm": 0.47502318024635315, "kl": 0.0126953125, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 12176625.0, "reward": 0.7224506512284279, "reward_std": 0.10677911480888724, "rewards/curriculum_aware_reward_fn": 0.10526315728202462, "rewards/format_reward": 0.6171875, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 578.078125, "epoch": 1.6218487394957983, "grad_norm": 0.34693828225135803, "kl": 0.006988525390625, "learning_rate": 1e-06, "loss": 0.006, "num_tokens": 12310939.0, "reward": 0.5254934206604958, "reward_std": 0.06210480257868767, "rewards/curriculum_aware_reward_fn": 0.08799342112615705, "rewards/format_reward": 0.4375, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 529.828125, "epoch": 1.638655462184874, "grad_norm": 2.9580295085906982, "kl": 0.21123504638671875, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 12436949.0, "reward": 0.5230263099074364, "reward_std": 0.13364601507782936, "rewards/curriculum_aware_reward_fn": 0.11677631549537182, "rewards/format_reward": 0.40625, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 470.8828125, "epoch": 1.6554621848739495, "grad_norm": 0.39620673656463623, "kl": 0.00954437255859375, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 12558190.0, "reward": 0.8194901347160339, "reward_std": 0.09049705043435097, "rewards/curriculum_aware_reward_fn": 0.26480263471603394, "rewards/format_reward": 0.5546875, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 495.3515625, "epoch": 1.6722689075630253, "grad_norm": 0.5109691619873047, "kl": 0.007015228271484375, "learning_rate": 1e-06, "loss": 0.0351, "num_tokens": 12681859.0, "reward": 0.4362664595246315, "reward_std": 0.0971333347260952, "rewards/curriculum_aware_reward_fn": 0.1237664483487606, "rewards/format_reward": 0.3125, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 478.0703125, "epoch": 1.6890756302521008, "grad_norm": 0.4189630150794983, "kl": 0.0095977783203125, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 12801148.0, "reward": 0.6920230239629745, "reward_std": 0.10883715003728867, "rewards/curriculum_aware_reward_fn": 0.19983552768826485, "rewards/format_reward": 0.4921875, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 460.6875, "epoch": 1.7058823529411766, "grad_norm": 0.5282026529312134, "kl": 0.007904052734375, "learning_rate": 1e-06, "loss": -0.0042, "num_tokens": 12921692.0, "reward": 0.3396381661295891, "reward_std": 0.11080991290509701, "rewards/curriculum_aware_reward_fn": 0.04276315798051655, "rewards/format_reward": 0.296875, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 554.40625, "epoch": 1.7226890756302522, "grad_norm": 0.5177521109580994, "kl": 0.01079559326171875, "learning_rate": 1e-06, "loss": -0.009, "num_tokens": 13052136.0, "reward": 0.36965460516512394, "reward_std": 0.10201659612357616, "rewards/curriculum_aware_reward_fn": 0.01809210516512394, "rewards/format_reward": 0.3515625, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 524.421875, "epoch": 1.7394957983193278, "grad_norm": 0.44328662753105164, "kl": 0.008655548095703125, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 13178822.0, "reward": 0.5349506437778473, "reward_std": 0.12413342297077179, "rewards/curriculum_aware_reward_fn": 0.058388158679008484, "rewards/format_reward": 0.4765625, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 446.46875, "epoch": 1.7563025210084033, "grad_norm": 0.647972583770752, "kl": 0.01692962646484375, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 13297402.0, "reward": 0.6476151421666145, "reward_std": 0.22924628667533398, "rewards/curriculum_aware_reward_fn": 0.07730263285338879, "rewards/format_reward": 0.5703125, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 503.703125, "epoch": 1.773109243697479, "grad_norm": 0.631151556968689, "kl": 0.008514404296875, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 13418340.0, "reward": 0.46299341320991516, "reward_std": 0.2022387906908989, "rewards/curriculum_aware_reward_fn": 0.06455592066049576, "rewards/format_reward": 0.3984375, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 562.03125, "epoch": 1.7899159663865545, "grad_norm": 0.3566150963306427, "kl": 0.006641387939453125, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 13550952.0, "reward": 0.35773025802336633, "reward_std": 0.09330996312201023, "rewards/curriculum_aware_reward_fn": 0.05304276151582599, "rewards/format_reward": 0.3046875, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 539.1796875, "epoch": 1.8067226890756303, "grad_norm": 0.4120214581489563, "kl": 0.00933074951171875, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 13678999.0, "reward": 0.5435855314135551, "reward_std": 0.15557273291051388, "rewards/curriculum_aware_reward_fn": 0.12171052396297455, "rewards/format_reward": 0.421875, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 542.1796875, "epoch": 1.8235294117647058, "grad_norm": 0.36332470178604126, "kl": 0.00751495361328125, "learning_rate": 1e-06, "loss": 0.0098, "num_tokens": 13811206.0, "reward": 0.48643091320991516, "reward_std": 0.13410842791199684, "rewards/curriculum_aware_reward_fn": 0.11924342392012477, "rewards/format_reward": 0.3671875, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 553.953125, "epoch": 1.8403361344537816, "grad_norm": 0.3152480721473694, "kl": 0.00626373291015625, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 13945808.0, "reward": 0.3293585479259491, "reward_std": 0.045257058925926685, "rewards/curriculum_aware_reward_fn": 0.016858553048223257, "rewards/format_reward": 0.3125, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 573.59375, "epoch": 1.8571428571428572, "grad_norm": 0.2340080589056015, "kl": 0.00682830810546875, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 14080460.0, "reward": 0.3347039520740509, "reward_std": 0.038679007440805435, "rewards/curriculum_aware_reward_fn": 0.02220394741743803, "rewards/format_reward": 0.3125, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 463.0390625, "epoch": 1.8739495798319328, "grad_norm": 0.36526933312416077, "kl": 0.009578704833984375, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 14201065.0, "reward": 0.6328125149011612, "reward_std": 0.05027205403894186, "rewards/curriculum_aware_reward_fn": 0.07031250046566129, "rewards/format_reward": 0.5625, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 481.9609375, "epoch": 1.8907563025210083, "grad_norm": 0.4954119324684143, "kl": 0.0100555419921875, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 14323068.0, "reward": 0.5254934206604958, "reward_std": 0.12779070809483528, "rewards/curriculum_aware_reward_fn": 0.08799342159181833, "rewards/format_reward": 0.4375, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 494.96875, "epoch": 1.907563025210084, "grad_norm": 0.46778982877731323, "kl": 0.00978851318359375, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 14447008.0, "reward": 0.5370065793395042, "reward_std": 0.1048955712467432, "rewards/curriculum_aware_reward_fn": 0.09950657980516553, "rewards/format_reward": 0.4375, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 501.6796875, "epoch": 1.9243697478991597, "grad_norm": 0.3055194616317749, "kl": 0.00933074951171875, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 14571103.0, "reward": 0.5111019909381866, "reward_std": 0.024554526433348656, "rewards/curriculum_aware_reward_fn": 0.08141447440721095, "rewards/format_reward": 0.4296875, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 508.8203125, "epoch": 1.9411764705882353, "grad_norm": 0.4632183611392975, "kl": 0.012451171875, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 14694424.0, "reward": 0.6089638024568558, "reward_std": 0.10860061645507812, "rewards/curriculum_aware_reward_fn": 0.11677631549537182, "rewards/format_reward": 0.4921875, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 495.6875, "epoch": 1.957983193277311, "grad_norm": 0.41369161009788513, "kl": 0.0089874267578125, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 14819792.0, "reward": 0.4621710479259491, "reward_std": 0.07010683044791222, "rewards/curriculum_aware_reward_fn": 0.0871710479259491, "rewards/format_reward": 0.375, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 529.359375, "epoch": 1.9747899159663866, "grad_norm": 0.40478190779685974, "kl": 0.012042999267578125, "learning_rate": 1e-06, "loss": 0.0388, "num_tokens": 14946718.0, "reward": 0.48190788179636, "reward_std": 0.10751516558229923, "rewards/curriculum_aware_reward_fn": 0.11472039762884378, "rewards/format_reward": 0.3671875, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 493.4881134033203, "epoch": 1.9915966386554622, "grad_norm": 0.3562357425689697, "kl": 0.0123748779296875, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 15064457.0, "reward": 0.6706414446234703, "reward_std": 0.101046122610569, "rewards/curriculum_aware_reward_fn": 0.05345394788309932, "rewards/format_reward": 0.6171875, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 517.7578125, "epoch": 2.0168067226890756, "grad_norm": 0.3487071394920349, "kl": 0.0104217529296875, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 15191538.0, "reward": 0.5201480090618134, "reward_std": 0.04716231161728501, "rewards/curriculum_aware_reward_fn": 0.02014802652411163, "rewards/format_reward": 0.5, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 577.765625, "epoch": 2.033613445378151, "grad_norm": 0.35752227902412415, "kl": 0.008148193359375, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 15327204.0, "reward": 0.42763157933950424, "reward_std": 0.09388388879597187, "rewards/curriculum_aware_reward_fn": 0.05263157933950424, "rewards/format_reward": 0.375, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 460.53125, "epoch": 2.0504201680672267, "grad_norm": 0.5020465850830078, "kl": 0.014190673828125, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 15447608.0, "reward": 0.693256601691246, "reward_std": 0.12680460885167122, "rewards/curriculum_aware_reward_fn": 0.06825657840818167, "rewards/format_reward": 0.625, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 526.7890625, "epoch": 2.0672268907563027, "grad_norm": 0.33090242743492126, "kl": 0.00830841064453125, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 15577021.0, "reward": 0.3022203971631825, "reward_std": 0.052566134836524725, "rewards/curriculum_aware_reward_fn": 0.0522203971631825, "rewards/format_reward": 0.25, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 465.390625, "epoch": 2.0840336134453783, "grad_norm": 0.25564736127853394, "kl": 0.018894195556640625, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 15693543.0, "reward": 0.5879934281110764, "reward_std": 0.03513536183163524, "rewards/curriculum_aware_reward_fn": 0.15830592159181833, "rewards/format_reward": 0.4296875, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 438.015625, "epoch": 2.100840336134454, "grad_norm": 0.5210288763046265, "kl": 0.0128936767578125, "learning_rate": 1e-06, "loss": 0.038, "num_tokens": 15805441.0, "reward": 0.7685032784938812, "reward_std": 0.15490676742047071, "rewards/curriculum_aware_reward_fn": 0.20600328128784895, "rewards/format_reward": 0.5625, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 419.3515625, "epoch": 2.1176470588235294, "grad_norm": 0.48274165391921997, "kl": 0.01959228515625, "learning_rate": 1e-06, "loss": 0.0226, "num_tokens": 15913862.0, "reward": 0.671875, "reward_std": 0.11604671645909548, "rewards/curriculum_aware_reward_fn": 0.10937500139698386, "rewards/format_reward": 0.5625, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 520.8203125, "epoch": 2.134453781512605, "grad_norm": 0.35000789165496826, "kl": 0.0090179443359375, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 16041007.0, "reward": 0.49794407607987523, "reward_std": 0.10071868449449539, "rewards/curriculum_aware_reward_fn": 0.12294407980516553, "rewards/format_reward": 0.375, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 557.6328125, "epoch": 2.1512605042016806, "grad_norm": 0.5103374719619751, "kl": 0.0096435546875, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 16173728.0, "reward": 0.45641446858644485, "reward_std": 0.10976400738582015, "rewards/curriculum_aware_reward_fn": 0.08141447091475129, "rewards/format_reward": 0.375, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 414.9140625, "epoch": 2.168067226890756, "grad_norm": 0.43994390964508057, "kl": 0.014190673828125, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 16285445.0, "reward": 0.7236842215061188, "reward_std": 0.11914092372171581, "rewards/curriculum_aware_reward_fn": 0.09868421289138496, "rewards/format_reward": 0.625, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 567.921875, "epoch": 2.184873949579832, "grad_norm": 0.319624662399292, "kl": 0.0082244873046875, "learning_rate": 1e-06, "loss": 0.0285, "num_tokens": 16420019.0, "reward": 0.4259868264198303, "reward_std": 0.05608854768797755, "rewards/curriculum_aware_reward_fn": 0.11348683759570122, "rewards/format_reward": 0.3125, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 463.53125, "epoch": 2.2016806722689077, "grad_norm": 0.359430193901062, "kl": 0.014495849609375, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 16541143.0, "reward": 0.4699835479259491, "reward_std": 0.08584295958280563, "rewards/curriculum_aware_reward_fn": 0.0949835516512394, "rewards/format_reward": 0.375, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 469.609375, "epoch": 2.2184873949579833, "grad_norm": 0.41892191767692566, "kl": 0.0117034912109375, "learning_rate": 1e-06, "loss": 0.0365, "num_tokens": 16662909.0, "reward": 0.5522204041481018, "reward_std": 0.0973742357455194, "rewards/curriculum_aware_reward_fn": 0.052220395184122026, "rewards/format_reward": 0.5, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 540.5703125, "epoch": 2.235294117647059, "grad_norm": 0.48490580916404724, "kl": 0.0093231201171875, "learning_rate": 1e-06, "loss": -0.0203, "num_tokens": 16795070.0, "reward": 0.41324013471603394, "reward_std": 0.08475807495415211, "rewards/curriculum_aware_reward_fn": 0.038240132853388786, "rewards/format_reward": 0.375, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 525.1796875, "epoch": 2.2521008403361344, "grad_norm": 0.4449516832828522, "kl": 0.0105438232421875, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 16923613.0, "reward": 0.5604440867900848, "reward_std": 0.1288975402712822, "rewards/curriculum_aware_reward_fn": 0.12294407933950424, "rewards/format_reward": 0.4375, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 476.3125, "epoch": 2.26890756302521, "grad_norm": 0.4340604543685913, "kl": 0.01129150390625, "learning_rate": 1e-06, "loss": -0.028, "num_tokens": 17045693.0, "reward": 0.5587993413209915, "reward_std": 0.09385511744767427, "rewards/curriculum_aware_reward_fn": 0.058799343183636665, "rewards/format_reward": 0.5, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 420.9921875, "epoch": 2.2857142857142856, "grad_norm": 0.45602235198020935, "kl": 0.01416015625, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 17154012.0, "reward": 0.7602795735001564, "reward_std": 0.09590415796265006, "rewards/curriculum_aware_reward_fn": 0.14309210563078523, "rewards/format_reward": 0.6171875, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 489.6640625, "epoch": 2.302521008403361, "grad_norm": 0.4504002332687378, "kl": 0.0130157470703125, "learning_rate": 1e-06, "loss": -0.0126, "num_tokens": 17274481.0, "reward": 0.6295230239629745, "reward_std": 0.15420474018901587, "rewards/curriculum_aware_reward_fn": 0.13733552629128098, "rewards/format_reward": 0.4921875, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 492.3046875, "epoch": 2.3193277310924367, "grad_norm": 0.3228984773159027, "kl": 0.0111846923828125, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 17399360.0, "reward": 0.5587993413209915, "reward_std": 0.0586426155641675, "rewards/curriculum_aware_reward_fn": 0.05879934271797538, "rewards/format_reward": 0.5, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 508.5625, "epoch": 2.3361344537815127, "grad_norm": 0.3110595643520355, "kl": 0.015472412109375, "learning_rate": 1e-06, "loss": -0.0107, "num_tokens": 17521248.0, "reward": 0.546875, "reward_std": 0.07312605157494545, "rewards/curriculum_aware_reward_fn": 0.0546875, "rewards/format_reward": 0.4921875, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 574.015625, "epoch": 2.3529411764705883, "grad_norm": 0.4071909487247467, "kl": 0.0107421875, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 17659522.0, "reward": 0.47450655698776245, "reward_std": 0.07414581999182701, "rewards/curriculum_aware_reward_fn": 0.03700657980516553, "rewards/format_reward": 0.4375, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 522.5546875, "epoch": 2.369747899159664, "grad_norm": 0.34431034326553345, "kl": 0.00946044921875, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 17788537.0, "reward": 0.4099506512284279, "reward_std": 0.05903024738654494, "rewards/curriculum_aware_reward_fn": 0.0349506571656093, "rewards/format_reward": 0.375, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 553.2421875, "epoch": 2.3865546218487395, "grad_norm": 0.4213170111179352, "kl": 0.009979248046875, "learning_rate": 1e-06, "loss": 0.0132, "num_tokens": 17918288.0, "reward": 0.4177631586790085, "reward_std": 0.08044615527614951, "rewards/curriculum_aware_reward_fn": 0.042763158096931875, "rewards/format_reward": 0.375, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 559.3203125, "epoch": 2.403361344537815, "grad_norm": 0.23342828452587128, "kl": 0.008510589599609375, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 18052169.0, "reward": 0.3762335553765297, "reward_std": 0.03740033693611622, "rewards/curriculum_aware_reward_fn": 0.07154605258256197, "rewards/format_reward": 0.3046875, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 493.078125, "epoch": 2.4201680672268906, "grad_norm": 0.4362901449203491, "kl": 0.012481689453125, "learning_rate": 1e-06, "loss": 0.0386, "num_tokens": 18177251.0, "reward": 0.5805921033024788, "reward_std": 0.12307591829448938, "rewards/curriculum_aware_reward_fn": 0.08840460516512394, "rewards/format_reward": 0.4921875, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 435.65625, "epoch": 2.4369747899159666, "grad_norm": 0.6844424605369568, "kl": 0.0600128173828125, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 18292007.0, "reward": 0.6208881735801697, "reward_std": 0.15131067298352718, "rewards/curriculum_aware_reward_fn": 0.12088816147297621, "rewards/format_reward": 0.5, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 490.296875, "epoch": 2.453781512605042, "grad_norm": 0.30699044466018677, "kl": 0.010986328125, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 18415301.0, "reward": 0.49424342066049576, "reward_std": 0.04014611290767789, "rewards/curriculum_aware_reward_fn": 0.05674342147540301, "rewards/format_reward": 0.4375, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 471.984375, "epoch": 2.4705882352941178, "grad_norm": 0.403209924697876, "kl": 0.0122528076171875, "learning_rate": 1e-06, "loss": 0.0353, "num_tokens": 18532667.0, "reward": 0.6027960330247879, "reward_std": 0.0935791190713644, "rewards/curriculum_aware_reward_fn": 0.1027960553765297, "rewards/format_reward": 0.5, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 422.703125, "epoch": 2.4873949579831933, "grad_norm": 0.42733973264694214, "kl": 0.0163116455078125, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 18645941.0, "reward": 0.7845394462347031, "reward_std": 0.0871797576546669, "rewards/curriculum_aware_reward_fn": 0.0345394741743803, "rewards/format_reward": 0.75, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 548.828125, "epoch": 2.504201680672269, "grad_norm": 0.2545667290687561, "kl": 0.01213836669921875, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 18774111.0, "reward": 0.539473682641983, "reward_std": 0.060992954298853874, "rewards/curriculum_aware_reward_fn": 0.10197368077933788, "rewards/format_reward": 0.4375, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 540.0703125, "epoch": 2.5210084033613445, "grad_norm": 0.3914143145084381, "kl": 0.00789642333984375, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 18904992.0, "reward": 0.27878287341445684, "reward_std": 0.06910991575568914, "rewards/curriculum_aware_reward_fn": 0.02878289413638413, "rewards/format_reward": 0.25, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 550.9375, "epoch": 2.53781512605042, "grad_norm": 0.2912365794181824, "kl": 0.00799560546875, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 19035968.0, "reward": 0.3215460553765297, "reward_std": 0.01937512680888176, "rewards/curriculum_aware_reward_fn": 0.07154605211690068, "rewards/format_reward": 0.25, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 471.1640625, "epoch": 2.5546218487394956, "grad_norm": 0.3965752124786377, "kl": 0.01221466064453125, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 19153861.0, "reward": 0.582648016512394, "reward_std": 0.08400850929319859, "rewards/curriculum_aware_reward_fn": 0.08264802652411163, "rewards/format_reward": 0.5, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 421.234375, "epoch": 2.571428571428571, "grad_norm": 0.6044662594795227, "kl": 0.026885986328125, "learning_rate": 1e-06, "loss": 0.0165, "num_tokens": 19265379.0, "reward": 0.8112664222717285, "reward_std": 0.1459241509437561, "rewards/curriculum_aware_reward_fn": 0.19407895021140575, "rewards/format_reward": 0.6171875, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 546.8671875, "epoch": 2.588235294117647, "grad_norm": 0.4222107231616974, "kl": 0.01050567626953125, "learning_rate": 1e-06, "loss": 0.0261, "num_tokens": 19396626.0, "reward": 0.38733551651239395, "reward_std": 0.06776260398328304, "rewards/curriculum_aware_reward_fn": 0.02014802605845034, "rewards/format_reward": 0.3671875, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 548.875, "epoch": 2.6050420168067228, "grad_norm": 0.30043891072273254, "kl": 0.010498046875, "learning_rate": 1e-06, "loss": 0.0291, "num_tokens": 19531202.0, "reward": 0.28166119009256363, "reward_std": 0.07623977493494749, "rewards/curriculum_aware_reward_fn": 0.031661184038966894, "rewards/format_reward": 0.25, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 560.640625, "epoch": 2.6218487394957983, "grad_norm": 0.39753058552742004, "kl": 0.0109710693359375, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 19665404.0, "reward": 0.5197368338704109, "reward_std": 0.08217737264931202, "rewards/curriculum_aware_reward_fn": 0.08223684225231409, "rewards/format_reward": 0.4375, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 481.6640625, "epoch": 2.638655462184874, "grad_norm": 0.39810478687286377, "kl": 0.009063720703125, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 19787409.0, "reward": 0.44202302396297455, "reward_std": 0.08141717128455639, "rewards/curriculum_aware_reward_fn": 0.12952302768826485, "rewards/format_reward": 0.3125, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 389.7890625, "epoch": 2.6554621848739495, "grad_norm": 0.4911426305770874, "kl": 0.02197265625, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 19896190.0, "reward": 0.7331414520740509, "reward_std": 0.17763726785779, "rewards/curriculum_aware_reward_fn": 0.1784539483487606, "rewards/format_reward": 0.5546875, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 518.9609375, "epoch": 2.6722689075630255, "grad_norm": 0.2420579046010971, "kl": 0.011962890625, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 20022809.0, "reward": 0.4453125, "reward_std": 0.01647413382306695, "rewards/curriculum_aware_reward_fn": 0.007812500232830644, "rewards/format_reward": 0.4375, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 424.4453125, "epoch": 2.689075630252101, "grad_norm": 0.46578091382980347, "kl": 0.01375579833984375, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 20136314.0, "reward": 0.49095392785966396, "reward_std": 0.13701673224568367, "rewards/curriculum_aware_reward_fn": 0.06126644625328481, "rewards/format_reward": 0.4296875, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 483.921875, "epoch": 2.7058823529411766, "grad_norm": 0.32379522919654846, "kl": 0.01180267333984375, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 20257344.0, "reward": 0.5197368343360722, "reward_std": 0.07396957790479064, "rewards/curriculum_aware_reward_fn": 0.0822368417866528, "rewards/format_reward": 0.4375, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 470.3515625, "epoch": 2.722689075630252, "grad_norm": 0.4478832483291626, "kl": 0.014068603515625, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 20375685.0, "reward": 0.5801809281110764, "reward_std": 0.06543473433703184, "rewards/curriculum_aware_reward_fn": 0.08018092112615705, "rewards/format_reward": 0.5, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 462.3046875, "epoch": 2.7394957983193278, "grad_norm": 0.4915456175804138, "kl": 0.0140838623046875, "learning_rate": 1e-06, "loss": 0.0286, "num_tokens": 20491340.0, "reward": 0.6981907933950424, "reward_std": 0.1432387800887227, "rewards/curriculum_aware_reward_fn": 0.13569078594446182, "rewards/format_reward": 0.5625, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 460.046875, "epoch": 2.7563025210084033, "grad_norm": 0.388621062040329, "kl": 0.0123138427734375, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 20613466.0, "reward": 0.4124177619814873, "reward_std": 0.07370226783677936, "rewards/curriculum_aware_reward_fn": 0.037417763262055814, "rewards/format_reward": 0.375, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 462.25, "epoch": 2.773109243697479, "grad_norm": 0.4878624677658081, "kl": 0.01593017578125, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 20729058.0, "reward": 0.6221217066049576, "reward_std": 0.12872529029846191, "rewards/curriculum_aware_reward_fn": 0.12212171033024788, "rewards/format_reward": 0.5, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 486.4609375, "epoch": 2.7899159663865545, "grad_norm": 0.4500262141227722, "kl": 0.0099029541015625, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 20853869.0, "reward": 0.4050164371728897, "reward_std": 0.11422262340784073, "rewards/curriculum_aware_reward_fn": 0.09251644648611546, "rewards/format_reward": 0.3125, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 448.5859375, "epoch": 2.80672268907563, "grad_norm": 0.5006850957870483, "kl": 0.0168914794921875, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 20973736.0, "reward": 0.677631601691246, "reward_std": 0.0868874522857368, "rewards/curriculum_aware_reward_fn": 0.12294407980516553, "rewards/format_reward": 0.5546875, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 428.7265625, "epoch": 2.8235294117647056, "grad_norm": 0.42931458353996277, "kl": 0.01781463623046875, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 21086485.0, "reward": 0.6040295884013176, "reward_std": 0.05929867131635547, "rewards/curriculum_aware_reward_fn": 0.041529607493430376, "rewards/format_reward": 0.5625, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 388.421875, "epoch": 2.8403361344537816, "grad_norm": 0.44046640396118164, "kl": 0.0161895751953125, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 21193627.0, "reward": 0.7483552545309067, "reward_std": 0.09682157123461366, "rewards/curriculum_aware_reward_fn": 0.060855262679979205, "rewards/format_reward": 0.6875, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 477.6953125, "epoch": 2.857142857142857, "grad_norm": 0.36667370796203613, "kl": 0.0146484375, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 21313716.0, "reward": 0.6060855239629745, "reward_std": 0.10079656913876534, "rewards/curriculum_aware_reward_fn": 0.11389802675694227, "rewards/format_reward": 0.4921875, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 534.4296875, "epoch": 2.8739495798319328, "grad_norm": 0.3436344563961029, "kl": 0.00984954833984375, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 21445667.0, "reward": 0.48231907933950424, "reward_std": 0.08960662921890616, "rewards/curriculum_aware_reward_fn": 0.10731907980516553, "rewards/format_reward": 0.375, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 496.21875, "epoch": 2.8907563025210083, "grad_norm": 0.48088422417640686, "kl": 0.0130767822265625, "learning_rate": 1e-06, "loss": 0.0182, "num_tokens": 21570871.0, "reward": 0.4465460618957877, "reward_std": 0.1538134217262268, "rewards/curriculum_aware_reward_fn": 0.13404605071991682, "rewards/format_reward": 0.3125, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 429.484375, "epoch": 2.907563025210084, "grad_norm": 0.5827536582946777, "kl": 0.016109466552734375, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 21686093.0, "reward": 0.4502467103302479, "reward_std": 0.15407454315572977, "rewards/curriculum_aware_reward_fn": 0.09087171172723174, "rewards/format_reward": 0.359375, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 482.4609375, "epoch": 2.92436974789916, "grad_norm": 0.467061311006546, "kl": 0.013336181640625, "learning_rate": 1e-06, "loss": 0.033, "num_tokens": 21808264.0, "reward": 0.6632401347160339, "reward_std": 0.10484125558286905, "rewards/curriculum_aware_reward_fn": 0.22574013099074364, "rewards/format_reward": 0.4375, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 500.96875, "epoch": 2.9411764705882355, "grad_norm": 0.41948550939559937, "kl": 0.009563446044921875, "learning_rate": 1e-06, "loss": 0.0329, "num_tokens": 21933084.0, "reward": 0.400082241743803, "reward_std": 0.10662292037159204, "rewards/curriculum_aware_reward_fn": 0.04070723708719015, "rewards/format_reward": 0.359375, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 557.640625, "epoch": 2.957983193277311, "grad_norm": 0.41708114743232727, "kl": 0.007190704345703125, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 22068550.0, "reward": 0.3005756618222222, "reward_std": 0.06424513552337885, "rewards/curriculum_aware_reward_fn": 0.050575657514855266, "rewards/format_reward": 0.25, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 482.421875, "epoch": 2.9747899159663866, "grad_norm": 0.6009016633033752, "kl": 0.013702392578125, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 22189356.0, "reward": 0.6620065569877625, "reward_std": 0.149446252733469, "rewards/curriculum_aware_reward_fn": 0.16200657933950424, "rewards/format_reward": 0.5, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 490.53572845458984, "epoch": 2.991596638655462, "grad_norm": 0.49134695529937744, "kl": 0.01397705078125, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 22309028.0, "reward": 0.6726973652839661, "reward_std": 0.14456172287464142, "rewards/curriculum_aware_reward_fn": 0.1101973676122725, "rewards/format_reward": 0.5625, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 358.109375, "epoch": 3.0168067226890756, "grad_norm": 0.5925723314285278, "kl": 0.0204315185546875, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 22410794.0, "reward": 0.87787826359272, "reward_std": 0.1721474528312683, "rewards/curriculum_aware_reward_fn": 0.19819078594446182, "rewards/format_reward": 0.6796875, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 523.0078125, "epoch": 3.033613445378151, "grad_norm": 0.2975535988807678, "kl": 0.01165771484375, "learning_rate": 1e-06, "loss": 0.0697, "num_tokens": 22539299.0, "reward": 0.5168585479259491, "reward_std": 0.048361226450651884, "rewards/curriculum_aware_reward_fn": 0.08717105106916279, "rewards/format_reward": 0.4296875, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 484.7578125, "epoch": 3.0504201680672267, "grad_norm": 0.45362988114356995, "kl": 0.0162200927734375, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 22660588.0, "reward": 0.5513980276882648, "reward_std": 0.1047646040096879, "rewards/curriculum_aware_reward_fn": 0.06702302722260356, "rewards/format_reward": 0.484375, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 452.9375, "epoch": 3.0672268907563027, "grad_norm": 0.5003635883331299, "kl": 0.0143890380859375, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 22778956.0, "reward": 0.73149673640728, "reward_std": 0.17891032248735428, "rewards/curriculum_aware_reward_fn": 0.23149671405553818, "rewards/format_reward": 0.5, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 513.0390625, "epoch": 3.0840336134453783, "grad_norm": 0.31615540385246277, "kl": 0.01172637939453125, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 22905121.0, "reward": 0.3244243338704109, "reward_std": 0.03051401791162789, "rewards/curriculum_aware_reward_fn": 0.011924341786652803, "rewards/format_reward": 0.3125, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 411.6015625, "epoch": 3.100840336134454, "grad_norm": 0.4836508631706238, "kl": 0.0144195556640625, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 23019342.0, "reward": 0.5826480239629745, "reward_std": 0.11801502481102943, "rewards/curriculum_aware_reward_fn": 0.07483552675694227, "rewards/format_reward": 0.5078125, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 416.921875, "epoch": 3.1176470588235294, "grad_norm": 0.3468119204044342, "kl": 0.01403045654296875, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 23137316.0, "reward": 0.47574012295808643, "reward_std": 0.05907326890155673, "rewards/curriculum_aware_reward_fn": 0.10074013040866703, "rewards/format_reward": 0.375, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 414.359375, "epoch": 3.134453781512605, "grad_norm": 0.4667985439300537, "kl": 0.0151519775390625, "learning_rate": 1e-06, "loss": 0.0299, "num_tokens": 23249858.0, "reward": 0.6344572305679321, "reward_std": 0.15162191167473793, "rewards/curriculum_aware_reward_fn": 0.13445723662152886, "rewards/format_reward": 0.5, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 482.109375, "epoch": 3.1512605042016806, "grad_norm": 0.4111727774143219, "kl": 0.013458251953125, "learning_rate": 1e-06, "loss": 0.0121, "num_tokens": 23370304.0, "reward": 0.4806743413209915, "reward_std": 0.052865433506667614, "rewards/curriculum_aware_reward_fn": 0.0431743401568383, "rewards/format_reward": 0.4375, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 500.5234375, "epoch": 3.168067226890756, "grad_norm": 0.4427432715892792, "kl": 0.01395416259765625, "learning_rate": 1e-06, "loss": -0.0198, "num_tokens": 23496979.0, "reward": 0.4243420949205756, "reward_std": 0.07115951599553227, "rewards/curriculum_aware_reward_fn": 0.11184210795909166, "rewards/format_reward": 0.3125, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 450.671875, "epoch": 3.184873949579832, "grad_norm": 0.4217956066131592, "kl": 0.0164947509765625, "learning_rate": 1e-06, "loss": 0.0361, "num_tokens": 23613281.0, "reward": 0.5629111751914024, "reward_std": 0.07686262531206012, "rewards/curriculum_aware_reward_fn": 0.12541118264198303, "rewards/format_reward": 0.4375, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 399.21875, "epoch": 3.2016806722689077, "grad_norm": 0.6111953258514404, "kl": 0.017852783203125, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 23723725.0, "reward": 0.7121710479259491, "reward_std": 0.15234812535345554, "rewards/curriculum_aware_reward_fn": 0.08717105351388454, "rewards/format_reward": 0.625, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 433.015625, "epoch": 3.2184873949579833, "grad_norm": 0.4865033030509949, "kl": 0.0166778564453125, "learning_rate": 1e-06, "loss": -0.009, "num_tokens": 23835815.0, "reward": 0.7372532933950424, "reward_std": 0.13220055866986513, "rewards/curriculum_aware_reward_fn": 0.17475328966975212, "rewards/format_reward": 0.5625, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 525.4765625, "epoch": 3.235294117647059, "grad_norm": 0.3422640562057495, "kl": 0.016204833984375, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 23964596.0, "reward": 0.43174342811107635, "reward_std": 0.09182633552700281, "rewards/curriculum_aware_reward_fn": 0.05674342066049576, "rewards/format_reward": 0.375, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 351.4453125, "epoch": 3.2521008403361344, "grad_norm": 0.5189781785011292, "kl": 0.023193359375, "learning_rate": 1e-06, "loss": 0.0349, "num_tokens": 24067141.0, "reward": 0.7643914222717285, "reward_std": 0.15736807510256767, "rewards/curriculum_aware_reward_fn": 0.1940789488144219, "rewards/format_reward": 0.5703125, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 460.296875, "epoch": 3.26890756302521, "grad_norm": 0.36804094910621643, "kl": 0.012298583984375, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 24187067.0, "reward": 0.48273026943206787, "reward_std": 0.037970013450831175, "rewards/curriculum_aware_reward_fn": 0.10773026570677757, "rewards/format_reward": 0.375, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 418.078125, "epoch": 3.2857142857142856, "grad_norm": 0.4727684259414673, "kl": 0.01959228515625, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 24300605.0, "reward": 0.5563322491943836, "reward_std": 0.06251880899071693, "rewards/curriculum_aware_reward_fn": 0.13445723743643612, "rewards/format_reward": 0.421875, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 498.9765625, "epoch": 3.302521008403361, "grad_norm": 0.5195404887199402, "kl": 0.01263427734375, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 24427482.0, "reward": 0.38569077104330063, "reward_std": 0.10553359193727374, "rewards/curriculum_aware_reward_fn": 0.07319078966975212, "rewards/format_reward": 0.3125, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 448.625, "epoch": 3.3193277310924367, "grad_norm": 0.49932360649108887, "kl": 0.017852783203125, "learning_rate": 1e-06, "loss": 0.0397, "num_tokens": 24541746.0, "reward": 0.5193256512284279, "reward_std": 0.10704736225306988, "rewards/curriculum_aware_reward_fn": 0.08182565891183913, "rewards/format_reward": 0.4375, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 377.1015625, "epoch": 3.3361344537815127, "grad_norm": 0.4484708309173584, "kl": 0.01934814453125, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 24647103.0, "reward": 0.6673519611358643, "reward_std": 0.06431722524575889, "rewards/curriculum_aware_reward_fn": 0.16735197603702545, "rewards/format_reward": 0.5, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 483.921875, "epoch": 3.3529411764705883, "grad_norm": 0.41696909070014954, "kl": 0.01053619384765625, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 24773253.0, "reward": 0.2717927638441324, "reward_std": 0.11790546495467424, "rewards/curriculum_aware_reward_fn": 0.08429276384413242, "rewards/format_reward": 0.1875, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 380.0546875, "epoch": 3.369747899159664, "grad_norm": 0.45737817883491516, "kl": 0.022186279296875, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 24881188.0, "reward": 0.7002467140555382, "reward_std": 0.03508220613002777, "rewards/curriculum_aware_reward_fn": 0.1377467131242156, "rewards/format_reward": 0.5625, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 390.3671875, "epoch": 3.3865546218487395, "grad_norm": 0.5029156804084778, "kl": 0.0277099609375, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 24991955.0, "reward": 0.6694078892469406, "reward_std": 0.10574874095618725, "rewards/curriculum_aware_reward_fn": 0.10690789762884378, "rewards/format_reward": 0.5625, "step": 200 }, { "epoch": 3.3865546218487395, "step": 200, "total_flos": 0.0, "train_loss": 0.010024200768093579, "train_runtime": 35564.3846, "train_samples_per_second": 0.72, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }