Qwen2.5-1.5B-CCRL-1 / trainer_state.json
chansung's picture
Model save
56f259c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.3865546218487395,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 635.7109375,
"epoch": 0.01680672268907563,
"grad_norm": 0.31708475947380066,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.0099,
"num_tokens": 143267.0,
"reward": 0.039062500349245965,
"reward_std": 0.0698821279220283,
"rewards/curriculum_aware_reward_fn": 0.023437500349245965,
"rewards/format_reward": 0.015625,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 527.6328125,
"epoch": 0.03361344537815126,
"grad_norm": 0.43825313448905945,
"kl": 0.0002913475036621094,
"learning_rate": 1e-06,
"loss": 0.0432,
"num_tokens": 270812.0,
"reward": 0.09292763145640492,
"reward_std": 0.12866380205377936,
"rewards/curriculum_aware_reward_fn": 0.06949013145640492,
"rewards/format_reward": 0.0234375,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 608.9921875,
"epoch": 0.05042016806722689,
"grad_norm": 0.4227641224861145,
"kl": 0.0002665519714355469,
"learning_rate": 1e-06,
"loss": -0.0273,
"num_tokens": 410971.0,
"reward": 0.059621710795909166,
"reward_std": 0.07889116508886218,
"rewards/curriculum_aware_reward_fn": 0.059621710795909166,
"rewards/format_reward": 0.0,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 558.921875,
"epoch": 0.06722689075630252,
"grad_norm": 0.4796917140483856,
"kl": 0.0002789497375488281,
"learning_rate": 1e-06,
"loss": -0.0009,
"num_tokens": 542313.0,
"reward": 0.08552631549537182,
"reward_std": 0.12651031091809273,
"rewards/curriculum_aware_reward_fn": 0.06990131689235568,
"rewards/format_reward": 0.015625,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 590.265625,
"epoch": 0.08403361344537816,
"grad_norm": 0.5620821118354797,
"kl": 0.0003027915954589844,
"learning_rate": 1e-06,
"loss": 0.0288,
"num_tokens": 677075.0,
"reward": 0.14925987273454666,
"reward_std": 0.24606542102992535,
"rewards/curriculum_aware_reward_fn": 0.09457236900925636,
"rewards/format_reward": 0.0546875,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 592.5234375,
"epoch": 0.10084033613445378,
"grad_norm": 0.4298699200153351,
"kl": 0.0003509521484375,
"learning_rate": 1e-06,
"loss": -0.0151,
"num_tokens": 812710.0,
"reward": 0.08840460644569248,
"reward_std": 0.1141207623295486,
"rewards/curriculum_aware_reward_fn": 0.03371710644569248,
"rewards/format_reward": 0.0546875,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 582.046875,
"epoch": 0.11764705882352941,
"grad_norm": 0.526942253112793,
"kl": 0.0004343986511230469,
"learning_rate": 1e-06,
"loss": 0.0192,
"num_tokens": 943268.0,
"reward": 0.12088815867900848,
"reward_std": 0.17540471255779266,
"rewards/curriculum_aware_reward_fn": 0.07401315867900848,
"rewards/format_reward": 0.046875,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 534.75,
"epoch": 0.13445378151260504,
"grad_norm": 0.44275274872779846,
"kl": 0.0003724098205566406,
"learning_rate": 1e-06,
"loss": -0.0033,
"num_tokens": 1074300.0,
"reward": 0.030016446253284812,
"reward_std": 0.08489933330565691,
"rewards/curriculum_aware_reward_fn": 0.014391447650268674,
"rewards/format_reward": 0.015625,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 539.09375,
"epoch": 0.15126050420168066,
"grad_norm": 0.5494865775108337,
"kl": 0.0007390975952148438,
"learning_rate": 1e-06,
"loss": 0.0036,
"num_tokens": 1197896.0,
"reward": 0.16570723708719015,
"reward_std": 0.21696669608354568,
"rewards/curriculum_aware_reward_fn": 0.05633223685435951,
"rewards/format_reward": 0.109375,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 593.7734375,
"epoch": 0.16806722689075632,
"grad_norm": 0.5171737670898438,
"kl": 0.0006322860717773438,
"learning_rate": 1e-06,
"loss": 0.0193,
"num_tokens": 1336931.0,
"reward": 0.11143092066049576,
"reward_std": 0.19064411148428917,
"rewards/curriculum_aware_reward_fn": 0.017680921009741724,
"rewards/format_reward": 0.09375,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 578.4765625,
"epoch": 0.18487394957983194,
"grad_norm": 0.6088258028030396,
"kl": 0.001346588134765625,
"learning_rate": 1e-06,
"loss": 0.037,
"num_tokens": 1467592.0,
"reward": 0.22944078594446182,
"reward_std": 0.3224767856299877,
"rewards/curriculum_aware_reward_fn": 0.04194079013541341,
"rewards/format_reward": 0.1875,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 601.171875,
"epoch": 0.20168067226890757,
"grad_norm": 0.4451327621936798,
"kl": 0.0010366439819335938,
"learning_rate": 1e-06,
"loss": 0.0148,
"num_tokens": 1607894.0,
"reward": 0.1204769799951464,
"reward_std": 0.1381341191008687,
"rewards/curriculum_aware_reward_fn": 0.018914473825134337,
"rewards/format_reward": 0.1015625,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 526.28125,
"epoch": 0.2184873949579832,
"grad_norm": 0.636314332485199,
"kl": 0.00191497802734375,
"learning_rate": 1e-06,
"loss": 0.0125,
"num_tokens": 1735650.0,
"reward": 0.26644736528396606,
"reward_std": 0.30141641572117805,
"rewards/curriculum_aware_reward_fn": 0.03988486935850233,
"rewards/format_reward": 0.2265625,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 507.515625,
"epoch": 0.23529411764705882,
"grad_norm": 0.6864922642707825,
"kl": 0.004413604736328125,
"learning_rate": 1e-06,
"loss": 0.0802,
"num_tokens": 1856316.0,
"reward": 0.3112664446234703,
"reward_std": 0.31644799932837486,
"rewards/curriculum_aware_reward_fn": 0.05345394683536142,
"rewards/format_reward": 0.2578125,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 554.5859375,
"epoch": 0.25210084033613445,
"grad_norm": 0.6268811225891113,
"kl": 0.0036067962646484375,
"learning_rate": 1e-06,
"loss": -0.0044,
"num_tokens": 1987511.0,
"reward": 0.4337993338704109,
"reward_std": 0.32329631969332695,
"rewards/curriculum_aware_reward_fn": 0.050986841320991516,
"rewards/format_reward": 0.3828125,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 584.3828125,
"epoch": 0.2689075630252101,
"grad_norm": 0.5531853437423706,
"kl": 0.003597259521484375,
"learning_rate": 1e-06,
"loss": 0.0104,
"num_tokens": 2119768.0,
"reward": 0.3828125037252903,
"reward_std": 0.26145630702376366,
"rewards/curriculum_aware_reward_fn": 0.0546875,
"rewards/format_reward": 0.328125,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 481.8046875,
"epoch": 0.2857142857142857,
"grad_norm": 0.6449251174926758,
"kl": 0.005481719970703125,
"learning_rate": 1e-06,
"loss": -0.0094,
"num_tokens": 2238911.0,
"reward": 0.4543585404753685,
"reward_std": 0.26075971499085426,
"rewards/curriculum_aware_reward_fn": 0.05592105304822326,
"rewards/format_reward": 0.3984375,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 645.75,
"epoch": 0.3025210084033613,
"grad_norm": 0.37918156385421753,
"kl": 0.001049041748046875,
"learning_rate": 1e-06,
"loss": 0.0055,
"num_tokens": 2385767.0,
"reward": 0.1451480264076963,
"reward_std": 0.1290158643387258,
"rewards/curriculum_aware_reward_fn": 0.04358552640769631,
"rewards/format_reward": 0.1015625,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 617.6953125,
"epoch": 0.31932773109243695,
"grad_norm": 0.39814478158950806,
"kl": 0.00528717041015625,
"learning_rate": 1e-06,
"loss": 0.0518,
"num_tokens": 2525656.0,
"reward": 0.35115131735801697,
"reward_std": 0.11648409254848957,
"rewards/curriculum_aware_reward_fn": 0.02302631549537182,
"rewards/format_reward": 0.328125,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 459.6015625,
"epoch": 0.33613445378151263,
"grad_norm": 0.7307525873184204,
"kl": 0.005184173583984375,
"learning_rate": 1e-06,
"loss": 0.083,
"num_tokens": 2644077.0,
"reward": 0.47574012726545334,
"reward_std": 0.2815094441175461,
"rewards/curriculum_aware_reward_fn": 0.04605263099074364,
"rewards/format_reward": 0.4296875,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 620.46875,
"epoch": 0.35294117647058826,
"grad_norm": 0.46509799361228943,
"kl": 0.0036363601684570312,
"learning_rate": 1e-06,
"loss": 0.0145,
"num_tokens": 2786169.0,
"reward": 0.24177631677594036,
"reward_std": 0.09853590792044997,
"rewards/curriculum_aware_reward_fn": 0.023026315728202462,
"rewards/format_reward": 0.21875,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 578.9609375,
"epoch": 0.3697478991596639,
"grad_norm": 0.5765166878700256,
"kl": 0.005565643310546875,
"learning_rate": 1e-06,
"loss": 0.0042,
"num_tokens": 2917180.0,
"reward": 0.4958881437778473,
"reward_std": 0.10692231869325042,
"rewards/curriculum_aware_reward_fn": 0.0740131582133472,
"rewards/format_reward": 0.421875,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 579.578125,
"epoch": 0.3865546218487395,
"grad_norm": 0.5340356826782227,
"kl": 0.00540924072265625,
"learning_rate": 1e-06,
"loss": -0.0083,
"num_tokens": 3053414.0,
"reward": 0.3708881437778473,
"reward_std": 0.11791826784610748,
"rewards/curriculum_aware_reward_fn": 0.06620065867900848,
"rewards/format_reward": 0.3046875,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 507.5625,
"epoch": 0.40336134453781514,
"grad_norm": 0.4752294719219208,
"kl": 0.031703948974609375,
"learning_rate": 1e-06,
"loss": -0.0004,
"num_tokens": 3181894.0,
"reward": 0.3700657896697521,
"reward_std": 0.1367718242108822,
"rewards/curriculum_aware_reward_fn": 0.002878289553336799,
"rewards/format_reward": 0.3671875,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 496.4296875,
"epoch": 0.42016806722689076,
"grad_norm": 0.46164318919181824,
"kl": 0.0082855224609375,
"learning_rate": 1e-06,
"loss": -0.0091,
"num_tokens": 3304077.0,
"reward": 0.5016447380185127,
"reward_std": 0.09064025245606899,
"rewards/curriculum_aware_reward_fn": 0.017269736621528864,
"rewards/format_reward": 0.484375,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 454.5859375,
"epoch": 0.4369747899159664,
"grad_norm": 0.5706049799919128,
"kl": 0.01887798309326172,
"learning_rate": 1e-06,
"loss": -0.0096,
"num_tokens": 3420488.0,
"reward": 0.6875,
"reward_std": 0.12697386741638184,
"rewards/curriculum_aware_reward_fn": 0.0234375,
"rewards/format_reward": 0.6640625,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 554.46875,
"epoch": 0.453781512605042,
"grad_norm": 0.45473384857177734,
"kl": 0.0068416595458984375,
"learning_rate": 1e-06,
"loss": 0.0074,
"num_tokens": 3552340.0,
"reward": 0.34868420753628016,
"reward_std": 0.10102300066500902,
"rewards/curriculum_aware_reward_fn": 0.012746710679493845,
"rewards/format_reward": 0.3359375,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 534.0,
"epoch": 0.47058823529411764,
"grad_norm": 0.348452091217041,
"kl": 0.01036834716796875,
"learning_rate": 1e-06,
"loss": 0.0145,
"num_tokens": 3677892.0,
"reward": 0.5571546033024788,
"reward_std": 0.055680982768535614,
"rewards/curriculum_aware_reward_fn": 0.010279605048708618,
"rewards/format_reward": 0.546875,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 584.03125,
"epoch": 0.48739495798319327,
"grad_norm": 0.452033668756485,
"kl": 0.0071258544921875,
"learning_rate": 1e-06,
"loss": 0.0115,
"num_tokens": 3813600.0,
"reward": 0.3984375,
"reward_std": 0.08443661965429783,
"rewards/curriculum_aware_reward_fn": 0.046875,
"rewards/format_reward": 0.3515625,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 552.4921875,
"epoch": 0.5042016806722689,
"grad_norm": 0.4926210641860962,
"kl": 0.005392551422119141,
"learning_rate": 1e-06,
"loss": 0.0285,
"num_tokens": 3947807.0,
"reward": 0.4683388201519847,
"reward_std": 0.11112732999026775,
"rewards/curriculum_aware_reward_fn": 0.03865131549537182,
"rewards/format_reward": 0.4296875,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 559.6953125,
"epoch": 0.5210084033613446,
"grad_norm": 0.5463467240333557,
"kl": 0.004418373107910156,
"learning_rate": 1e-06,
"loss": -0.0233,
"num_tokens": 4080704.0,
"reward": 0.22203946067020297,
"reward_std": 0.09257729165256023,
"rewards/curriculum_aware_reward_fn": 0.042351973708719015,
"rewards/format_reward": 0.1796875,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 549.921875,
"epoch": 0.5378151260504201,
"grad_norm": 0.36463335156440735,
"kl": 0.006511688232421875,
"learning_rate": 1e-06,
"loss": -0.0032,
"num_tokens": 4214870.0,
"reward": 0.4346217215061188,
"reward_std": 0.03605314111337066,
"rewards/curriculum_aware_reward_fn": 0.004934210563078523,
"rewards/format_reward": 0.4296875,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 478.7890625,
"epoch": 0.5546218487394958,
"grad_norm": 0.5116223692893982,
"kl": 0.008532524108886719,
"learning_rate": 1e-06,
"loss": -0.0153,
"num_tokens": 4338203.0,
"reward": 0.4560032896697521,
"reward_std": 0.12314211018383503,
"rewards/curriculum_aware_reward_fn": 0.08881578966975212,
"rewards/format_reward": 0.3671875,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 476.7890625,
"epoch": 0.5714285714285714,
"grad_norm": 0.43187472224235535,
"kl": 0.007843017578125,
"learning_rate": 1e-06,
"loss": 0.0134,
"num_tokens": 4461184.0,
"reward": 0.4333881586790085,
"reward_std": 0.12357822060585022,
"rewards/curriculum_aware_reward_fn": 0.02713815774768591,
"rewards/format_reward": 0.40625,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 529.7578125,
"epoch": 0.5882352941176471,
"grad_norm": 0.4466142952442169,
"kl": 0.0057315826416015625,
"learning_rate": 1e-06,
"loss": -0.015,
"num_tokens": 4590329.0,
"reward": 0.426809199154377,
"reward_std": 0.10671343095600605,
"rewards/curriculum_aware_reward_fn": 0.059621710097417235,
"rewards/format_reward": 0.3671875,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 521.9453125,
"epoch": 0.6050420168067226,
"grad_norm": 0.5088793635368347,
"kl": 0.00739288330078125,
"learning_rate": 1e-06,
"loss": 0.0193,
"num_tokens": 4717658.0,
"reward": 0.5740131624042988,
"reward_std": 0.09916227497160435,
"rewards/curriculum_aware_reward_fn": 0.08182565809693187,
"rewards/format_reward": 0.4921875,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 481.59375,
"epoch": 0.6218487394957983,
"grad_norm": 0.3755647540092468,
"kl": 0.005794525146484375,
"learning_rate": 1e-06,
"loss": 0.0106,
"num_tokens": 4837174.0,
"reward": 0.5123355314135551,
"reward_std": 0.023199534974992275,
"rewards/curriculum_aware_reward_fn": 0.0748355258256197,
"rewards/format_reward": 0.4375,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 465.1953125,
"epoch": 0.6386554621848739,
"grad_norm": 0.5442925691604614,
"kl": 0.008731842041015625,
"learning_rate": 1e-06,
"loss": -0.0111,
"num_tokens": 4953039.0,
"reward": 0.7232730239629745,
"reward_std": 0.1315580508671701,
"rewards/curriculum_aware_reward_fn": 0.16077302338089794,
"rewards/format_reward": 0.5625,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 501.484375,
"epoch": 0.6554621848739496,
"grad_norm": 0.4446295201778412,
"kl": 0.00624847412109375,
"learning_rate": 1e-06,
"loss": 0.0286,
"num_tokens": 5076965.0,
"reward": 0.47327301651239395,
"reward_std": 0.08440816402435303,
"rewards/curriculum_aware_reward_fn": 0.09827302861958742,
"rewards/format_reward": 0.375,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 539.15625,
"epoch": 0.6722689075630253,
"grad_norm": 0.37400856614112854,
"kl": 0.005260467529296875,
"learning_rate": 1e-06,
"loss": 0.0044,
"num_tokens": 5207185.0,
"reward": 0.4745065679308027,
"reward_std": 0.07072597183287144,
"rewards/curriculum_aware_reward_fn": 0.09950657887384295,
"rewards/format_reward": 0.375,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 492.0859375,
"epoch": 0.6890756302521008,
"grad_norm": 0.4103780686855316,
"kl": 0.00856781005859375,
"learning_rate": 1e-06,
"loss": 0.0049,
"num_tokens": 5328012.0,
"reward": 0.71875,
"reward_std": 0.10247145313769579,
"rewards/curriculum_aware_reward_fn": 0.09375000419095159,
"rewards/format_reward": 0.625,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 405.1328125,
"epoch": 0.7058823529411765,
"grad_norm": 0.6738374829292297,
"kl": 0.0108184814453125,
"learning_rate": 1e-06,
"loss": 0.0454,
"num_tokens": 5438933.0,
"reward": 0.757401317358017,
"reward_std": 0.212964728474617,
"rewards/curriculum_aware_reward_fn": 0.1636513164266944,
"rewards/format_reward": 0.59375,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 516.640625,
"epoch": 0.7226890756302521,
"grad_norm": 0.31194940209388733,
"kl": 0.0074005126953125,
"learning_rate": 1e-06,
"loss": -0.0205,
"num_tokens": 5563887.0,
"reward": 0.6562500149011612,
"reward_std": 0.04224720690399408,
"rewards/curriculum_aware_reward_fn": 0.15625,
"rewards/format_reward": 0.5,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 477.3984375,
"epoch": 0.7394957983193278,
"grad_norm": 0.38581541180610657,
"kl": 0.00885009765625,
"learning_rate": 1e-06,
"loss": -0.0164,
"num_tokens": 5688114.0,
"reward": 0.6402138248085976,
"reward_std": 0.08311590366065502,
"rewards/curriculum_aware_reward_fn": 0.03083881549537182,
"rewards/format_reward": 0.609375,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 521.8046875,
"epoch": 0.7563025210084033,
"grad_norm": 0.36903509497642517,
"kl": 0.0078277587890625,
"learning_rate": 1e-06,
"loss": -0.0022,
"num_tokens": 5814153.0,
"reward": 0.5513980239629745,
"reward_std": 0.06967925047501922,
"rewards/curriculum_aware_reward_fn": 0.05139802524354309,
"rewards/format_reward": 0.5,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 416.4296875,
"epoch": 0.773109243697479,
"grad_norm": 0.5821658968925476,
"kl": 0.0094757080078125,
"learning_rate": 1e-06,
"loss": -0.0056,
"num_tokens": 5923904.0,
"reward": 0.7257401347160339,
"reward_std": 0.13419464463368058,
"rewards/curriculum_aware_reward_fn": 0.10074013192206621,
"rewards/format_reward": 0.625,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 526.9296875,
"epoch": 0.7899159663865546,
"grad_norm": 0.449553519487381,
"kl": 0.005664825439453125,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 6053447.0,
"reward": 0.4819078892469406,
"reward_std": 0.09099963493645191,
"rewards/curriculum_aware_reward_fn": 0.10690789669752121,
"rewards/format_reward": 0.375,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 536.5,
"epoch": 0.8067226890756303,
"grad_norm": 0.5381475687026978,
"kl": 0.008424758911132812,
"learning_rate": 1e-06,
"loss": 0.0099,
"num_tokens": 6183559.0,
"reward": 0.46833881735801697,
"reward_std": 0.08668615715578198,
"rewards/curriculum_aware_reward_fn": 0.03865131642669439,
"rewards/format_reward": 0.4296875,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 539.6328125,
"epoch": 0.8235294117647058,
"grad_norm": 0.44155657291412354,
"kl": 0.0077495574951171875,
"learning_rate": 1e-06,
"loss": 0.0085,
"num_tokens": 6314544.0,
"reward": 0.5526315867900848,
"reward_std": 0.027912108227610588,
"rewards/curriculum_aware_reward_fn": 0.11513157933950424,
"rewards/format_reward": 0.4375,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 554.0546875,
"epoch": 0.8403361344537815,
"grad_norm": 0.4840262532234192,
"kl": 0.0054950714111328125,
"learning_rate": 1e-06,
"loss": 0.0073,
"num_tokens": 6445087.0,
"reward": 0.33634869009256363,
"reward_std": 0.10334387933835387,
"rewards/curriculum_aware_reward_fn": 0.03166118450462818,
"rewards/format_reward": 0.3046875,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 578.6796875,
"epoch": 0.8571428571428571,
"grad_norm": 0.30791598558425903,
"kl": 0.005002021789550781,
"learning_rate": 1e-06,
"loss": 0.0068,
"num_tokens": 6582878.0,
"reward": 0.348684199154377,
"reward_std": 0.07469352334737778,
"rewards/curriculum_aware_reward_fn": 0.036184209398925304,
"rewards/format_reward": 0.3125,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 448.1328125,
"epoch": 0.8739495798319328,
"grad_norm": 0.5027822852134705,
"kl": 0.00795745849609375,
"learning_rate": 1e-06,
"loss": -0.0178,
"num_tokens": 6698503.0,
"reward": 0.6311677470803261,
"reward_std": 0.11679959110915661,
"rewards/curriculum_aware_reward_fn": 0.09210526384413242,
"rewards/format_reward": 0.5390625,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 521.1875,
"epoch": 0.8907563025210085,
"grad_norm": 0.4084753394126892,
"kl": 0.00714111328125,
"learning_rate": 1e-06,
"loss": 0.0193,
"num_tokens": 6823951.0,
"reward": 0.5028782784938812,
"reward_std": 0.059696739073842764,
"rewards/curriculum_aware_reward_fn": 0.06537829001899809,
"rewards/format_reward": 0.4375,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 539.109375,
"epoch": 0.907563025210084,
"grad_norm": 0.2098054140806198,
"kl": 0.007198333740234375,
"learning_rate": 1e-06,
"loss": 0.0114,
"num_tokens": 6953317.0,
"reward": 0.46052631735801697,
"reward_std": 0.03168220818042755,
"rewards/curriculum_aware_reward_fn": 0.023026317358016968,
"rewards/format_reward": 0.4375,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 521.2265625,
"epoch": 0.9243697478991597,
"grad_norm": 0.4919142425060272,
"kl": 0.007293701171875,
"learning_rate": 1e-06,
"loss": 0.0101,
"num_tokens": 7079922.0,
"reward": 0.49547697603702545,
"reward_std": 0.10914274398237467,
"rewards/curriculum_aware_reward_fn": 0.12047697883099318,
"rewards/format_reward": 0.375,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 500.5625,
"epoch": 0.9411764705882353,
"grad_norm": 0.46875280141830444,
"kl": 0.00684356689453125,
"learning_rate": 1e-06,
"loss": 0.0266,
"num_tokens": 7206954.0,
"reward": 0.40830591320991516,
"reward_std": 0.1075905729085207,
"rewards/curriculum_aware_reward_fn": 0.04111842066049576,
"rewards/format_reward": 0.3671875,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 482.3046875,
"epoch": 0.957983193277311,
"grad_norm": 0.40924757719039917,
"kl": 0.012725830078125,
"learning_rate": 1e-06,
"loss": 0.0041,
"num_tokens": 7327857.0,
"reward": 0.5958059132099152,
"reward_std": 0.06403321353718638,
"rewards/curriculum_aware_reward_fn": 0.04111842007841915,
"rewards/format_reward": 0.5546875,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 445.375,
"epoch": 0.9747899159663865,
"grad_norm": 0.4467240273952484,
"kl": 0.0105743408203125,
"learning_rate": 1e-06,
"loss": -0.0061,
"num_tokens": 7440561.0,
"reward": 0.7578125,
"reward_std": 0.057358515448868275,
"rewards/curriculum_aware_reward_fn": 0.13281250069849193,
"rewards/format_reward": 0.625,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 582.3452377319336,
"epoch": 0.9915966386554622,
"grad_norm": 0.5007306933403015,
"kl": 0.007415771484375,
"learning_rate": 1e-06,
"loss": 0.0029,
"num_tokens": 7569086.0,
"reward": 0.4514802545309067,
"reward_std": 0.06341935088858008,
"rewards/curriculum_aware_reward_fn": 0.0217927637277171,
"rewards/format_reward": 0.4296875,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 553.09375,
"epoch": 1.0168067226890756,
"grad_norm": 0.4292355179786682,
"kl": 0.005462646484375,
"learning_rate": 1e-06,
"loss": 0.0119,
"num_tokens": 7702626.0,
"reward": 0.4325658082962036,
"reward_std": 0.07455102633684874,
"rewards/curriculum_aware_reward_fn": 0.05756579013541341,
"rewards/format_reward": 0.375,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 521.375,
"epoch": 1.0336134453781514,
"grad_norm": 0.41578003764152527,
"kl": 0.008762359619140625,
"learning_rate": 1e-06,
"loss": -0.0056,
"num_tokens": 7827818.0,
"reward": 0.5082236900925636,
"reward_std": 0.07253926200792193,
"rewards/curriculum_aware_reward_fn": 0.07072368497028947,
"rewards/format_reward": 0.4375,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 648.2734375,
"epoch": 1.050420168067227,
"grad_norm": 0.48642197251319885,
"kl": 0.0062713623046875,
"learning_rate": 1e-06,
"loss": 0.0136,
"num_tokens": 7974333.0,
"reward": 0.3449835442006588,
"reward_std": 0.07259867247194052,
"rewards/curriculum_aware_reward_fn": 0.03248355258256197,
"rewards/format_reward": 0.3125,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 436.9296875,
"epoch": 1.0672268907563025,
"grad_norm": 0.3184286653995514,
"kl": 0.0114593505859375,
"learning_rate": 1e-06,
"loss": 0.0139,
"num_tokens": 8084908.0,
"reward": 0.6899671256542206,
"reward_std": 0.0728745711967349,
"rewards/curriculum_aware_reward_fn": 0.0649671049322933,
"rewards/format_reward": 0.625,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 541.53125,
"epoch": 1.084033613445378,
"grad_norm": 0.16483676433563232,
"kl": 0.0060882568359375,
"learning_rate": 1e-06,
"loss": -0.0032,
"num_tokens": 8216696.0,
"reward": 0.2627467066049576,
"reward_std": 0.024391429498791695,
"rewards/curriculum_aware_reward_fn": 0.012746710330247879,
"rewards/format_reward": 0.25,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 509.7890625,
"epoch": 1.1008403361344539,
"grad_norm": 0.4256879985332489,
"kl": 0.00730133056640625,
"learning_rate": 1e-06,
"loss": -0.0102,
"num_tokens": 8342845.0,
"reward": 0.5197368264198303,
"reward_std": 0.030515023041516542,
"rewards/curriculum_aware_reward_fn": 0.019736842485144734,
"rewards/format_reward": 0.5,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 486.296875,
"epoch": 1.1176470588235294,
"grad_norm": 0.3091375231742859,
"kl": 0.008016586303710938,
"learning_rate": 1e-06,
"loss": -0.003,
"num_tokens": 8462971.0,
"reward": 0.46299341320991516,
"reward_std": 0.04847824294120073,
"rewards/curriculum_aware_reward_fn": 0.025493420660495758,
"rewards/format_reward": 0.4375,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 596.234375,
"epoch": 1.134453781512605,
"grad_norm": 0.4554305076599121,
"kl": 0.006458282470703125,
"learning_rate": 1e-06,
"loss": 0.011,
"num_tokens": 8598337.0,
"reward": 0.3758223643526435,
"reward_std": 0.08455474488437176,
"rewards/curriculum_aware_reward_fn": 0.1258223680779338,
"rewards/format_reward": 0.25,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 444.265625,
"epoch": 1.1512605042016806,
"grad_norm": 0.4700126349925995,
"kl": 0.013336181640625,
"learning_rate": 1e-06,
"loss": -0.0065,
"num_tokens": 8715091.0,
"reward": 0.67434211820364,
"reward_std": 0.12386543769389391,
"rewards/curriculum_aware_reward_fn": 0.11965460598003119,
"rewards/format_reward": 0.5546875,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 537.6484375,
"epoch": 1.1680672268907564,
"grad_norm": 0.5387859344482422,
"kl": 0.0084075927734375,
"learning_rate": 1e-06,
"loss": 0.0113,
"num_tokens": 8845582.0,
"reward": 0.5822368338704109,
"reward_std": 0.16140672331675887,
"rewards/curriculum_aware_reward_fn": 0.10567433899268508,
"rewards/format_reward": 0.4765625,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 571.25,
"epoch": 1.184873949579832,
"grad_norm": 0.28276559710502625,
"kl": 0.005802154541015625,
"learning_rate": 1e-06,
"loss": 0.0147,
"num_tokens": 8979574.0,
"reward": 0.2606907826848328,
"reward_std": 0.051840442698448896,
"rewards/curriculum_aware_reward_fn": 0.018503289436921477,
"rewards/format_reward": 0.2421875,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 495.6953125,
"epoch": 1.2016806722689075,
"grad_norm": 0.3467198312282562,
"kl": 0.007556915283203125,
"learning_rate": 1e-06,
"loss": 0.0133,
"num_tokens": 9104143.0,
"reward": 0.5476973727345467,
"reward_std": 0.0878668250516057,
"rewards/curriculum_aware_reward_fn": 0.04769736947491765,
"rewards/format_reward": 0.5,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 628.96875,
"epoch": 1.2184873949579833,
"grad_norm": 0.30438435077667236,
"kl": 0.0047740936279296875,
"learning_rate": 1e-06,
"loss": -0.0058,
"num_tokens": 9247579.0,
"reward": 0.25863486528396606,
"reward_std": 0.05783074861392379,
"rewards/curriculum_aware_reward_fn": 0.016447368427179754,
"rewards/format_reward": 0.2421875,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 581.7109375,
"epoch": 1.2352941176470589,
"grad_norm": 0.16290180385112762,
"kl": 0.005523681640625,
"learning_rate": 1e-06,
"loss": 0.0069,
"num_tokens": 9383094.0,
"reward": 0.32195723056793213,
"reward_std": 0.014439198188483715,
"rewards/curriculum_aware_reward_fn": 0.009457237087190151,
"rewards/format_reward": 0.3125,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 536.9609375,
"epoch": 1.2521008403361344,
"grad_norm": 1.2357046604156494,
"kl": 0.170867919921875,
"learning_rate": 1e-06,
"loss": -0.0054,
"num_tokens": 9513217.0,
"reward": 0.582236819434911,
"reward_std": 0.0510927583090961,
"rewards/curriculum_aware_reward_fn": 0.01973684225231409,
"rewards/format_reward": 0.5625,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 487.8671875,
"epoch": 1.26890756302521,
"grad_norm": 0.46429404616355896,
"kl": 0.0113677978515625,
"learning_rate": 1e-06,
"loss": 0.0446,
"num_tokens": 9635408.0,
"reward": 0.726973682641983,
"reward_std": 0.11705214250832796,
"rewards/curriculum_aware_reward_fn": 0.10197368450462818,
"rewards/format_reward": 0.625,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 584.296875,
"epoch": 1.2857142857142856,
"grad_norm": 0.42755427956581116,
"kl": 0.00647735595703125,
"learning_rate": 1e-06,
"loss": 0.0307,
"num_tokens": 9770998.0,
"reward": 0.49136512726545334,
"reward_std": 0.10772840678691864,
"rewards/curriculum_aware_reward_fn": 0.06167763099074364,
"rewards/format_reward": 0.4296875,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 429.296875,
"epoch": 1.3025210084033614,
"grad_norm": 0.45878008008003235,
"kl": 0.01023101806640625,
"learning_rate": 1e-06,
"loss": 0.0156,
"num_tokens": 9886868.0,
"reward": 0.7347861528396606,
"reward_std": 0.10009488789364696,
"rewards/curriculum_aware_reward_fn": 0.06291118392255157,
"rewards/format_reward": 0.671875,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 516.6328125,
"epoch": 1.319327731092437,
"grad_norm": 0.3113223910331726,
"kl": 0.0077972412109375,
"learning_rate": 1e-06,
"loss": 0.0078,
"num_tokens": 10011221.0,
"reward": 0.5966282933950424,
"reward_std": 0.041548303328454494,
"rewards/curriculum_aware_reward_fn": 0.09662828780710697,
"rewards/format_reward": 0.5,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 557.4765625,
"epoch": 1.3361344537815127,
"grad_norm": 0.33871227502822876,
"kl": 0.0073699951171875,
"learning_rate": 1e-06,
"loss": 0.0191,
"num_tokens": 10140970.0,
"reward": 0.5415295958518982,
"reward_std": 0.07458627689629793,
"rewards/curriculum_aware_reward_fn": 0.04152960516512394,
"rewards/format_reward": 0.5,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 564.125,
"epoch": 1.3529411764705883,
"grad_norm": 0.4491986930370331,
"kl": 0.006259918212890625,
"learning_rate": 1e-06,
"loss": 0.0074,
"num_tokens": 10271986.0,
"reward": 0.5296052545309067,
"reward_std": 0.1359914354979992,
"rewards/curriculum_aware_reward_fn": 0.09210526291280985,
"rewards/format_reward": 0.4375,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 503.125,
"epoch": 1.3697478991596639,
"grad_norm": 0.2838430404663086,
"kl": 0.00777435302734375,
"learning_rate": 1e-06,
"loss": 0.0186,
"num_tokens": 10398090.0,
"reward": 0.6521381586790085,
"reward_std": 0.05697542009875178,
"rewards/curriculum_aware_reward_fn": 0.027138158679008484,
"rewards/format_reward": 0.625,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 530.234375,
"epoch": 1.3865546218487395,
"grad_norm": 0.4765428602695465,
"kl": 0.00778961181640625,
"learning_rate": 1e-06,
"loss": 0.0302,
"num_tokens": 10526192.0,
"reward": 0.6208881437778473,
"reward_std": 0.12499829288572073,
"rewards/curriculum_aware_reward_fn": 0.06620065588504076,
"rewards/format_reward": 0.5546875,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 566.84375,
"epoch": 1.403361344537815,
"grad_norm": 0.4760180711746216,
"kl": 0.0066986083984375,
"learning_rate": 1e-06,
"loss": 0.0084,
"num_tokens": 10657412.0,
"reward": 0.46916117519140244,
"reward_std": 0.10547287575900555,
"rewards/curriculum_aware_reward_fn": 0.09416118310764432,
"rewards/format_reward": 0.375,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 560.7734375,
"epoch": 1.4201680672268908,
"grad_norm": 0.27778276801109314,
"kl": 0.005718231201171875,
"learning_rate": 1e-06,
"loss": 0.0127,
"num_tokens": 10788255.0,
"reward": 0.44736841320991516,
"reward_std": 0.06990169547498226,
"rewards/curriculum_aware_reward_fn": 0.07236842112615705,
"rewards/format_reward": 0.375,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 484.9296875,
"epoch": 1.4369747899159664,
"grad_norm": 0.34481725096702576,
"kl": 0.02048492431640625,
"learning_rate": 1e-06,
"loss": -0.0032,
"num_tokens": 10911166.0,
"reward": 0.7388980239629745,
"reward_std": 0.08143611438572407,
"rewards/curriculum_aware_reward_fn": 0.1138980237301439,
"rewards/format_reward": 0.625,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 457.71875,
"epoch": 1.453781512605042,
"grad_norm": 0.4829816222190857,
"kl": 0.0100555419921875,
"learning_rate": 1e-06,
"loss": -0.0016,
"num_tokens": 11027554.0,
"reward": 0.6735197305679321,
"reward_std": 0.08864451944828033,
"rewards/curriculum_aware_reward_fn": 0.11101973801851273,
"rewards/format_reward": 0.5625,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 508.6953125,
"epoch": 1.4705882352941178,
"grad_norm": 0.5016542077064514,
"kl": 0.00922393798828125,
"learning_rate": 1e-06,
"loss": 0.01,
"num_tokens": 11149275.0,
"reward": 0.6870888322591782,
"reward_std": 0.08495050063356757,
"rewards/curriculum_aware_reward_fn": 0.12458881677594036,
"rewards/format_reward": 0.5625,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 602.9921875,
"epoch": 1.4873949579831933,
"grad_norm": 0.29301658272743225,
"kl": 0.004894256591796875,
"learning_rate": 1e-06,
"loss": 0.0249,
"num_tokens": 11288106.0,
"reward": 0.29481907188892365,
"reward_std": 0.0620402698405087,
"rewards/curriculum_aware_reward_fn": 0.04481907875742763,
"rewards/format_reward": 0.25,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 438.71875,
"epoch": 1.504201680672269,
"grad_norm": 0.5715950727462769,
"kl": 0.01503753662109375,
"learning_rate": 1e-06,
"loss": -0.0041,
"num_tokens": 11401118.0,
"reward": 0.8972039222717285,
"reward_std": 0.10221139155328274,
"rewards/curriculum_aware_reward_fn": 0.1472039446234703,
"rewards/format_reward": 0.75,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 547.015625,
"epoch": 1.5210084033613445,
"grad_norm": 0.31229323148727417,
"kl": 0.0074462890625,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 11531400.0,
"reward": 0.5945723652839661,
"reward_std": 0.05676991865038872,
"rewards/curriculum_aware_reward_fn": 0.15707236900925636,
"rewards/format_reward": 0.4375,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 599.9375,
"epoch": 1.53781512605042,
"grad_norm": 0.3754754066467285,
"kl": 0.005001068115234375,
"learning_rate": 1e-06,
"loss": 0.0337,
"num_tokens": 11667224.0,
"reward": 0.4358552396297455,
"reward_std": 0.1078398427926004,
"rewards/curriculum_aware_reward_fn": 0.060855261399410665,
"rewards/format_reward": 0.375,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 563.3671875,
"epoch": 1.5546218487394958,
"grad_norm": 0.44682905077934265,
"kl": 0.00695037841796875,
"learning_rate": 1e-06,
"loss": 0.0261,
"num_tokens": 11800087.0,
"reward": 0.47820721566677094,
"reward_std": 0.11488656094297767,
"rewards/curriculum_aware_reward_fn": 0.10320723743643612,
"rewards/format_reward": 0.375,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 524.7578125,
"epoch": 1.5714285714285714,
"grad_norm": 0.4093223214149475,
"kl": 0.0079803466796875,
"learning_rate": 1e-06,
"loss": -0.0003,
"num_tokens": 11927808.0,
"reward": 0.5555098727345467,
"reward_std": 0.10677139926701784,
"rewards/curriculum_aware_reward_fn": 0.06332236900925636,
"rewards/format_reward": 0.4921875,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 612.375,
"epoch": 1.5882352941176472,
"grad_norm": 0.28754857182502747,
"kl": 0.004489898681640625,
"learning_rate": 1e-06,
"loss": -0.0277,
"num_tokens": 12069560.0,
"reward": 0.3371710553765297,
"reward_std": 0.050214093178510666,
"rewards/curriculum_aware_reward_fn": 0.02467105258256197,
"rewards/format_reward": 0.3125,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 383.2578125,
"epoch": 1.6050420168067228,
"grad_norm": 0.47502318024635315,
"kl": 0.0126953125,
"learning_rate": 1e-06,
"loss": 0.016,
"num_tokens": 12176625.0,
"reward": 0.7224506512284279,
"reward_std": 0.10677911480888724,
"rewards/curriculum_aware_reward_fn": 0.10526315728202462,
"rewards/format_reward": 0.6171875,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 578.078125,
"epoch": 1.6218487394957983,
"grad_norm": 0.34693828225135803,
"kl": 0.006988525390625,
"learning_rate": 1e-06,
"loss": 0.006,
"num_tokens": 12310939.0,
"reward": 0.5254934206604958,
"reward_std": 0.06210480257868767,
"rewards/curriculum_aware_reward_fn": 0.08799342112615705,
"rewards/format_reward": 0.4375,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 529.828125,
"epoch": 1.638655462184874,
"grad_norm": 2.9580295085906982,
"kl": 0.21123504638671875,
"learning_rate": 1e-06,
"loss": 0.0019,
"num_tokens": 12436949.0,
"reward": 0.5230263099074364,
"reward_std": 0.13364601507782936,
"rewards/curriculum_aware_reward_fn": 0.11677631549537182,
"rewards/format_reward": 0.40625,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 470.8828125,
"epoch": 1.6554621848739495,
"grad_norm": 0.39620673656463623,
"kl": 0.00954437255859375,
"learning_rate": 1e-06,
"loss": -0.0048,
"num_tokens": 12558190.0,
"reward": 0.8194901347160339,
"reward_std": 0.09049705043435097,
"rewards/curriculum_aware_reward_fn": 0.26480263471603394,
"rewards/format_reward": 0.5546875,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 495.3515625,
"epoch": 1.6722689075630253,
"grad_norm": 0.5109691619873047,
"kl": 0.007015228271484375,
"learning_rate": 1e-06,
"loss": 0.0351,
"num_tokens": 12681859.0,
"reward": 0.4362664595246315,
"reward_std": 0.0971333347260952,
"rewards/curriculum_aware_reward_fn": 0.1237664483487606,
"rewards/format_reward": 0.3125,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 478.0703125,
"epoch": 1.6890756302521008,
"grad_norm": 0.4189630150794983,
"kl": 0.0095977783203125,
"learning_rate": 1e-06,
"loss": 0.0011,
"num_tokens": 12801148.0,
"reward": 0.6920230239629745,
"reward_std": 0.10883715003728867,
"rewards/curriculum_aware_reward_fn": 0.19983552768826485,
"rewards/format_reward": 0.4921875,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 460.6875,
"epoch": 1.7058823529411766,
"grad_norm": 0.5282026529312134,
"kl": 0.007904052734375,
"learning_rate": 1e-06,
"loss": -0.0042,
"num_tokens": 12921692.0,
"reward": 0.3396381661295891,
"reward_std": 0.11080991290509701,
"rewards/curriculum_aware_reward_fn": 0.04276315798051655,
"rewards/format_reward": 0.296875,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 554.40625,
"epoch": 1.7226890756302522,
"grad_norm": 0.5177521109580994,
"kl": 0.01079559326171875,
"learning_rate": 1e-06,
"loss": -0.009,
"num_tokens": 13052136.0,
"reward": 0.36965460516512394,
"reward_std": 0.10201659612357616,
"rewards/curriculum_aware_reward_fn": 0.01809210516512394,
"rewards/format_reward": 0.3515625,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 524.421875,
"epoch": 1.7394957983193278,
"grad_norm": 0.44328662753105164,
"kl": 0.008655548095703125,
"learning_rate": 1e-06,
"loss": 0.0114,
"num_tokens": 13178822.0,
"reward": 0.5349506437778473,
"reward_std": 0.12413342297077179,
"rewards/curriculum_aware_reward_fn": 0.058388158679008484,
"rewards/format_reward": 0.4765625,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 446.46875,
"epoch": 1.7563025210084033,
"grad_norm": 0.647972583770752,
"kl": 0.01692962646484375,
"learning_rate": 1e-06,
"loss": 0.0047,
"num_tokens": 13297402.0,
"reward": 0.6476151421666145,
"reward_std": 0.22924628667533398,
"rewards/curriculum_aware_reward_fn": 0.07730263285338879,
"rewards/format_reward": 0.5703125,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 503.703125,
"epoch": 1.773109243697479,
"grad_norm": 0.631151556968689,
"kl": 0.008514404296875,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 13418340.0,
"reward": 0.46299341320991516,
"reward_std": 0.2022387906908989,
"rewards/curriculum_aware_reward_fn": 0.06455592066049576,
"rewards/format_reward": 0.3984375,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 562.03125,
"epoch": 1.7899159663865545,
"grad_norm": 0.3566150963306427,
"kl": 0.006641387939453125,
"learning_rate": 1e-06,
"loss": -0.0002,
"num_tokens": 13550952.0,
"reward": 0.35773025802336633,
"reward_std": 0.09330996312201023,
"rewards/curriculum_aware_reward_fn": 0.05304276151582599,
"rewards/format_reward": 0.3046875,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 539.1796875,
"epoch": 1.8067226890756303,
"grad_norm": 0.4120214581489563,
"kl": 0.00933074951171875,
"learning_rate": 1e-06,
"loss": 0.0026,
"num_tokens": 13678999.0,
"reward": 0.5435855314135551,
"reward_std": 0.15557273291051388,
"rewards/curriculum_aware_reward_fn": 0.12171052396297455,
"rewards/format_reward": 0.421875,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 542.1796875,
"epoch": 1.8235294117647058,
"grad_norm": 0.36332470178604126,
"kl": 0.00751495361328125,
"learning_rate": 1e-06,
"loss": 0.0098,
"num_tokens": 13811206.0,
"reward": 0.48643091320991516,
"reward_std": 0.13410842791199684,
"rewards/curriculum_aware_reward_fn": 0.11924342392012477,
"rewards/format_reward": 0.3671875,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 553.953125,
"epoch": 1.8403361344537816,
"grad_norm": 0.3152480721473694,
"kl": 0.00626373291015625,
"learning_rate": 1e-06,
"loss": 0.0016,
"num_tokens": 13945808.0,
"reward": 0.3293585479259491,
"reward_std": 0.045257058925926685,
"rewards/curriculum_aware_reward_fn": 0.016858553048223257,
"rewards/format_reward": 0.3125,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 573.59375,
"epoch": 1.8571428571428572,
"grad_norm": 0.2340080589056015,
"kl": 0.00682830810546875,
"learning_rate": 1e-06,
"loss": 0.0071,
"num_tokens": 14080460.0,
"reward": 0.3347039520740509,
"reward_std": 0.038679007440805435,
"rewards/curriculum_aware_reward_fn": 0.02220394741743803,
"rewards/format_reward": 0.3125,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 463.0390625,
"epoch": 1.8739495798319328,
"grad_norm": 0.36526933312416077,
"kl": 0.009578704833984375,
"learning_rate": 1e-06,
"loss": 0.0039,
"num_tokens": 14201065.0,
"reward": 0.6328125149011612,
"reward_std": 0.05027205403894186,
"rewards/curriculum_aware_reward_fn": 0.07031250046566129,
"rewards/format_reward": 0.5625,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 481.9609375,
"epoch": 1.8907563025210083,
"grad_norm": 0.4954119324684143,
"kl": 0.0100555419921875,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 14323068.0,
"reward": 0.5254934206604958,
"reward_std": 0.12779070809483528,
"rewards/curriculum_aware_reward_fn": 0.08799342159181833,
"rewards/format_reward": 0.4375,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 494.96875,
"epoch": 1.907563025210084,
"grad_norm": 0.46778982877731323,
"kl": 0.00978851318359375,
"learning_rate": 1e-06,
"loss": 0.0199,
"num_tokens": 14447008.0,
"reward": 0.5370065793395042,
"reward_std": 0.1048955712467432,
"rewards/curriculum_aware_reward_fn": 0.09950657980516553,
"rewards/format_reward": 0.4375,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 501.6796875,
"epoch": 1.9243697478991597,
"grad_norm": 0.3055194616317749,
"kl": 0.00933074951171875,
"learning_rate": 1e-06,
"loss": 0.0032,
"num_tokens": 14571103.0,
"reward": 0.5111019909381866,
"reward_std": 0.024554526433348656,
"rewards/curriculum_aware_reward_fn": 0.08141447440721095,
"rewards/format_reward": 0.4296875,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 508.8203125,
"epoch": 1.9411764705882353,
"grad_norm": 0.4632183611392975,
"kl": 0.012451171875,
"learning_rate": 1e-06,
"loss": 0.0157,
"num_tokens": 14694424.0,
"reward": 0.6089638024568558,
"reward_std": 0.10860061645507812,
"rewards/curriculum_aware_reward_fn": 0.11677631549537182,
"rewards/format_reward": 0.4921875,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 495.6875,
"epoch": 1.957983193277311,
"grad_norm": 0.41369161009788513,
"kl": 0.0089874267578125,
"learning_rate": 1e-06,
"loss": 0.0212,
"num_tokens": 14819792.0,
"reward": 0.4621710479259491,
"reward_std": 0.07010683044791222,
"rewards/curriculum_aware_reward_fn": 0.0871710479259491,
"rewards/format_reward": 0.375,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 529.359375,
"epoch": 1.9747899159663866,
"grad_norm": 0.40478190779685974,
"kl": 0.012042999267578125,
"learning_rate": 1e-06,
"loss": 0.0388,
"num_tokens": 14946718.0,
"reward": 0.48190788179636,
"reward_std": 0.10751516558229923,
"rewards/curriculum_aware_reward_fn": 0.11472039762884378,
"rewards/format_reward": 0.3671875,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 493.4881134033203,
"epoch": 1.9915966386554622,
"grad_norm": 0.3562357425689697,
"kl": 0.0123748779296875,
"learning_rate": 1e-06,
"loss": 0.0141,
"num_tokens": 15064457.0,
"reward": 0.6706414446234703,
"reward_std": 0.101046122610569,
"rewards/curriculum_aware_reward_fn": 0.05345394788309932,
"rewards/format_reward": 0.6171875,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 517.7578125,
"epoch": 2.0168067226890756,
"grad_norm": 0.3487071394920349,
"kl": 0.0104217529296875,
"learning_rate": 1e-06,
"loss": 0.0163,
"num_tokens": 15191538.0,
"reward": 0.5201480090618134,
"reward_std": 0.04716231161728501,
"rewards/curriculum_aware_reward_fn": 0.02014802652411163,
"rewards/format_reward": 0.5,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 577.765625,
"epoch": 2.033613445378151,
"grad_norm": 0.35752227902412415,
"kl": 0.008148193359375,
"learning_rate": 1e-06,
"loss": 0.0108,
"num_tokens": 15327204.0,
"reward": 0.42763157933950424,
"reward_std": 0.09388388879597187,
"rewards/curriculum_aware_reward_fn": 0.05263157933950424,
"rewards/format_reward": 0.375,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 460.53125,
"epoch": 2.0504201680672267,
"grad_norm": 0.5020465850830078,
"kl": 0.014190673828125,
"learning_rate": 1e-06,
"loss": 0.0113,
"num_tokens": 15447608.0,
"reward": 0.693256601691246,
"reward_std": 0.12680460885167122,
"rewards/curriculum_aware_reward_fn": 0.06825657840818167,
"rewards/format_reward": 0.625,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 526.7890625,
"epoch": 2.0672268907563027,
"grad_norm": 0.33090242743492126,
"kl": 0.00830841064453125,
"learning_rate": 1e-06,
"loss": 0.0212,
"num_tokens": 15577021.0,
"reward": 0.3022203971631825,
"reward_std": 0.052566134836524725,
"rewards/curriculum_aware_reward_fn": 0.0522203971631825,
"rewards/format_reward": 0.25,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 465.390625,
"epoch": 2.0840336134453783,
"grad_norm": 0.25564736127853394,
"kl": 0.018894195556640625,
"learning_rate": 1e-06,
"loss": 0.001,
"num_tokens": 15693543.0,
"reward": 0.5879934281110764,
"reward_std": 0.03513536183163524,
"rewards/curriculum_aware_reward_fn": 0.15830592159181833,
"rewards/format_reward": 0.4296875,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 438.015625,
"epoch": 2.100840336134454,
"grad_norm": 0.5210288763046265,
"kl": 0.0128936767578125,
"learning_rate": 1e-06,
"loss": 0.038,
"num_tokens": 15805441.0,
"reward": 0.7685032784938812,
"reward_std": 0.15490676742047071,
"rewards/curriculum_aware_reward_fn": 0.20600328128784895,
"rewards/format_reward": 0.5625,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 419.3515625,
"epoch": 2.1176470588235294,
"grad_norm": 0.48274165391921997,
"kl": 0.01959228515625,
"learning_rate": 1e-06,
"loss": 0.0226,
"num_tokens": 15913862.0,
"reward": 0.671875,
"reward_std": 0.11604671645909548,
"rewards/curriculum_aware_reward_fn": 0.10937500139698386,
"rewards/format_reward": 0.5625,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 520.8203125,
"epoch": 2.134453781512605,
"grad_norm": 0.35000789165496826,
"kl": 0.0090179443359375,
"learning_rate": 1e-06,
"loss": -0.0026,
"num_tokens": 16041007.0,
"reward": 0.49794407607987523,
"reward_std": 0.10071868449449539,
"rewards/curriculum_aware_reward_fn": 0.12294407980516553,
"rewards/format_reward": 0.375,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 557.6328125,
"epoch": 2.1512605042016806,
"grad_norm": 0.5103374719619751,
"kl": 0.0096435546875,
"learning_rate": 1e-06,
"loss": 0.0051,
"num_tokens": 16173728.0,
"reward": 0.45641446858644485,
"reward_std": 0.10976400738582015,
"rewards/curriculum_aware_reward_fn": 0.08141447091475129,
"rewards/format_reward": 0.375,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 414.9140625,
"epoch": 2.168067226890756,
"grad_norm": 0.43994390964508057,
"kl": 0.014190673828125,
"learning_rate": 1e-06,
"loss": -0.0002,
"num_tokens": 16285445.0,
"reward": 0.7236842215061188,
"reward_std": 0.11914092372171581,
"rewards/curriculum_aware_reward_fn": 0.09868421289138496,
"rewards/format_reward": 0.625,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 567.921875,
"epoch": 2.184873949579832,
"grad_norm": 0.319624662399292,
"kl": 0.0082244873046875,
"learning_rate": 1e-06,
"loss": 0.0285,
"num_tokens": 16420019.0,
"reward": 0.4259868264198303,
"reward_std": 0.05608854768797755,
"rewards/curriculum_aware_reward_fn": 0.11348683759570122,
"rewards/format_reward": 0.3125,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 463.53125,
"epoch": 2.2016806722689077,
"grad_norm": 0.359430193901062,
"kl": 0.014495849609375,
"learning_rate": 1e-06,
"loss": 0.0185,
"num_tokens": 16541143.0,
"reward": 0.4699835479259491,
"reward_std": 0.08584295958280563,
"rewards/curriculum_aware_reward_fn": 0.0949835516512394,
"rewards/format_reward": 0.375,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 469.609375,
"epoch": 2.2184873949579833,
"grad_norm": 0.41892191767692566,
"kl": 0.0117034912109375,
"learning_rate": 1e-06,
"loss": 0.0365,
"num_tokens": 16662909.0,
"reward": 0.5522204041481018,
"reward_std": 0.0973742357455194,
"rewards/curriculum_aware_reward_fn": 0.052220395184122026,
"rewards/format_reward": 0.5,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 540.5703125,
"epoch": 2.235294117647059,
"grad_norm": 0.48490580916404724,
"kl": 0.0093231201171875,
"learning_rate": 1e-06,
"loss": -0.0203,
"num_tokens": 16795070.0,
"reward": 0.41324013471603394,
"reward_std": 0.08475807495415211,
"rewards/curriculum_aware_reward_fn": 0.038240132853388786,
"rewards/format_reward": 0.375,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 525.1796875,
"epoch": 2.2521008403361344,
"grad_norm": 0.4449516832828522,
"kl": 0.0105438232421875,
"learning_rate": 1e-06,
"loss": 0.0023,
"num_tokens": 16923613.0,
"reward": 0.5604440867900848,
"reward_std": 0.1288975402712822,
"rewards/curriculum_aware_reward_fn": 0.12294407933950424,
"rewards/format_reward": 0.4375,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 476.3125,
"epoch": 2.26890756302521,
"grad_norm": 0.4340604543685913,
"kl": 0.01129150390625,
"learning_rate": 1e-06,
"loss": -0.028,
"num_tokens": 17045693.0,
"reward": 0.5587993413209915,
"reward_std": 0.09385511744767427,
"rewards/curriculum_aware_reward_fn": 0.058799343183636665,
"rewards/format_reward": 0.5,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 420.9921875,
"epoch": 2.2857142857142856,
"grad_norm": 0.45602235198020935,
"kl": 0.01416015625,
"learning_rate": 1e-06,
"loss": 0.0042,
"num_tokens": 17154012.0,
"reward": 0.7602795735001564,
"reward_std": 0.09590415796265006,
"rewards/curriculum_aware_reward_fn": 0.14309210563078523,
"rewards/format_reward": 0.6171875,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 489.6640625,
"epoch": 2.302521008403361,
"grad_norm": 0.4504002332687378,
"kl": 0.0130157470703125,
"learning_rate": 1e-06,
"loss": -0.0126,
"num_tokens": 17274481.0,
"reward": 0.6295230239629745,
"reward_std": 0.15420474018901587,
"rewards/curriculum_aware_reward_fn": 0.13733552629128098,
"rewards/format_reward": 0.4921875,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 492.3046875,
"epoch": 2.3193277310924367,
"grad_norm": 0.3228984773159027,
"kl": 0.0111846923828125,
"learning_rate": 1e-06,
"loss": -0.004,
"num_tokens": 17399360.0,
"reward": 0.5587993413209915,
"reward_std": 0.0586426155641675,
"rewards/curriculum_aware_reward_fn": 0.05879934271797538,
"rewards/format_reward": 0.5,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 508.5625,
"epoch": 2.3361344537815127,
"grad_norm": 0.3110595643520355,
"kl": 0.015472412109375,
"learning_rate": 1e-06,
"loss": -0.0107,
"num_tokens": 17521248.0,
"reward": 0.546875,
"reward_std": 0.07312605157494545,
"rewards/curriculum_aware_reward_fn": 0.0546875,
"rewards/format_reward": 0.4921875,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 574.015625,
"epoch": 2.3529411764705883,
"grad_norm": 0.4071909487247467,
"kl": 0.0107421875,
"learning_rate": 1e-06,
"loss": -0.0034,
"num_tokens": 17659522.0,
"reward": 0.47450655698776245,
"reward_std": 0.07414581999182701,
"rewards/curriculum_aware_reward_fn": 0.03700657980516553,
"rewards/format_reward": 0.4375,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 522.5546875,
"epoch": 2.369747899159664,
"grad_norm": 0.34431034326553345,
"kl": 0.00946044921875,
"learning_rate": 1e-06,
"loss": -0.0068,
"num_tokens": 17788537.0,
"reward": 0.4099506512284279,
"reward_std": 0.05903024738654494,
"rewards/curriculum_aware_reward_fn": 0.0349506571656093,
"rewards/format_reward": 0.375,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 553.2421875,
"epoch": 2.3865546218487395,
"grad_norm": 0.4213170111179352,
"kl": 0.009979248046875,
"learning_rate": 1e-06,
"loss": 0.0132,
"num_tokens": 17918288.0,
"reward": 0.4177631586790085,
"reward_std": 0.08044615527614951,
"rewards/curriculum_aware_reward_fn": 0.042763158096931875,
"rewards/format_reward": 0.375,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 559.3203125,
"epoch": 2.403361344537815,
"grad_norm": 0.23342828452587128,
"kl": 0.008510589599609375,
"learning_rate": 1e-06,
"loss": -0.0035,
"num_tokens": 18052169.0,
"reward": 0.3762335553765297,
"reward_std": 0.03740033693611622,
"rewards/curriculum_aware_reward_fn": 0.07154605258256197,
"rewards/format_reward": 0.3046875,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 493.078125,
"epoch": 2.4201680672268906,
"grad_norm": 0.4362901449203491,
"kl": 0.012481689453125,
"learning_rate": 1e-06,
"loss": 0.0386,
"num_tokens": 18177251.0,
"reward": 0.5805921033024788,
"reward_std": 0.12307591829448938,
"rewards/curriculum_aware_reward_fn": 0.08840460516512394,
"rewards/format_reward": 0.4921875,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 435.65625,
"epoch": 2.4369747899159666,
"grad_norm": 0.6844424605369568,
"kl": 0.0600128173828125,
"learning_rate": 1e-06,
"loss": 0.0209,
"num_tokens": 18292007.0,
"reward": 0.6208881735801697,
"reward_std": 0.15131067298352718,
"rewards/curriculum_aware_reward_fn": 0.12088816147297621,
"rewards/format_reward": 0.5,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 490.296875,
"epoch": 2.453781512605042,
"grad_norm": 0.30699044466018677,
"kl": 0.010986328125,
"learning_rate": 1e-06,
"loss": 0.0248,
"num_tokens": 18415301.0,
"reward": 0.49424342066049576,
"reward_std": 0.04014611290767789,
"rewards/curriculum_aware_reward_fn": 0.05674342147540301,
"rewards/format_reward": 0.4375,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 471.984375,
"epoch": 2.4705882352941178,
"grad_norm": 0.403209924697876,
"kl": 0.0122528076171875,
"learning_rate": 1e-06,
"loss": 0.0353,
"num_tokens": 18532667.0,
"reward": 0.6027960330247879,
"reward_std": 0.0935791190713644,
"rewards/curriculum_aware_reward_fn": 0.1027960553765297,
"rewards/format_reward": 0.5,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 422.703125,
"epoch": 2.4873949579831933,
"grad_norm": 0.42733973264694214,
"kl": 0.0163116455078125,
"learning_rate": 1e-06,
"loss": -0.002,
"num_tokens": 18645941.0,
"reward": 0.7845394462347031,
"reward_std": 0.0871797576546669,
"rewards/curriculum_aware_reward_fn": 0.0345394741743803,
"rewards/format_reward": 0.75,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 548.828125,
"epoch": 2.504201680672269,
"grad_norm": 0.2545667290687561,
"kl": 0.01213836669921875,
"learning_rate": 1e-06,
"loss": 0.0089,
"num_tokens": 18774111.0,
"reward": 0.539473682641983,
"reward_std": 0.060992954298853874,
"rewards/curriculum_aware_reward_fn": 0.10197368077933788,
"rewards/format_reward": 0.4375,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 540.0703125,
"epoch": 2.5210084033613445,
"grad_norm": 0.3914143145084381,
"kl": 0.00789642333984375,
"learning_rate": 1e-06,
"loss": -0.0052,
"num_tokens": 18904992.0,
"reward": 0.27878287341445684,
"reward_std": 0.06910991575568914,
"rewards/curriculum_aware_reward_fn": 0.02878289413638413,
"rewards/format_reward": 0.25,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 550.9375,
"epoch": 2.53781512605042,
"grad_norm": 0.2912365794181824,
"kl": 0.00799560546875,
"learning_rate": 1e-06,
"loss": 0.0067,
"num_tokens": 19035968.0,
"reward": 0.3215460553765297,
"reward_std": 0.01937512680888176,
"rewards/curriculum_aware_reward_fn": 0.07154605211690068,
"rewards/format_reward": 0.25,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 471.1640625,
"epoch": 2.5546218487394956,
"grad_norm": 0.3965752124786377,
"kl": 0.01221466064453125,
"learning_rate": 1e-06,
"loss": 0.0062,
"num_tokens": 19153861.0,
"reward": 0.582648016512394,
"reward_std": 0.08400850929319859,
"rewards/curriculum_aware_reward_fn": 0.08264802652411163,
"rewards/format_reward": 0.5,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 421.234375,
"epoch": 2.571428571428571,
"grad_norm": 0.6044662594795227,
"kl": 0.026885986328125,
"learning_rate": 1e-06,
"loss": 0.0165,
"num_tokens": 19265379.0,
"reward": 0.8112664222717285,
"reward_std": 0.1459241509437561,
"rewards/curriculum_aware_reward_fn": 0.19407895021140575,
"rewards/format_reward": 0.6171875,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 546.8671875,
"epoch": 2.588235294117647,
"grad_norm": 0.4222107231616974,
"kl": 0.01050567626953125,
"learning_rate": 1e-06,
"loss": 0.0261,
"num_tokens": 19396626.0,
"reward": 0.38733551651239395,
"reward_std": 0.06776260398328304,
"rewards/curriculum_aware_reward_fn": 0.02014802605845034,
"rewards/format_reward": 0.3671875,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 548.875,
"epoch": 2.6050420168067228,
"grad_norm": 0.30043891072273254,
"kl": 0.010498046875,
"learning_rate": 1e-06,
"loss": 0.0291,
"num_tokens": 19531202.0,
"reward": 0.28166119009256363,
"reward_std": 0.07623977493494749,
"rewards/curriculum_aware_reward_fn": 0.031661184038966894,
"rewards/format_reward": 0.25,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 560.640625,
"epoch": 2.6218487394957983,
"grad_norm": 0.39753058552742004,
"kl": 0.0109710693359375,
"learning_rate": 1e-06,
"loss": 0.0213,
"num_tokens": 19665404.0,
"reward": 0.5197368338704109,
"reward_std": 0.08217737264931202,
"rewards/curriculum_aware_reward_fn": 0.08223684225231409,
"rewards/format_reward": 0.4375,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 481.6640625,
"epoch": 2.638655462184874,
"grad_norm": 0.39810478687286377,
"kl": 0.009063720703125,
"learning_rate": 1e-06,
"loss": 0.0044,
"num_tokens": 19787409.0,
"reward": 0.44202302396297455,
"reward_std": 0.08141717128455639,
"rewards/curriculum_aware_reward_fn": 0.12952302768826485,
"rewards/format_reward": 0.3125,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 389.7890625,
"epoch": 2.6554621848739495,
"grad_norm": 0.4911426305770874,
"kl": 0.02197265625,
"learning_rate": 1e-06,
"loss": 0.0028,
"num_tokens": 19896190.0,
"reward": 0.7331414520740509,
"reward_std": 0.17763726785779,
"rewards/curriculum_aware_reward_fn": 0.1784539483487606,
"rewards/format_reward": 0.5546875,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 518.9609375,
"epoch": 2.6722689075630255,
"grad_norm": 0.2420579046010971,
"kl": 0.011962890625,
"learning_rate": 1e-06,
"loss": 0.0303,
"num_tokens": 20022809.0,
"reward": 0.4453125,
"reward_std": 0.01647413382306695,
"rewards/curriculum_aware_reward_fn": 0.007812500232830644,
"rewards/format_reward": 0.4375,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 424.4453125,
"epoch": 2.689075630252101,
"grad_norm": 0.46578091382980347,
"kl": 0.01375579833984375,
"learning_rate": 1e-06,
"loss": 0.0011,
"num_tokens": 20136314.0,
"reward": 0.49095392785966396,
"reward_std": 0.13701673224568367,
"rewards/curriculum_aware_reward_fn": 0.06126644625328481,
"rewards/format_reward": 0.4296875,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 483.921875,
"epoch": 2.7058823529411766,
"grad_norm": 0.32379522919654846,
"kl": 0.01180267333984375,
"learning_rate": 1e-06,
"loss": 0.0065,
"num_tokens": 20257344.0,
"reward": 0.5197368343360722,
"reward_std": 0.07396957790479064,
"rewards/curriculum_aware_reward_fn": 0.0822368417866528,
"rewards/format_reward": 0.4375,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 470.3515625,
"epoch": 2.722689075630252,
"grad_norm": 0.4478832483291626,
"kl": 0.014068603515625,
"learning_rate": 1e-06,
"loss": -0.0003,
"num_tokens": 20375685.0,
"reward": 0.5801809281110764,
"reward_std": 0.06543473433703184,
"rewards/curriculum_aware_reward_fn": 0.08018092112615705,
"rewards/format_reward": 0.5,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 462.3046875,
"epoch": 2.7394957983193278,
"grad_norm": 0.4915456175804138,
"kl": 0.0140838623046875,
"learning_rate": 1e-06,
"loss": 0.0286,
"num_tokens": 20491340.0,
"reward": 0.6981907933950424,
"reward_std": 0.1432387800887227,
"rewards/curriculum_aware_reward_fn": 0.13569078594446182,
"rewards/format_reward": 0.5625,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 460.046875,
"epoch": 2.7563025210084033,
"grad_norm": 0.388621062040329,
"kl": 0.0123138427734375,
"learning_rate": 1e-06,
"loss": 0.0144,
"num_tokens": 20613466.0,
"reward": 0.4124177619814873,
"reward_std": 0.07370226783677936,
"rewards/curriculum_aware_reward_fn": 0.037417763262055814,
"rewards/format_reward": 0.375,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 462.25,
"epoch": 2.773109243697479,
"grad_norm": 0.4878624677658081,
"kl": 0.01593017578125,
"learning_rate": 1e-06,
"loss": -0.0006,
"num_tokens": 20729058.0,
"reward": 0.6221217066049576,
"reward_std": 0.12872529029846191,
"rewards/curriculum_aware_reward_fn": 0.12212171033024788,
"rewards/format_reward": 0.5,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 486.4609375,
"epoch": 2.7899159663865545,
"grad_norm": 0.4500262141227722,
"kl": 0.0099029541015625,
"learning_rate": 1e-06,
"loss": 0.0219,
"num_tokens": 20853869.0,
"reward": 0.4050164371728897,
"reward_std": 0.11422262340784073,
"rewards/curriculum_aware_reward_fn": 0.09251644648611546,
"rewards/format_reward": 0.3125,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 448.5859375,
"epoch": 2.80672268907563,
"grad_norm": 0.5006850957870483,
"kl": 0.0168914794921875,
"learning_rate": 1e-06,
"loss": 0.0135,
"num_tokens": 20973736.0,
"reward": 0.677631601691246,
"reward_std": 0.0868874522857368,
"rewards/curriculum_aware_reward_fn": 0.12294407980516553,
"rewards/format_reward": 0.5546875,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 428.7265625,
"epoch": 2.8235294117647056,
"grad_norm": 0.42931458353996277,
"kl": 0.01781463623046875,
"learning_rate": 1e-06,
"loss": 0.0042,
"num_tokens": 21086485.0,
"reward": 0.6040295884013176,
"reward_std": 0.05929867131635547,
"rewards/curriculum_aware_reward_fn": 0.041529607493430376,
"rewards/format_reward": 0.5625,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 388.421875,
"epoch": 2.8403361344537816,
"grad_norm": 0.44046640396118164,
"kl": 0.0161895751953125,
"learning_rate": 1e-06,
"loss": 0.01,
"num_tokens": 21193627.0,
"reward": 0.7483552545309067,
"reward_std": 0.09682157123461366,
"rewards/curriculum_aware_reward_fn": 0.060855262679979205,
"rewards/format_reward": 0.6875,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 477.6953125,
"epoch": 2.857142857142857,
"grad_norm": 0.36667370796203613,
"kl": 0.0146484375,
"learning_rate": 1e-06,
"loss": -0.0002,
"num_tokens": 21313716.0,
"reward": 0.6060855239629745,
"reward_std": 0.10079656913876534,
"rewards/curriculum_aware_reward_fn": 0.11389802675694227,
"rewards/format_reward": 0.4921875,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 534.4296875,
"epoch": 2.8739495798319328,
"grad_norm": 0.3436344563961029,
"kl": 0.00984954833984375,
"learning_rate": 1e-06,
"loss": 0.0022,
"num_tokens": 21445667.0,
"reward": 0.48231907933950424,
"reward_std": 0.08960662921890616,
"rewards/curriculum_aware_reward_fn": 0.10731907980516553,
"rewards/format_reward": 0.375,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 496.21875,
"epoch": 2.8907563025210083,
"grad_norm": 0.48088422417640686,
"kl": 0.0130767822265625,
"learning_rate": 1e-06,
"loss": 0.0182,
"num_tokens": 21570871.0,
"reward": 0.4465460618957877,
"reward_std": 0.1538134217262268,
"rewards/curriculum_aware_reward_fn": 0.13404605071991682,
"rewards/format_reward": 0.3125,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 429.484375,
"epoch": 2.907563025210084,
"grad_norm": 0.5827536582946777,
"kl": 0.016109466552734375,
"learning_rate": 1e-06,
"loss": 0.0127,
"num_tokens": 21686093.0,
"reward": 0.4502467103302479,
"reward_std": 0.15407454315572977,
"rewards/curriculum_aware_reward_fn": 0.09087171172723174,
"rewards/format_reward": 0.359375,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 482.4609375,
"epoch": 2.92436974789916,
"grad_norm": 0.467061311006546,
"kl": 0.013336181640625,
"learning_rate": 1e-06,
"loss": 0.033,
"num_tokens": 21808264.0,
"reward": 0.6632401347160339,
"reward_std": 0.10484125558286905,
"rewards/curriculum_aware_reward_fn": 0.22574013099074364,
"rewards/format_reward": 0.4375,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 500.96875,
"epoch": 2.9411764705882355,
"grad_norm": 0.41948550939559937,
"kl": 0.009563446044921875,
"learning_rate": 1e-06,
"loss": 0.0329,
"num_tokens": 21933084.0,
"reward": 0.400082241743803,
"reward_std": 0.10662292037159204,
"rewards/curriculum_aware_reward_fn": 0.04070723708719015,
"rewards/format_reward": 0.359375,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 557.640625,
"epoch": 2.957983193277311,
"grad_norm": 0.41708114743232727,
"kl": 0.007190704345703125,
"learning_rate": 1e-06,
"loss": 0.021,
"num_tokens": 22068550.0,
"reward": 0.3005756618222222,
"reward_std": 0.06424513552337885,
"rewards/curriculum_aware_reward_fn": 0.050575657514855266,
"rewards/format_reward": 0.25,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 482.421875,
"epoch": 2.9747899159663866,
"grad_norm": 0.6009016633033752,
"kl": 0.013702392578125,
"learning_rate": 1e-06,
"loss": 0.0048,
"num_tokens": 22189356.0,
"reward": 0.6620065569877625,
"reward_std": 0.149446252733469,
"rewards/curriculum_aware_reward_fn": 0.16200657933950424,
"rewards/format_reward": 0.5,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 490.53572845458984,
"epoch": 2.991596638655462,
"grad_norm": 0.49134695529937744,
"kl": 0.01397705078125,
"learning_rate": 1e-06,
"loss": -0.0025,
"num_tokens": 22309028.0,
"reward": 0.6726973652839661,
"reward_std": 0.14456172287464142,
"rewards/curriculum_aware_reward_fn": 0.1101973676122725,
"rewards/format_reward": 0.5625,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 358.109375,
"epoch": 3.0168067226890756,
"grad_norm": 0.5925723314285278,
"kl": 0.0204315185546875,
"learning_rate": 1e-06,
"loss": 0.0117,
"num_tokens": 22410794.0,
"reward": 0.87787826359272,
"reward_std": 0.1721474528312683,
"rewards/curriculum_aware_reward_fn": 0.19819078594446182,
"rewards/format_reward": 0.6796875,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 523.0078125,
"epoch": 3.033613445378151,
"grad_norm": 0.2975535988807678,
"kl": 0.01165771484375,
"learning_rate": 1e-06,
"loss": 0.0697,
"num_tokens": 22539299.0,
"reward": 0.5168585479259491,
"reward_std": 0.048361226450651884,
"rewards/curriculum_aware_reward_fn": 0.08717105106916279,
"rewards/format_reward": 0.4296875,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 484.7578125,
"epoch": 3.0504201680672267,
"grad_norm": 0.45362988114356995,
"kl": 0.0162200927734375,
"learning_rate": 1e-06,
"loss": 0.0023,
"num_tokens": 22660588.0,
"reward": 0.5513980276882648,
"reward_std": 0.1047646040096879,
"rewards/curriculum_aware_reward_fn": 0.06702302722260356,
"rewards/format_reward": 0.484375,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 452.9375,
"epoch": 3.0672268907563027,
"grad_norm": 0.5003635883331299,
"kl": 0.0143890380859375,
"learning_rate": 1e-06,
"loss": 0.0104,
"num_tokens": 22778956.0,
"reward": 0.73149673640728,
"reward_std": 0.17891032248735428,
"rewards/curriculum_aware_reward_fn": 0.23149671405553818,
"rewards/format_reward": 0.5,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 513.0390625,
"epoch": 3.0840336134453783,
"grad_norm": 0.31615540385246277,
"kl": 0.01172637939453125,
"learning_rate": 1e-06,
"loss": 0.0235,
"num_tokens": 22905121.0,
"reward": 0.3244243338704109,
"reward_std": 0.03051401791162789,
"rewards/curriculum_aware_reward_fn": 0.011924341786652803,
"rewards/format_reward": 0.3125,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 411.6015625,
"epoch": 3.100840336134454,
"grad_norm": 0.4836508631706238,
"kl": 0.0144195556640625,
"learning_rate": 1e-06,
"loss": 0.0043,
"num_tokens": 23019342.0,
"reward": 0.5826480239629745,
"reward_std": 0.11801502481102943,
"rewards/curriculum_aware_reward_fn": 0.07483552675694227,
"rewards/format_reward": 0.5078125,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 416.921875,
"epoch": 3.1176470588235294,
"grad_norm": 0.3468119204044342,
"kl": 0.01403045654296875,
"learning_rate": 1e-06,
"loss": -0.0007,
"num_tokens": 23137316.0,
"reward": 0.47574012295808643,
"reward_std": 0.05907326890155673,
"rewards/curriculum_aware_reward_fn": 0.10074013040866703,
"rewards/format_reward": 0.375,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 414.359375,
"epoch": 3.134453781512605,
"grad_norm": 0.4667985439300537,
"kl": 0.0151519775390625,
"learning_rate": 1e-06,
"loss": 0.0299,
"num_tokens": 23249858.0,
"reward": 0.6344572305679321,
"reward_std": 0.15162191167473793,
"rewards/curriculum_aware_reward_fn": 0.13445723662152886,
"rewards/format_reward": 0.5,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 482.109375,
"epoch": 3.1512605042016806,
"grad_norm": 0.4111727774143219,
"kl": 0.013458251953125,
"learning_rate": 1e-06,
"loss": 0.0121,
"num_tokens": 23370304.0,
"reward": 0.4806743413209915,
"reward_std": 0.052865433506667614,
"rewards/curriculum_aware_reward_fn": 0.0431743401568383,
"rewards/format_reward": 0.4375,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 500.5234375,
"epoch": 3.168067226890756,
"grad_norm": 0.4427432715892792,
"kl": 0.01395416259765625,
"learning_rate": 1e-06,
"loss": -0.0198,
"num_tokens": 23496979.0,
"reward": 0.4243420949205756,
"reward_std": 0.07115951599553227,
"rewards/curriculum_aware_reward_fn": 0.11184210795909166,
"rewards/format_reward": 0.3125,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 450.671875,
"epoch": 3.184873949579832,
"grad_norm": 0.4217956066131592,
"kl": 0.0164947509765625,
"learning_rate": 1e-06,
"loss": 0.0361,
"num_tokens": 23613281.0,
"reward": 0.5629111751914024,
"reward_std": 0.07686262531206012,
"rewards/curriculum_aware_reward_fn": 0.12541118264198303,
"rewards/format_reward": 0.4375,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 399.21875,
"epoch": 3.2016806722689077,
"grad_norm": 0.6111953258514404,
"kl": 0.017852783203125,
"learning_rate": 1e-06,
"loss": 0.0248,
"num_tokens": 23723725.0,
"reward": 0.7121710479259491,
"reward_std": 0.15234812535345554,
"rewards/curriculum_aware_reward_fn": 0.08717105351388454,
"rewards/format_reward": 0.625,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 433.015625,
"epoch": 3.2184873949579833,
"grad_norm": 0.4865033030509949,
"kl": 0.0166778564453125,
"learning_rate": 1e-06,
"loss": -0.009,
"num_tokens": 23835815.0,
"reward": 0.7372532933950424,
"reward_std": 0.13220055866986513,
"rewards/curriculum_aware_reward_fn": 0.17475328966975212,
"rewards/format_reward": 0.5625,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 525.4765625,
"epoch": 3.235294117647059,
"grad_norm": 0.3422640562057495,
"kl": 0.016204833984375,
"learning_rate": 1e-06,
"loss": 0.0136,
"num_tokens": 23964596.0,
"reward": 0.43174342811107635,
"reward_std": 0.09182633552700281,
"rewards/curriculum_aware_reward_fn": 0.05674342066049576,
"rewards/format_reward": 0.375,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 351.4453125,
"epoch": 3.2521008403361344,
"grad_norm": 0.5189781785011292,
"kl": 0.023193359375,
"learning_rate": 1e-06,
"loss": 0.0349,
"num_tokens": 24067141.0,
"reward": 0.7643914222717285,
"reward_std": 0.15736807510256767,
"rewards/curriculum_aware_reward_fn": 0.1940789488144219,
"rewards/format_reward": 0.5703125,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 460.296875,
"epoch": 3.26890756302521,
"grad_norm": 0.36804094910621643,
"kl": 0.012298583984375,
"learning_rate": 1e-06,
"loss": -0.0044,
"num_tokens": 24187067.0,
"reward": 0.48273026943206787,
"reward_std": 0.037970013450831175,
"rewards/curriculum_aware_reward_fn": 0.10773026570677757,
"rewards/format_reward": 0.375,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 418.078125,
"epoch": 3.2857142857142856,
"grad_norm": 0.4727684259414673,
"kl": 0.01959228515625,
"learning_rate": 1e-06,
"loss": 0.0229,
"num_tokens": 24300605.0,
"reward": 0.5563322491943836,
"reward_std": 0.06251880899071693,
"rewards/curriculum_aware_reward_fn": 0.13445723743643612,
"rewards/format_reward": 0.421875,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 498.9765625,
"epoch": 3.302521008403361,
"grad_norm": 0.5195404887199402,
"kl": 0.01263427734375,
"learning_rate": 1e-06,
"loss": 0.0303,
"num_tokens": 24427482.0,
"reward": 0.38569077104330063,
"reward_std": 0.10553359193727374,
"rewards/curriculum_aware_reward_fn": 0.07319078966975212,
"rewards/format_reward": 0.3125,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 448.625,
"epoch": 3.3193277310924367,
"grad_norm": 0.49932360649108887,
"kl": 0.017852783203125,
"learning_rate": 1e-06,
"loss": 0.0397,
"num_tokens": 24541746.0,
"reward": 0.5193256512284279,
"reward_std": 0.10704736225306988,
"rewards/curriculum_aware_reward_fn": 0.08182565891183913,
"rewards/format_reward": 0.4375,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 377.1015625,
"epoch": 3.3361344537815127,
"grad_norm": 0.4484708309173584,
"kl": 0.01934814453125,
"learning_rate": 1e-06,
"loss": 0.0074,
"num_tokens": 24647103.0,
"reward": 0.6673519611358643,
"reward_std": 0.06431722524575889,
"rewards/curriculum_aware_reward_fn": 0.16735197603702545,
"rewards/format_reward": 0.5,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 483.921875,
"epoch": 3.3529411764705883,
"grad_norm": 0.41696909070014954,
"kl": 0.01053619384765625,
"learning_rate": 1e-06,
"loss": 0.0158,
"num_tokens": 24773253.0,
"reward": 0.2717927638441324,
"reward_std": 0.11790546495467424,
"rewards/curriculum_aware_reward_fn": 0.08429276384413242,
"rewards/format_reward": 0.1875,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 380.0546875,
"epoch": 3.369747899159664,
"grad_norm": 0.45737817883491516,
"kl": 0.022186279296875,
"learning_rate": 1e-06,
"loss": 0.0056,
"num_tokens": 24881188.0,
"reward": 0.7002467140555382,
"reward_std": 0.03508220613002777,
"rewards/curriculum_aware_reward_fn": 0.1377467131242156,
"rewards/format_reward": 0.5625,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 390.3671875,
"epoch": 3.3865546218487395,
"grad_norm": 0.5029156804084778,
"kl": 0.0277099609375,
"learning_rate": 1e-06,
"loss": 0.004,
"num_tokens": 24991955.0,
"reward": 0.6694078892469406,
"reward_std": 0.10574874095618725,
"rewards/curriculum_aware_reward_fn": 0.10690789762884378,
"rewards/format_reward": 0.5625,
"step": 200
},
{
"epoch": 3.3865546218487395,
"step": 200,
"total_flos": 0.0,
"train_loss": 0.010024200768093579,
"train_runtime": 35564.3846,
"train_samples_per_second": 0.72,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}