Qwen2.5-1.5B-CCRL-1 / trainer_state.json

Model save

56f259c verified 8 days ago

94.2 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 3.3865546218487395,
	"eval_steps": 500,
	"global_step": 200,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio": 0.0,
	"completion_length": 635.7109375,
	"epoch": 0.01680672268907563,
	"grad_norm": 0.31708475947380066,
	"kl": 0.0,
	"learning_rate": 1e-06,
	"loss": 0.0099,
	"num_tokens": 143267.0,
	"reward": 0.039062500349245965,
	"reward_std": 0.0698821279220283,
	"rewards/curriculum_aware_reward_fn": 0.023437500349245965,
	"rewards/format_reward": 0.015625,
	"step": 1
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 527.6328125,
	"epoch": 0.03361344537815126,
	"grad_norm": 0.43825313448905945,
	"kl": 0.0002913475036621094,
	"learning_rate": 1e-06,
	"loss": 0.0432,
	"num_tokens": 270812.0,
	"reward": 0.09292763145640492,
	"reward_std": 0.12866380205377936,
	"rewards/curriculum_aware_reward_fn": 0.06949013145640492,
	"rewards/format_reward": 0.0234375,
	"step": 2
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 608.9921875,
	"epoch": 0.05042016806722689,
	"grad_norm": 0.4227641224861145,
	"kl": 0.0002665519714355469,
	"learning_rate": 1e-06,
	"loss": -0.0273,
	"num_tokens": 410971.0,
	"reward": 0.059621710795909166,
	"reward_std": 0.07889116508886218,
	"rewards/curriculum_aware_reward_fn": 0.059621710795909166,
	"rewards/format_reward": 0.0,
	"step": 3
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 558.921875,
	"epoch": 0.06722689075630252,
	"grad_norm": 0.4796917140483856,
	"kl": 0.0002789497375488281,
	"learning_rate": 1e-06,
	"loss": -0.0009,
	"num_tokens": 542313.0,
	"reward": 0.08552631549537182,
	"reward_std": 0.12651031091809273,
	"rewards/curriculum_aware_reward_fn": 0.06990131689235568,
	"rewards/format_reward": 0.015625,
	"step": 4
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 590.265625,
	"epoch": 0.08403361344537816,
	"grad_norm": 0.5620821118354797,
	"kl": 0.0003027915954589844,
	"learning_rate": 1e-06,
	"loss": 0.0288,
	"num_tokens": 677075.0,
	"reward": 0.14925987273454666,
	"reward_std": 0.24606542102992535,
	"rewards/curriculum_aware_reward_fn": 0.09457236900925636,
	"rewards/format_reward": 0.0546875,
	"step": 5
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 592.5234375,
	"epoch": 0.10084033613445378,
	"grad_norm": 0.4298699200153351,
	"kl": 0.0003509521484375,
	"learning_rate": 1e-06,
	"loss": -0.0151,
	"num_tokens": 812710.0,
	"reward": 0.08840460644569248,
	"reward_std": 0.1141207623295486,
	"rewards/curriculum_aware_reward_fn": 0.03371710644569248,
	"rewards/format_reward": 0.0546875,
	"step": 6
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 582.046875,
	"epoch": 0.11764705882352941,
	"grad_norm": 0.526942253112793,
	"kl": 0.0004343986511230469,
	"learning_rate": 1e-06,
	"loss": 0.0192,
	"num_tokens": 943268.0,
	"reward": 0.12088815867900848,
	"reward_std": 0.17540471255779266,
	"rewards/curriculum_aware_reward_fn": 0.07401315867900848,
	"rewards/format_reward": 0.046875,
	"step": 7
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 534.75,
	"epoch": 0.13445378151260504,
	"grad_norm": 0.44275274872779846,
	"kl": 0.0003724098205566406,
	"learning_rate": 1e-06,
	"loss": -0.0033,
	"num_tokens": 1074300.0,
	"reward": 0.030016446253284812,
	"reward_std": 0.08489933330565691,
	"rewards/curriculum_aware_reward_fn": 0.014391447650268674,
	"rewards/format_reward": 0.015625,
	"step": 8
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 539.09375,
	"epoch": 0.15126050420168066,
	"grad_norm": 0.5494865775108337,
	"kl": 0.0007390975952148438,
	"learning_rate": 1e-06,
	"loss": 0.0036,
	"num_tokens": 1197896.0,
	"reward": 0.16570723708719015,
	"reward_std": 0.21696669608354568,
	"rewards/curriculum_aware_reward_fn": 0.05633223685435951,
	"rewards/format_reward": 0.109375,
	"step": 9
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 593.7734375,
	"epoch": 0.16806722689075632,
	"grad_norm": 0.5171737670898438,
	"kl": 0.0006322860717773438,
	"learning_rate": 1e-06,
	"loss": 0.0193,
	"num_tokens": 1336931.0,
	"reward": 0.11143092066049576,
	"reward_std": 0.19064411148428917,
	"rewards/curriculum_aware_reward_fn": 0.017680921009741724,
	"rewards/format_reward": 0.09375,
	"step": 10
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 578.4765625,
	"epoch": 0.18487394957983194,
	"grad_norm": 0.6088258028030396,
	"kl": 0.001346588134765625,
	"learning_rate": 1e-06,
	"loss": 0.037,
	"num_tokens": 1467592.0,
	"reward": 0.22944078594446182,
	"reward_std": 0.3224767856299877,
	"rewards/curriculum_aware_reward_fn": 0.04194079013541341,
	"rewards/format_reward": 0.1875,
	"step": 11
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 601.171875,
	"epoch": 0.20168067226890757,
	"grad_norm": 0.4451327621936798,
	"kl": 0.0010366439819335938,
	"learning_rate": 1e-06,
	"loss": 0.0148,
	"num_tokens": 1607894.0,
	"reward": 0.1204769799951464,
	"reward_std": 0.1381341191008687,
	"rewards/curriculum_aware_reward_fn": 0.018914473825134337,
	"rewards/format_reward": 0.1015625,
	"step": 12
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 526.28125,
	"epoch": 0.2184873949579832,
	"grad_norm": 0.636314332485199,
	"kl": 0.00191497802734375,
	"learning_rate": 1e-06,
	"loss": 0.0125,
	"num_tokens": 1735650.0,
	"reward": 0.26644736528396606,
	"reward_std": 0.30141641572117805,
	"rewards/curriculum_aware_reward_fn": 0.03988486935850233,
	"rewards/format_reward": 0.2265625,
	"step": 13
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 507.515625,
	"epoch": 0.23529411764705882,
	"grad_norm": 0.6864922642707825,
	"kl": 0.004413604736328125,
	"learning_rate": 1e-06,
	"loss": 0.0802,
	"num_tokens": 1856316.0,
	"reward": 0.3112664446234703,
	"reward_std": 0.31644799932837486,
	"rewards/curriculum_aware_reward_fn": 0.05345394683536142,
	"rewards/format_reward": 0.2578125,
	"step": 14
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 554.5859375,
	"epoch": 0.25210084033613445,
	"grad_norm": 0.6268811225891113,
	"kl": 0.0036067962646484375,
	"learning_rate": 1e-06,
	"loss": -0.0044,
	"num_tokens": 1987511.0,
	"reward": 0.4337993338704109,
	"reward_std": 0.32329631969332695,
	"rewards/curriculum_aware_reward_fn": 0.050986841320991516,
	"rewards/format_reward": 0.3828125,
	"step": 15
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 584.3828125,
	"epoch": 0.2689075630252101,
	"grad_norm": 0.5531853437423706,
	"kl": 0.003597259521484375,
	"learning_rate": 1e-06,
	"loss": 0.0104,
	"num_tokens": 2119768.0,
	"reward": 0.3828125037252903,
	"reward_std": 0.26145630702376366,
	"rewards/curriculum_aware_reward_fn": 0.0546875,
	"rewards/format_reward": 0.328125,
	"step": 16
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 481.8046875,
	"epoch": 0.2857142857142857,
	"grad_norm": 0.6449251174926758,
	"kl": 0.005481719970703125,
	"learning_rate": 1e-06,
	"loss": -0.0094,
	"num_tokens": 2238911.0,
	"reward": 0.4543585404753685,
	"reward_std": 0.26075971499085426,
	"rewards/curriculum_aware_reward_fn": 0.05592105304822326,
	"rewards/format_reward": 0.3984375,
	"step": 17
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 645.75,
	"epoch": 0.3025210084033613,
	"grad_norm": 0.37918156385421753,
	"kl": 0.001049041748046875,
	"learning_rate": 1e-06,
	"loss": 0.0055,
	"num_tokens": 2385767.0,
	"reward": 0.1451480264076963,
	"reward_std": 0.1290158643387258,
	"rewards/curriculum_aware_reward_fn": 0.04358552640769631,
	"rewards/format_reward": 0.1015625,
	"step": 18
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 617.6953125,
	"epoch": 0.31932773109243695,
	"grad_norm": 0.39814478158950806,
	"kl": 0.00528717041015625,
	"learning_rate": 1e-06,
	"loss": 0.0518,
	"num_tokens": 2525656.0,
	"reward": 0.35115131735801697,
	"reward_std": 0.11648409254848957,
	"rewards/curriculum_aware_reward_fn": 0.02302631549537182,
	"rewards/format_reward": 0.328125,
	"step": 19
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 459.6015625,
	"epoch": 0.33613445378151263,
	"grad_norm": 0.7307525873184204,
	"kl": 0.005184173583984375,
	"learning_rate": 1e-06,
	"loss": 0.083,
	"num_tokens": 2644077.0,
	"reward": 0.47574012726545334,
	"reward_std": 0.2815094441175461,
	"rewards/curriculum_aware_reward_fn": 0.04605263099074364,
	"rewards/format_reward": 0.4296875,
	"step": 20
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 620.46875,
	"epoch": 0.35294117647058826,
	"grad_norm": 0.46509799361228943,
	"kl": 0.0036363601684570312,
	"learning_rate": 1e-06,
	"loss": 0.0145,
	"num_tokens": 2786169.0,
	"reward": 0.24177631677594036,
	"reward_std": 0.09853590792044997,
	"rewards/curriculum_aware_reward_fn": 0.023026315728202462,
	"rewards/format_reward": 0.21875,
	"step": 21
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 578.9609375,
	"epoch": 0.3697478991596639,
	"grad_norm": 0.5765166878700256,
	"kl": 0.005565643310546875,
	"learning_rate": 1e-06,
	"loss": 0.0042,
	"num_tokens": 2917180.0,
	"reward": 0.4958881437778473,
	"reward_std": 0.10692231869325042,
	"rewards/curriculum_aware_reward_fn": 0.0740131582133472,
	"rewards/format_reward": 0.421875,
	"step": 22
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 579.578125,
	"epoch": 0.3865546218487395,
	"grad_norm": 0.5340356826782227,
	"kl": 0.00540924072265625,
	"learning_rate": 1e-06,
	"loss": -0.0083,
	"num_tokens": 3053414.0,
	"reward": 0.3708881437778473,
	"reward_std": 0.11791826784610748,
	"rewards/curriculum_aware_reward_fn": 0.06620065867900848,
	"rewards/format_reward": 0.3046875,
	"step": 23
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 507.5625,
	"epoch": 0.40336134453781514,
	"grad_norm": 0.4752294719219208,
	"kl": 0.031703948974609375,
	"learning_rate": 1e-06,
	"loss": -0.0004,
	"num_tokens": 3181894.0,
	"reward": 0.3700657896697521,
	"reward_std": 0.1367718242108822,
	"rewards/curriculum_aware_reward_fn": 0.002878289553336799,
	"rewards/format_reward": 0.3671875,
	"step": 24
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 496.4296875,
	"epoch": 0.42016806722689076,
	"grad_norm": 0.46164318919181824,
	"kl": 0.0082855224609375,
	"learning_rate": 1e-06,
	"loss": -0.0091,
	"num_tokens": 3304077.0,
	"reward": 0.5016447380185127,
	"reward_std": 0.09064025245606899,
	"rewards/curriculum_aware_reward_fn": 0.017269736621528864,
	"rewards/format_reward": 0.484375,
	"step": 25
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 454.5859375,
	"epoch": 0.4369747899159664,
	"grad_norm": 0.5706049799919128,
	"kl": 0.01887798309326172,
	"learning_rate": 1e-06,
	"loss": -0.0096,
	"num_tokens": 3420488.0,
	"reward": 0.6875,
	"reward_std": 0.12697386741638184,
	"rewards/curriculum_aware_reward_fn": 0.0234375,
	"rewards/format_reward": 0.6640625,
	"step": 26
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 554.46875,
	"epoch": 0.453781512605042,
	"grad_norm": 0.45473384857177734,
	"kl": 0.0068416595458984375,
	"learning_rate": 1e-06,
	"loss": 0.0074,
	"num_tokens": 3552340.0,
	"reward": 0.34868420753628016,
	"reward_std": 0.10102300066500902,
	"rewards/curriculum_aware_reward_fn": 0.012746710679493845,
	"rewards/format_reward": 0.3359375,
	"step": 27
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 534.0,
	"epoch": 0.47058823529411764,
	"grad_norm": 0.348452091217041,
	"kl": 0.01036834716796875,
	"learning_rate": 1e-06,
	"loss": 0.0145,
	"num_tokens": 3677892.0,
	"reward": 0.5571546033024788,
	"reward_std": 0.055680982768535614,
	"rewards/curriculum_aware_reward_fn": 0.010279605048708618,
	"rewards/format_reward": 0.546875,
	"step": 28
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 584.03125,
	"epoch": 0.48739495798319327,
	"grad_norm": 0.452033668756485,
	"kl": 0.0071258544921875,
	"learning_rate": 1e-06,
	"loss": 0.0115,
	"num_tokens": 3813600.0,
	"reward": 0.3984375,
	"reward_std": 0.08443661965429783,
	"rewards/curriculum_aware_reward_fn": 0.046875,
	"rewards/format_reward": 0.3515625,
	"step": 29
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 552.4921875,
	"epoch": 0.5042016806722689,
	"grad_norm": 0.4926210641860962,
	"kl": 0.005392551422119141,
	"learning_rate": 1e-06,
	"loss": 0.0285,
	"num_tokens": 3947807.0,
	"reward": 0.4683388201519847,
	"reward_std": 0.11112732999026775,
	"rewards/curriculum_aware_reward_fn": 0.03865131549537182,
	"rewards/format_reward": 0.4296875,
	"step": 30
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 559.6953125,
	"epoch": 0.5210084033613446,
	"grad_norm": 0.5463467240333557,
	"kl": 0.004418373107910156,
	"learning_rate": 1e-06,
	"loss": -0.0233,
	"num_tokens": 4080704.0,
	"reward": 0.22203946067020297,
	"reward_std": 0.09257729165256023,
	"rewards/curriculum_aware_reward_fn": 0.042351973708719015,
	"rewards/format_reward": 0.1796875,
	"step": 31
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 549.921875,
	"epoch": 0.5378151260504201,
	"grad_norm": 0.36463335156440735,
	"kl": 0.006511688232421875,
	"learning_rate": 1e-06,
	"loss": -0.0032,
	"num_tokens": 4214870.0,
	"reward": 0.4346217215061188,
	"reward_std": 0.03605314111337066,
	"rewards/curriculum_aware_reward_fn": 0.004934210563078523,
	"rewards/format_reward": 0.4296875,
	"step": 32
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 478.7890625,
	"epoch": 0.5546218487394958,
	"grad_norm": 0.5116223692893982,
	"kl": 0.008532524108886719,
	"learning_rate": 1e-06,
	"loss": -0.0153,
	"num_tokens": 4338203.0,
	"reward": 0.4560032896697521,
	"reward_std": 0.12314211018383503,
	"rewards/curriculum_aware_reward_fn": 0.08881578966975212,
	"rewards/format_reward": 0.3671875,
	"step": 33
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 476.7890625,
	"epoch": 0.5714285714285714,
	"grad_norm": 0.43187472224235535,
	"kl": 0.007843017578125,
	"learning_rate": 1e-06,
	"loss": 0.0134,
	"num_tokens": 4461184.0,
	"reward": 0.4333881586790085,
	"reward_std": 0.12357822060585022,
	"rewards/curriculum_aware_reward_fn": 0.02713815774768591,
	"rewards/format_reward": 0.40625,
	"step": 34
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 529.7578125,
	"epoch": 0.5882352941176471,
	"grad_norm": 0.4466142952442169,
	"kl": 0.0057315826416015625,
	"learning_rate": 1e-06,
	"loss": -0.015,
	"num_tokens": 4590329.0,
	"reward": 0.426809199154377,
	"reward_std": 0.10671343095600605,
	"rewards/curriculum_aware_reward_fn": 0.059621710097417235,
	"rewards/format_reward": 0.3671875,
	"step": 35
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 521.9453125,
	"epoch": 0.6050420168067226,
	"grad_norm": 0.5088793635368347,
	"kl": 0.00739288330078125,
	"learning_rate": 1e-06,
	"loss": 0.0193,
	"num_tokens": 4717658.0,
	"reward": 0.5740131624042988,
	"reward_std": 0.09916227497160435,
	"rewards/curriculum_aware_reward_fn": 0.08182565809693187,
	"rewards/format_reward": 0.4921875,
	"step": 36
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 481.59375,
	"epoch": 0.6218487394957983,
	"grad_norm": 0.3755647540092468,
	"kl": 0.005794525146484375,
	"learning_rate": 1e-06,
	"loss": 0.0106,
	"num_tokens": 4837174.0,
	"reward": 0.5123355314135551,
	"reward_std": 0.023199534974992275,
	"rewards/curriculum_aware_reward_fn": 0.0748355258256197,
	"rewards/format_reward": 0.4375,
	"step": 37
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 465.1953125,
	"epoch": 0.6386554621848739,
	"grad_norm": 0.5442925691604614,
	"kl": 0.008731842041015625,
	"learning_rate": 1e-06,
	"loss": -0.0111,
	"num_tokens": 4953039.0,
	"reward": 0.7232730239629745,
	"reward_std": 0.1315580508671701,
	"rewards/curriculum_aware_reward_fn": 0.16077302338089794,
	"rewards/format_reward": 0.5625,
	"step": 38
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 501.484375,
	"epoch": 0.6554621848739496,
	"grad_norm": 0.4446295201778412,
	"kl": 0.00624847412109375,
	"learning_rate": 1e-06,
	"loss": 0.0286,
	"num_tokens": 5076965.0,
	"reward": 0.47327301651239395,
	"reward_std": 0.08440816402435303,
	"rewards/curriculum_aware_reward_fn": 0.09827302861958742,
	"rewards/format_reward": 0.375,
	"step": 39
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 539.15625,
	"epoch": 0.6722689075630253,
	"grad_norm": 0.37400856614112854,
	"kl": 0.005260467529296875,
	"learning_rate": 1e-06,
	"loss": 0.0044,
	"num_tokens": 5207185.0,
	"reward": 0.4745065679308027,
	"reward_std": 0.07072597183287144,
	"rewards/curriculum_aware_reward_fn": 0.09950657887384295,
	"rewards/format_reward": 0.375,
	"step": 40
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 492.0859375,
	"epoch": 0.6890756302521008,
	"grad_norm": 0.4103780686855316,
	"kl": 0.00856781005859375,
	"learning_rate": 1e-06,
	"loss": 0.0049,
	"num_tokens": 5328012.0,
	"reward": 0.71875,
	"reward_std": 0.10247145313769579,
	"rewards/curriculum_aware_reward_fn": 0.09375000419095159,
	"rewards/format_reward": 0.625,
	"step": 41
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 405.1328125,
	"epoch": 0.7058823529411765,
	"grad_norm": 0.6738374829292297,
	"kl": 0.0108184814453125,
	"learning_rate": 1e-06,
	"loss": 0.0454,
	"num_tokens": 5438933.0,
	"reward": 0.757401317358017,
	"reward_std": 0.212964728474617,
	"rewards/curriculum_aware_reward_fn": 0.1636513164266944,
	"rewards/format_reward": 0.59375,
	"step": 42
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 516.640625,
	"epoch": 0.7226890756302521,
	"grad_norm": 0.31194940209388733,
	"kl": 0.0074005126953125,
	"learning_rate": 1e-06,
	"loss": -0.0205,
	"num_tokens": 5563887.0,
	"reward": 0.6562500149011612,
	"reward_std": 0.04224720690399408,
	"rewards/curriculum_aware_reward_fn": 0.15625,
	"rewards/format_reward": 0.5,
	"step": 43
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 477.3984375,
	"epoch": 0.7394957983193278,
	"grad_norm": 0.38581541180610657,
	"kl": 0.00885009765625,
	"learning_rate": 1e-06,
	"loss": -0.0164,
	"num_tokens": 5688114.0,
	"reward": 0.6402138248085976,
	"reward_std": 0.08311590366065502,
	"rewards/curriculum_aware_reward_fn": 0.03083881549537182,
	"rewards/format_reward": 0.609375,
	"step": 44
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 521.8046875,
	"epoch": 0.7563025210084033,
	"grad_norm": 0.36903509497642517,
	"kl": 0.0078277587890625,
	"learning_rate": 1e-06,
	"loss": -0.0022,
	"num_tokens": 5814153.0,
	"reward": 0.5513980239629745,
	"reward_std": 0.06967925047501922,
	"rewards/curriculum_aware_reward_fn": 0.05139802524354309,
	"rewards/format_reward": 0.5,
	"step": 45
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 416.4296875,
	"epoch": 0.773109243697479,
	"grad_norm": 0.5821658968925476,
	"kl": 0.0094757080078125,
	"learning_rate": 1e-06,
	"loss": -0.0056,
	"num_tokens": 5923904.0,
	"reward": 0.7257401347160339,
	"reward_std": 0.13419464463368058,
	"rewards/curriculum_aware_reward_fn": 0.10074013192206621,
	"rewards/format_reward": 0.625,
	"step": 46
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 526.9296875,
	"epoch": 0.7899159663865546,
	"grad_norm": 0.449553519487381,
	"kl": 0.005664825439453125,
	"learning_rate": 1e-06,
	"loss": 0.0001,
	"num_tokens": 6053447.0,
	"reward": 0.4819078892469406,
	"reward_std": 0.09099963493645191,
	"rewards/curriculum_aware_reward_fn": 0.10690789669752121,
	"rewards/format_reward": 0.375,
	"step": 47
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 536.5,
	"epoch": 0.8067226890756303,
	"grad_norm": 0.5381475687026978,
	"kl": 0.008424758911132812,
	"learning_rate": 1e-06,
	"loss": 0.0099,
	"num_tokens": 6183559.0,
	"reward": 0.46833881735801697,
	"reward_std": 0.08668615715578198,
	"rewards/curriculum_aware_reward_fn": 0.03865131642669439,
	"rewards/format_reward": 0.4296875,
	"step": 48
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 539.6328125,
	"epoch": 0.8235294117647058,
	"grad_norm": 0.44155657291412354,
	"kl": 0.0077495574951171875,
	"learning_rate": 1e-06,
	"loss": 0.0085,
	"num_tokens": 6314544.0,
	"reward": 0.5526315867900848,
	"reward_std": 0.027912108227610588,
	"rewards/curriculum_aware_reward_fn": 0.11513157933950424,
	"rewards/format_reward": 0.4375,
	"step": 49
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 554.0546875,
	"epoch": 0.8403361344537815,
	"grad_norm": 0.4840262532234192,
	"kl": 0.0054950714111328125,
	"learning_rate": 1e-06,
	"loss": 0.0073,
	"num_tokens": 6445087.0,
	"reward": 0.33634869009256363,
	"reward_std": 0.10334387933835387,
	"rewards/curriculum_aware_reward_fn": 0.03166118450462818,
	"rewards/format_reward": 0.3046875,
	"step": 50
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 578.6796875,
	"epoch": 0.8571428571428571,
	"grad_norm": 0.30791598558425903,
	"kl": 0.005002021789550781,
	"learning_rate": 1e-06,
	"loss": 0.0068,
	"num_tokens": 6582878.0,
	"reward": 0.348684199154377,
	"reward_std": 0.07469352334737778,
	"rewards/curriculum_aware_reward_fn": 0.036184209398925304,
	"rewards/format_reward": 0.3125,
	"step": 51
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 448.1328125,
	"epoch": 0.8739495798319328,
	"grad_norm": 0.5027822852134705,
	"kl": 0.00795745849609375,
	"learning_rate": 1e-06,
	"loss": -0.0178,
	"num_tokens": 6698503.0,
	"reward": 0.6311677470803261,
	"reward_std": 0.11679959110915661,
	"rewards/curriculum_aware_reward_fn": 0.09210526384413242,
	"rewards/format_reward": 0.5390625,
	"step": 52
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 521.1875,
	"epoch": 0.8907563025210085,
	"grad_norm": 0.4084753394126892,
	"kl": 0.00714111328125,
	"learning_rate": 1e-06,
	"loss": 0.0193,
	"num_tokens": 6823951.0,
	"reward": 0.5028782784938812,
	"reward_std": 0.059696739073842764,
	"rewards/curriculum_aware_reward_fn": 0.06537829001899809,
	"rewards/format_reward": 0.4375,
	"step": 53
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 539.109375,
	"epoch": 0.907563025210084,
	"grad_norm": 0.2098054140806198,
	"kl": 0.007198333740234375,
	"learning_rate": 1e-06,
	"loss": 0.0114,
	"num_tokens": 6953317.0,
	"reward": 0.46052631735801697,
	"reward_std": 0.03168220818042755,
	"rewards/curriculum_aware_reward_fn": 0.023026317358016968,
	"rewards/format_reward": 0.4375,
	"step": 54
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 521.2265625,
	"epoch": 0.9243697478991597,
	"grad_norm": 0.4919142425060272,
	"kl": 0.007293701171875,
	"learning_rate": 1e-06,
	"loss": 0.0101,
	"num_tokens": 7079922.0,
	"reward": 0.49547697603702545,
	"reward_std": 0.10914274398237467,
	"rewards/curriculum_aware_reward_fn": 0.12047697883099318,
	"rewards/format_reward": 0.375,
	"step": 55
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 500.5625,
	"epoch": 0.9411764705882353,
	"grad_norm": 0.46875280141830444,
	"kl": 0.00684356689453125,
	"learning_rate": 1e-06,
	"loss": 0.0266,
	"num_tokens": 7206954.0,
	"reward": 0.40830591320991516,
	"reward_std": 0.1075905729085207,
	"rewards/curriculum_aware_reward_fn": 0.04111842066049576,
	"rewards/format_reward": 0.3671875,
	"step": 56
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 482.3046875,
	"epoch": 0.957983193277311,
	"grad_norm": 0.40924757719039917,
	"kl": 0.012725830078125,
	"learning_rate": 1e-06,
	"loss": 0.0041,
	"num_tokens": 7327857.0,
	"reward": 0.5958059132099152,
	"reward_std": 0.06403321353718638,
	"rewards/curriculum_aware_reward_fn": 0.04111842007841915,
	"rewards/format_reward": 0.5546875,
	"step": 57
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 445.375,
	"epoch": 0.9747899159663865,
	"grad_norm": 0.4467240273952484,
	"kl": 0.0105743408203125,
	"learning_rate": 1e-06,
	"loss": -0.0061,
	"num_tokens": 7440561.0,
	"reward": 0.7578125,
	"reward_std": 0.057358515448868275,
	"rewards/curriculum_aware_reward_fn": 0.13281250069849193,
	"rewards/format_reward": 0.625,
	"step": 58
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 582.3452377319336,
	"epoch": 0.9915966386554622,
	"grad_norm": 0.5007306933403015,
	"kl": 0.007415771484375,
	"learning_rate": 1e-06,
	"loss": 0.0029,
	"num_tokens": 7569086.0,
	"reward": 0.4514802545309067,
	"reward_std": 0.06341935088858008,
	"rewards/curriculum_aware_reward_fn": 0.0217927637277171,
	"rewards/format_reward": 0.4296875,
	"step": 59
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 553.09375,
	"epoch": 1.0168067226890756,
	"grad_norm": 0.4292355179786682,
	"kl": 0.005462646484375,
	"learning_rate": 1e-06,
	"loss": 0.0119,
	"num_tokens": 7702626.0,
	"reward": 0.4325658082962036,
	"reward_std": 0.07455102633684874,
	"rewards/curriculum_aware_reward_fn": 0.05756579013541341,
	"rewards/format_reward": 0.375,
	"step": 60
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 521.375,
	"epoch": 1.0336134453781514,
	"grad_norm": 0.41578003764152527,
	"kl": 0.008762359619140625,
	"learning_rate": 1e-06,
	"loss": -0.0056,
	"num_tokens": 7827818.0,
	"reward": 0.5082236900925636,
	"reward_std": 0.07253926200792193,
	"rewards/curriculum_aware_reward_fn": 0.07072368497028947,
	"rewards/format_reward": 0.4375,
	"step": 61
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 648.2734375,
	"epoch": 1.050420168067227,
	"grad_norm": 0.48642197251319885,
	"kl": 0.0062713623046875,
	"learning_rate": 1e-06,
	"loss": 0.0136,
	"num_tokens": 7974333.0,
	"reward": 0.3449835442006588,
	"reward_std": 0.07259867247194052,
	"rewards/curriculum_aware_reward_fn": 0.03248355258256197,
	"rewards/format_reward": 0.3125,
	"step": 62
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 436.9296875,
	"epoch": 1.0672268907563025,
	"grad_norm": 0.3184286653995514,
	"kl": 0.0114593505859375,
	"learning_rate": 1e-06,
	"loss": 0.0139,
	"num_tokens": 8084908.0,
	"reward": 0.6899671256542206,
	"reward_std": 0.0728745711967349,
	"rewards/curriculum_aware_reward_fn": 0.0649671049322933,
	"rewards/format_reward": 0.625,
	"step": 63
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 541.53125,
	"epoch": 1.084033613445378,
	"grad_norm": 0.16483676433563232,
	"kl": 0.0060882568359375,
	"learning_rate": 1e-06,
	"loss": -0.0032,
	"num_tokens": 8216696.0,
	"reward": 0.2627467066049576,
	"reward_std": 0.024391429498791695,
	"rewards/curriculum_aware_reward_fn": 0.012746710330247879,
	"rewards/format_reward": 0.25,
	"step": 64
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 509.7890625,
	"epoch": 1.1008403361344539,
	"grad_norm": 0.4256879985332489,
	"kl": 0.00730133056640625,
	"learning_rate": 1e-06,
	"loss": -0.0102,
	"num_tokens": 8342845.0,
	"reward": 0.5197368264198303,
	"reward_std": 0.030515023041516542,
	"rewards/curriculum_aware_reward_fn": 0.019736842485144734,
	"rewards/format_reward": 0.5,
	"step": 65
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 486.296875,
	"epoch": 1.1176470588235294,
	"grad_norm": 0.3091375231742859,
	"kl": 0.008016586303710938,
	"learning_rate": 1e-06,
	"loss": -0.003,
	"num_tokens": 8462971.0,
	"reward": 0.46299341320991516,
	"reward_std": 0.04847824294120073,
	"rewards/curriculum_aware_reward_fn": 0.025493420660495758,
	"rewards/format_reward": 0.4375,
	"step": 66
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 596.234375,
	"epoch": 1.134453781512605,
	"grad_norm": 0.4554305076599121,
	"kl": 0.006458282470703125,
	"learning_rate": 1e-06,
	"loss": 0.011,
	"num_tokens": 8598337.0,
	"reward": 0.3758223643526435,
	"reward_std": 0.08455474488437176,
	"rewards/curriculum_aware_reward_fn": 0.1258223680779338,
	"rewards/format_reward": 0.25,
	"step": 67
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 444.265625,
	"epoch": 1.1512605042016806,
	"grad_norm": 0.4700126349925995,
	"kl": 0.013336181640625,
	"learning_rate": 1e-06,
	"loss": -0.0065,
	"num_tokens": 8715091.0,
	"reward": 0.67434211820364,
	"reward_std": 0.12386543769389391,
	"rewards/curriculum_aware_reward_fn": 0.11965460598003119,
	"rewards/format_reward": 0.5546875,
	"step": 68
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 537.6484375,
	"epoch": 1.1680672268907564,
	"grad_norm": 0.5387859344482422,
	"kl": 0.0084075927734375,
	"learning_rate": 1e-06,
	"loss": 0.0113,
	"num_tokens": 8845582.0,
	"reward": 0.5822368338704109,
	"reward_std": 0.16140672331675887,
	"rewards/curriculum_aware_reward_fn": 0.10567433899268508,
	"rewards/format_reward": 0.4765625,
	"step": 69
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 571.25,
	"epoch": 1.184873949579832,
	"grad_norm": 0.28276559710502625,
	"kl": 0.005802154541015625,
	"learning_rate": 1e-06,
	"loss": 0.0147,
	"num_tokens": 8979574.0,
	"reward": 0.2606907826848328,
	"reward_std": 0.051840442698448896,
	"rewards/curriculum_aware_reward_fn": 0.018503289436921477,
	"rewards/format_reward": 0.2421875,
	"step": 70
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 495.6953125,
	"epoch": 1.2016806722689075,
	"grad_norm": 0.3467198312282562,
	"kl": 0.007556915283203125,
	"learning_rate": 1e-06,
	"loss": 0.0133,
	"num_tokens": 9104143.0,
	"reward": 0.5476973727345467,
	"reward_std": 0.0878668250516057,
	"rewards/curriculum_aware_reward_fn": 0.04769736947491765,
	"rewards/format_reward": 0.5,
	"step": 71
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 628.96875,
	"epoch": 1.2184873949579833,
	"grad_norm": 0.30438435077667236,
	"kl": 0.0047740936279296875,
	"learning_rate": 1e-06,
	"loss": -0.0058,
	"num_tokens": 9247579.0,
	"reward": 0.25863486528396606,
	"reward_std": 0.05783074861392379,
	"rewards/curriculum_aware_reward_fn": 0.016447368427179754,
	"rewards/format_reward": 0.2421875,
	"step": 72
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 581.7109375,
	"epoch": 1.2352941176470589,
	"grad_norm": 0.16290180385112762,
	"kl": 0.005523681640625,
	"learning_rate": 1e-06,
	"loss": 0.0069,
	"num_tokens": 9383094.0,
	"reward": 0.32195723056793213,
	"reward_std": 0.014439198188483715,
	"rewards/curriculum_aware_reward_fn": 0.009457237087190151,
	"rewards/format_reward": 0.3125,
	"step": 73
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 536.9609375,
	"epoch": 1.2521008403361344,
	"grad_norm": 1.2357046604156494,
	"kl": 0.170867919921875,
	"learning_rate": 1e-06,
	"loss": -0.0054,
	"num_tokens": 9513217.0,
	"reward": 0.582236819434911,
	"reward_std": 0.0510927583090961,
	"rewards/curriculum_aware_reward_fn": 0.01973684225231409,
	"rewards/format_reward": 0.5625,
	"step": 74
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 487.8671875,
	"epoch": 1.26890756302521,
	"grad_norm": 0.46429404616355896,
	"kl": 0.0113677978515625,
	"learning_rate": 1e-06,
	"loss": 0.0446,
	"num_tokens": 9635408.0,
	"reward": 0.726973682641983,
	"reward_std": 0.11705214250832796,
	"rewards/curriculum_aware_reward_fn": 0.10197368450462818,
	"rewards/format_reward": 0.625,
	"step": 75
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 584.296875,
	"epoch": 1.2857142857142856,
	"grad_norm": 0.42755427956581116,
	"kl": 0.00647735595703125,
	"learning_rate": 1e-06,
	"loss": 0.0307,
	"num_tokens": 9770998.0,
	"reward": 0.49136512726545334,
	"reward_std": 0.10772840678691864,
	"rewards/curriculum_aware_reward_fn": 0.06167763099074364,
	"rewards/format_reward": 0.4296875,
	"step": 76
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 429.296875,
	"epoch": 1.3025210084033614,
	"grad_norm": 0.45878008008003235,
	"kl": 0.01023101806640625,
	"learning_rate": 1e-06,
	"loss": 0.0156,
	"num_tokens": 9886868.0,
	"reward": 0.7347861528396606,
	"reward_std": 0.10009488789364696,
	"rewards/curriculum_aware_reward_fn": 0.06291118392255157,
	"rewards/format_reward": 0.671875,
	"step": 77
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 516.6328125,
	"epoch": 1.319327731092437,
	"grad_norm": 0.3113223910331726,
	"kl": 0.0077972412109375,
	"learning_rate": 1e-06,
	"loss": 0.0078,
	"num_tokens": 10011221.0,
	"reward": 0.5966282933950424,
	"reward_std": 0.041548303328454494,
	"rewards/curriculum_aware_reward_fn": 0.09662828780710697,
	"rewards/format_reward": 0.5,
	"step": 78
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 557.4765625,
	"epoch": 1.3361344537815127,
	"grad_norm": 0.33871227502822876,
	"kl": 0.0073699951171875,
	"learning_rate": 1e-06,
	"loss": 0.0191,
	"num_tokens": 10140970.0,
	"reward": 0.5415295958518982,
	"reward_std": 0.07458627689629793,
	"rewards/curriculum_aware_reward_fn": 0.04152960516512394,
	"rewards/format_reward": 0.5,
	"step": 79
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 564.125,
	"epoch": 1.3529411764705883,
	"grad_norm": 0.4491986930370331,
	"kl": 0.006259918212890625,
	"learning_rate": 1e-06,
	"loss": 0.0074,
	"num_tokens": 10271986.0,
	"reward": 0.5296052545309067,
	"reward_std": 0.1359914354979992,
	"rewards/curriculum_aware_reward_fn": 0.09210526291280985,
	"rewards/format_reward": 0.4375,
	"step": 80
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 503.125,
	"epoch": 1.3697478991596639,
	"grad_norm": 0.2838430404663086,
	"kl": 0.00777435302734375,
	"learning_rate": 1e-06,
	"loss": 0.0186,
	"num_tokens": 10398090.0,
	"reward": 0.6521381586790085,
	"reward_std": 0.05697542009875178,
	"rewards/curriculum_aware_reward_fn": 0.027138158679008484,
	"rewards/format_reward": 0.625,
	"step": 81
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 530.234375,
	"epoch": 1.3865546218487395,
	"grad_norm": 0.4765428602695465,
	"kl": 0.00778961181640625,
	"learning_rate": 1e-06,
	"loss": 0.0302,
	"num_tokens": 10526192.0,
	"reward": 0.6208881437778473,
	"reward_std": 0.12499829288572073,
	"rewards/curriculum_aware_reward_fn": 0.06620065588504076,
	"rewards/format_reward": 0.5546875,
	"step": 82
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 566.84375,
	"epoch": 1.403361344537815,
	"grad_norm": 0.4760180711746216,
	"kl": 0.0066986083984375,
	"learning_rate": 1e-06,
	"loss": 0.0084,
	"num_tokens": 10657412.0,
	"reward": 0.46916117519140244,
	"reward_std": 0.10547287575900555,
	"rewards/curriculum_aware_reward_fn": 0.09416118310764432,
	"rewards/format_reward": 0.375,
	"step": 83
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 560.7734375,
	"epoch": 1.4201680672268908,
	"grad_norm": 0.27778276801109314,
	"kl": 0.005718231201171875,
	"learning_rate": 1e-06,
	"loss": 0.0127,
	"num_tokens": 10788255.0,
	"reward": 0.44736841320991516,
	"reward_std": 0.06990169547498226,
	"rewards/curriculum_aware_reward_fn": 0.07236842112615705,
	"rewards/format_reward": 0.375,
	"step": 84
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 484.9296875,
	"epoch": 1.4369747899159664,
	"grad_norm": 0.34481725096702576,
	"kl": 0.02048492431640625,
	"learning_rate": 1e-06,
	"loss": -0.0032,
	"num_tokens": 10911166.0,
	"reward": 0.7388980239629745,
	"reward_std": 0.08143611438572407,
	"rewards/curriculum_aware_reward_fn": 0.1138980237301439,
	"rewards/format_reward": 0.625,
	"step": 85
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 457.71875,
	"epoch": 1.453781512605042,
	"grad_norm": 0.4829816222190857,
	"kl": 0.0100555419921875,
	"learning_rate": 1e-06,
	"loss": -0.0016,
	"num_tokens": 11027554.0,
	"reward": 0.6735197305679321,
	"reward_std": 0.08864451944828033,
	"rewards/curriculum_aware_reward_fn": 0.11101973801851273,
	"rewards/format_reward": 0.5625,
	"step": 86
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 508.6953125,
	"epoch": 1.4705882352941178,
	"grad_norm": 0.5016542077064514,
	"kl": 0.00922393798828125,
	"learning_rate": 1e-06,
	"loss": 0.01,
	"num_tokens": 11149275.0,
	"reward": 0.6870888322591782,
	"reward_std": 0.08495050063356757,
	"rewards/curriculum_aware_reward_fn": 0.12458881677594036,
	"rewards/format_reward": 0.5625,
	"step": 87
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 602.9921875,
	"epoch": 1.4873949579831933,
	"grad_norm": 0.29301658272743225,
	"kl": 0.004894256591796875,
	"learning_rate": 1e-06,
	"loss": 0.0249,
	"num_tokens": 11288106.0,
	"reward": 0.29481907188892365,
	"reward_std": 0.0620402698405087,
	"rewards/curriculum_aware_reward_fn": 0.04481907875742763,
	"rewards/format_reward": 0.25,
	"step": 88
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 438.71875,
	"epoch": 1.504201680672269,
	"grad_norm": 0.5715950727462769,
	"kl": 0.01503753662109375,
	"learning_rate": 1e-06,
	"loss": -0.0041,
	"num_tokens": 11401118.0,
	"reward": 0.8972039222717285,
	"reward_std": 0.10221139155328274,
	"rewards/curriculum_aware_reward_fn": 0.1472039446234703,
	"rewards/format_reward": 0.75,
	"step": 89
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 547.015625,
	"epoch": 1.5210084033613445,
	"grad_norm": 0.31229323148727417,
	"kl": 0.0074462890625,
	"learning_rate": 1e-06,
	"loss": 0.0001,
	"num_tokens": 11531400.0,
	"reward": 0.5945723652839661,
	"reward_std": 0.05676991865038872,
	"rewards/curriculum_aware_reward_fn": 0.15707236900925636,
	"rewards/format_reward": 0.4375,
	"step": 90
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 599.9375,
	"epoch": 1.53781512605042,
	"grad_norm": 0.3754754066467285,
	"kl": 0.005001068115234375,
	"learning_rate": 1e-06,
	"loss": 0.0337,
	"num_tokens": 11667224.0,
	"reward": 0.4358552396297455,
	"reward_std": 0.1078398427926004,
	"rewards/curriculum_aware_reward_fn": 0.060855261399410665,
	"rewards/format_reward": 0.375,
	"step": 91
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 563.3671875,
	"epoch": 1.5546218487394958,
	"grad_norm": 0.44682905077934265,
	"kl": 0.00695037841796875,
	"learning_rate": 1e-06,
	"loss": 0.0261,
	"num_tokens": 11800087.0,
	"reward": 0.47820721566677094,
	"reward_std": 0.11488656094297767,
	"rewards/curriculum_aware_reward_fn": 0.10320723743643612,
	"rewards/format_reward": 0.375,
	"step": 92
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 524.7578125,
	"epoch": 1.5714285714285714,
	"grad_norm": 0.4093223214149475,
	"kl": 0.0079803466796875,
	"learning_rate": 1e-06,
	"loss": -0.0003,
	"num_tokens": 11927808.0,
	"reward": 0.5555098727345467,
	"reward_std": 0.10677139926701784,
	"rewards/curriculum_aware_reward_fn": 0.06332236900925636,
	"rewards/format_reward": 0.4921875,
	"step": 93
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 612.375,
	"epoch": 1.5882352941176472,
	"grad_norm": 0.28754857182502747,
	"kl": 0.004489898681640625,
	"learning_rate": 1e-06,
	"loss": -0.0277,
	"num_tokens": 12069560.0,
	"reward": 0.3371710553765297,
	"reward_std": 0.050214093178510666,
	"rewards/curriculum_aware_reward_fn": 0.02467105258256197,
	"rewards/format_reward": 0.3125,
	"step": 94
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 383.2578125,
	"epoch": 1.6050420168067228,
	"grad_norm": 0.47502318024635315,
	"kl": 0.0126953125,
	"learning_rate": 1e-06,
	"loss": 0.016,
	"num_tokens": 12176625.0,
	"reward": 0.7224506512284279,
	"reward_std": 0.10677911480888724,
	"rewards/curriculum_aware_reward_fn": 0.10526315728202462,
	"rewards/format_reward": 0.6171875,
	"step": 95
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 578.078125,
	"epoch": 1.6218487394957983,
	"grad_norm": 0.34693828225135803,
	"kl": 0.006988525390625,
	"learning_rate": 1e-06,
	"loss": 0.006,
	"num_tokens": 12310939.0,
	"reward": 0.5254934206604958,
	"reward_std": 0.06210480257868767,
	"rewards/curriculum_aware_reward_fn": 0.08799342112615705,
	"rewards/format_reward": 0.4375,
	"step": 96
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 529.828125,
	"epoch": 1.638655462184874,
	"grad_norm": 2.9580295085906982,
	"kl": 0.21123504638671875,
	"learning_rate": 1e-06,
	"loss": 0.0019,
	"num_tokens": 12436949.0,
	"reward": 0.5230263099074364,
	"reward_std": 0.13364601507782936,
	"rewards/curriculum_aware_reward_fn": 0.11677631549537182,
	"rewards/format_reward": 0.40625,
	"step": 97
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 470.8828125,
	"epoch": 1.6554621848739495,
	"grad_norm": 0.39620673656463623,
	"kl": 0.00954437255859375,
	"learning_rate": 1e-06,
	"loss": -0.0048,
	"num_tokens": 12558190.0,
	"reward": 0.8194901347160339,
	"reward_std": 0.09049705043435097,
	"rewards/curriculum_aware_reward_fn": 0.26480263471603394,
	"rewards/format_reward": 0.5546875,
	"step": 98
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 495.3515625,
	"epoch": 1.6722689075630253,
	"grad_norm": 0.5109691619873047,
	"kl": 0.007015228271484375,
	"learning_rate": 1e-06,
	"loss": 0.0351,
	"num_tokens": 12681859.0,
	"reward": 0.4362664595246315,
	"reward_std": 0.0971333347260952,
	"rewards/curriculum_aware_reward_fn": 0.1237664483487606,
	"rewards/format_reward": 0.3125,
	"step": 99
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 478.0703125,
	"epoch": 1.6890756302521008,
	"grad_norm": 0.4189630150794983,
	"kl": 0.0095977783203125,
	"learning_rate": 1e-06,
	"loss": 0.0011,
	"num_tokens": 12801148.0,
	"reward": 0.6920230239629745,
	"reward_std": 0.10883715003728867,
	"rewards/curriculum_aware_reward_fn": 0.19983552768826485,
	"rewards/format_reward": 0.4921875,
	"step": 100
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 460.6875,
	"epoch": 1.7058823529411766,
	"grad_norm": 0.5282026529312134,
	"kl": 0.007904052734375,
	"learning_rate": 1e-06,
	"loss": -0.0042,
	"num_tokens": 12921692.0,
	"reward": 0.3396381661295891,
	"reward_std": 0.11080991290509701,
	"rewards/curriculum_aware_reward_fn": 0.04276315798051655,
	"rewards/format_reward": 0.296875,
	"step": 101
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 554.40625,
	"epoch": 1.7226890756302522,
	"grad_norm": 0.5177521109580994,
	"kl": 0.01079559326171875,
	"learning_rate": 1e-06,
	"loss": -0.009,
	"num_tokens": 13052136.0,
	"reward": 0.36965460516512394,
	"reward_std": 0.10201659612357616,
	"rewards/curriculum_aware_reward_fn": 0.01809210516512394,
	"rewards/format_reward": 0.3515625,
	"step": 102
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 524.421875,
	"epoch": 1.7394957983193278,
	"grad_norm": 0.44328662753105164,
	"kl": 0.008655548095703125,
	"learning_rate": 1e-06,
	"loss": 0.0114,
	"num_tokens": 13178822.0,
	"reward": 0.5349506437778473,
	"reward_std": 0.12413342297077179,
	"rewards/curriculum_aware_reward_fn": 0.058388158679008484,
	"rewards/format_reward": 0.4765625,
	"step": 103
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 446.46875,
	"epoch": 1.7563025210084033,
	"grad_norm": 0.647972583770752,
	"kl": 0.01692962646484375,
	"learning_rate": 1e-06,
	"loss": 0.0047,
	"num_tokens": 13297402.0,
	"reward": 0.6476151421666145,
	"reward_std": 0.22924628667533398,
	"rewards/curriculum_aware_reward_fn": 0.07730263285338879,
	"rewards/format_reward": 0.5703125,
	"step": 104
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 503.703125,
	"epoch": 1.773109243697479,
	"grad_norm": 0.631151556968689,
	"kl": 0.008514404296875,
	"learning_rate": 1e-06,
	"loss": 0.0008,
	"num_tokens": 13418340.0,
	"reward": 0.46299341320991516,
	"reward_std": 0.2022387906908989,
	"rewards/curriculum_aware_reward_fn": 0.06455592066049576,
	"rewards/format_reward": 0.3984375,
	"step": 105
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 562.03125,
	"epoch": 1.7899159663865545,
	"grad_norm": 0.3566150963306427,
	"kl": 0.006641387939453125,
	"learning_rate": 1e-06,
	"loss": -0.0002,
	"num_tokens": 13550952.0,
	"reward": 0.35773025802336633,
	"reward_std": 0.09330996312201023,
	"rewards/curriculum_aware_reward_fn": 0.05304276151582599,
	"rewards/format_reward": 0.3046875,
	"step": 106
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 539.1796875,
	"epoch": 1.8067226890756303,
	"grad_norm": 0.4120214581489563,
	"kl": 0.00933074951171875,
	"learning_rate": 1e-06,
	"loss": 0.0026,
	"num_tokens": 13678999.0,
	"reward": 0.5435855314135551,
	"reward_std": 0.15557273291051388,
	"rewards/curriculum_aware_reward_fn": 0.12171052396297455,
	"rewards/format_reward": 0.421875,
	"step": 107
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 542.1796875,
	"epoch": 1.8235294117647058,
	"grad_norm": 0.36332470178604126,
	"kl": 0.00751495361328125,
	"learning_rate": 1e-06,
	"loss": 0.0098,
	"num_tokens": 13811206.0,
	"reward": 0.48643091320991516,
	"reward_std": 0.13410842791199684,
	"rewards/curriculum_aware_reward_fn": 0.11924342392012477,
	"rewards/format_reward": 0.3671875,
	"step": 108
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 553.953125,
	"epoch": 1.8403361344537816,
	"grad_norm": 0.3152480721473694,
	"kl": 0.00626373291015625,
	"learning_rate": 1e-06,
	"loss": 0.0016,
	"num_tokens": 13945808.0,
	"reward": 0.3293585479259491,
	"reward_std": 0.045257058925926685,
	"rewards/curriculum_aware_reward_fn": 0.016858553048223257,
	"rewards/format_reward": 0.3125,
	"step": 109
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 573.59375,
	"epoch": 1.8571428571428572,
	"grad_norm": 0.2340080589056015,
	"kl": 0.00682830810546875,
	"learning_rate": 1e-06,
	"loss": 0.0071,
	"num_tokens": 14080460.0,
	"reward": 0.3347039520740509,
	"reward_std": 0.038679007440805435,
	"rewards/curriculum_aware_reward_fn": 0.02220394741743803,
	"rewards/format_reward": 0.3125,
	"step": 110
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 463.0390625,
	"epoch": 1.8739495798319328,
	"grad_norm": 0.36526933312416077,
	"kl": 0.009578704833984375,
	"learning_rate": 1e-06,
	"loss": 0.0039,
	"num_tokens": 14201065.0,
	"reward": 0.6328125149011612,
	"reward_std": 0.05027205403894186,
	"rewards/curriculum_aware_reward_fn": 0.07031250046566129,
	"rewards/format_reward": 0.5625,
	"step": 111
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 481.9609375,
	"epoch": 1.8907563025210083,
	"grad_norm": 0.4954119324684143,
	"kl": 0.0100555419921875,
	"learning_rate": 1e-06,
	"loss": 0.0005,
	"num_tokens": 14323068.0,
	"reward": 0.5254934206604958,
	"reward_std": 0.12779070809483528,
	"rewards/curriculum_aware_reward_fn": 0.08799342159181833,
	"rewards/format_reward": 0.4375,
	"step": 112
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 494.96875,
	"epoch": 1.907563025210084,
	"grad_norm": 0.46778982877731323,
	"kl": 0.00978851318359375,
	"learning_rate": 1e-06,
	"loss": 0.0199,
	"num_tokens": 14447008.0,
	"reward": 0.5370065793395042,
	"reward_std": 0.1048955712467432,
	"rewards/curriculum_aware_reward_fn": 0.09950657980516553,
	"rewards/format_reward": 0.4375,
	"step": 113
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 501.6796875,
	"epoch": 1.9243697478991597,
	"grad_norm": 0.3055194616317749,
	"kl": 0.00933074951171875,
	"learning_rate": 1e-06,
	"loss": 0.0032,
	"num_tokens": 14571103.0,
	"reward": 0.5111019909381866,
	"reward_std": 0.024554526433348656,
	"rewards/curriculum_aware_reward_fn": 0.08141447440721095,
	"rewards/format_reward": 0.4296875,
	"step": 114
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 508.8203125,
	"epoch": 1.9411764705882353,
	"grad_norm": 0.4632183611392975,
	"kl": 0.012451171875,
	"learning_rate": 1e-06,
	"loss": 0.0157,
	"num_tokens": 14694424.0,
	"reward": 0.6089638024568558,
	"reward_std": 0.10860061645507812,
	"rewards/curriculum_aware_reward_fn": 0.11677631549537182,
	"rewards/format_reward": 0.4921875,
	"step": 115
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 495.6875,
	"epoch": 1.957983193277311,
	"grad_norm": 0.41369161009788513,
	"kl": 0.0089874267578125,
	"learning_rate": 1e-06,
	"loss": 0.0212,
	"num_tokens": 14819792.0,
	"reward": 0.4621710479259491,
	"reward_std": 0.07010683044791222,
	"rewards/curriculum_aware_reward_fn": 0.0871710479259491,
	"rewards/format_reward": 0.375,
	"step": 116
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 529.359375,
	"epoch": 1.9747899159663866,
	"grad_norm": 0.40478190779685974,
	"kl": 0.012042999267578125,
	"learning_rate": 1e-06,
	"loss": 0.0388,
	"num_tokens": 14946718.0,
	"reward": 0.48190788179636,
	"reward_std": 0.10751516558229923,
	"rewards/curriculum_aware_reward_fn": 0.11472039762884378,
	"rewards/format_reward": 0.3671875,
	"step": 117
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 493.4881134033203,
	"epoch": 1.9915966386554622,
	"grad_norm": 0.3562357425689697,
	"kl": 0.0123748779296875,
	"learning_rate": 1e-06,
	"loss": 0.0141,
	"num_tokens": 15064457.0,
	"reward": 0.6706414446234703,
	"reward_std": 0.101046122610569,
	"rewards/curriculum_aware_reward_fn": 0.05345394788309932,
	"rewards/format_reward": 0.6171875,
	"step": 118
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 517.7578125,
	"epoch": 2.0168067226890756,
	"grad_norm": 0.3487071394920349,
	"kl": 0.0104217529296875,
	"learning_rate": 1e-06,
	"loss": 0.0163,
	"num_tokens": 15191538.0,
	"reward": 0.5201480090618134,
	"reward_std": 0.04716231161728501,
	"rewards/curriculum_aware_reward_fn": 0.02014802652411163,
	"rewards/format_reward": 0.5,
	"step": 119
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 577.765625,
	"epoch": 2.033613445378151,
	"grad_norm": 0.35752227902412415,
	"kl": 0.008148193359375,
	"learning_rate": 1e-06,
	"loss": 0.0108,
	"num_tokens": 15327204.0,
	"reward": 0.42763157933950424,
	"reward_std": 0.09388388879597187,
	"rewards/curriculum_aware_reward_fn": 0.05263157933950424,
	"rewards/format_reward": 0.375,
	"step": 120
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 460.53125,
	"epoch": 2.0504201680672267,
	"grad_norm": 0.5020465850830078,
	"kl": 0.014190673828125,
	"learning_rate": 1e-06,
	"loss": 0.0113,
	"num_tokens": 15447608.0,
	"reward": 0.693256601691246,
	"reward_std": 0.12680460885167122,
	"rewards/curriculum_aware_reward_fn": 0.06825657840818167,
	"rewards/format_reward": 0.625,
	"step": 121
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 526.7890625,
	"epoch": 2.0672268907563027,
	"grad_norm": 0.33090242743492126,
	"kl": 0.00830841064453125,
	"learning_rate": 1e-06,
	"loss": 0.0212,
	"num_tokens": 15577021.0,
	"reward": 0.3022203971631825,
	"reward_std": 0.052566134836524725,
	"rewards/curriculum_aware_reward_fn": 0.0522203971631825,
	"rewards/format_reward": 0.25,
	"step": 122
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 465.390625,
	"epoch": 2.0840336134453783,
	"grad_norm": 0.25564736127853394,
	"kl": 0.018894195556640625,
	"learning_rate": 1e-06,
	"loss": 0.001,
	"num_tokens": 15693543.0,
	"reward": 0.5879934281110764,
	"reward_std": 0.03513536183163524,
	"rewards/curriculum_aware_reward_fn": 0.15830592159181833,
	"rewards/format_reward": 0.4296875,
	"step": 123
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 438.015625,
	"epoch": 2.100840336134454,
	"grad_norm": 0.5210288763046265,
	"kl": 0.0128936767578125,
	"learning_rate": 1e-06,
	"loss": 0.038,
	"num_tokens": 15805441.0,
	"reward": 0.7685032784938812,
	"reward_std": 0.15490676742047071,
	"rewards/curriculum_aware_reward_fn": 0.20600328128784895,
	"rewards/format_reward": 0.5625,
	"step": 124
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 419.3515625,
	"epoch": 2.1176470588235294,
	"grad_norm": 0.48274165391921997,
	"kl": 0.01959228515625,
	"learning_rate": 1e-06,
	"loss": 0.0226,
	"num_tokens": 15913862.0,
	"reward": 0.671875,
	"reward_std": 0.11604671645909548,
	"rewards/curriculum_aware_reward_fn": 0.10937500139698386,
	"rewards/format_reward": 0.5625,
	"step": 125
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 520.8203125,
	"epoch": 2.134453781512605,
	"grad_norm": 0.35000789165496826,
	"kl": 0.0090179443359375,
	"learning_rate": 1e-06,
	"loss": -0.0026,
	"num_tokens": 16041007.0,
	"reward": 0.49794407607987523,
	"reward_std": 0.10071868449449539,
	"rewards/curriculum_aware_reward_fn": 0.12294407980516553,
	"rewards/format_reward": 0.375,
	"step": 126
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 557.6328125,
	"epoch": 2.1512605042016806,
	"grad_norm": 0.5103374719619751,
	"kl": 0.0096435546875,
	"learning_rate": 1e-06,
	"loss": 0.0051,
	"num_tokens": 16173728.0,
	"reward": 0.45641446858644485,
	"reward_std": 0.10976400738582015,
	"rewards/curriculum_aware_reward_fn": 0.08141447091475129,
	"rewards/format_reward": 0.375,
	"step": 127
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 414.9140625,
	"epoch": 2.168067226890756,
	"grad_norm": 0.43994390964508057,
	"kl": 0.014190673828125,
	"learning_rate": 1e-06,
	"loss": -0.0002,
	"num_tokens": 16285445.0,
	"reward": 0.7236842215061188,
	"reward_std": 0.11914092372171581,
	"rewards/curriculum_aware_reward_fn": 0.09868421289138496,
	"rewards/format_reward": 0.625,
	"step": 128
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 567.921875,
	"epoch": 2.184873949579832,
	"grad_norm": 0.319624662399292,
	"kl": 0.0082244873046875,
	"learning_rate": 1e-06,
	"loss": 0.0285,
	"num_tokens": 16420019.0,
	"reward": 0.4259868264198303,
	"reward_std": 0.05608854768797755,
	"rewards/curriculum_aware_reward_fn": 0.11348683759570122,
	"rewards/format_reward": 0.3125,
	"step": 129
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 463.53125,
	"epoch": 2.2016806722689077,
	"grad_norm": 0.359430193901062,
	"kl": 0.014495849609375,
	"learning_rate": 1e-06,
	"loss": 0.0185,
	"num_tokens": 16541143.0,
	"reward": 0.4699835479259491,
	"reward_std": 0.08584295958280563,
	"rewards/curriculum_aware_reward_fn": 0.0949835516512394,
	"rewards/format_reward": 0.375,
	"step": 130
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 469.609375,
	"epoch": 2.2184873949579833,
	"grad_norm": 0.41892191767692566,
	"kl": 0.0117034912109375,
	"learning_rate": 1e-06,
	"loss": 0.0365,
	"num_tokens": 16662909.0,
	"reward": 0.5522204041481018,
	"reward_std": 0.0973742357455194,
	"rewards/curriculum_aware_reward_fn": 0.052220395184122026,
	"rewards/format_reward": 0.5,
	"step": 131
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 540.5703125,
	"epoch": 2.235294117647059,
	"grad_norm": 0.48490580916404724,
	"kl": 0.0093231201171875,
	"learning_rate": 1e-06,
	"loss": -0.0203,
	"num_tokens": 16795070.0,
	"reward": 0.41324013471603394,
	"reward_std": 0.08475807495415211,
	"rewards/curriculum_aware_reward_fn": 0.038240132853388786,
	"rewards/format_reward": 0.375,
	"step": 132
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 525.1796875,
	"epoch": 2.2521008403361344,
	"grad_norm": 0.4449516832828522,
	"kl": 0.0105438232421875,
	"learning_rate": 1e-06,
	"loss": 0.0023,
	"num_tokens": 16923613.0,
	"reward": 0.5604440867900848,
	"reward_std": 0.1288975402712822,
	"rewards/curriculum_aware_reward_fn": 0.12294407933950424,
	"rewards/format_reward": 0.4375,
	"step": 133
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 476.3125,
	"epoch": 2.26890756302521,
	"grad_norm": 0.4340604543685913,
	"kl": 0.01129150390625,
	"learning_rate": 1e-06,
	"loss": -0.028,
	"num_tokens": 17045693.0,
	"reward": 0.5587993413209915,
	"reward_std": 0.09385511744767427,
	"rewards/curriculum_aware_reward_fn": 0.058799343183636665,
	"rewards/format_reward": 0.5,
	"step": 134
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 420.9921875,
	"epoch": 2.2857142857142856,
	"grad_norm": 0.45602235198020935,
	"kl": 0.01416015625,
	"learning_rate": 1e-06,
	"loss": 0.0042,
	"num_tokens": 17154012.0,
	"reward": 0.7602795735001564,
	"reward_std": 0.09590415796265006,
	"rewards/curriculum_aware_reward_fn": 0.14309210563078523,
	"rewards/format_reward": 0.6171875,
	"step": 135
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 489.6640625,
	"epoch": 2.302521008403361,
	"grad_norm": 0.4504002332687378,
	"kl": 0.0130157470703125,
	"learning_rate": 1e-06,
	"loss": -0.0126,
	"num_tokens": 17274481.0,
	"reward": 0.6295230239629745,
	"reward_std": 0.15420474018901587,
	"rewards/curriculum_aware_reward_fn": 0.13733552629128098,
	"rewards/format_reward": 0.4921875,
	"step": 136
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 492.3046875,
	"epoch": 2.3193277310924367,
	"grad_norm": 0.3228984773159027,
	"kl": 0.0111846923828125,
	"learning_rate": 1e-06,
	"loss": -0.004,
	"num_tokens": 17399360.0,
	"reward": 0.5587993413209915,
	"reward_std": 0.0586426155641675,
	"rewards/curriculum_aware_reward_fn": 0.05879934271797538,
	"rewards/format_reward": 0.5,
	"step": 137
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 508.5625,
	"epoch": 2.3361344537815127,
	"grad_norm": 0.3110595643520355,
	"kl": 0.015472412109375,
	"learning_rate": 1e-06,
	"loss": -0.0107,
	"num_tokens": 17521248.0,
	"reward": 0.546875,
	"reward_std": 0.07312605157494545,
	"rewards/curriculum_aware_reward_fn": 0.0546875,
	"rewards/format_reward": 0.4921875,
	"step": 138
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 574.015625,
	"epoch": 2.3529411764705883,
	"grad_norm": 0.4071909487247467,
	"kl": 0.0107421875,
	"learning_rate": 1e-06,
	"loss": -0.0034,
	"num_tokens": 17659522.0,
	"reward": 0.47450655698776245,
	"reward_std": 0.07414581999182701,
	"rewards/curriculum_aware_reward_fn": 0.03700657980516553,
	"rewards/format_reward": 0.4375,
	"step": 139
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 522.5546875,
	"epoch": 2.369747899159664,
	"grad_norm": 0.34431034326553345,
	"kl": 0.00946044921875,
	"learning_rate": 1e-06,
	"loss": -0.0068,
	"num_tokens": 17788537.0,
	"reward": 0.4099506512284279,
	"reward_std": 0.05903024738654494,
	"rewards/curriculum_aware_reward_fn": 0.0349506571656093,
	"rewards/format_reward": 0.375,
	"step": 140
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 553.2421875,
	"epoch": 2.3865546218487395,
	"grad_norm": 0.4213170111179352,
	"kl": 0.009979248046875,
	"learning_rate": 1e-06,
	"loss": 0.0132,
	"num_tokens": 17918288.0,
	"reward": 0.4177631586790085,
	"reward_std": 0.08044615527614951,
	"rewards/curriculum_aware_reward_fn": 0.042763158096931875,
	"rewards/format_reward": 0.375,
	"step": 141
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 559.3203125,
	"epoch": 2.403361344537815,
	"grad_norm": 0.23342828452587128,
	"kl": 0.008510589599609375,
	"learning_rate": 1e-06,
	"loss": -0.0035,
	"num_tokens": 18052169.0,
	"reward": 0.3762335553765297,
	"reward_std": 0.03740033693611622,
	"rewards/curriculum_aware_reward_fn": 0.07154605258256197,
	"rewards/format_reward": 0.3046875,
	"step": 142
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 493.078125,
	"epoch": 2.4201680672268906,
	"grad_norm": 0.4362901449203491,
	"kl": 0.012481689453125,
	"learning_rate": 1e-06,
	"loss": 0.0386,
	"num_tokens": 18177251.0,
	"reward": 0.5805921033024788,
	"reward_std": 0.12307591829448938,
	"rewards/curriculum_aware_reward_fn": 0.08840460516512394,
	"rewards/format_reward": 0.4921875,
	"step": 143
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 435.65625,
	"epoch": 2.4369747899159666,
	"grad_norm": 0.6844424605369568,
	"kl": 0.0600128173828125,
	"learning_rate": 1e-06,
	"loss": 0.0209,
	"num_tokens": 18292007.0,
	"reward": 0.6208881735801697,
	"reward_std": 0.15131067298352718,
	"rewards/curriculum_aware_reward_fn": 0.12088816147297621,
	"rewards/format_reward": 0.5,
	"step": 144
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 490.296875,
	"epoch": 2.453781512605042,
	"grad_norm": 0.30699044466018677,
	"kl": 0.010986328125,
	"learning_rate": 1e-06,
	"loss": 0.0248,
	"num_tokens": 18415301.0,
	"reward": 0.49424342066049576,
	"reward_std": 0.04014611290767789,
	"rewards/curriculum_aware_reward_fn": 0.05674342147540301,
	"rewards/format_reward": 0.4375,
	"step": 145
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 471.984375,
	"epoch": 2.4705882352941178,
	"grad_norm": 0.403209924697876,
	"kl": 0.0122528076171875,
	"learning_rate": 1e-06,
	"loss": 0.0353,
	"num_tokens": 18532667.0,
	"reward": 0.6027960330247879,
	"reward_std": 0.0935791190713644,
	"rewards/curriculum_aware_reward_fn": 0.1027960553765297,
	"rewards/format_reward": 0.5,
	"step": 146
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 422.703125,
	"epoch": 2.4873949579831933,
	"grad_norm": 0.42733973264694214,
	"kl": 0.0163116455078125,
	"learning_rate": 1e-06,
	"loss": -0.002,
	"num_tokens": 18645941.0,
	"reward": 0.7845394462347031,
	"reward_std": 0.0871797576546669,
	"rewards/curriculum_aware_reward_fn": 0.0345394741743803,
	"rewards/format_reward": 0.75,
	"step": 147
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 548.828125,
	"epoch": 2.504201680672269,
	"grad_norm": 0.2545667290687561,
	"kl": 0.01213836669921875,
	"learning_rate": 1e-06,
	"loss": 0.0089,
	"num_tokens": 18774111.0,
	"reward": 0.539473682641983,
	"reward_std": 0.060992954298853874,
	"rewards/curriculum_aware_reward_fn": 0.10197368077933788,
	"rewards/format_reward": 0.4375,
	"step": 148
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 540.0703125,
	"epoch": 2.5210084033613445,
	"grad_norm": 0.3914143145084381,
	"kl": 0.00789642333984375,
	"learning_rate": 1e-06,
	"loss": -0.0052,
	"num_tokens": 18904992.0,
	"reward": 0.27878287341445684,
	"reward_std": 0.06910991575568914,
	"rewards/curriculum_aware_reward_fn": 0.02878289413638413,
	"rewards/format_reward": 0.25,
	"step": 149
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 550.9375,
	"epoch": 2.53781512605042,
	"grad_norm": 0.2912365794181824,
	"kl": 0.00799560546875,
	"learning_rate": 1e-06,
	"loss": 0.0067,
	"num_tokens": 19035968.0,
	"reward": 0.3215460553765297,
	"reward_std": 0.01937512680888176,
	"rewards/curriculum_aware_reward_fn": 0.07154605211690068,
	"rewards/format_reward": 0.25,
	"step": 150
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 471.1640625,
	"epoch": 2.5546218487394956,
	"grad_norm": 0.3965752124786377,
	"kl": 0.01221466064453125,
	"learning_rate": 1e-06,
	"loss": 0.0062,
	"num_tokens": 19153861.0,
	"reward": 0.582648016512394,
	"reward_std": 0.08400850929319859,
	"rewards/curriculum_aware_reward_fn": 0.08264802652411163,
	"rewards/format_reward": 0.5,
	"step": 151
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 421.234375,
	"epoch": 2.571428571428571,
	"grad_norm": 0.6044662594795227,
	"kl": 0.026885986328125,
	"learning_rate": 1e-06,
	"loss": 0.0165,
	"num_tokens": 19265379.0,
	"reward": 0.8112664222717285,
	"reward_std": 0.1459241509437561,
	"rewards/curriculum_aware_reward_fn": 0.19407895021140575,
	"rewards/format_reward": 0.6171875,
	"step": 152
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 546.8671875,
	"epoch": 2.588235294117647,
	"grad_norm": 0.4222107231616974,
	"kl": 0.01050567626953125,
	"learning_rate": 1e-06,
	"loss": 0.0261,
	"num_tokens": 19396626.0,
	"reward": 0.38733551651239395,
	"reward_std": 0.06776260398328304,
	"rewards/curriculum_aware_reward_fn": 0.02014802605845034,
	"rewards/format_reward": 0.3671875,
	"step": 153
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 548.875,
	"epoch": 2.6050420168067228,
	"grad_norm": 0.30043891072273254,
	"kl": 0.010498046875,
	"learning_rate": 1e-06,
	"loss": 0.0291,
	"num_tokens": 19531202.0,
	"reward": 0.28166119009256363,
	"reward_std": 0.07623977493494749,
	"rewards/curriculum_aware_reward_fn": 0.031661184038966894,
	"rewards/format_reward": 0.25,
	"step": 154
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 560.640625,
	"epoch": 2.6218487394957983,
	"grad_norm": 0.39753058552742004,
	"kl": 0.0109710693359375,
	"learning_rate": 1e-06,
	"loss": 0.0213,
	"num_tokens": 19665404.0,
	"reward": 0.5197368338704109,
	"reward_std": 0.08217737264931202,
	"rewards/curriculum_aware_reward_fn": 0.08223684225231409,
	"rewards/format_reward": 0.4375,
	"step": 155
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 481.6640625,
	"epoch": 2.638655462184874,
	"grad_norm": 0.39810478687286377,
	"kl": 0.009063720703125,
	"learning_rate": 1e-06,
	"loss": 0.0044,
	"num_tokens": 19787409.0,
	"reward": 0.44202302396297455,
	"reward_std": 0.08141717128455639,
	"rewards/curriculum_aware_reward_fn": 0.12952302768826485,
	"rewards/format_reward": 0.3125,
	"step": 156
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 389.7890625,
	"epoch": 2.6554621848739495,
	"grad_norm": 0.4911426305770874,
	"kl": 0.02197265625,
	"learning_rate": 1e-06,
	"loss": 0.0028,
	"num_tokens": 19896190.0,
	"reward": 0.7331414520740509,
	"reward_std": 0.17763726785779,
	"rewards/curriculum_aware_reward_fn": 0.1784539483487606,
	"rewards/format_reward": 0.5546875,
	"step": 157
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 518.9609375,
	"epoch": 2.6722689075630255,
	"grad_norm": 0.2420579046010971,
	"kl": 0.011962890625,
	"learning_rate": 1e-06,
	"loss": 0.0303,
	"num_tokens": 20022809.0,
	"reward": 0.4453125,
	"reward_std": 0.01647413382306695,
	"rewards/curriculum_aware_reward_fn": 0.007812500232830644,
	"rewards/format_reward": 0.4375,
	"step": 158
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 424.4453125,
	"epoch": 2.689075630252101,
	"grad_norm": 0.46578091382980347,
	"kl": 0.01375579833984375,
	"learning_rate": 1e-06,
	"loss": 0.0011,
	"num_tokens": 20136314.0,
	"reward": 0.49095392785966396,
	"reward_std": 0.13701673224568367,
	"rewards/curriculum_aware_reward_fn": 0.06126644625328481,
	"rewards/format_reward": 0.4296875,
	"step": 159
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 483.921875,
	"epoch": 2.7058823529411766,
	"grad_norm": 0.32379522919654846,
	"kl": 0.01180267333984375,
	"learning_rate": 1e-06,
	"loss": 0.0065,
	"num_tokens": 20257344.0,
	"reward": 0.5197368343360722,
	"reward_std": 0.07396957790479064,
	"rewards/curriculum_aware_reward_fn": 0.0822368417866528,
	"rewards/format_reward": 0.4375,
	"step": 160
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 470.3515625,
	"epoch": 2.722689075630252,
	"grad_norm": 0.4478832483291626,
	"kl": 0.014068603515625,
	"learning_rate": 1e-06,
	"loss": -0.0003,
	"num_tokens": 20375685.0,
	"reward": 0.5801809281110764,
	"reward_std": 0.06543473433703184,
	"rewards/curriculum_aware_reward_fn": 0.08018092112615705,
	"rewards/format_reward": 0.5,
	"step": 161
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 462.3046875,
	"epoch": 2.7394957983193278,
	"grad_norm": 0.4915456175804138,
	"kl": 0.0140838623046875,
	"learning_rate": 1e-06,
	"loss": 0.0286,
	"num_tokens": 20491340.0,
	"reward": 0.6981907933950424,
	"reward_std": 0.1432387800887227,
	"rewards/curriculum_aware_reward_fn": 0.13569078594446182,
	"rewards/format_reward": 0.5625,
	"step": 162
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 460.046875,
	"epoch": 2.7563025210084033,
	"grad_norm": 0.388621062040329,
	"kl": 0.0123138427734375,
	"learning_rate": 1e-06,
	"loss": 0.0144,
	"num_tokens": 20613466.0,
	"reward": 0.4124177619814873,
	"reward_std": 0.07370226783677936,
	"rewards/curriculum_aware_reward_fn": 0.037417763262055814,
	"rewards/format_reward": 0.375,
	"step": 163
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 462.25,
	"epoch": 2.773109243697479,
	"grad_norm": 0.4878624677658081,
	"kl": 0.01593017578125,
	"learning_rate": 1e-06,
	"loss": -0.0006,
	"num_tokens": 20729058.0,
	"reward": 0.6221217066049576,
	"reward_std": 0.12872529029846191,
	"rewards/curriculum_aware_reward_fn": 0.12212171033024788,
	"rewards/format_reward": 0.5,
	"step": 164
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 486.4609375,
	"epoch": 2.7899159663865545,
	"grad_norm": 0.4500262141227722,
	"kl": 0.0099029541015625,
	"learning_rate": 1e-06,
	"loss": 0.0219,
	"num_tokens": 20853869.0,
	"reward": 0.4050164371728897,
	"reward_std": 0.11422262340784073,
	"rewards/curriculum_aware_reward_fn": 0.09251644648611546,
	"rewards/format_reward": 0.3125,
	"step": 165
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 448.5859375,
	"epoch": 2.80672268907563,
	"grad_norm": 0.5006850957870483,
	"kl": 0.0168914794921875,
	"learning_rate": 1e-06,
	"loss": 0.0135,
	"num_tokens": 20973736.0,
	"reward": 0.677631601691246,
	"reward_std": 0.0868874522857368,
	"rewards/curriculum_aware_reward_fn": 0.12294407980516553,
	"rewards/format_reward": 0.5546875,
	"step": 166
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 428.7265625,
	"epoch": 2.8235294117647056,
	"grad_norm": 0.42931458353996277,
	"kl": 0.01781463623046875,
	"learning_rate": 1e-06,
	"loss": 0.0042,
	"num_tokens": 21086485.0,
	"reward": 0.6040295884013176,
	"reward_std": 0.05929867131635547,
	"rewards/curriculum_aware_reward_fn": 0.041529607493430376,
	"rewards/format_reward": 0.5625,
	"step": 167
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 388.421875,
	"epoch": 2.8403361344537816,
	"grad_norm": 0.44046640396118164,
	"kl": 0.0161895751953125,
	"learning_rate": 1e-06,
	"loss": 0.01,
	"num_tokens": 21193627.0,
	"reward": 0.7483552545309067,
	"reward_std": 0.09682157123461366,
	"rewards/curriculum_aware_reward_fn": 0.060855262679979205,
	"rewards/format_reward": 0.6875,
	"step": 168
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 477.6953125,
	"epoch": 2.857142857142857,
	"grad_norm": 0.36667370796203613,
	"kl": 0.0146484375,
	"learning_rate": 1e-06,
	"loss": -0.0002,
	"num_tokens": 21313716.0,
	"reward": 0.6060855239629745,
	"reward_std": 0.10079656913876534,
	"rewards/curriculum_aware_reward_fn": 0.11389802675694227,
	"rewards/format_reward": 0.4921875,
	"step": 169
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 534.4296875,
	"epoch": 2.8739495798319328,
	"grad_norm": 0.3436344563961029,
	"kl": 0.00984954833984375,
	"learning_rate": 1e-06,
	"loss": 0.0022,
	"num_tokens": 21445667.0,
	"reward": 0.48231907933950424,
	"reward_std": 0.08960662921890616,
	"rewards/curriculum_aware_reward_fn": 0.10731907980516553,
	"rewards/format_reward": 0.375,
	"step": 170
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 496.21875,
	"epoch": 2.8907563025210083,
	"grad_norm": 0.48088422417640686,
	"kl": 0.0130767822265625,
	"learning_rate": 1e-06,
	"loss": 0.0182,
	"num_tokens": 21570871.0,
	"reward": 0.4465460618957877,
	"reward_std": 0.1538134217262268,
	"rewards/curriculum_aware_reward_fn": 0.13404605071991682,
	"rewards/format_reward": 0.3125,
	"step": 171
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 429.484375,
	"epoch": 2.907563025210084,
	"grad_norm": 0.5827536582946777,
	"kl": 0.016109466552734375,
	"learning_rate": 1e-06,
	"loss": 0.0127,
	"num_tokens": 21686093.0,
	"reward": 0.4502467103302479,
	"reward_std": 0.15407454315572977,
	"rewards/curriculum_aware_reward_fn": 0.09087171172723174,
	"rewards/format_reward": 0.359375,
	"step": 172
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 482.4609375,
	"epoch": 2.92436974789916,
	"grad_norm": 0.467061311006546,
	"kl": 0.013336181640625,
	"learning_rate": 1e-06,
	"loss": 0.033,
	"num_tokens": 21808264.0,
	"reward": 0.6632401347160339,
	"reward_std": 0.10484125558286905,
	"rewards/curriculum_aware_reward_fn": 0.22574013099074364,
	"rewards/format_reward": 0.4375,
	"step": 173
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 500.96875,
	"epoch": 2.9411764705882355,
	"grad_norm": 0.41948550939559937,
	"kl": 0.009563446044921875,
	"learning_rate": 1e-06,
	"loss": 0.0329,
	"num_tokens": 21933084.0,
	"reward": 0.400082241743803,
	"reward_std": 0.10662292037159204,
	"rewards/curriculum_aware_reward_fn": 0.04070723708719015,
	"rewards/format_reward": 0.359375,
	"step": 174
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 557.640625,
	"epoch": 2.957983193277311,
	"grad_norm": 0.41708114743232727,
	"kl": 0.007190704345703125,
	"learning_rate": 1e-06,
	"loss": 0.021,
	"num_tokens": 22068550.0,
	"reward": 0.3005756618222222,
	"reward_std": 0.06424513552337885,
	"rewards/curriculum_aware_reward_fn": 0.050575657514855266,
	"rewards/format_reward": 0.25,
	"step": 175
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 482.421875,
	"epoch": 2.9747899159663866,
	"grad_norm": 0.6009016633033752,
	"kl": 0.013702392578125,
	"learning_rate": 1e-06,
	"loss": 0.0048,
	"num_tokens": 22189356.0,
	"reward": 0.6620065569877625,
	"reward_std": 0.149446252733469,
	"rewards/curriculum_aware_reward_fn": 0.16200657933950424,
	"rewards/format_reward": 0.5,
	"step": 176
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 490.53572845458984,
	"epoch": 2.991596638655462,
	"grad_norm": 0.49134695529937744,
	"kl": 0.01397705078125,
	"learning_rate": 1e-06,
	"loss": -0.0025,
	"num_tokens": 22309028.0,
	"reward": 0.6726973652839661,
	"reward_std": 0.14456172287464142,
	"rewards/curriculum_aware_reward_fn": 0.1101973676122725,
	"rewards/format_reward": 0.5625,
	"step": 177
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 358.109375,
	"epoch": 3.0168067226890756,
	"grad_norm": 0.5925723314285278,
	"kl": 0.0204315185546875,
	"learning_rate": 1e-06,
	"loss": 0.0117,
	"num_tokens": 22410794.0,
	"reward": 0.87787826359272,
	"reward_std": 0.1721474528312683,
	"rewards/curriculum_aware_reward_fn": 0.19819078594446182,
	"rewards/format_reward": 0.6796875,
	"step": 178
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 523.0078125,
	"epoch": 3.033613445378151,
	"grad_norm": 0.2975535988807678,
	"kl": 0.01165771484375,
	"learning_rate": 1e-06,
	"loss": 0.0697,
	"num_tokens": 22539299.0,
	"reward": 0.5168585479259491,
	"reward_std": 0.048361226450651884,
	"rewards/curriculum_aware_reward_fn": 0.08717105106916279,
	"rewards/format_reward": 0.4296875,
	"step": 179
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 484.7578125,
	"epoch": 3.0504201680672267,
	"grad_norm": 0.45362988114356995,
	"kl": 0.0162200927734375,
	"learning_rate": 1e-06,
	"loss": 0.0023,
	"num_tokens": 22660588.0,
	"reward": 0.5513980276882648,
	"reward_std": 0.1047646040096879,
	"rewards/curriculum_aware_reward_fn": 0.06702302722260356,
	"rewards/format_reward": 0.484375,
	"step": 180
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 452.9375,
	"epoch": 3.0672268907563027,
	"grad_norm": 0.5003635883331299,
	"kl": 0.0143890380859375,
	"learning_rate": 1e-06,
	"loss": 0.0104,
	"num_tokens": 22778956.0,
	"reward": 0.73149673640728,
	"reward_std": 0.17891032248735428,
	"rewards/curriculum_aware_reward_fn": 0.23149671405553818,
	"rewards/format_reward": 0.5,
	"step": 181
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 513.0390625,
	"epoch": 3.0840336134453783,
	"grad_norm": 0.31615540385246277,
	"kl": 0.01172637939453125,
	"learning_rate": 1e-06,
	"loss": 0.0235,
	"num_tokens": 22905121.0,
	"reward": 0.3244243338704109,
	"reward_std": 0.03051401791162789,
	"rewards/curriculum_aware_reward_fn": 0.011924341786652803,
	"rewards/format_reward": 0.3125,
	"step": 182
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 411.6015625,
	"epoch": 3.100840336134454,
	"grad_norm": 0.4836508631706238,
	"kl": 0.0144195556640625,
	"learning_rate": 1e-06,
	"loss": 0.0043,
	"num_tokens": 23019342.0,
	"reward": 0.5826480239629745,
	"reward_std": 0.11801502481102943,
	"rewards/curriculum_aware_reward_fn": 0.07483552675694227,
	"rewards/format_reward": 0.5078125,
	"step": 183
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 416.921875,
	"epoch": 3.1176470588235294,
	"grad_norm": 0.3468119204044342,
	"kl": 0.01403045654296875,
	"learning_rate": 1e-06,
	"loss": -0.0007,
	"num_tokens": 23137316.0,
	"reward": 0.47574012295808643,
	"reward_std": 0.05907326890155673,
	"rewards/curriculum_aware_reward_fn": 0.10074013040866703,
	"rewards/format_reward": 0.375,
	"step": 184
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 414.359375,
	"epoch": 3.134453781512605,
	"grad_norm": 0.4667985439300537,
	"kl": 0.0151519775390625,
	"learning_rate": 1e-06,
	"loss": 0.0299,
	"num_tokens": 23249858.0,
	"reward": 0.6344572305679321,
	"reward_std": 0.15162191167473793,
	"rewards/curriculum_aware_reward_fn": 0.13445723662152886,
	"rewards/format_reward": 0.5,
	"step": 185
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 482.109375,
	"epoch": 3.1512605042016806,
	"grad_norm": 0.4111727774143219,
	"kl": 0.013458251953125,
	"learning_rate": 1e-06,
	"loss": 0.0121,
	"num_tokens": 23370304.0,
	"reward": 0.4806743413209915,
	"reward_std": 0.052865433506667614,
	"rewards/curriculum_aware_reward_fn": 0.0431743401568383,
	"rewards/format_reward": 0.4375,
	"step": 186
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 500.5234375,
	"epoch": 3.168067226890756,
	"grad_norm": 0.4427432715892792,
	"kl": 0.01395416259765625,
	"learning_rate": 1e-06,
	"loss": -0.0198,
	"num_tokens": 23496979.0,
	"reward": 0.4243420949205756,
	"reward_std": 0.07115951599553227,
	"rewards/curriculum_aware_reward_fn": 0.11184210795909166,
	"rewards/format_reward": 0.3125,
	"step": 187
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 450.671875,
	"epoch": 3.184873949579832,
	"grad_norm": 0.4217956066131592,
	"kl": 0.0164947509765625,
	"learning_rate": 1e-06,
	"loss": 0.0361,
	"num_tokens": 23613281.0,
	"reward": 0.5629111751914024,
	"reward_std": 0.07686262531206012,
	"rewards/curriculum_aware_reward_fn": 0.12541118264198303,
	"rewards/format_reward": 0.4375,
	"step": 188
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 399.21875,
	"epoch": 3.2016806722689077,
	"grad_norm": 0.6111953258514404,
	"kl": 0.017852783203125,
	"learning_rate": 1e-06,
	"loss": 0.0248,
	"num_tokens": 23723725.0,
	"reward": 0.7121710479259491,
	"reward_std": 0.15234812535345554,
	"rewards/curriculum_aware_reward_fn": 0.08717105351388454,
	"rewards/format_reward": 0.625,
	"step": 189
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 433.015625,
	"epoch": 3.2184873949579833,
	"grad_norm": 0.4865033030509949,
	"kl": 0.0166778564453125,
	"learning_rate": 1e-06,
	"loss": -0.009,
	"num_tokens": 23835815.0,
	"reward": 0.7372532933950424,
	"reward_std": 0.13220055866986513,
	"rewards/curriculum_aware_reward_fn": 0.17475328966975212,
	"rewards/format_reward": 0.5625,
	"step": 190
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 525.4765625,
	"epoch": 3.235294117647059,
	"grad_norm": 0.3422640562057495,
	"kl": 0.016204833984375,
	"learning_rate": 1e-06,
	"loss": 0.0136,
	"num_tokens": 23964596.0,
	"reward": 0.43174342811107635,
	"reward_std": 0.09182633552700281,
	"rewards/curriculum_aware_reward_fn": 0.05674342066049576,
	"rewards/format_reward": 0.375,
	"step": 191
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 351.4453125,
	"epoch": 3.2521008403361344,
	"grad_norm": 0.5189781785011292,
	"kl": 0.023193359375,
	"learning_rate": 1e-06,
	"loss": 0.0349,
	"num_tokens": 24067141.0,
	"reward": 0.7643914222717285,
	"reward_std": 0.15736807510256767,
	"rewards/curriculum_aware_reward_fn": 0.1940789488144219,
	"rewards/format_reward": 0.5703125,
	"step": 192
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 460.296875,
	"epoch": 3.26890756302521,
	"grad_norm": 0.36804094910621643,
	"kl": 0.012298583984375,
	"learning_rate": 1e-06,
	"loss": -0.0044,
	"num_tokens": 24187067.0,
	"reward": 0.48273026943206787,
	"reward_std": 0.037970013450831175,
	"rewards/curriculum_aware_reward_fn": 0.10773026570677757,
	"rewards/format_reward": 0.375,
	"step": 193
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 418.078125,
	"epoch": 3.2857142857142856,
	"grad_norm": 0.4727684259414673,
	"kl": 0.01959228515625,
	"learning_rate": 1e-06,
	"loss": 0.0229,
	"num_tokens": 24300605.0,
	"reward": 0.5563322491943836,
	"reward_std": 0.06251880899071693,
	"rewards/curriculum_aware_reward_fn": 0.13445723743643612,
	"rewards/format_reward": 0.421875,
	"step": 194
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 498.9765625,
	"epoch": 3.302521008403361,
	"grad_norm": 0.5195404887199402,
	"kl": 0.01263427734375,
	"learning_rate": 1e-06,
	"loss": 0.0303,
	"num_tokens": 24427482.0,
	"reward": 0.38569077104330063,
	"reward_std": 0.10553359193727374,
	"rewards/curriculum_aware_reward_fn": 0.07319078966975212,
	"rewards/format_reward": 0.3125,
	"step": 195
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 448.625,
	"epoch": 3.3193277310924367,
	"grad_norm": 0.49932360649108887,
	"kl": 0.017852783203125,
	"learning_rate": 1e-06,
	"loss": 0.0397,
	"num_tokens": 24541746.0,
	"reward": 0.5193256512284279,
	"reward_std": 0.10704736225306988,
	"rewards/curriculum_aware_reward_fn": 0.08182565891183913,
	"rewards/format_reward": 0.4375,
	"step": 196
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 377.1015625,
	"epoch": 3.3361344537815127,
	"grad_norm": 0.4484708309173584,
	"kl": 0.01934814453125,
	"learning_rate": 1e-06,
	"loss": 0.0074,
	"num_tokens": 24647103.0,
	"reward": 0.6673519611358643,
	"reward_std": 0.06431722524575889,
	"rewards/curriculum_aware_reward_fn": 0.16735197603702545,
	"rewards/format_reward": 0.5,
	"step": 197
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 483.921875,
	"epoch": 3.3529411764705883,
	"grad_norm": 0.41696909070014954,
	"kl": 0.01053619384765625,
	"learning_rate": 1e-06,
	"loss": 0.0158,
	"num_tokens": 24773253.0,
	"reward": 0.2717927638441324,
	"reward_std": 0.11790546495467424,
	"rewards/curriculum_aware_reward_fn": 0.08429276384413242,
	"rewards/format_reward": 0.1875,
	"step": 198
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 380.0546875,
	"epoch": 3.369747899159664,
	"grad_norm": 0.45737817883491516,
	"kl": 0.022186279296875,
	"learning_rate": 1e-06,
	"loss": 0.0056,
	"num_tokens": 24881188.0,
	"reward": 0.7002467140555382,
	"reward_std": 0.03508220613002777,
	"rewards/curriculum_aware_reward_fn": 0.1377467131242156,
	"rewards/format_reward": 0.5625,
	"step": 199
	},
	{
	"clip_ratio": 0.0,
	"completion_length": 390.3671875,
	"epoch": 3.3865546218487395,
	"grad_norm": 0.5029156804084778,
	"kl": 0.0277099609375,
	"learning_rate": 1e-06,
	"loss": 0.004,
	"num_tokens": 24991955.0,
	"reward": 0.6694078892469406,
	"reward_std": 0.10574874095618725,
	"rewards/curriculum_aware_reward_fn": 0.10690789762884378,
	"rewards/format_reward": 0.5625,
	"step": 200
	},
	{
	"epoch": 3.3865546218487395,
	"step": 200,
	"total_flos": 0.0,
	"train_loss": 0.010024200768093579,
	"train_runtime": 35564.3846,
	"train_samples_per_second": 0.72,
	"train_steps_per_second": 0.006
	}
	],
	"logging_steps": 1,
	"max_steps": 200,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 4,
	"save_steps": 50,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 8,
	"trial_name": null,
	"trial_params": null
	}