{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.997849462365592, "eval_steps": 500, "global_step": 870, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 5.555124839106474e-10, "advantages/std": 0.4191276431083679, "advantages/var": 0.17566798121757543, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "epoch": 0.005734767025089606, "grad_norm": 0.0661654509620333, "learning_rate": 2e-06, "loss": 0.0, "num_tokens": 567472.0, "reward": 0.05078125, "reward_std": 0.0628461092710495, "rewards/drgrpo_math_reward/mean": 0.05078125, "rewards/drgrpo_math_reward/std": 0.21976542472839355, "step": 1 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.8864816796594247e-09, "advantages/std": 0.4839746057987213, "advantages/var": 0.2342314190580277, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "epoch": 0.011469534050179211, "grad_norm": 0.07893257755147583, "learning_rate": 1.9999935545509886e-06, "loss": 0.0, "num_tokens": 1137683.0, "reward": 0.041015625, "reward_std": 0.08736887574195862, "rewards/drgrpo_math_reward/mean": 0.041015625, "rewards/drgrpo_math_reward/std": 0.19852031767368317, "step": 2 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 6.085015117959159e-10, "advantages/std": 0.38262951374053955, "advantages/var": 0.14640534478532174, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "epoch": 0.017204301075268817, "grad_norm": 0.05273261487651427, "learning_rate": 1.999974218287042e-06, "loss": 0.0, "num_tokens": 1716151.0, "reward": 0.033203125, "reward_std": 0.06166848540306091, "rewards/drgrpo_math_reward/mean": 0.033203125, "rewards/drgrpo_math_reward/std": 0.17934183776378632, "step": 3 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.163413262831454e-09, "advantages/std": 0.5411086678504944, "advantages/var": 0.29279859042293666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8828125, "epoch": 0.022939068100358423, "grad_norm": 0.08951759926581193, "learning_rate": 1.999941991457422e-06, "loss": 0.0, "num_tokens": 2291009.0, "reward": 0.080078125, "reward_std": 0.11910437047481537, "rewards/drgrpo_math_reward/mean": 0.080078125, "rewards/drgrpo_math_reward/std": 0.271679550409317, "step": 4 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 6.123178716914558e-09, "advantages/std": 0.3422202467918396, "advantages/var": 0.1171146973142676, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "epoch": 0.02867383512544803, "grad_norm": 0.0556359742689093, "learning_rate": 1.999896874477561e-06, "loss": 0.0, "num_tokens": 2877221.0, "reward": 0.02734375, "reward_std": 0.044233135879039764, "rewards/drgrpo_math_reward/mean": 0.02734375, "rewards/drgrpo_math_reward/std": 0.16324250400066376, "step": 5 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6410115441812772e-09, "advantages/std": 0.567529559135437, "advantages/var": 0.3220898004924635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "epoch": 0.034408602150537634, "grad_norm": 0.08678292003542855, "learning_rate": 1.999838867929058e-06, "loss": 0.0, "num_tokens": 3441013.0, "reward": 0.0703125, "reward_std": 0.13341236114501953, "rewards/drgrpo_math_reward/mean": 0.0703125, "rewards/drgrpo_math_reward/std": 0.25592297315597534, "step": 6 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.810849309709882e-10, "advantages/std": 0.48396992683410645, "advantages/var": 0.23422689007981035, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "epoch": 0.04014336917562724, "grad_norm": 0.06964260359021238, "learning_rate": 1.9997679725596696e-06, "loss": 0.0, "num_tokens": 4012338.0, "reward": 0.05078125, "reward_std": 0.08257714658975601, "rewards/drgrpo_math_reward/mean": 0.05078125, "rewards/drgrpo_math_reward/std": 0.21976542472839355, "step": 7 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.217055255215444e-09, "advantages/std": 0.38261309266090393, "advantages/var": 0.14639277867554146, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "epoch": 0.045878136200716846, "grad_norm": 0.05660229204058055, "learning_rate": 1.9996841892832997e-06, "loss": 0.0, "num_tokens": 4589742.0, "reward": 0.021484375, "reward_std": 0.05287160724401474, "rewards/drgrpo_math_reward/mean": 0.021484375, "rewards/drgrpo_math_reward/std": 0.14513419568538666, "step": 8 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.205523083682543e-10, "advantages/std": 0.5674973726272583, "advantages/var": 0.32205326793884126, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8984375, "epoch": 0.05161290322580645, "grad_norm": 0.10223105213325116, "learning_rate": 1.9995875191799916e-06, "loss": 0.0, "num_tokens": 5158187.0, "reward": 0.0625, "reward_std": 0.11211784183979034, "rewards/drgrpo_math_reward/mean": 0.0625, "rewards/drgrpo_math_reward/std": 0.2422981858253479, "step": 9 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.442394194841287e-09, "advantages/std": 0.5410900115966797, "advantages/var": 0.29277840064969496, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "epoch": 0.05734767025089606, "grad_norm": 0.0873740292220268, "learning_rate": 1.999477963495908e-06, "loss": 0.0, "num_tokens": 5745432.0, "reward": 0.046875, "reward_std": 0.10312044620513916, "rewards/drgrpo_math_reward/mean": 0.046875, "rewards/drgrpo_math_reward/std": 0.21157780289649963, "step": 10 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 4.905956743521581e-09, "advantages/std": 0.6169639229774475, "advantages/var": 0.3806444822557218, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "epoch": 0.06308243727598567, "grad_norm": 0.09985166634793106, "learning_rate": 1.999355523643321e-06, "loss": 0.0, "num_tokens": 6302284.0, "reward": 0.09765625, "reward_std": 0.15776723623275757, "rewards/drgrpo_math_reward/mean": 0.09765625, "rewards/drgrpo_math_reward/std": 0.29713961482048035, "step": 11 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.356785814154949e-09, "advantages/std": 0.5927495956420898, "advantages/var": 0.351352083133861, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "epoch": 0.06881720430107527, "grad_norm": 0.09746683104396865, "learning_rate": 1.9992202012005907e-06, "loss": 0.0, "num_tokens": 6847911.0, "reward": 0.13671875, "reward_std": 0.134835883975029, "rewards/drgrpo_math_reward/mean": 0.13671875, "rewards/drgrpo_math_reward/std": 0.3438861668109894, "step": 12 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.272964464066904e-10, "advantages/std": 0.6402633786201477, "advantages/var": 0.4099371940020866, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6015625, "epoch": 0.07455197132616488, "grad_norm": 0.12241159362046089, "learning_rate": 1.999071997912144e-06, "loss": 0.0, "num_tokens": 7397104.0, "reward": 0.140625, "reward_std": 0.17550916969776154, "rewards/drgrpo_math_reward/mean": 0.140625, "rewards/drgrpo_math_reward/std": 0.3479743003845215, "step": 13 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 8.782758630205541e-10, "advantages/std": 0.662749171257019, "advantages/var": 0.43923646400186556, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "epoch": 0.08028673835125448, "grad_norm": 0.11842970024947452, "learning_rate": 1.9989109156884548e-06, "loss": -0.0, "num_tokens": 7919711.0, "reward": 0.24609375, "reward_std": 0.19800353050231934, "rewards/drgrpo_math_reward/mean": 0.24609375, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 14 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.547832120915635e-10, "advantages/std": 0.6169470548629761, "advantages/var": 0.3806236685041, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "epoch": 0.08602150537634409, "grad_norm": 0.10544054381115398, "learning_rate": 1.9987369566060176e-06, "loss": 0.0, "num_tokens": 8486377.0, "reward": 0.138671875, "reward_std": 0.14182433485984802, "rewards/drgrpo_math_reward/mean": 0.138671875, "rewards/drgrpo_math_reward/std": 0.34594178199768066, "step": 15 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.026547517507542e-10, "advantages/std": 0.6627170443534851, "advantages/var": 0.43919388087661915, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "epoch": 0.09175627240143369, "grad_norm": 0.1029013554395571, "learning_rate": 1.998550122907321e-06, "loss": 0.0, "num_tokens": 9010906.0, "reward": 0.2109375, "reward_std": 0.1751890480518341, "rewards/drgrpo_math_reward/mean": 0.2109375, "rewards/drgrpo_math_reward/std": 0.4083731174468994, "step": 16 }, { "advantages/mean": -2.2118911147117615e-09, "advantages/snr": 3.4546925328041916e-09, "advantages/std": 0.6402570009231567, "advantages/var": 0.40992902723111513, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "epoch": 0.0974910394265233, "grad_norm": 0.09333975019661332, "learning_rate": 1.9983504170008193e-06, "loss": 0.0, "num_tokens": 9483444.0, "reward": 0.4296875, "reward_std": 0.1654340773820877, "rewards/drgrpo_math_reward/mean": 0.4296875, "rewards/drgrpo_math_reward/std": 0.4955156147480011, "step": 17 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 1.7675378169219039e-09, "advantages/std": 0.5927668809890747, "advantages/var": 0.35137257519751586, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "epoch": 0.1032258064516129, "grad_norm": 0.09414256579238753, "learning_rate": 1.998137841460901e-06, "loss": 0.0, "num_tokens": 9947381.0, "reward": 0.435546875, "reward_std": 0.1488201767206192, "rewards/drgrpo_math_reward/mean": 0.435546875, "rewards/drgrpo_math_reward/std": 0.49631330370903015, "step": 18 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 2.084621108725022e-09, "advantages/std": 0.7259828448295593, "advantages/var": 0.52705109098682, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.2578125, "epoch": 0.10896057347670252, "grad_norm": 0.1324490145644826, "learning_rate": 1.9979123990278553e-06, "loss": 0.0, "num_tokens": 10410104.0, "reward": 0.470703125, "reward_std": 0.21412275731563568, "rewards/drgrpo_math_reward/mean": 0.470703125, "rewards/drgrpo_math_reward/std": 0.49962911009788513, "step": 19 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 2.777472395498241e-10, "advantages/std": 0.8382824659347534, "advantages/var": 0.702717492693651, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6015625, "epoch": 0.11469534050179211, "grad_norm": 0.13092408067934705, "learning_rate": 1.9976740926078385e-06, "loss": 0.0, "num_tokens": 10882005.0, "reward": 0.484375, "reward_std": 0.27759891748428345, "rewards/drgrpo_math_reward/mean": 0.484375, "rewards/drgrpo_math_reward/std": 0.5002445578575134, "step": 20 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.151134813747075e-09, "advantages/std": 0.6169727444648743, "advantages/var": 0.38065536741251904, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5234375, "epoch": 0.12043010752688173, "grad_norm": 0.0966143983061611, "learning_rate": 1.997422925272834e-06, "loss": -0.0, "num_tokens": 11319967.0, "reward": 0.48046875, "reward_std": 0.15780691802501678, "rewards/drgrpo_math_reward/mean": 0.48046875, "rewards/drgrpo_math_reward/std": 0.5001069903373718, "step": 21 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6844605207443237, "advantages/var": 0.4684862044575908, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0390625, "epoch": 0.12616487455197134, "grad_norm": 0.09441767628743837, "learning_rate": 1.997158900260614e-06, "loss": 0.0, "num_tokens": 11825678.0, "reward": 0.380859375, "reward_std": 0.19172291457653046, "rewards/drgrpo_math_reward/mean": 0.380859375, "rewards/drgrpo_math_reward/std": 0.48607301712036133, "step": 22 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 3.346778951902237e-09, "advantages/std": 0.7652543187141418, "advantages/var": 0.5856141723106454, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.8984375, "epoch": 0.13189964157706094, "grad_norm": 0.10348862604357899, "learning_rate": 1.996882020974698e-06, "loss": -0.0, "num_tokens": 12263166.0, "reward": 0.5625, "reward_std": 0.24365317821502686, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49656352400779724, "step": 23 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 1.5212517850802946e-10, "advantages/std": 0.7652600407600403, "advantages/var": 0.5856229299840585, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.421875, "epoch": 0.13763440860215054, "grad_norm": 0.09742798893790104, "learning_rate": 1.996592290984309e-06, "loss": 0.0, "num_tokens": 12768940.0, "reward": 0.427734375, "reward_std": 0.25115451216697693, "rewards/drgrpo_math_reward/mean": 0.427734375, "rewards/drgrpo_math_reward/std": 0.4952339828014374, "step": 24 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 1.0648821384124561e-09, "advantages/std": 0.7652558088302612, "advantages/var": 0.5856164529484573, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6484375, "epoch": 0.14336917562724014, "grad_norm": 0.0891449302596231, "learning_rate": 1.9962897140243264e-06, "loss": -0.0, "num_tokens": 13234632.0, "reward": 0.478515625, "reward_std": 0.2462756335735321, "rewards/drgrpo_math_reward/mean": 0.478515625, "rewards/drgrpo_math_reward/std": 0.5000267624855042, "step": 25 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.700835604131341e-09, "advantages/std": 0.6844595670700073, "advantages/var": 0.46848489895366185, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.7734375, "epoch": 0.14910394265232976, "grad_norm": 0.1000599497656262, "learning_rate": 1.995974293995239e-06, "loss": 0.0, "num_tokens": 13674481.0, "reward": 0.439453125, "reward_std": 0.19191381335258484, "rewards/drgrpo_math_reward/mean": 0.439453125, "rewards/drgrpo_math_reward/std": 0.49680593609809875, "step": 26 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 4.810568122441904e-10, "advantages/std": 0.7259973287582397, "advantages/var": 0.5270721213640996, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.8125, "epoch": 0.15483870967741936, "grad_norm": 0.09867820945315903, "learning_rate": 1.995646034963094e-06, "loss": -0.0, "num_tokens": 14115991.0, "reward": 0.529296875, "reward_std": 0.23284542560577393, "rewards/drgrpo_math_reward/mean": 0.529296875, "rewards/drgrpo_math_reward/std": 0.49962911009788513, "step": 27 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 4.4538390717672717e-10, "advantages/std": 0.7841458916664124, "advantages/var": 0.6148847794173129, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0078125, "epoch": 0.16057347670250896, "grad_norm": 0.1207204324090634, "learning_rate": 1.995304941159446e-06, "loss": -0.0, "num_tokens": 14531825.0, "reward": 0.5, "reward_std": 0.24823318421840668, "rewards/drgrpo_math_reward/mean": 0.5, "rewards/drgrpo_math_reward/std": 0.5004889965057373, "step": 28 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.1876815718671333e-09, "advantages/std": 0.7841517329216003, "advantages/var": 0.6148939402439488, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6484375, "epoch": 0.16630824372759856, "grad_norm": 0.11285563183533347, "learning_rate": 1.9949510169813e-06, "loss": 0.0, "num_tokens": 15001109.0, "reward": 0.421875, "reward_std": 0.2550915479660034, "rewards/drgrpo_math_reward/mean": 0.421875, "rewards/drgrpo_math_reward/std": 0.49434176087379456, "step": 29 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.4185804544034511e-09, "advantages/std": 0.8206465840339661, "advantages/var": 0.6734608158866173, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.7890625, "epoch": 0.17204301075268819, "grad_norm": 0.11098559090255534, "learning_rate": 1.9945842669910563e-06, "loss": 0.0, "num_tokens": 15460163.0, "reward": 0.455078125, "reward_std": 0.28239789605140686, "rewards/drgrpo_math_reward/mean": 0.455078125, "rewards/drgrpo_math_reward/std": 0.4984649419784546, "step": 30 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.8105087963957554e-09, "advantages/std": 0.6627430319786072, "advantages/var": 0.43922832643619714, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.734375, "epoch": 0.17777777777777778, "grad_norm": 0.09039190942215285, "learning_rate": 1.994204695916451e-06, "loss": 0.0, "num_tokens": 15919182.0, "reward": 0.4921875, "reward_std": 0.19295790791511536, "rewards/drgrpo_math_reward/mean": 0.4921875, "rewards/drgrpo_math_reward/std": 0.5004279017448425, "step": 31 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.3752997705491423e-09, "advantages/std": 0.7841726541519165, "advantages/var": 0.6149267515196613, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.328125, "epoch": 0.18351254480286738, "grad_norm": 0.11794572459622037, "learning_rate": 1.9938123086504976e-06, "loss": -0.0, "num_tokens": 16312727.0, "reward": 0.6640625, "reward_std": 0.2775266766548157, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4727790653705597, "step": 32 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.2486359733038545e-09, "advantages/std": 0.7458719611167908, "advantages/var": 0.5563249823802074, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0078125, "epoch": 0.18924731182795698, "grad_norm": 0.1128510782822976, "learning_rate": 1.9934071102514193e-06, "loss": -0.0, "num_tokens": 16770608.0, "reward": 0.400390625, "reward_std": 0.22752927243709564, "rewards/drgrpo_math_reward/mean": 0.400390625, "rewards/drgrpo_math_reward/std": 0.4904567301273346, "step": 33 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.2486075361171953e-09, "advantages/std": 0.7458889484405518, "advantages/var": 0.5563503234057521, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 0.1949820788530466, "grad_norm": 0.13288767484270153, "learning_rate": 1.9929891059425876e-06, "loss": -0.0, "num_tokens": 17165874.0, "reward": 0.66015625, "reward_std": 0.2408919632434845, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.4741191864013672, "step": 34 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.181918182738839e-09, "advantages/std": 0.6402549147605896, "advantages/var": 0.40992635587508985, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.984375, "epoch": 0.2007168458781362, "grad_norm": 0.10206535063663684, "learning_rate": 1.9925583011124534e-06, "loss": 0.0, "num_tokens": 17596694.0, "reward": 0.486328125, "reward_std": 0.16860288381576538, "rewards/drgrpo_math_reward/mean": 0.486328125, "rewards/drgrpo_math_reward/std": 0.5003018379211426, "step": 35 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 3.859984473210753e-09, "advantages/std": 0.7841477990150452, "advantages/var": 0.6148877707001397, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.1875, "epoch": 0.2064516129032258, "grad_norm": 0.11745032669252742, "learning_rate": 1.9921147013144777e-06, "loss": 0.0, "num_tokens": 18027171.0, "reward": 0.583984375, "reward_std": 0.2518920302391052, "rewards/drgrpo_math_reward/mean": 0.583984375, "rewards/drgrpo_math_reward/std": 0.493378221988678, "step": 36 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 4.539493392414977e-09, "advantages/std": 0.8206400871276855, "advantages/var": 0.6734501526009353, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.2109375, "epoch": 0.2121863799283154, "grad_norm": 0.11616739287880208, "learning_rate": 1.9916583122670605e-06, "loss": -0.0, "num_tokens": 18462309.0, "reward": 0.505859375, "reward_std": 0.27702218294143677, "rewards/drgrpo_math_reward/mean": 0.505859375, "rewards/drgrpo_math_reward/std": 0.5004546642303467, "step": 37 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 3.3329545514275437e-09, "advantages/std": 0.8382855653762817, "advantages/var": 0.7027226891182323, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.2734375, "epoch": 0.21792114695340503, "grad_norm": 0.12608654853243934, "learning_rate": 1.9911891398534664e-06, "loss": 0.0, "num_tokens": 18890055.0, "reward": 0.5625, "reward_std": 0.282077431678772, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49656352400779724, "step": 38 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.5607508143992436e-09, "advantages/std": 0.7458930611610413, "advantages/var": 0.5563564586881888, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.9921875, "epoch": 0.22365591397849463, "grad_norm": 0.11047244469548782, "learning_rate": 1.990707190121749e-06, "loss": 0.0, "num_tokens": 19321223.0, "reward": 0.4921875, "reward_std": 0.2421843558549881, "rewards/drgrpo_math_reward/mean": 0.4921875, "rewards/drgrpo_math_reward/std": 0.5004279017448425, "step": 39 }, { "advantages/mean": 3.4924596548080444e-09, "advantages/snr": 4.453833655037098e-09, "advantages/std": 0.7841468453407288, "advantages/var": 0.6148862750578168, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5234375, "epoch": 0.22939068100358423, "grad_norm": 0.12354020784375054, "learning_rate": 1.990212469284673e-06, "loss": -0.0, "num_tokens": 19714405.0, "reward": 0.59765625, "reward_std": 0.25085198879241943, "rewards/drgrpo_math_reward/mean": 0.59765625, "rewards/drgrpo_math_reward/std": 0.4908501207828522, "step": 40 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 5.8186564958217324e-09, "advantages/std": 0.6402320265769958, "advantages/var": 0.4098970478548871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.046875, "epoch": 0.23512544802867383, "grad_norm": 0.09343775744994841, "learning_rate": 1.9897049837196347e-06, "loss": 0.0, "num_tokens": 20151920.0, "reward": 0.5703125, "reward_std": 0.150799959897995, "rewards/drgrpo_math_reward/mean": 0.5703125, "rewards/drgrpo_math_reward/std": 0.4955156147480011, "step": 41 }, { "advantages/mean": 3.4924596548080444e-09, "advantages/snr": 5.269918221737605e-09, "advantages/std": 0.6627160906791687, "advantages/var": 0.43919261684508015, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0859375, "epoch": 0.24086021505376345, "grad_norm": 0.09928140749624612, "learning_rate": 1.9891847399685785e-06, "loss": 0.0, "num_tokens": 20579925.0, "reward": 0.537109375, "reward_std": 0.17244303226470947, "rewards/drgrpo_math_reward/mean": 0.537109375, "rewards/drgrpo_math_reward/std": 0.4991086423397064, "step": 42 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 2.8863204942206493e-09, "advantages/std": 0.7260024547576904, "advantages/var": 0.5270795643141923, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0703125, "epoch": 0.24659498207885305, "grad_norm": 0.08654344017350685, "learning_rate": 1.988651744737914e-06, "loss": 0.0, "num_tokens": 20995510.0, "reward": 0.615234375, "reward_std": 0.23697268962860107, "rewards/drgrpo_math_reward/mean": 0.615234375, "rewards/drgrpo_math_reward/std": 0.4870156943798065, "step": 43 }, { "advantages/mean": -4.423782229423523e-09, "advantages/snr": 5.641639701081546e-09, "advantages/std": 0.7841305732727051, "advantages/var": 0.6148607559409811, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.2578125, "epoch": 0.2523297491039427, "grad_norm": 0.11850323929354498, "learning_rate": 1.9881060048984273e-06, "loss": 0.0, "num_tokens": 21405806.0, "reward": 0.578125, "reward_std": 0.23255254328250885, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.49434176087379456, "step": 44 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.040977664461505e-09, "advantages/std": 0.6844679713249207, "advantages/var": 0.4684964037696524, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.2265625, "epoch": 0.25806451612903225, "grad_norm": 0.08907125101421681, "learning_rate": 1.9875475274851963e-06, "loss": 0.0, "num_tokens": 21831608.0, "reward": 0.529296875, "reward_std": 0.20224958658218384, "rewards/drgrpo_math_reward/mean": 0.529296875, "rewards/drgrpo_math_reward/std": 0.49962911009788513, "step": 45 }, { "advantages/mean": 2.444721758365631e-09, "advantages/snr": 3.4650237237236235e-09, "advantages/std": 0.7055425643920898, "advantages/var": 0.49779031016896624, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4296875, "epoch": 0.2637992831541219, "grad_norm": 0.11397161388206502, "learning_rate": 1.9869763196974956e-06, "loss": -0.0, "num_tokens": 22270465.0, "reward": 0.537109375, "reward_std": 0.21935221552848816, "rewards/drgrpo_math_reward/mean": 0.537109375, "rewards/drgrpo_math_reward/std": 0.4991086423397064, "step": 46 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 9.621307655971707e-10, "advantages/std": 0.7259843945503235, "advantages/var": 0.5270533411305998, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 0.26953405017921145, "grad_norm": 0.11913278545264834, "learning_rate": 1.9863923888987067e-06, "loss": 0.0, "num_tokens": 22646765.0, "reward": 0.73828125, "reward_std": 0.2220982313156128, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "step": 47 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.513146738112728e-10, "advantages/std": 0.6627410054206848, "advantages/var": 0.4392256402660202, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.515625, "epoch": 0.2752688172043011, "grad_norm": 0.08812395981287387, "learning_rate": 1.9857957426162217e-06, "loss": 0.0, "num_tokens": 23020678.0, "reward": 0.732421875, "reward_std": 0.18776722252368927, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "step": 48 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.5455387472946387e-09, "advantages/std": 0.6402630805969238, "advantages/var": 0.409936812375463, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.265625, "epoch": 0.2810035842293907, "grad_norm": 0.08861789349104914, "learning_rate": 1.9851863885413475e-06, "loss": 0.0, "num_tokens": 23411396.0, "reward": 0.64453125, "reward_std": 0.1792823225259781, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.47912323474884033, "step": 49 }, { "advantages/mean": -2.444721758365631e-09, "advantages/snr": 3.117702991142957e-09, "advantages/std": 0.7841419577598572, "advantages/var": 0.6148786099194616, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3671875, "epoch": 0.2867383512544803, "grad_norm": 0.10727839524678742, "learning_rate": 1.9845643345292055e-06, "loss": 0.0, "num_tokens": 23800347.0, "reward": 0.6171875, "reward_std": 0.24549835920333862, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.486548513174057, "step": 50 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.8555752635002136, "advantages/var": 0.73200903151346, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.296875, "epoch": 0.2924731182795699, "grad_norm": 0.14064487825049946, "learning_rate": 1.9839295885986295e-06, "loss": 0.0, "num_tokens": 24208654.0, "reward": 0.568359375, "reward_std": 0.30204230546951294, "rewards/drgrpo_math_reward/mean": 0.568359375, "rewards/drgrpo_math_reward/std": 0.4957893490791321, "step": 51 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.2828189032237875e-09, "advantages/std": 0.7259969115257263, "advantages/var": 0.5270715155448933, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "epoch": 0.2982078853046595, "grad_norm": 0.10144077245571843, "learning_rate": 1.9832821589320657e-06, "loss": 0.0, "num_tokens": 24585811.0, "reward": 0.708984375, "reward_std": 0.2281283736228943, "rewards/drgrpo_math_reward/mean": 0.708984375, "rewards/drgrpo_math_reward/std": 0.45467492938041687, "step": 52 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.1850493940763188e-09, "advantages/std": 0.745893657207489, "advantages/var": 0.5563573478623631, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6796875, "epoch": 0.3039426523297491, "grad_norm": 0.11075616094101441, "learning_rate": 1.9826220538754633e-06, "loss": 0.0, "num_tokens": 24976125.0, "reward": 0.638671875, "reward_std": 0.2438090443611145, "rewards/drgrpo_math_reward/mean": 0.638671875, "rewards/drgrpo_math_reward/std": 0.48085519671440125, "step": 53 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.2728135953284436e-09, "advantages/std": 0.6402673721313477, "advantages/var": 0.4099423078159816, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.453125, "epoch": 0.3096774193548387, "grad_norm": 0.1041979680791278, "learning_rate": 1.981949281938169e-06, "loss": 0.0, "num_tokens": 25351535.0, "reward": 0.716796875, "reward_std": 0.17717355489730835, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "step": 54 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 2.9699709063628248e-09, "advantages/std": 0.7055543065071106, "advantages/var": 0.49780687943072977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3828125, "epoch": 0.3154121863799283, "grad_norm": 0.1004346935563218, "learning_rate": 1.981263851792818e-06, "loss": 0.0, "num_tokens": 25758805.0, "reward": 0.55859375, "reward_std": 0.23311826586723328, "rewards/drgrpo_math_reward/mean": 0.55859375, "rewards/drgrpo_math_reward/std": 0.4970405399799347, "step": 55 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 3.1215390453330702e-09, "advantages/std": 0.745884120464325, "advantages/var": 0.5563431211608396, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4296875, "epoch": 0.3211469534050179, "grad_norm": 0.1123178122977563, "learning_rate": 1.98056577227522e-06, "loss": 0.0, "num_tokens": 26171238.0, "reward": 0.638671875, "reward_std": 0.23839645087718964, "rewards/drgrpo_math_reward/mean": 0.638671875, "rewards/drgrpo_math_reward/std": 0.48085519671440125, "step": 56 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 1.825474558646876e-09, "advantages/std": 0.7652716040611267, "advantages/var": 0.5856406279822899, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5078125, "epoch": 0.32688172043010755, "grad_norm": 0.14210326752271707, "learning_rate": 1.9798550523842466e-06, "loss": 0.0, "num_tokens": 26556243.0, "reward": 0.5546875, "reward_std": 0.2571454644203186, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.49748632311820984, "step": 57 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.0909617320145493e-09, "advantages/std": 0.6402533650398254, "advantages/var": 0.40992437144481997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 0.3326164874551971, "grad_norm": 0.09242518859172717, "learning_rate": 1.9791317012817163e-06, "loss": -0.0, "num_tokens": 26893836.0, "reward": 0.734375, "reward_std": 0.17055149376392365, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "step": 58 }, { "advantages/mean": 3.4924596548080444e-09, "advantages/snr": 6.153658863105484e-09, "advantages/std": 0.5675419569015503, "advantages/var": 0.32210387284364117, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "epoch": 0.33835125448028674, "grad_norm": 0.08581509501870313, "learning_rate": 1.9783957282922735e-06, "loss": 0.0, "num_tokens": 27256040.0, "reward": 0.767578125, "reward_std": 0.14563170075416565, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "step": 59 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.5607348503658015e-09, "advantages/std": 0.7459006905555725, "advantages/var": 0.5563678401712799, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.46875, "epoch": 0.34408602150537637, "grad_norm": 0.12524857775816534, "learning_rate": 1.9776471429032713e-06, "loss": -0.0, "num_tokens": 27653752.0, "reward": 0.615234375, "reward_std": 0.25137394666671753, "rewards/drgrpo_math_reward/mean": 0.615234375, "rewards/drgrpo_math_reward/std": 0.4870156943798065, "step": 60 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 9.90017651051043e-10, "advantages/std": 0.705534815788269, "advantages/var": 0.49777937628938673, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.2890625, "epoch": 0.34982078853046594, "grad_norm": 0.1111828856567375, "learning_rate": 1.9768859547646473e-06, "loss": -0.0, "num_tokens": 28065669.0, "reward": 0.533203125, "reward_std": 0.21243801712989807, "rewards/drgrpo_math_reward/mean": 0.533203125, "rewards/drgrpo_math_reward/std": 0.4993842542171478, "step": 61 }, { "advantages/mean": 1.979060471057892e-09, "advantages/snr": 3.207756752593712e-09, "advantages/std": 0.616960883140564, "advantages/var": 0.3806407313255846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4609375, "epoch": 0.35555555555555557, "grad_norm": 0.10791702722200502, "learning_rate": 1.9761121736888013e-06, "loss": 0.0, "num_tokens": 28426726.0, "reward": 0.611328125, "reward_std": 0.14953625202178955, "rewards/drgrpo_math_reward/mean": 0.611328125, "rewards/drgrpo_math_reward/std": 0.4879252314567566, "step": 62 }, { "advantages/mean": 3.14321368932724e-09, "advantages/snr": 4.2141231733245396e-09, "advantages/std": 0.7458760738372803, "advantages/var": 0.556331117522916, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3828125, "epoch": 0.36129032258064514, "grad_norm": 0.12558155240872126, "learning_rate": 1.9753258096504644e-06, "loss": -0.0, "num_tokens": 28819995.0, "reward": 0.669921875, "reward_std": 0.2335052341222763, "rewards/drgrpo_math_reward/mean": 0.669921875, "rewards/drgrpo_math_reward/std": 0.47070086002349854, "step": 63 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6627410054206848, "advantages/var": 0.4392256402660202, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 0.36702508960573477, "grad_norm": 0.0959694131610515, "learning_rate": 1.974526872786577e-06, "loss": 0.0, "num_tokens": 29208423.0, "reward": 0.681640625, "reward_std": 0.19183336198329926, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "step": 64 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.3100353311278406e-09, "advantages/std": 0.7055366039276123, "advantages/var": 0.4977818994817085, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.1953125, "epoch": 0.3727598566308244, "grad_norm": 0.11289072632248913, "learning_rate": 1.973715373396152e-06, "loss": -0.0, "num_tokens": 29641798.0, "reward": 0.501953125, "reward_std": 0.21054047346115112, "rewards/drgrpo_math_reward/mean": 0.501953125, "rewards/drgrpo_math_reward/std": 0.5004851818084717, "step": 65 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 4.682357958938268e-10, "advantages/std": 0.7458762526512146, "advantages/var": 0.5563313842690185, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7265625, "epoch": 0.37849462365591396, "grad_norm": 0.12374670412263673, "learning_rate": 1.9728913219401447e-06, "loss": -0.0, "num_tokens": 29984772.0, "reward": 0.724609375, "reward_std": 0.22908951342105865, "rewards/drgrpo_math_reward/mean": 0.724609375, "rewards/drgrpo_math_reward/std": 0.44714778661727905, "step": 66 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.9639257732332665e-09, "advantages/std": 0.5927684307098389, "advantages/var": 0.35137441244620504, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3671875, "epoch": 0.3842293906810036, "grad_norm": 0.08870404660567285, "learning_rate": 1.9720547290413193e-06, "loss": 0.0, "num_tokens": 30377090.0, "reward": 0.56640625, "reward_std": 0.14843884110450745, "rewards/drgrpo_math_reward/mean": 0.56640625, "rewards/drgrpo_math_reward/std": 0.4960552453994751, "step": 67 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.4845764618497307e-09, "advantages/std": 0.7841652035713196, "advantages/var": 0.6149150664920491, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3125, "epoch": 0.3899641577060932, "grad_norm": 0.12278493841069396, "learning_rate": 1.971205605484109e-06, "loss": -0.0, "num_tokens": 30793062.0, "reward": 0.625, "reward_std": 0.26963385939598083, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4845963716506958, "step": 68 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 1.3747107188159735e-09, "advantages/std": 0.5927845239639282, "advantages/var": 0.351393491851141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6484375, "epoch": 0.3956989247311828, "grad_norm": 0.10932699774049387, "learning_rate": 1.9703439622144798e-06, "loss": 0.0, "num_tokens": 31130078.0, "reward": 0.75390625, "reward_std": 0.16250109672546387, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 69 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5411006212234497, "advantages/var": 0.2927898822884032, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6484375, "epoch": 0.4014336917562724, "grad_norm": 0.0973369113971032, "learning_rate": 1.969469810339786e-06, "loss": 0.0, "num_tokens": 31501694.0, "reward": 0.712890625, "reward_std": 0.10982859134674072, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "step": 70 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.6035814254396043e-09, "advantages/std": 0.7259707450866699, "advantages/var": 0.5270335227216947, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4140625, "epoch": 0.407168458781362, "grad_norm": 0.1193934165987424, "learning_rate": 1.968583161128631e-06, "loss": 0.0, "num_tokens": 31877570.0, "reward": 0.6640625, "reward_std": 0.20957423746585846, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4727790653705597, "step": 71 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6627310514450073, "advantages/var": 0.43921244654940494, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5703125, "epoch": 0.4129032258064516, "grad_norm": 0.10450942981189178, "learning_rate": 1.9676840260107193e-06, "loss": -0.0, "num_tokens": 32263693.0, "reward": 0.7421875, "reward_std": 0.17906413972377777, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "step": 72 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.4592062554395867e-09, "advantages/std": 0.6627400517463684, "advantages/var": 0.4392243761887791, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.2734375, "epoch": 0.41863799283154124, "grad_norm": 0.10201143119169823, "learning_rate": 1.9667724165767103e-06, "loss": -0.0, "num_tokens": 32659714.0, "reward": 0.603515625, "reward_std": 0.18938496708869934, "rewards/drgrpo_math_reward/mean": 0.603515625, "rewards/drgrpo_math_reward/std": 0.4896455705165863, "step": 73 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 1.9242545798625515e-09, "advantages/std": 0.7259870171546936, "advantages/var": 0.5270571490771694, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "epoch": 0.4243727598566308, "grad_norm": 0.10628123510140317, "learning_rate": 1.9658483445780673e-06, "loss": 0.0, "num_tokens": 33074833.0, "reward": 0.580078125, "reward_std": 0.22558510303497314, "rewards/drgrpo_math_reward/mean": 0.580078125, "rewards/drgrpo_math_reward/std": 0.4940285086631775, "step": 74 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 3.207120720412668e-09, "advantages/std": 0.725980281829834, "advantages/var": 0.5270473696057252, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4453125, "epoch": 0.43010752688172044, "grad_norm": 0.13379472077895976, "learning_rate": 1.964911821926909e-06, "loss": 0.0, "num_tokens": 33443741.0, "reward": 0.6875, "reward_std": 0.21779169142246246, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "step": 75 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 1.8150665549813358e-09, "advantages/std": 0.7055215239524841, "advantages/var": 0.49776062076023564, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.1953125, "epoch": 0.43584229390681006, "grad_norm": 0.1124066466648034, "learning_rate": 1.9639628606958534e-06, "loss": 0.0, "num_tokens": 33827717.0, "reward": 0.5546875, "reward_std": 0.19697046279907227, "rewards/drgrpo_math_reward/mean": 0.5546875, "rewards/drgrpo_math_reward/std": 0.49748632311820984, "step": 76 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.513254799865995e-10, "advantages/std": 0.6627206206321716, "advantages/var": 0.43919862101109075, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.15625, "epoch": 0.44157706093189963, "grad_norm": 0.11017788928598075, "learning_rate": 1.9630014731178623e-06, "loss": 0.0, "num_tokens": 34224624.0, "reward": 0.587890625, "reward_std": 0.17120419442653656, "rewards/drgrpo_math_reward/mean": 0.587890625, "rewards/drgrpo_math_reward/std": 0.49269601702690125, "step": 77 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.4422747502017864e-09, "advantages/std": 0.5411087870597839, "advantages/var": 0.2927987194333106, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.359375, "epoch": 0.44731182795698926, "grad_norm": 0.10372848528138051, "learning_rate": 1.962027671586086e-06, "loss": -0.0, "num_tokens": 34590278.0, "reward": 0.64453125, "reward_std": 0.11615687608718872, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.47912323474884033, "step": 78 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 3.7419168605135744e-09, "advantages/std": 0.6844452023506165, "advantages/var": 0.4684652350207763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 0.45304659498207883, "grad_norm": 0.09963318509983374, "learning_rate": 1.9610414686536994e-06, "loss": -0.0, "num_tokens": 34960944.0, "reward": 0.783203125, "reward_std": 0.17722684144973755, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "step": 79 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 9.900380591882187e-10, "advantages/std": 0.7055202722549438, "advantages/var": 0.4977588545626901, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.03125, "epoch": 0.45878136200716846, "grad_norm": 0.10252246045287623, "learning_rate": 1.9600428770337452e-06, "loss": 0.0, "num_tokens": 35380048.0, "reward": 0.57421875, "reward_std": 0.19990062713623047, "rewards/drgrpo_math_reward/mean": 0.57421875, "rewards/drgrpo_math_reward/std": 0.4949444830417633, "step": 80 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 2.565590413342101e-09, "advantages/std": 0.7260103225708008, "advantages/var": 0.5270909884793582, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4140625, "epoch": 0.4645161290322581, "grad_norm": 0.123558718808345, "learning_rate": 1.9590319095989656e-06, "loss": -0.0, "num_tokens": 35777503.0, "reward": 0.625, "reward_std": 0.24006178975105286, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4845963716506958, "step": 81 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5927636027336121, "advantages/var": 0.35136868872573146, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.234375, "epoch": 0.47025089605734766, "grad_norm": 0.07762963539699437, "learning_rate": 1.9580085793816383e-06, "loss": 0.0, "num_tokens": 36164017.0, "reward": 0.630859375, "reward_std": 0.14719420671463013, "rewards/drgrpo_math_reward/mean": 0.630859375, "rewards/drgrpo_math_reward/std": 0.4830440282821655, "step": 82 }, { "advantages/mean": 3.841705620288849e-09, "advantages/snr": 5.79668898988052e-09, "advantages/std": 0.6627413630485535, "advantages/var": 0.43922611429545455, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5078125, "epoch": 0.4759856630824373, "grad_norm": 0.09479804524220217, "learning_rate": 1.9569728995734097e-06, "loss": -0.0, "num_tokens": 36555130.0, "reward": 0.69921875, "reward_std": 0.19331128895282745, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "step": 83 }, { "advantages/mean": 4.307366907596588e-09, "advantages/snr": 5.77493445417106e-09, "advantages/std": 0.7458728551864624, "advantages/var": 0.5563263161040055, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.546875, "epoch": 0.4817204301075269, "grad_norm": 0.12625445014462494, "learning_rate": 1.955924883525122e-06, "loss": 0.0, "num_tokens": 36939699.0, "reward": 0.654296875, "reward_std": 0.23397575318813324, "rewards/drgrpo_math_reward/mean": 0.654296875, "rewards/drgrpo_math_reward/std": 0.4760620892047882, "step": 84 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.547905772152612e-10, "advantages/std": 0.6169410347938538, "advantages/var": 0.3806162404125111, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 0.4874551971326165, "grad_norm": 0.09628359839227178, "learning_rate": 1.954864544746643e-06, "loss": -0.0, "num_tokens": 37279417.0, "reward": 0.75, "reward_std": 0.1396407037973404, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "step": 85 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5095467356875e-09, "advantages/std": 0.6169551014900208, "advantages/var": 0.3806335972545618, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6015625, "epoch": 0.4931899641577061, "grad_norm": 0.10806892744670954, "learning_rate": 1.953791896906692e-06, "loss": 0.0, "num_tokens": 37627086.0, "reward": 0.712890625, "reward_std": 0.14958953857421875, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "step": 86 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.1080091890165633e-09, "advantages/std": 0.6627029180526733, "advantages/var": 0.4391751575955283, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3125, "epoch": 0.4989247311827957, "grad_norm": 0.11168792285973307, "learning_rate": 1.952706953832663e-06, "loss": 0.0, "num_tokens": 38001431.0, "reward": 0.650390625, "reward_std": 0.16051658987998962, "rewards/drgrpo_math_reward/mean": 0.650390625, "rewards/drgrpo_math_reward/std": 0.47731292247772217, "step": 87 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.545519078575167e-09, "advantages/std": 0.6402680277824402, "advantages/var": 0.4099431474004156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.2890625, "epoch": 0.5046594982078854, "grad_norm": 0.10161661705130945, "learning_rate": 1.9516097295104467e-06, "loss": -0.0, "num_tokens": 38397264.0, "reward": 0.619140625, "reward_std": 0.17731529474258423, "rewards/drgrpo_math_reward/mean": 0.619140625, "rewards/drgrpo_math_reward/std": 0.48607301712036133, "step": 88 }, { "advantages/mean": -4.0745362639427185e-09, "advantages/snr": 6.873695280222318e-09, "advantages/std": 0.5927723050117493, "advantages/var": 0.3513790055889423, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.28125, "epoch": 0.5103942652329749, "grad_norm": 0.08613456921037321, "learning_rate": 1.9505002380842493e-06, "loss": 0.0, "num_tokens": 38753353.0, "reward": 0.732421875, "reward_std": 0.15670263767242432, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "step": 89 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.7259822487831116, "advantages/var": 0.5270502255481837, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 0.5161290322580645, "grad_norm": 0.10380223264458875, "learning_rate": 1.9493784938564127e-06, "loss": -0.0, "num_tokens": 39134170.0, "reward": 0.666015625, "reward_std": 0.2191292941570282, "rewards/drgrpo_math_reward/mean": 0.666015625, "rewards/drgrpo_math_reward/std": 0.47209542989730835, "step": 90 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.909128642129434e-09, "advantages/std": 0.6402759552001953, "advantages/var": 0.4099532988075225, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.359375, "epoch": 0.5218637992831541, "grad_norm": 0.09683974585509025, "learning_rate": 1.948244511287226e-06, "loss": 0.0, "num_tokens": 39533908.0, "reward": 0.64453125, "reward_std": 0.18237385153770447, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.47912323474884033, "step": 91 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.1784057046447742e-09, "advantages/std": 0.5927431583404541, "advantages/var": 0.35134445175941664, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4921875, "epoch": 0.5275985663082438, "grad_norm": 0.09273021928980656, "learning_rate": 1.9470983049947442e-06, "loss": 0.0, "num_tokens": 39887396.0, "reward": 0.583984375, "reward_std": 0.1294998973608017, "rewards/drgrpo_math_reward/mean": 0.583984375, "rewards/drgrpo_math_reward/std": 0.493378221988678, "step": 92 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.2643627621857223e-09, "advantages/std": 0.6169434785842896, "advantages/var": 0.38061925576768374, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.5333333333333333, "grad_norm": 0.10723717702600626, "learning_rate": 1.945939889754595e-06, "loss": -0.0, "num_tokens": 40218582.0, "reward": 0.775390625, "reward_std": 0.13985365629196167, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "step": 93 }, { "advantages/mean": 3.4924596548080444e-09, "advantages/snr": 5.2697082586837295e-09, "advantages/std": 0.6627424955368042, "advantages/var": 0.43922761539035093, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5859375, "epoch": 0.5390681003584229, "grad_norm": 0.09781963450649286, "learning_rate": 1.944769280499791e-06, "loss": -0.0, "num_tokens": 40577421.0, "reward": 0.708984375, "reward_std": 0.19246044754981995, "rewards/drgrpo_math_reward/mean": 0.708984375, "rewards/drgrpo_math_reward/std": 0.45467492938041687, "step": 94 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.855574338978251e-10, "advantages/std": 0.5927781462669373, "advantages/var": 0.35138593069166646, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4296875, "epoch": 0.5448028673835126, "grad_norm": 0.09352421489197817, "learning_rate": 1.9435864923205368e-06, "loss": -0.0, "num_tokens": 40967686.0, "reward": 0.51171875, "reward_std": 0.15877464413642883, "rewards/drgrpo_math_reward/mean": 0.51171875, "rewards/drgrpo_math_reward/std": 0.5003514885902405, "step": 95 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 6.122999110340186e-09, "advantages/std": 0.6844605803489685, "advantages/var": 0.46848628605164677, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7421875, "epoch": 0.5505376344086022, "grad_norm": 0.10892418912338672, "learning_rate": 1.9423915404640348e-06, "loss": -0.0, "num_tokens": 41296890.0, "reward": 0.798828125, "reward_std": 0.19373659789562225, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "step": 96 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.721296443360608e-09, "advantages/std": 0.6844697594642639, "advantages/var": 0.4684988516210673, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 0.5562724014336917, "grad_norm": 0.1139790080845369, "learning_rate": 1.9411844403342867e-06, "loss": -0.0, "num_tokens": 41653784.0, "reward": 0.6875, "reward_std": 0.1988215148448944, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "step": 97 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.46158324069184e-09, "advantages/std": 0.5675143599510193, "advantages/var": 0.3220725487506151, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8046875, "epoch": 0.5620071684587814, "grad_norm": 0.09593953458003912, "learning_rate": 1.9399652074918976e-06, "loss": -0.0, "num_tokens": 41977368.0, "reward": 0.642578125, "reward_std": 0.12758207321166992, "rewards/drgrpo_math_reward/mean": 0.642578125, "rewards/drgrpo_math_reward/std": 0.4797092080116272, "step": 98 }, { "advantages/mean": 2.2118911147117615e-09, "advantages/snr": 3.73141094295914e-09, "advantages/std": 0.5927760601043701, "advantages/var": 0.3513834574328598, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 0.567741935483871, "grad_norm": 0.09361504854760651, "learning_rate": 1.938733857653874e-06, "loss": -0.0, "num_tokens": 42310064.0, "reward": 0.771484375, "reward_std": 0.15729552507400513, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "step": 99 }, { "advantages/mean": 3.958120942115784e-09, "advantages/snr": 5.452144616454889e-09, "advantages/std": 0.7259750366210938, "advantages/var": 0.5270397537969984, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7109375, "epoch": 0.5734767025089605, "grad_norm": 0.12028417825023664, "learning_rate": 1.9374904066934204e-06, "loss": -0.0, "num_tokens": 42656656.0, "reward": 0.748046875, "reward_std": 0.20609426498413086, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "step": 100 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.528461269917769e-09, "advantages/std": 0.6169794797897339, "advantages/var": 0.38066367848161065, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5859375, "epoch": 0.5792114695340502, "grad_norm": 0.0914960391071336, "learning_rate": 1.936234870639737e-06, "loss": 0.0, "num_tokens": 42999270.0, "reward": 0.673828125, "reward_std": 0.16729867458343506, "rewards/drgrpo_math_reward/mean": 0.673828125, "rewards/drgrpo_math_reward/std": 0.4692695140838623, "step": 101 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 6.600113212973575e-10, "advantages/std": 0.7055352926254272, "advantages/var": 0.49778004914004725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 0.5849462365591398, "grad_norm": 0.1272033650123926, "learning_rate": 1.934967265677811e-06, "loss": -0.0, "num_tokens": 43312096.0, "reward": 0.822265625, "reward_std": 0.2132958173751831, "rewards/drgrpo_math_reward/mean": 0.822265625, "rewards/drgrpo_math_reward/std": 0.3826628625392914, "step": 102 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 2.3636813243364914e-09, "advantages/std": 0.6402720808982849, "advantages/var": 0.4099483375778199, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 0.5906810035842294, "grad_norm": 0.12027193570521513, "learning_rate": 1.933687608148208e-06, "loss": -0.0, "num_tokens": 43657071.0, "reward": 0.6328125, "reward_std": 0.17918440699577332, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48250964283943176, "step": 103 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 5.454654287891141e-10, "advantages/std": 0.6402714848518372, "advantages/var": 0.40994757431437634, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4609375, "epoch": 0.596415770609319, "grad_norm": 0.10324535660350227, "learning_rate": 1.9323959145468632e-06, "loss": -0.0, "num_tokens": 44003848.0, "reward": 0.6484375, "reward_std": 0.18055561184883118, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4779251217842102, "step": 104 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.5455112585663302e-09, "advantages/std": 0.6402699947357178, "advantages/var": 0.40994566615887607, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 0.6021505376344086, "grad_norm": 0.11480327046972252, "learning_rate": 1.9310922015248674e-06, "loss": 0.0, "num_tokens": 44352512.0, "reward": 0.677734375, "reward_std": 0.17984820902347565, "rewards/drgrpo_math_reward/mean": 0.677734375, "rewards/drgrpo_math_reward/std": 0.46780112385749817, "step": 105 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5675099492073059, "advantages/var": 0.32206754244927893, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6796875, "epoch": 0.6078853046594982, "grad_norm": 0.07441490218367734, "learning_rate": 1.929776485888251e-06, "loss": -0.0, "num_tokens": 44692314.0, "reward": 0.8046875, "reward_std": 0.11877965182065964, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 106 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.547650551233487e-10, "advantages/std": 0.6169618964195251, "advantages/var": 0.3806419816335769, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 0.6136200716845878, "grad_norm": 0.09874488875111885, "learning_rate": 1.928448784597772e-06, "loss": -0.0, "num_tokens": 45038009.0, "reward": 0.79296875, "reward_std": 0.1528840959072113, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40557438135147095, "step": 107 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.4615498902498422e-09, "advantages/std": 0.5675220489501953, "advantages/var": 0.3220812760446279, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3359375, "epoch": 0.6193548387096774, "grad_norm": 0.08777871837206135, "learning_rate": 1.927109114768691e-06, "loss": -0.0, "num_tokens": 45416676.0, "reward": 0.615234375, "reward_std": 0.13126249611377716, "rewards/drgrpo_math_reward/mean": 0.615234375, "rewards/drgrpo_math_reward/std": 0.4870156943798065, "step": 108 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.401642178264875e-10, "advantages/std": 0.6844654083251953, "advantages/var": 0.46849289519377635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5703125, "epoch": 0.625089605734767, "grad_norm": 0.11756416212628888, "learning_rate": 1.925757493670555e-06, "loss": -0.0, "num_tokens": 45758130.0, "reward": 0.712890625, "reward_std": 0.19373658299446106, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "step": 109 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.300071103815819e-10, "advantages/std": 0.7055321931838989, "advantages/var": 0.4977756756188825, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.453125, "epoch": 0.6308243727598566, "grad_norm": 0.11680201348953667, "learning_rate": 1.9243939387269745e-06, "loss": 0.0, "num_tokens": 46120187.0, "reward": 0.7421875, "reward_std": 0.2054899036884308, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "step": 110 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.3606930113424405e-09, "advantages/std": 0.6844472289085388, "advantages/var": 0.46846800916057774, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5234375, "epoch": 0.6365591397849463, "grad_norm": 0.09902559205119116, "learning_rate": 1.9230184675153973e-06, "loss": 0.0, "num_tokens": 46491791.0, "reward": 0.78125, "reward_std": 0.18210561573505402, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "step": 111 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.7496185642272747e-09, "advantages/std": 0.5927420258522034, "advantages/var": 0.3513431092113741, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 0.6422939068100358, "grad_norm": 0.12233732295442115, "learning_rate": 1.9216310977668816e-06, "loss": -0.0, "num_tokens": 46805356.0, "reward": 0.859375, "reward_std": 0.13099107146263123, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3479743003845215, "step": 112 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.205355891720779e-10, "advantages/std": 0.5675089359283447, "advantages/var": 0.3220663923585221, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5703125, "epoch": 0.6480286738351254, "grad_norm": 0.09696126625101334, "learning_rate": 1.9202318473658702e-06, "loss": -0.0, "num_tokens": 47163162.0, "reward": 0.833984375, "reward_std": 0.11850585043430328, "rewards/drgrpo_math_reward/mean": 0.833984375, "rewards/drgrpo_math_reward/std": 0.3724585771560669, "step": 113 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 5.280237331390249e-09, "advantages/std": 0.7055156826972961, "advantages/var": 0.49775237853183185, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.09375, "epoch": 0.6537634408602151, "grad_norm": 0.09366751212359639, "learning_rate": 1.918820734349957e-06, "loss": 0.0, "num_tokens": 47544793.0, "reward": 0.60546875, "reward_std": 0.19315844774246216, "rewards/drgrpo_math_reward/mean": 0.60546875, "rewards/drgrpo_math_reward/std": 0.4892277717590332, "step": 114 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.026718784559063e-10, "advantages/std": 0.662700891494751, "advantages/var": 0.4391724715879377, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.453125, "epoch": 0.6594982078853047, "grad_norm": 0.09878548424447173, "learning_rate": 1.917397776909656e-06, "loss": -0.0, "num_tokens": 47889140.0, "reward": 0.740234375, "reward_std": 0.15437430143356323, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 115 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5133464932441711, "advantages/var": 0.26352462212608785, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "epoch": 0.6652329749103942, "grad_norm": 0.07187607742215389, "learning_rate": 1.9159629933881667e-06, "loss": 0.0, "num_tokens": 48240298.0, "reward": 0.681640625, "reward_std": 0.10774585604667664, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "step": 116 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.090915928790097e-09, "advantages/std": 0.6402802467346191, "advantages/var": 0.40995879435854476, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5859375, "epoch": 0.6709677419354839, "grad_norm": 0.09483870955676035, "learning_rate": 1.9145164022811366e-06, "loss": -0.0, "num_tokens": 48616023.0, "reward": 0.767578125, "reward_std": 0.18949955701828003, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "step": 117 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.640996894833302e-09, "advantages/std": 0.5675346255302429, "advantages/var": 0.32209555117575306, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.34375, "epoch": 0.6767025089605735, "grad_norm": 0.0799103957884236, "learning_rate": 1.9130580222364246e-06, "loss": 0.0, "num_tokens": 49002177.0, "reward": 0.634765625, "reward_std": 0.13567525148391724, "rewards/drgrpo_math_reward/mean": 0.634765625, "rewards/drgrpo_math_reward/std": 0.4819667339324951, "step": 118 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 9.091187807058312e-10, "advantages/std": 0.640264630317688, "advantages/var": 0.40993879683584566, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.453125, "epoch": 0.6824372759856631, "grad_norm": 0.09784687923225988, "learning_rate": 1.9115878720538587e-06, "loss": -0.0, "num_tokens": 49345197.0, "reward": 0.787109375, "reward_std": 0.17407254874706268, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "step": 119 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.4017351944972693e-10, "advantages/std": 0.6844466924667358, "advantages/var": 0.46846727482865447, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "epoch": 0.6881720430107527, "grad_norm": 0.10343894185472298, "learning_rate": 1.9101059706849955e-06, "loss": 0.0, "num_tokens": 49713161.0, "reward": 0.740234375, "reward_std": 0.17628221213817596, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 120 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.1321283327139503e-09, "advantages/std": 0.6169723868370056, "advantages/var": 0.3806549261193517, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.171875, "epoch": 0.6939068100358423, "grad_norm": 0.09800192750888861, "learning_rate": 1.9086123372328743e-06, "loss": -0.0, "num_tokens": 50082740.0, "reward": 0.69921875, "reward_std": 0.16283372044563293, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "step": 121 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5411226153373718, "advantages/var": 0.2928136848295573, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0078125, "epoch": 0.6996415770609319, "grad_norm": 0.08144816305836884, "learning_rate": 1.9071069909517714e-06, "loss": 0.0, "num_tokens": 50483979.0, "reward": 0.626953125, "reward_std": 0.12679103016853333, "rewards/drgrpo_math_reward/mean": 0.626953125, "rewards/drgrpo_math_reward/std": 0.48408737778663635, "step": 122 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.8869485505068507e-09, "advantages/std": 0.6169501543045044, "advantages/var": 0.3806274928963518, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3046875, "epoch": 0.7053763440860215, "grad_norm": 0.06859208850463304, "learning_rate": 1.9055899512469525e-06, "loss": 0.0, "num_tokens": 50880518.0, "reward": 0.693359375, "reward_std": 0.1467098444700241, "rewards/drgrpo_math_reward/mean": 0.693359375, "rewards/drgrpo_math_reward/std": 0.4615498185157776, "step": 123 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.963958950203137e-09, "advantages/std": 0.5927584171295166, "advantages/var": 0.35136254107789, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0390625, "epoch": 0.7111111111111111, "grad_norm": 0.08720312256200466, "learning_rate": 1.9040612376744214e-06, "loss": -0.0, "num_tokens": 51286789.0, "reward": 0.564453125, "reward_std": 0.1403670608997345, "rewards/drgrpo_math_reward/mean": 0.564453125, "rewards/drgrpo_math_reward/std": 0.49631330370903015, "step": 124 }, { "advantages/mean": -1.979060471057892e-09, "advantages/snr": 2.653391730814346e-09, "advantages/std": 0.7458606362342834, "advantages/var": 0.5563080886838101, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.2109375, "epoch": 0.7168458781362007, "grad_norm": 0.1263612403114818, "learning_rate": 1.9025208699406693e-06, "loss": 0.0, "num_tokens": 51674991.0, "reward": 0.623046875, "reward_std": 0.2112126499414444, "rewards/drgrpo_math_reward/mean": 0.623046875, "rewards/drgrpo_math_reward/std": 0.4850969910621643, "step": 125 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.273058577869462e-10, "advantages/std": 0.6402550935745239, "advantages/var": 0.4099265848481224, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "epoch": 0.7225806451612903, "grad_norm": 0.11243042207957041, "learning_rate": 1.9009688679024189e-06, "loss": -0.0, "num_tokens": 52046776.0, "reward": 0.677734375, "reward_std": 0.17193621397018433, "rewards/drgrpo_math_reward/mean": 0.677734375, "rewards/drgrpo_math_reward/std": 0.46780112385749817, "step": 126 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 2.267730371168527e-10, "advantages/std": 0.51335608959198, "advantages/var": 0.263534474721169, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.1484375, "epoch": 0.72831541218638, "grad_norm": 0.06588552852843944, "learning_rate": 1.899405251566371e-06, "loss": -0.0, "num_tokens": 52433141.0, "reward": 0.673828125, "reward_std": 0.11631764471530914, "rewards/drgrpo_math_reward/mean": 0.673828125, "rewards/drgrpo_math_reward/std": 0.4692695140838623, "step": 127 }, { "advantages/mean": 2.444721758365631e-09, "advantages/snr": 4.517915937092211e-09, "advantages/std": 0.5411171317100525, "advantages/var": 0.2928077502301143, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4140625, "epoch": 0.7340501792114695, "grad_norm": 0.09175884619532558, "learning_rate": 1.8978300410889436e-06, "loss": 0.0, "num_tokens": 52779666.0, "reward": 0.69921875, "reward_std": 0.12427500635385513, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "step": 128 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.070705582851116e-10, "advantages/std": 0.5133683085441589, "advantages/var": 0.26354702021749077, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7109375, "epoch": 0.7397849462365591, "grad_norm": 0.09992818871073707, "learning_rate": 1.896243256776015e-06, "loss": 0.0, "num_tokens": 53119925.0, "reward": 0.810546875, "reward_std": 0.12409625202417374, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "step": 129 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.1512346735605115e-09, "advantages/std": 0.6169579029083252, "advantages/var": 0.3806370539610384, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 0.7455197132616488, "grad_norm": 0.11278414036562853, "learning_rate": 1.8946449190826594e-06, "loss": 0.0, "num_tokens": 53438010.0, "reward": 0.728515625, "reward_std": 0.1502682864665985, "rewards/drgrpo_math_reward/mean": 0.728515625, "rewards/drgrpo_math_reward/std": 0.44516023993492126, "step": 130 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.40163121808843e-10, "advantages/std": 0.684467613697052, "advantages/var": 0.4684959142001368, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4609375, "epoch": 0.7512544802867384, "grad_norm": 0.11641857128749868, "learning_rate": 1.8930350486128855e-06, "loss": 0.0, "num_tokens": 53798932.0, "reward": 0.662109375, "reward_std": 0.19508743286132812, "rewards/drgrpo_math_reward/mean": 0.662109375, "rewards/drgrpo_math_reward/std": 0.4734536409378052, "step": 131 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 4.682191455771794e-10, "advantages/std": 0.7459027767181396, "advantages/var": 0.5563709523158309, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.515625, "epoch": 0.7569892473118279, "grad_norm": 0.13444117617435852, "learning_rate": 1.8914136661193688e-06, "loss": 0.0, "num_tokens": 54161779.0, "reward": 0.69921875, "reward_std": 0.2540428340435028, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "step": 132 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5927889943122864, "advantages/var": 0.3513987917777719, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6171875, "epoch": 0.7627240143369176, "grad_norm": 0.08342336332356025, "learning_rate": 1.8897807925031862e-06, "loss": -0.0, "num_tokens": 54503425.0, "reward": 0.74609375, "reward_std": 0.16517478227615356, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "step": 133 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 2.366517489989152e-09, "advantages/std": 0.541119396686554, "advantages/var": 0.29281020147042014, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "epoch": 0.7684587813620072, "grad_norm": 0.09201317646884366, "learning_rate": 1.8881364488135445e-06, "loss": 0.0, "num_tokens": 54839971.0, "reward": 0.744140625, "reward_std": 0.12286502122879028, "rewards/drgrpo_math_reward/mean": 0.744140625, "rewards/drgrpo_math_reward/std": 0.43676990270614624, "step": 134 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 2.2110739623964127e-09, "advantages/std": 0.684463381767273, "advantages/var": 0.46849012098029164, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6171875, "epoch": 0.7741935483870968, "grad_norm": 0.11601588537123636, "learning_rate": 1.8864806562475108e-06, "loss": -0.0, "num_tokens": 55205295.0, "reward": 0.748046875, "reward_std": 0.1935114860534668, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "step": 135 }, { "advantages/mean": -3.841705620288849e-09, "advantages/snr": 7.099511805150324e-09, "advantages/std": 0.5411224961280823, "advantages/var": 0.2928135558158864, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.546875, "epoch": 0.7799283154121864, "grad_norm": 0.10069275859365387, "learning_rate": 1.8848134361497382e-06, "loss": 0.0, "num_tokens": 55550192.0, "reward": 0.7109375, "reward_std": 0.12544146180152893, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45377036929130554, "step": 136 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 1.2727750610572488e-09, "advantages/std": 0.6402602195739746, "advantages/var": 0.4099331487689142, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.59375, "epoch": 0.785663082437276, "grad_norm": 0.13652324103982894, "learning_rate": 1.883134810012191e-06, "loss": -0.0, "num_tokens": 55855452.0, "reward": 0.7734375, "reward_std": 0.1709846556186676, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "step": 137 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 3.300016460701913e-09, "advantages/std": 0.7055438756942749, "advantages/var": 0.49779216052969844, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 0.7913978494623656, "grad_norm": 0.14581603699194623, "learning_rate": 1.8814447994738676e-06, "loss": 0.0, "num_tokens": 56222620.0, "reward": 0.71875, "reward_std": 0.2189446985721588, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "step": 138 }, { "advantages/mean": 3.3760443329811096e-09, "advantages/snr": 5.2729291799930736e-09, "advantages/std": 0.6402597427368164, "advantages/var": 0.40993253816941433, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6171875, "epoch": 0.7971326164874551, "grad_norm": 0.1401222345717879, "learning_rate": 1.8797434263205215e-06, "loss": 0.0, "num_tokens": 56553416.0, "reward": 0.728515625, "reward_std": 0.16807040572166443, "rewards/drgrpo_math_reward/mean": 0.728515625, "rewards/drgrpo_math_reward/std": 0.44516023993492126, "step": 139 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.207020138399704e-10, "advantages/std": 0.7260030508041382, "advantages/var": 0.527080429776916, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5546875, "epoch": 0.8028673835125448, "grad_norm": 0.14751235076577096, "learning_rate": 1.8780307124843801e-06, "loss": -0.0, "num_tokens": 56905340.0, "reward": 0.66796875, "reward_std": 0.24067938327789307, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.47140273451805115, "step": 140 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.513218146596493e-10, "advantages/std": 0.6627275347709656, "advantages/var": 0.4392077853436014, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 0.8086021505376344, "grad_norm": 0.11795374165805851, "learning_rate": 1.8763066800438634e-06, "loss": -0.0, "num_tokens": 57229974.0, "reward": 0.7421875, "reward_std": 0.18216341733932495, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "step": 141 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 2.830483806902887e-09, "advantages/std": 0.6169368624687195, "advantages/var": 0.3806110922727477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 0.814336917562724, "grad_norm": 0.1198373257457826, "learning_rate": 1.8745713512232975e-06, "loss": -0.0, "num_tokens": 57580175.0, "reward": 0.7578125, "reward_std": 0.13412389159202576, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "step": 142 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.3200122715556054e-09, "advantages/std": 0.7055408358573914, "advantages/var": 0.49778787106234645, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 0.8200716845878137, "grad_norm": 0.1277159915414159, "learning_rate": 1.872824748392629e-06, "loss": 0.0, "num_tokens": 57929621.0, "reward": 0.728515625, "reward_std": 0.21116498112678528, "rewards/drgrpo_math_reward/mean": 0.728515625, "rewards/drgrpo_math_reward/std": 0.44516023993492126, "step": 143 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6409944820246048e-09, "advantages/std": 0.5675354599952698, "advantages/var": 0.32209649835204246, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6015625, "epoch": 0.8258064516129032, "grad_norm": 0.1105144555250896, "learning_rate": 1.8710668940671375e-06, "loss": -0.0, "num_tokens": 58282766.0, "reward": 0.6484375, "reward_std": 0.13549649715423584, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4779251217842102, "step": 144 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.7055389285087585, "advantages/var": 0.4977851796412871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7421875, "epoch": 0.8315412186379928, "grad_norm": 0.12585201733350596, "learning_rate": 1.8692978109071436e-06, "loss": -0.0, "num_tokens": 58622039.0, "reward": 0.650390625, "reward_std": 0.2173701673746109, "rewards/drgrpo_math_reward/mean": 0.650390625, "rewards/drgrpo_math_reward/std": 0.47731292247772217, "step": 145 }, { "advantages/mean": 2.9103830456733704e-09, "advantages/snr": 5.378396706710231e-09, "advantages/std": 0.5411246418952942, "advantages/var": 0.2928158780663104, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 0.8372759856630825, "grad_norm": 0.11153986094147587, "learning_rate": 1.8675175217177175e-06, "loss": -0.0, "num_tokens": 58929432.0, "reward": 0.83984375, "reward_std": 0.12820197641849518, "rewards/drgrpo_math_reward/mean": 0.83984375, "rewards/drgrpo_math_reward/std": 0.3671095669269562, "step": 146 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.217018206111108e-09, "advantages/std": 0.765249490737915, "advantages/var": 0.5856067830746383, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6015625, "epoch": 0.843010752688172, "grad_norm": 0.1588873673073329, "learning_rate": 1.8657260494483857e-06, "loss": 0.0, "num_tokens": 59269313.0, "reward": 0.6640625, "reward_std": 0.23562897741794586, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4727790653705597, "step": 147 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 2.3638036737036475e-09, "advantages/std": 0.6402389407157898, "advantages/var": 0.4099059012088766, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7109375, "epoch": 0.8487455197132616, "grad_norm": 0.12523162439180782, "learning_rate": 1.863923417192835e-06, "loss": -0.0, "num_tokens": 59594229.0, "reward": 0.755859375, "reward_std": 0.15289105474948883, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "step": 148 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.4016063357878736e-10, "advantages/std": 0.6844726204872131, "advantages/var": 0.4685027681966325, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 0.8544802867383513, "grad_norm": 0.12469169098064292, "learning_rate": 1.862109648188614e-06, "loss": -0.0, "num_tokens": 59906940.0, "reward": 0.83984375, "reward_std": 0.20101869106292725, "rewards/drgrpo_math_reward/mean": 0.83984375, "rewards/drgrpo_math_reward/std": 0.3671095669269562, "step": 149 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 3.6364913724873166e-09, "advantages/std": 0.6402617692947388, "advantages/var": 0.4099351332204293, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 0.8602150537634409, "grad_norm": 0.11307951931045364, "learning_rate": 1.8602847658168334e-06, "loss": -0.0, "num_tokens": 60205003.0, "reward": 0.78125, "reward_std": 0.17063409090042114, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "step": 150 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.0189870128005075e-09, "advantages/std": 0.6169768571853638, "advantages/var": 0.38066044230232876, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 0.8659498207885304, "grad_norm": 0.10587022976228867, "learning_rate": 1.858448793601866e-06, "loss": 0.0, "num_tokens": 60556602.0, "reward": 0.685546875, "reward_std": 0.16561943292617798, "rewards/drgrpo_math_reward/mean": 0.685546875, "rewards/drgrpo_math_reward/std": 0.4647517800331116, "step": 151 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 2.0000916675106025e-09, "advantages/std": 0.6402549147605896, "advantages/var": 0.40992635587508985, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6796875, "epoch": 0.8716845878136201, "grad_norm": 0.12526811062534735, "learning_rate": 1.8566017552110425e-06, "loss": 0.0, "num_tokens": 60887201.0, "reward": 0.71875, "reward_std": 0.17155036330223083, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "step": 152 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5675363540649414, "advantages/var": 0.32209751318532653, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 0.8774193548387097, "grad_norm": 0.10211996564125343, "learning_rate": 1.8547436744543466e-06, "loss": -0.0, "num_tokens": 61185286.0, "reward": 0.814453125, "reward_std": 0.14386671781539917, "rewards/drgrpo_math_reward/mean": 0.814453125, "rewards/drgrpo_math_reward/std": 0.38912075757980347, "step": 153 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.1078899386312626e-09, "advantages/std": 0.6627404093742371, "advantages/var": 0.4392248502175313, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5859375, "epoch": 0.8831541218637993, "grad_norm": 0.11397862432403714, "learning_rate": 1.8528745752841072e-06, "loss": 0.0, "num_tokens": 61542710.0, "reward": 0.6875, "reward_std": 0.19087517261505127, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "step": 154 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.2642229790524877e-09, "advantages/std": 0.616981565952301, "advantages/var": 0.3806662527249536, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8046875, "epoch": 0.8888888888888888, "grad_norm": 0.125203555152859, "learning_rate": 1.850994481794692e-06, "loss": -0.0, "num_tokens": 61853890.0, "reward": 0.7890625, "reward_std": 0.17024031281471252, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4083731174468994, "step": 155 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 1.8150253068009446e-09, "advantages/std": 0.7055375576019287, "advantages/var": 0.4977832451868949, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7109375, "epoch": 0.8946236559139785, "grad_norm": 0.1340954863610576, "learning_rate": 1.8491034182221936e-06, "loss": -0.0, "num_tokens": 62208673.0, "reward": 0.7421875, "reward_std": 0.21991004049777985, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "step": 156 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.396342015106288e-09, "advantages/std": 0.6169801950454712, "advantages/var": 0.3806645610783477, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6171875, "epoch": 0.9003584229390681, "grad_norm": 0.11944949047236865, "learning_rate": 1.84720140894412e-06, "loss": -0.0, "num_tokens": 62550600.0, "reward": 0.67578125, "reward_std": 0.16847631335258484, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.4685399830341339, "step": 157 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.7008059819477141e-09, "advantages/std": 0.6844714879989624, "advantages/var": 0.46850121788351373, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4765625, "epoch": 0.9060931899641577, "grad_norm": 0.11198139544631054, "learning_rate": 1.845288478479079e-06, "loss": 0.0, "num_tokens": 62902673.0, "reward": 0.669921875, "reward_std": 0.20164872705936432, "rewards/drgrpo_math_reward/mean": 0.669921875, "rewards/drgrpo_math_reward/std": 0.47070086002349854, "step": 158 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.6415939859533683e-09, "advantages/std": 0.6169814467430115, "advantages/var": 0.3806661056250995, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5078125, "epoch": 0.9118279569892473, "grad_norm": 0.13382309533152528, "learning_rate": 1.8433646514864622e-06, "loss": 0.0, "num_tokens": 63231192.0, "reward": 0.734375, "reward_std": 0.16853412985801697, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "step": 159 }, { "advantages/mean": 3.14321368932724e-09, "advantages/snr": 4.909345618527959e-09, "advantages/std": 0.6402510404586792, "advantages/var": 0.40992139480842127, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5703125, "epoch": 0.9175627240143369, "grad_norm": 0.12102508099304998, "learning_rate": 1.841429952766127e-06, "loss": -0.0, "num_tokens": 63559189.0, "reward": 0.796875, "reward_std": 0.16372573375701904, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "step": 160 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.319995767442294e-09, "advantages/std": 0.7055496573448181, "advantages/var": 0.49780031897939025, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 0.9232974910394265, "grad_norm": 0.12875192786845846, "learning_rate": 1.8394844072580772e-06, "loss": 0.0, "num_tokens": 63918176.0, "reward": 0.740234375, "reward_std": 0.2258191555738449, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 161 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7211301707837319e-09, "advantages/std": 0.5411110520362854, "advantages/var": 0.29280117063581557, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6015625, "epoch": 0.9290322580645162, "grad_norm": 0.0890862094590982, "learning_rate": 1.8375280400421418e-06, "loss": -0.0, "num_tokens": 64252958.0, "reward": 0.76953125, "reward_std": 0.11795367300510406, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "step": 162 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.6922530955107685e-09, "advantages/std": 0.5675330758094788, "advantages/var": 0.32209379213776757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.578125, "epoch": 0.9347670250896057, "grad_norm": 0.09661514305502454, "learning_rate": 1.8355608763376506e-06, "loss": -0.0, "num_tokens": 64587413.0, "reward": 0.7578125, "reward_std": 0.137538880109787, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "step": 163 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.174870072540035e-09, "advantages/std": 0.513348400592804, "advantages/var": 0.2635265803911899, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7265625, "epoch": 0.9405017921146953, "grad_norm": 0.11058602387885622, "learning_rate": 1.833582941503111e-06, "loss": 0.0, "num_tokens": 64891317.0, "reward": 0.849609375, "reward_std": 0.10914888978004456, "rewards/drgrpo_math_reward/mean": 0.849609375, "rewards/drgrpo_math_reward/std": 0.35780346393585205, "step": 164 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.151054608540737e-09, "advantages/std": 0.6169846653938293, "advantages/var": 0.38067007733113556, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5390625, "epoch": 0.946236559139785, "grad_norm": 0.1256546281676702, "learning_rate": 1.8315942610358788e-06, "loss": -0.0, "num_tokens": 65221328.0, "reward": 0.6953125, "reward_std": 0.17429867386817932, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4607250988483429, "step": 165 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.026560788825669e-10, "advantages/std": 0.6627157926559448, "advantages/var": 0.43919222183559725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6015625, "epoch": 0.9519713261648746, "grad_norm": 0.13519274214441523, "learning_rate": 1.8295948605718311e-06, "loss": 0.0, "num_tokens": 65531110.0, "reward": 0.822265625, "reward_std": 0.1700795441865921, "rewards/drgrpo_math_reward/mean": 0.822265625, "rewards/drgrpo_math_reward/std": 0.3826628625392914, "step": 166 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 8.783251543820981e-10, "advantages/std": 0.6627119779586792, "advantages/var": 0.4391871657299049, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 0.9577060931899641, "grad_norm": 0.12126412504321249, "learning_rate": 1.8275847658850357e-06, "loss": -0.0, "num_tokens": 65833008.0, "reward": 0.873046875, "reward_std": 0.17139101028442383, "rewards/drgrpo_math_reward/mean": 0.873046875, "rewards/drgrpo_math_reward/std": 0.33324605226516724, "step": 167 }, { "advantages/mean": -3.14321368932724e-09, "advantages/snr": 5.302685966288546e-09, "advantages/std": 0.5927587747573853, "advantages/var": 0.3513629650518766, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.46875, "epoch": 0.9634408602150538, "grad_norm": 0.10986845676979375, "learning_rate": 1.8255640028874178e-06, "loss": 0.0, "num_tokens": 66172603.0, "reward": 0.677734375, "reward_std": 0.14212869107723236, "rewards/drgrpo_math_reward/mean": 0.677734375, "rewards/drgrpo_math_reward/std": 0.46780112385749817, "step": 168 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 4.5672185058770585e-09, "advantages/std": 0.6627224683761597, "advantages/var": 0.43920107009058995, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4296875, "epoch": 0.9691756272401434, "grad_norm": 0.12521137355202885, "learning_rate": 1.8235325976284273e-06, "loss": -0.0, "num_tokens": 66514331.0, "reward": 0.671875, "reward_std": 0.17781181633472443, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4699897766113281, "step": 169 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.4546471957707234e-09, "advantages/std": 0.6402394771575928, "advantages/var": 0.40990658811102776, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 0.974910394265233, "grad_norm": 0.1326433956566627, "learning_rate": 1.8214905762947024e-06, "loss": -0.0, "num_tokens": 66841132.0, "reward": 0.6171875, "reward_std": 0.15705665946006775, "rewards/drgrpo_math_reward/mean": 0.6171875, "rewards/drgrpo_math_reward/std": 0.486548513174057, "step": 170 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.810670975825566e-10, "advantages/std": 0.48398786783218384, "advantages/var": 0.23424425620874345, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 0.9806451612903225, "grad_norm": 0.10042911751327677, "learning_rate": 1.8194379652097318e-06, "loss": 0.0, "num_tokens": 67126856.0, "reward": 0.9296875, "reward_std": 0.09655161201953888, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.25592297315597534, "step": 171 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.54735816219516e-10, "advantages/std": 0.6169857978820801, "advantages/var": 0.38067147478818697, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 0.9863799283154122, "grad_norm": 0.09758493639928423, "learning_rate": 1.8173747908335156e-06, "loss": -0.0, "num_tokens": 67445582.0, "reward": 0.78125, "reward_std": 0.17511680722236633, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "step": 172 }, { "advantages/mean": 4.0745362639427185e-09, "advantages/snr": 6.363551256461308e-09, "advantages/std": 0.6402928233146667, "advantages/var": 0.40997489958826705, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.328125, "epoch": 0.9921146953405018, "grad_norm": 0.11343140673500957, "learning_rate": 1.8153010797622244e-06, "loss": -0.0, "num_tokens": 67825849.0, "reward": 0.58203125, "reward_std": 0.20513556897640228, "rewards/drgrpo_math_reward/mean": 0.58203125, "rewards/drgrpo_math_reward/std": 0.4937073290348053, "step": 173 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.7737607444699495e-10, "advantages/std": 0.6169724464416504, "advantages/var": 0.38065499966819516, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5859375, "epoch": 0.9978494623655914, "grad_norm": 0.1345832158223139, "learning_rate": 1.813216858727856e-06, "loss": 0.0, "num_tokens": 68156887.0, "reward": 0.759765625, "reward_std": 0.1600971221923828, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "step": 174 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.535495498358237e-09, "advantages/std": 0.5133521556854248, "advantages/var": 0.2635304357468726, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.0057347670250896, "grad_norm": 0.08846530436560635, "learning_rate": 1.8111221545978911e-06, "loss": 0.0, "num_tokens": 68463713.0, "reward": 0.783203125, "reward_std": 0.11179311573505402, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "step": 175 }, { "advantages/mean": -1.979060471057892e-09, "advantages/snr": 3.207702210829852e-09, "advantages/std": 0.6169713735580444, "advantages/var": 0.3806536757901, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 1.0114695340501791, "grad_norm": 0.12162713670165774, "learning_rate": 1.8090169943749474e-06, "loss": -0.0, "num_tokens": 68793970.0, "reward": 0.716796875, "reward_std": 0.16441097855567932, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "step": 176 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 2.7967154580859883e-09, "advantages/std": 0.5411344766616821, "advantages/var": 0.2928265218319126, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 1.0172043010752687, "grad_norm": 0.11588209727467447, "learning_rate": 1.8069014051964305e-06, "loss": 0.0, "num_tokens": 69120286.0, "reward": 0.712890625, "reward_std": 0.13468407094478607, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "step": 177 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 3.773762202776577e-09, "advantages/std": 0.6169722080230713, "advantages/var": 0.38065470547286395, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 1.0229390681003585, "grad_norm": 0.11652252068154983, "learning_rate": 1.8047754143341844e-06, "loss": -0.0, "num_tokens": 69464225.0, "reward": 0.74609375, "reward_std": 0.16258057951927185, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "step": 178 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 5.891888107292995e-10, "advantages/std": 0.5927572846412659, "advantages/var": 0.3513611984952867, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.028673835125448, "grad_norm": 0.11567745012452299, "learning_rate": 1.8026390491941412e-06, "loss": 0.0, "num_tokens": 69800101.0, "reward": 0.76953125, "reward_std": 0.14152011275291443, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "step": 179 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 2.494605362666458e-09, "advantages/std": 0.513335108757019, "advantages/var": 0.26351293388258057, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 1.0344086021505376, "grad_norm": 0.1176150203158601, "learning_rate": 1.8004923373159655e-06, "loss": -0.0, "num_tokens": 70091201.0, "reward": 0.884765625, "reward_std": 0.10205584019422531, "rewards/drgrpo_math_reward/mean": 0.884765625, "rewards/drgrpo_math_reward/std": 0.3196168541908264, "step": 180 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 9.434476599952727e-10, "advantages/std": 0.6169675588607788, "advantages/var": 0.38064896868662856, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "epoch": 1.0401433691756272, "grad_norm": 0.11385245873200502, "learning_rate": 1.7983353063727014e-06, "loss": 0.0, "num_tokens": 70448648.0, "reward": 0.642578125, "reward_std": 0.1600867509841919, "rewards/drgrpo_math_reward/mean": 0.642578125, "rewards/drgrpo_math_reward/std": 0.4797092080116272, "step": 181 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.7494731357015733e-09, "advantages/std": 0.5927733778953552, "advantages/var": 0.3513802775414696, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.46875, "epoch": 1.0458781362007168, "grad_norm": 0.12129793606221752, "learning_rate": 1.796167984170415e-06, "loss": 0.0, "num_tokens": 70780836.0, "reward": 0.716796875, "reward_std": 0.15710300207138062, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "step": 182 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.7774705849796182e-09, "advantages/std": 0.4191415011882782, "advantages/var": 0.17567959801836341, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.515625, "epoch": 1.0516129032258064, "grad_norm": 0.0788480088934583, "learning_rate": 1.7939903986478354e-06, "loss": 0.0, "num_tokens": 71109621.0, "reward": 0.74609375, "reward_std": 0.07053204625844955, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "step": 183 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.5455413540152107e-09, "advantages/std": 0.6402624249458313, "advantages/var": 0.40993597279751626, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6484375, "epoch": 1.0573476702508962, "grad_norm": 0.1196317613297228, "learning_rate": 1.7918025778759956e-06, "loss": 0.0, "num_tokens": 71460780.0, "reward": 0.8203125, "reward_std": 0.17331230640411377, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "step": 184 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 5.106281460934552e-09, "advantages/std": 0.5927597880363464, "advantages/var": 0.35136416631289435, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4140625, "epoch": 1.0630824372759857, "grad_norm": 0.10067744849563154, "learning_rate": 1.7896045500578705e-06, "loss": -0.0, "num_tokens": 71804156.0, "reward": 0.75390625, "reward_std": 0.14507855474948883, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 185 }, { "advantages/mean": 4.889443516731262e-09, "advantages/snr": 6.930326720457966e-09, "advantages/std": 0.705514132976532, "advantages/var": 0.49775019182962765, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.203125, "epoch": 1.0688172043010753, "grad_norm": 0.118786945507082, "learning_rate": 1.787396343528012e-06, "loss": -0.0, "num_tokens": 72190914.0, "reward": 0.611328125, "reward_std": 0.1938217133283615, "rewards/drgrpo_math_reward/mean": 0.611328125, "rewards/drgrpo_math_reward/std": 0.4879252314567566, "step": 186 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.60358735013618e-09, "advantages/std": 0.725968062877655, "advantages/var": 0.5270296283183349, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.28125, "epoch": 1.0745519713261649, "grad_norm": 0.14993946274012712, "learning_rate": 1.7851779867521854e-06, "loss": -0.0, "num_tokens": 72565241.0, "reward": 0.8125, "reward_std": 0.2053028643131256, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39069411158561707, "step": 187 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.2908518937981486e-09, "advantages/std": 0.5411092638969421, "advantages/var": 0.29279923547509057, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.1171875, "epoch": 1.0802867383512544, "grad_norm": 0.09825626473693042, "learning_rate": 1.782949508327002e-06, "loss": -0.0, "num_tokens": 72938932.0, "reward": 0.7265625, "reward_std": 0.11591032892465591, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "step": 188 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 5.269865610913928e-10, "advantages/std": 0.6627227067947388, "advantages/var": 0.4392013861013453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.265625, "epoch": 1.086021505376344, "grad_norm": 0.1285357379135228, "learning_rate": 1.7807109369795494e-06, "loss": -0.0, "num_tokens": 73315124.0, "reward": 0.76953125, "reward_std": 0.17832425236701965, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "step": 189 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.8183443748961302e-09, "advantages/std": 0.6402270197868347, "advantages/var": 0.40989063686513205, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.9453125, "epoch": 1.0917562724014336, "grad_norm": 0.10725149822657516, "learning_rate": 1.7784623015670235e-06, "loss": 0.0, "num_tokens": 73720544.0, "reward": 0.685546875, "reward_std": 0.1424422264099121, "rewards/drgrpo_math_reward/mean": 0.685546875, "rewards/drgrpo_math_reward/std": 0.4647517800331116, "step": 190 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.026354141130688e-10, "advantages/std": 0.6627352833747864, "advantages/var": 0.4392180558298584, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.203125, "epoch": 1.0974910394265234, "grad_norm": 0.11610833341146351, "learning_rate": 1.776203631076353e-06, "loss": -0.0, "num_tokens": 74090640.0, "reward": 0.751953125, "reward_std": 0.18688389658927917, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "step": 191 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.290818906372107e-09, "advantages/std": 0.54112309217453, "advantages/var": 0.2928142008845249, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.109375, "epoch": 1.103225806451613, "grad_norm": 0.08538677631007051, "learning_rate": 1.7739349546238294e-06, "loss": -0.0, "num_tokens": 74466506.0, "reward": 0.736328125, "reward_std": 0.12616653740406036, "rewards/drgrpo_math_reward/mean": 0.736328125, "rewards/drgrpo_math_reward/std": 0.4410543739795685, "step": 192 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.81090826351961e-10, "advantages/std": 0.4839639961719513, "advantages/var": 0.2342211495907245, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.1089605734767025, "grad_norm": 0.08127147810312718, "learning_rate": 1.7716563014547295e-06, "loss": -0.0, "num_tokens": 74802558.0, "reward": 0.853515625, "reward_std": 0.0840085819363594, "rewards/drgrpo_math_reward/mean": 0.853515625, "rewards/drgrpo_math_reward/std": 0.35393697023391724, "step": 193 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.9639107649722726e-09, "advantages/std": 0.5927729606628418, "advantages/var": 0.351379782892991, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "epoch": 1.114695340501792, "grad_norm": 0.10981419844773183, "learning_rate": 1.7693677009429386e-06, "loss": 0.0, "num_tokens": 75158840.0, "reward": 0.775390625, "reward_std": 0.15448637306690216, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "step": 194 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 1.7008246438031396e-10, "advantages/std": 0.6844639778137207, "advantages/var": 0.46849093692458155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "epoch": 1.1204301075268817, "grad_norm": 0.12474326099494516, "learning_rate": 1.767069182590573e-06, "loss": 0.0, "num_tokens": 75506297.0, "reward": 0.755859375, "reward_std": 0.1931663453578949, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "step": 195 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 8.782909500132189e-10, "advantages/std": 0.6627377867698669, "advantages/var": 0.4392213740126216, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5390625, "epoch": 1.1261648745519715, "grad_norm": 0.1065692005467982, "learning_rate": 1.7647607760275985e-06, "loss": 0.0, "num_tokens": 75870075.0, "reward": 0.732421875, "reward_std": 0.18927854299545288, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "step": 196 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 6.154131970797732e-10, "advantages/std": 0.5674983263015747, "advantages/var": 0.32205435035508856, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4765625, "epoch": 1.131899641577061, "grad_norm": 0.10817758669915627, "learning_rate": 1.7624425110114479e-06, "loss": -0.0, "num_tokens": 76228472.0, "reward": 0.783203125, "reward_std": 0.1127297431230545, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "step": 197 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.3566739604607385e-09, "advantages/std": 0.5927777290344238, "advantages/var": 0.3513854360392088, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.078125, "epoch": 1.1376344086021506, "grad_norm": 0.10047330172499264, "learning_rate": 1.7601144174266397e-06, "loss": 0.0, "num_tokens": 76622738.0, "reward": 0.66796875, "reward_std": 0.1594453752040863, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.47140273451805115, "step": 198 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5927481651306152, "advantages/var": 0.3513503872657111, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.359375, "epoch": 1.1433691756272402, "grad_norm": 0.10913937011699498, "learning_rate": 1.7577765252843907e-06, "loss": 0.0, "num_tokens": 76982836.0, "reward": 0.71875, "reward_std": 0.13594119250774384, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "step": 199 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 5.454944758585228e-10, "advantages/std": 0.6402373909950256, "advantages/var": 0.40990391682811733, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0546875, "epoch": 1.1491039426523297, "grad_norm": 0.11378900313126608, "learning_rate": 1.7554288647222299e-06, "loss": -0.0, "num_tokens": 77332601.0, "reward": 0.697265625, "reward_std": 0.15084770321846008, "rewards/drgrpo_math_reward/mean": 0.697265625, "rewards/drgrpo_math_reward/std": 0.45989060401916504, "step": 200 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 2.000191661310961e-09, "advantages/std": 0.6402229070663452, "advantages/var": 0.4098853707324821, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.296875, "epoch": 1.1548387096774193, "grad_norm": 0.11910528525763692, "learning_rate": 1.753071466003611e-06, "loss": -0.0, "num_tokens": 77706422.0, "reward": 0.72265625, "reward_std": 0.14356109499931335, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "step": 201 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 2.9458545951805715e-09, "advantages/std": 0.592775285243988, "advantages/var": 0.3513825387960914, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.515625, "epoch": 1.1605734767025089, "grad_norm": 0.095311577064199, "learning_rate": 1.7507043595175194e-06, "loss": -0.0, "num_tokens": 78082808.0, "reward": 0.74609375, "reward_std": 0.15666288137435913, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "step": 202 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 9.434435584623148e-10, "advantages/std": 0.6169702410697937, "advantages/var": 0.38065227836571935, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.421875, "epoch": 1.1663082437275984, "grad_norm": 0.11997442039347711, "learning_rate": 1.7483275757780845e-06, "loss": -0.0, "num_tokens": 78414271.0, "reward": 0.755859375, "reward_std": 0.16046330332756042, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "step": 203 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.63662950049489e-10, "advantages/std": 0.6402374505996704, "advantages/var": 0.4099039931503654, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.2421875, "epoch": 1.1720430107526882, "grad_norm": 0.11316051328669278, "learning_rate": 1.7459411454241822e-06, "loss": 0.0, "num_tokens": 78787535.0, "reward": 0.703125, "reward_std": 0.15316343307495117, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "step": 204 }, { "advantages/mean": 3.958120942115784e-09, "advantages/snr": 6.415644286351711e-09, "advantages/std": 0.6169483065605164, "advantages/var": 0.38062521296788887, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "epoch": 1.1777777777777778, "grad_norm": 0.11763953198911904, "learning_rate": 1.7435450992190433e-06, "loss": 0.0, "num_tokens": 79169667.0, "reward": 0.720703125, "reward_std": 0.14475145936012268, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "step": 205 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.542871858279876e-09, "advantages/std": 0.4527219235897064, "advantages/var": 0.20495714009876398, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3359375, "epoch": 1.1835125448028674, "grad_norm": 0.085193050070482, "learning_rate": 1.7411394680498548e-06, "loss": 0.0, "num_tokens": 79520376.0, "reward": 0.734375, "reward_std": 0.07850531488656998, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "step": 206 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.2727816045321094e-09, "advantages/std": 0.6402736306190491, "advantages/var": 0.4099503220660985, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6484375, "epoch": 1.189247311827957, "grad_norm": 0.10257919489205741, "learning_rate": 1.7387242829273632e-06, "loss": -0.0, "num_tokens": 79878454.0, "reward": 0.8671875, "reward_std": 0.18469981849193573, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.33970388770103455, "step": 207 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.5428349906959549e-09, "advantages/std": 0.45273274183273315, "advantages/var": 0.2049669355273842, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4453125, "epoch": 1.1949820788530465, "grad_norm": 0.0784678825502076, "learning_rate": 1.7362995749854732e-06, "loss": 0.0, "num_tokens": 80208714.0, "reward": 0.748046875, "reward_std": 0.08446772396564484, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "step": 208 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.151374085923811e-09, "advantages/std": 0.5411207675933838, "advantages/var": 0.29281168512085287, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3671875, "epoch": 1.2007168458781363, "grad_norm": 0.1200006321749552, "learning_rate": 1.7338653754808478e-06, "loss": 0.0, "num_tokens": 80552015.0, "reward": 0.67578125, "reward_std": 0.12241831421852112, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.4685399830341339, "step": 209 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.640988277691964e-09, "advantages/std": 0.5675376057624817, "advantages/var": 0.3220989339546101, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.359375, "epoch": 1.206451612903226, "grad_norm": 0.09318624890473866, "learning_rate": 1.7314217157925047e-06, "loss": -0.0, "num_tokens": 80922817.0, "reward": 0.6875, "reward_std": 0.13930147886276245, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "step": 210 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.535437045355292e-09, "advantages/std": 0.5133587718009949, "advantages/var": 0.26353722858502593, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 1.2121863799283155, "grad_norm": 0.09283272269770848, "learning_rate": 1.7289686274214115e-06, "loss": 0.0, "num_tokens": 81258104.0, "reward": 0.791015625, "reward_std": 0.11877173185348511, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "step": 211 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.0571223937653763e-09, "advantages/std": 0.4527307450771332, "advantages/var": 0.20496512753809615, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5546875, "epoch": 1.217921146953405, "grad_norm": 0.07092035506603638, "learning_rate": 1.7265061419900803e-06, "loss": -0.0, "num_tokens": 81570956.0, "reward": 0.77734375, "reward_std": 0.08666947484016418, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "step": 212 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.1783252504846608e-09, "advantages/std": 0.5927836298942566, "advantages/var": 0.351392431870611, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 1.2236559139784946, "grad_norm": 0.10275672109081843, "learning_rate": 1.7240342912421596e-06, "loss": -0.0, "num_tokens": 81929448.0, "reward": 0.77734375, "reward_std": 0.1595107614994049, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "step": 213 }, { "advantages/mean": 3.14321368932724e-09, "advantages/snr": 5.30285339941009e-09, "advantages/std": 0.5927400588989258, "advantages/var": 0.351340777423502, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6484375, "epoch": 1.2293906810035842, "grad_norm": 0.10177505312597472, "learning_rate": 1.721553107042026e-06, "loss": 0.0, "num_tokens": 82277177.0, "reward": 0.7734375, "reward_std": 0.1298515498638153, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "step": 214 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.3963397183334816e-09, "advantages/std": 0.6169806122779846, "advantages/var": 0.3806650759269168, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.2351254480286737, "grad_norm": 0.11516499191731107, "learning_rate": 1.719062621374371e-06, "loss": -0.0, "num_tokens": 82613023.0, "reward": 0.806640625, "reward_std": 0.162800133228302, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "step": 215 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.4616845902163285e-09, "advantages/std": 0.5674909949302673, "advantages/var": 0.3220460293269447, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 1.2408602150537635, "grad_norm": 0.10541414157657623, "learning_rate": 1.716562866343792e-06, "loss": -0.0, "num_tokens": 82953297.0, "reward": 0.810546875, "reward_std": 0.10854348540306091, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "step": 216 }, { "advantages/mean": 5.122274160385132e-09, "advantages/snr": 8.641663558617698e-09, "advantages/std": 0.5927416682243347, "advantages/var": 0.3513426852493673, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5546875, "epoch": 1.246594982078853, "grad_norm": 0.10613394268696186, "learning_rate": 1.7140538741743761e-06, "loss": -0.0, "num_tokens": 83287525.0, "reward": 0.732421875, "reward_std": 0.1317683309316635, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "step": 217 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 5.283169087461682e-09, "advantages/std": 0.6169836521148682, "advantages/var": 0.38066882697700066, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.2523297491039427, "grad_norm": 0.18363264943966723, "learning_rate": 1.7115356772092855e-06, "loss": 0.0, "num_tokens": 83593222.0, "reward": 0.80859375, "reward_std": 0.16916778683662415, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.3937928080558777, "step": 218 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 1.5308064329270049e-09, "advantages/std": 0.684435248374939, "advantages/var": 0.4684516092180644, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8046875, "epoch": 1.2580645161290323, "grad_norm": 0.14972630520833002, "learning_rate": 1.7090083079103398e-06, "loss": -0.0, "num_tokens": 83956841.0, "reward": 0.716796875, "reward_std": 0.16752377152442932, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "step": 219 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.509501527011544e-09, "advantages/std": 0.6169735789299011, "advantages/var": 0.38065639709757093, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6171875, "epoch": 1.2637992831541218, "grad_norm": 0.10774499555376024, "learning_rate": 1.7064717988576e-06, "loss": 0.0, "num_tokens": 84294960.0, "reward": 0.80078125, "reward_std": 0.16471639275550842, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "step": 220 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 3.4015651457391856e-09, "advantages/std": 0.5133606791496277, "advantages/var": 0.263539186896967, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 1.2695340501792114, "grad_norm": 0.11889023408621924, "learning_rate": 1.7039261827489448e-06, "loss": -0.0, "num_tokens": 84584816.0, "reward": 0.845703125, "reward_std": 0.12083513289690018, "rewards/drgrpo_math_reward/mean": 0.845703125, "rewards/drgrpo_math_reward/std": 0.36158639192581177, "step": 221 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.2052981517498e-10, "advantages/std": 0.5675129294395447, "advantages/var": 0.3220709250810536, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 1.2752688172043012, "grad_norm": 0.10400581914675953, "learning_rate": 1.7013714923996524e-06, "loss": 0.0, "num_tokens": 84940732.0, "reward": 0.671875, "reward_std": 0.1220255121588707, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4699897766113281, "step": 222 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.8863650839346726e-09, "advantages/std": 0.48399415612220764, "advantages/var": 0.2342503431604479, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4921875, "epoch": 1.2810035842293908, "grad_norm": 0.09338526706615954, "learning_rate": 1.6988077607419752e-06, "loss": 0.0, "num_tokens": 85311837.0, "reward": 0.658203125, "reward_std": 0.10242104530334473, "rewards/drgrpo_math_reward/mean": 0.658203125, "rewards/drgrpo_math_reward/std": 0.4747757613658905, "step": 223 }, { "advantages/mean": 5.820766091346741e-09, "advantages/snr": 9.434677124475458e-09, "advantages/std": 0.6169544458389282, "advantages/var": 0.380632788240419, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6328125, "epoch": 1.2867383512544803, "grad_norm": 0.10577161017698553, "learning_rate": 1.6962350208247167e-06, "loss": -0.0, "num_tokens": 85661568.0, "reward": 0.802734375, "reward_std": 0.15140779316425323, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "step": 224 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.44229712174148e-09, "advantages/std": 0.5411052703857422, "advantages/var": 0.29279491363922716, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 1.29247311827957, "grad_norm": 0.11875574904240402, "learning_rate": 1.6936533058128049e-06, "loss": 0.0, "num_tokens": 85972152.0, "reward": 0.771484375, "reward_std": 0.11509227007627487, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "step": 225 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.8106073788705395e-09, "advantages/std": 0.6627197861671448, "advantages/var": 0.4391975149774261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5234375, "epoch": 1.2982078853046595, "grad_norm": 0.13578877806891682, "learning_rate": 1.6910626489868648e-06, "loss": -0.0, "num_tokens": 86318612.0, "reward": 0.767578125, "reward_std": 0.17392629384994507, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "step": 226 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.988950628733241e-09, "advantages/std": 0.5133618712425232, "advantages/var": 0.26354041084562496, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.303942652329749, "grad_norm": 0.10376306260311771, "learning_rate": 1.6884630837427888e-06, "loss": -0.0, "num_tokens": 86652473.0, "reward": 0.7890625, "reward_std": 0.11784161627292633, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4083731174468994, "step": 227 }, { "advantages/mean": 1.979060471057892e-09, "advantages/snr": 3.4870532137228623e-09, "advantages/std": 0.5675452351570129, "advantages/var": 0.3221075939494291, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.3096774193548386, "grad_norm": 0.11089030384660872, "learning_rate": 1.685854643591308e-06, "loss": -0.0, "num_tokens": 86987538.0, "reward": 0.830078125, "reward_std": 0.14660954475402832, "rewards/drgrpo_math_reward/mean": 0.830078125, "rewards/drgrpo_math_reward/std": 0.3759314715862274, "step": 228 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.547490864636814e-10, "advantages/std": 0.616974949836731, "advantages/var": 0.3806580887260367, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5703125, "epoch": 1.3154121863799282, "grad_norm": 0.13452286867989058, "learning_rate": 1.6832373621575581e-06, "loss": -0.0, "num_tokens": 87345584.0, "reward": 0.720703125, "reward_std": 0.1643172949552536, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "step": 229 }, { "advantages/mean": -3.14321368932724e-09, "advantages/snr": 4.9094397704003384e-09, "advantages/std": 0.6402387619018555, "advantages/var": 0.4099056722416208, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 1.321146953405018, "grad_norm": 0.12065475955406939, "learning_rate": 1.6806112731806471e-06, "loss": -0.0, "num_tokens": 87712890.0, "reward": 0.748046875, "reward_std": 0.15587902069091797, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "step": 230 }, { "advantages/mean": 3.14321368932724e-09, "advantages/snr": 4.909346989645819e-09, "advantages/std": 0.6402508616447449, "advantages/var": 0.40992116583683824, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6640625, "epoch": 1.3268817204301075, "grad_norm": 0.14713488219513188, "learning_rate": 1.677976410513221e-06, "loss": -0.0, "num_tokens": 88036204.0, "reward": 0.767578125, "reward_std": 0.16333986818790436, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "step": 231 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.3566045311990682e-09, "advantages/std": 0.592795193195343, "advantages/var": 0.35140614107550405, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 1.3326164874551971, "grad_norm": 0.12236957050780332, "learning_rate": 1.6753328081210244e-06, "loss": -0.0, "num_tokens": 88377468.0, "reward": 0.740234375, "reward_std": 0.17170454561710358, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 232 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.810691119189989e-10, "advantages/std": 0.4839858412742615, "advantages/var": 0.23424229455395462, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6484375, "epoch": 1.3383512544802867, "grad_norm": 0.09100034420982667, "learning_rate": 1.672680500082467e-06, "loss": -0.0, "num_tokens": 88714912.0, "reward": 0.75390625, "reward_std": 0.09720437228679657, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 233 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 9.819764140889662e-10, "advantages/std": 0.5927602648735046, "advantages/var": 0.3513647316129074, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 1.3440860215053765, "grad_norm": 0.1254882832658587, "learning_rate": 1.6700195205881811e-06, "loss": -0.0, "num_tokens": 89066352.0, "reward": 0.76953125, "reward_std": 0.14567705988883972, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "step": 234 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.621424895284137e-10, "advantages/std": 0.48398369550704956, "advantages/var": 0.23424021751666046, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7421875, "epoch": 1.349820788530466, "grad_norm": 0.08781737003905032, "learning_rate": 1.667349903940582e-06, "loss": -0.0, "num_tokens": 89378668.0, "reward": 0.8203125, "reward_std": 0.09611941128969193, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "step": 235 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.0615190173497912e-09, "advantages/std": 0.6844562292098999, "advantages/var": 0.46848032970423503, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 1.3555555555555556, "grad_norm": 0.156887010897483, "learning_rate": 1.6646716845534256e-06, "loss": 0.0, "num_tokens": 89703790.0, "reward": 0.75390625, "reward_std": 0.18814034759998322, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 236 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 2.3637020084117984e-09, "advantages/std": 0.640266478061676, "advantages/var": 0.40994116292950267, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 1.3612903225806452, "grad_norm": 0.12184794404851976, "learning_rate": 1.661984896951365e-06, "loss": 0.0, "num_tokens": 90043631.0, "reward": 0.693359375, "reward_std": 0.17868411540985107, "rewards/drgrpo_math_reward/mean": 0.693359375, "rewards/drgrpo_math_reward/std": 0.4615498185157776, "step": 237 }, { "advantages/mean": 2.6775524020195007e-09, "advantages/snr": 4.040107848228005e-09, "advantages/std": 0.6627427935600281, "advantages/var": 0.43922801041575, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6640625, "epoch": 1.3670250896057348, "grad_norm": 0.13810493233753124, "learning_rate": 1.6592895757695052e-06, "loss": 0.0, "num_tokens": 90372461.0, "reward": 0.734375, "reward_std": 0.1929323822259903, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "step": 238 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7211491296425828e-09, "advantages/std": 0.5411050915718079, "advantages/var": 0.29279472012493457, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.3727598566308243, "grad_norm": 0.1195543824573462, "learning_rate": 1.6565857557529564e-06, "loss": -0.0, "num_tokens": 90709076.0, "reward": 0.689453125, "reward_std": 0.11443497240543365, "rewards/drgrpo_math_reward/mean": 0.689453125, "rewards/drgrpo_math_reward/std": 0.46317005157470703, "step": 239 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.6923740854357398e-09, "advantages/std": 0.5675144791603088, "advantages/var": 0.3220726840565966, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.378494623655914, "grad_norm": 0.11978689375308976, "learning_rate": 1.653873471756387e-06, "loss": -0.0, "num_tokens": 91014076.0, "reward": 0.82421875, "reward_std": 0.12686356902122498, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "step": 240 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 4.329563888276925e-09, "advantages/std": 0.483992338180542, "advantages/var": 0.23424858341746813, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 1.3842293906810035, "grad_norm": 0.09668002730261871, "learning_rate": 1.6511527587435735e-06, "loss": -0.0, "num_tokens": 91308978.0, "reward": 0.802734375, "reward_std": 0.10006578266620636, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "step": 241 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7211654345952558e-09, "advantages/std": 0.5410999655723572, "advantages/var": 0.2927891727424061, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 1.3899641577060933, "grad_norm": 0.11978959048312579, "learning_rate": 1.6484236517869496e-06, "loss": -0.0, "num_tokens": 91645355.0, "reward": 0.77734375, "reward_std": 0.10975531488656998, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "step": 242 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 6.8031042252281955e-09, "advantages/std": 0.5133626461029053, "advantages/var": 0.26354120641377676, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 1.3956989247311828, "grad_norm": 0.11021201732188858, "learning_rate": 1.645686186067155e-06, "loss": -0.0, "num_tokens": 91972936.0, "reward": 0.79296875, "reward_std": 0.11602336168289185, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40557438135147095, "step": 243 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 5.660876278326729e-10, "advantages/std": 0.616946816444397, "advantages/var": 0.38062337432087645, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.4014336917562724, "grad_norm": 0.12693376456130956, "learning_rate": 1.642940396872581e-06, "loss": -0.0, "num_tokens": 92284566.0, "reward": 0.8515625, "reward_std": 0.14205874502658844, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 244 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.1783259613716923e-09, "advantages/std": 0.5927832722663879, "advantages/var": 0.3513920078788466, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 1.407168458781362, "grad_norm": 0.13008023128536025, "learning_rate": 1.640186319598916e-06, "loss": -0.0, "num_tokens": 92589945.0, "reward": 0.798828125, "reward_std": 0.1603584885597229, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "step": 245 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 9.43414757590889e-10, "advantages/std": 0.6169890761375427, "advantages/var": 0.3806755200730585, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.4129032258064516, "grad_norm": 0.1093101952621721, "learning_rate": 1.6374239897486897e-06, "loss": 0.0, "num_tokens": 92917612.0, "reward": 0.740234375, "reward_std": 0.1752377450466156, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 246 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.547386598039818e-10, "advantages/std": 0.6169834733009338, "advantages/var": 0.38066860632648414, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 1.4186379928315414, "grad_norm": 0.12908774109218704, "learning_rate": 1.6346534429308141e-06, "loss": -0.0, "num_tokens": 93230679.0, "reward": 0.8046875, "reward_std": 0.1726028323173523, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 247 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.5355671185058926e-10, "advantages/std": 0.5133440494537354, "advantages/var": 0.2635221131095591, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 1.424372759856631, "grad_norm": 0.0956546402740197, "learning_rate": 1.6318747148601257e-06, "loss": 0.0, "num_tokens": 93547156.0, "reward": 0.78125, "reward_std": 0.10622049123048782, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "step": 248 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.019088804540415e-09, "advantages/std": 0.6169560551643372, "advantages/var": 0.38063477400394063, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.4301075268817205, "grad_norm": 0.1324576934358323, "learning_rate": 1.6290878413569251e-06, "loss": -0.0, "num_tokens": 93862895.0, "reward": 0.8203125, "reward_std": 0.1507083922624588, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "step": 249 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.9640203700520696e-09, "advantages/std": 0.5927398800849915, "advantages/var": 0.35134056544317005, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.43584229390681, "grad_norm": 0.12693635524342287, "learning_rate": 1.6262928583465141e-06, "loss": 0.0, "num_tokens": 94182526.0, "reward": 0.70703125, "reward_std": 0.13044346868991852, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.455569326877594, "step": 250 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.023510386753999e-09, "advantages/std": 0.5411510467529297, "advantages/var": 0.2928444554017915, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6015625, "epoch": 1.4415770609318996, "grad_norm": 0.10718139478731144, "learning_rate": 1.6234898018587336e-06, "loss": 0.0, "num_tokens": 94505852.0, "reward": 0.685546875, "reward_std": 0.148565411567688, "rewards/drgrpo_math_reward/mean": 0.685546875, "rewards/drgrpo_math_reward/std": 0.4647517800331116, "step": 251 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.320905644247716e-09, "advantages/std": 0.5927315354347229, "advantages/var": 0.35133067309880417, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 1.4473118279569892, "grad_norm": 0.13562192546429738, "learning_rate": 1.620678708027499e-06, "loss": -0.0, "num_tokens": 94801238.0, "reward": 0.830078125, "reward_std": 0.12356997281312943, "rewards/drgrpo_math_reward/mean": 0.830078125, "rewards/drgrpo_math_reward/std": 0.3759314715862274, "step": 252 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.810576778213242e-10, "advantages/std": 0.4839973449707031, "advantages/var": 0.2342534299386898, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.4530465949820788, "grad_norm": 0.09921367049420339, "learning_rate": 1.6178596130903343e-06, "loss": -0.0, "num_tokens": 95089221.0, "reward": 0.876953125, "reward_std": 0.10134430229663849, "rewards/drgrpo_math_reward/mean": 0.876953125, "rewards/drgrpo_math_reward/std": 0.32881227135658264, "step": 253 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.535737752088846e-10, "advantages/std": 0.5133247375488281, "advantages/var": 0.2635022861795733, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 1.4587813620071683, "grad_norm": 0.09839599796375287, "learning_rate": 1.615032553387905e-06, "loss": -0.0, "num_tokens": 95401324.0, "reward": 0.759765625, "reward_std": 0.09633205831050873, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "step": 254 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.360621779654249e-09, "advantages/std": 0.3422415256500244, "advantages/var": 0.11712926187925632, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.4645161290322581, "grad_norm": 0.08417996652092524, "learning_rate": 1.6121975653635488e-06, "loss": -0.0, "num_tokens": 95673457.0, "reward": 0.853515625, "reward_std": 0.05193261057138443, "rewards/drgrpo_math_reward/mean": 0.853515625, "rewards/drgrpo_math_reward/std": 0.35393697023391724, "step": 255 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.1818541999072432e-09, "advantages/std": 0.6402736902236938, "advantages/var": 0.40995039839266667, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.4702508960573477, "grad_norm": 0.11978488193094501, "learning_rate": 1.6093546855628081e-06, "loss": -0.0, "num_tokens": 96000510.0, "reward": 0.71875, "reward_std": 0.18044257164001465, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "step": 256 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.356762589421316e-09, "advantages/std": 0.5927554368972778, "advantages/var": 0.3513590079712827, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 1.4759856630824373, "grad_norm": 0.15554104851634354, "learning_rate": 1.6065039506329559e-06, "loss": -0.0, "num_tokens": 96288522.0, "reward": 0.841796875, "reward_std": 0.1409113109111786, "rewards/drgrpo_math_reward/mean": 0.841796875, "rewards/drgrpo_math_reward/std": 0.36528825759887695, "step": 257 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.0909627476506956e-09, "advantages/std": 0.6402527689933777, "advantages/var": 0.40992360820368745, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6015625, "epoch": 1.4817204301075269, "grad_norm": 0.1124054300875447, "learning_rate": 1.6036453973225256e-06, "loss": 0.0, "num_tokens": 96605712.0, "reward": 0.75, "reward_std": 0.16642636060714722, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "step": 258 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 1.936265683458883e-09, "advantages/std": 0.5411126613616943, "advantages/var": 0.2928029122859357, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 1.4874551971326164, "grad_norm": 0.0927318028258141, "learning_rate": 1.6007790624808365e-06, "loss": 0.0, "num_tokens": 96916524.0, "reward": 0.75, "reward_std": 0.11971767991781235, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "step": 259 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.1818799957085062e-09, "advantages/std": 0.6402661204338074, "advantages/var": 0.4099407049753587, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.390625, "epoch": 1.4931899641577062, "grad_norm": 0.12873288090331472, "learning_rate": 1.5979049830575188e-06, "loss": 0.0, "num_tokens": 97289662.0, "reward": 0.689453125, "reward_std": 0.1769224852323532, "rewards/drgrpo_math_reward/mean": 0.689453125, "rewards/drgrpo_math_reward/std": 0.46317005157470703, "step": 260 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.264264540309556e-09, "advantages/std": 0.6169702410697937, "advantages/var": 0.38065227836571935, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.4989247311827958, "grad_norm": 0.13646443362459426, "learning_rate": 1.595023196102037e-06, "loss": -0.0, "num_tokens": 97586152.0, "reward": 0.900390625, "reward_std": 0.16071709990501404, "rewards/drgrpo_math_reward/mean": 0.900390625, "rewards/drgrpo_math_reward/std": 0.29977133870124817, "step": 261 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.1819039640075217e-09, "advantages/std": 0.6402590870857239, "advantages/var": 0.40993169859584455, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.265625, "epoch": 1.5046594982078854, "grad_norm": 0.12095215698111506, "learning_rate": 1.5921337387632133e-06, "loss": 0.0, "num_tokens": 97953428.0, "reward": 0.7265625, "reward_std": 0.17425240576267242, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "step": 262 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.0204684823417303e-09, "advantages/std": 0.6844816207885742, "advantages/var": 0.4685150891973535, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 1.510394265232975, "grad_norm": 0.12969155515976394, "learning_rate": 1.589236648288747e-06, "loss": -0.0, "num_tokens": 98303480.0, "reward": 0.73828125, "reward_std": 0.2113167941570282, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "step": 263 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.070973091997636e-10, "advantages/std": 0.513353168964386, "advantages/var": 0.2635314760857774, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.546875, "epoch": 1.5161290322580645, "grad_norm": 0.11123808459796197, "learning_rate": 1.5863319620247364e-06, "loss": 0.0, "num_tokens": 98625084.0, "reward": 0.775390625, "reward_std": 0.11332826316356659, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "step": 264 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 2.0512885590636562e-10, "advantages/std": 0.5675228834152222, "advantages/var": 0.32208222319992785, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.484375, "epoch": 1.521863799283154, "grad_norm": 0.12638654065468802, "learning_rate": 1.5834197174151957e-06, "loss": 0.0, "num_tokens": 98961075.0, "reward": 0.6953125, "reward_std": 0.1302509605884552, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4607250988483429, "step": 265 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 3.6300257734247797e-09, "advantages/std": 0.7055423855781555, "advantages/var": 0.49779005784731467, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5390625, "epoch": 1.5275985663082436, "grad_norm": 0.15321337018896833, "learning_rate": 1.5804999520015733e-06, "loss": -0.0, "num_tokens": 99308149.0, "reward": 0.662109375, "reward_std": 0.21978893876075745, "rewards/drgrpo_math_reward/mean": 0.662109375, "rewards/drgrpo_math_reward/std": 0.4734536409378052, "step": 266 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.3673624827691494e-09, "advantages/std": 0.4840032756328583, "advantages/var": 0.23425917082333658, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8828125, "epoch": 1.5333333333333332, "grad_norm": 0.13292427647956506, "learning_rate": 1.5775727034222674e-06, "loss": 0.0, "num_tokens": 99587432.0, "reward": 0.82421875, "reward_std": 0.10298692435026169, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "step": 267 }, { "advantages/mean": 2.9103830456733704e-09, "advantages/snr": 5.669514852808519e-09, "advantages/std": 0.5133389830589294, "advantages/var": 0.26351691152797585, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6171875, "epoch": 1.5390681003584228, "grad_norm": 0.10246467822109713, "learning_rate": 1.5746380094121409e-06, "loss": -0.0, "num_tokens": 99896539.0, "reward": 0.798828125, "reward_std": 0.10248804092407227, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "step": 268 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.605355109142449e-10, "advantages/std": 0.5411296486854553, "advantages/var": 0.2928212966864443, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.5448028673835126, "grad_norm": 0.13369139053348553, "learning_rate": 1.5716959078020354e-06, "loss": -0.0, "num_tokens": 100204602.0, "reward": 0.890625, "reward_std": 0.12988576292991638, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31241437792778015, "step": 269 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 2.3665224427940754e-09, "advantages/std": 0.5411182641983032, "advantages/var": 0.2928089758489847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.28125, "epoch": 1.5505376344086022, "grad_norm": 0.10862156057212859, "learning_rate": 1.5687464365182819e-06, "loss": 0.0, "num_tokens": 100573673.0, "reward": 0.68359375, "reward_std": 0.12546055018901825, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.46552830934524536, "step": 270 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.205449828633859e-10, "advantages/std": 0.5675024390220642, "advantages/var": 0.3220590182959917, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.5562724014336917, "grad_norm": 0.10918909185356437, "learning_rate": 1.5657896335822145e-06, "loss": -0.0, "num_tokens": 100888955.0, "reward": 0.794921875, "reward_std": 0.11804604530334473, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "step": 271 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.581628144299183e-09, "advantages/std": 0.5411251187324524, "advantages/var": 0.2928163941232107, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 1.5620071684587815, "grad_norm": 0.14669506533301582, "learning_rate": 1.5628255371096781e-06, "loss": -0.0, "num_tokens": 101202559.0, "reward": 0.783203125, "reward_std": 0.1296921968460083, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "step": 272 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.542863428346356e-09, "advantages/std": 0.4527243971824646, "advantages/var": 0.20495937980422596, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6640625, "epoch": 1.567741935483871, "grad_norm": 0.12734598943828992, "learning_rate": 1.5598541853105384e-06, "loss": 0.0, "num_tokens": 101512061.0, "reward": 0.798828125, "reward_std": 0.08093452453613281, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "step": 273 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.535084520450849e-09, "advantages/std": 0.5927653908729553, "advantages/var": 0.3513708086167675, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5703125, "epoch": 1.5734767025089607, "grad_norm": 0.10954190322824714, "learning_rate": 1.556875616488188e-06, "loss": 0.0, "num_tokens": 101860888.0, "reward": 0.693359375, "reward_std": 0.14845240116119385, "rewards/drgrpo_math_reward/mean": 0.693359375, "rewards/drgrpo_math_reward/std": 0.4615498185157776, "step": 274 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.44317703235199e-09, "advantages/std": 0.4839960038661957, "advantages/var": 0.2342521317584465, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7265625, "epoch": 1.5792114695340502, "grad_norm": 0.10920229025532756, "learning_rate": 1.553889869039054e-06, "loss": -0.0, "num_tokens": 102187891.0, "reward": 0.74609375, "reward_std": 0.10270209610462189, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "step": 275 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 3.864592097531673e-09, "advantages/std": 0.6627185940742493, "advantages/var": 0.4391959349317496, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.5849462365591398, "grad_norm": 0.12656252860542871, "learning_rate": 1.5508969814521024e-06, "loss": -0.0, "num_tokens": 102533110.0, "reward": 0.83203125, "reward_std": 0.1728019416332245, "rewards/drgrpo_math_reward/mean": 0.83203125, "rewards/drgrpo_math_reward/std": 0.374204158782959, "step": 276 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 2.0512187593391272e-10, "advantages/std": 0.5675421953201294, "advantages/var": 0.3221041434687919, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.5906810035842294, "grad_norm": 0.12892846319959114, "learning_rate": 1.5478969923083417e-06, "loss": 0.0, "num_tokens": 102833839.0, "reward": 0.7578125, "reward_std": 0.147240549325943, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "step": 277 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.332811143021943e-09, "advantages/std": 0.4191608130931854, "advantages/var": 0.17569578723294033, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 1.596415770609319, "grad_norm": 0.10514370212106776, "learning_rate": 1.5448899402803264e-06, "loss": 0.0, "num_tokens": 103125233.0, "reward": 0.8359375, "reward_std": 0.0820918083190918, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.37069445848464966, "step": 278 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.272841930556052e-09, "advantages/std": 0.6402618288993835, "advantages/var": 0.4099352095455835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 1.6021505376344085, "grad_norm": 0.17932125159917675, "learning_rate": 1.5418758641316572e-06, "loss": -0.0, "num_tokens": 103392415.0, "reward": 0.78515625, "reward_std": 0.17716015875339508, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4111155867576599, "step": 279 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.2908526047526143e-09, "advantages/std": 0.5411089658737183, "advantages/var": 0.2927989129489248, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 1.607885304659498, "grad_norm": 0.1229624013003741, "learning_rate": 1.5388548027164822e-06, "loss": -0.0, "num_tokens": 103673196.0, "reward": 0.849609375, "reward_std": 0.11839379370212555, "rewards/drgrpo_math_reward/mean": 0.849609375, "rewards/drgrpo_math_reward/std": 0.35780346393585205, "step": 280 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.5351069148987546e-09, "advantages/std": 0.5927616357803345, "advantages/var": 0.3513663568529779, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.6136200716845877, "grad_norm": 0.15384274136793785, "learning_rate": 1.5358267949789964e-06, "loss": 0.0, "num_tokens": 103948854.0, "reward": 0.8515625, "reward_std": 0.14453192055225372, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 281 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.855572759201336e-10, "advantages/std": 0.5927782654762268, "advantages/var": 0.351386072021004, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.6193548387096774, "grad_norm": 0.12660588676294837, "learning_rate": 1.532791879952939e-06, "loss": -0.0, "num_tokens": 104257681.0, "reward": 0.76953125, "reward_std": 0.15902778506278992, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "step": 282 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.512648095506594e-09, "advantages/std": 0.5675463676452637, "advantages/var": 0.3221088794273328, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.625089605734767, "grad_norm": 0.1321244018164173, "learning_rate": 1.5297500967610891e-06, "loss": 0.0, "num_tokens": 104551086.0, "reward": 0.720703125, "reward_std": 0.15144036710262299, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "step": 283 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 2.2676874538895522e-10, "advantages/std": 0.5133658051490784, "advantages/var": 0.2635444498963615, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.6308243727598566, "grad_norm": 0.1153286910805719, "learning_rate": 1.5267014846147645e-06, "loss": 0.0, "num_tokens": 104851390.0, "reward": 0.8203125, "reward_std": 0.12236613780260086, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "step": 284 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.7737516300790627e-10, "advantages/std": 0.6169739365577698, "advantages/var": 0.3806568383915909, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 1.6365591397849464, "grad_norm": 0.17301525240106697, "learning_rate": 1.5236460828133134e-06, "loss": -0.0, "num_tokens": 105181988.0, "reward": 0.72265625, "reward_std": 0.16404348611831665, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "step": 285 }, { "advantages/mean": 3.4924596548080444e-09, "advantages/snr": 5.102414391245492e-09, "advantages/std": 0.6844719648361206, "advantages/var": 0.4685018706466195, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6640625, "epoch": 1.642293906810036, "grad_norm": 0.167075688925522, "learning_rate": 1.5205839307436086e-06, "loss": -0.0, "num_tokens": 105524388.0, "reward": 0.68359375, "reward_std": 0.20257321000099182, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.46552830934524536, "step": 286 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.28217165803688e-09, "advantages/std": 0.5675038695335388, "advantages/var": 0.32206064193553985, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 1.6480286738351255, "grad_norm": 0.153258601415096, "learning_rate": 1.5175150678795402e-06, "loss": -0.0, "num_tokens": 105839505.0, "reward": 0.74609375, "reward_std": 0.1155911535024643, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "step": 287 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.4054086576411598e-09, "advantages/std": 0.4839731454849243, "advantages/var": 0.23423000555057172, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 1.653763440860215, "grad_norm": 0.09923316072742916, "learning_rate": 1.5144395337815063e-06, "loss": 0.0, "num_tokens": 106125337.0, "reward": 0.802734375, "reward_std": 0.08879336714744568, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "step": 288 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.163550907512668e-09, "advantages/std": 0.5410942435264587, "advantages/var": 0.29278298037747064, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 1.6594982078853047, "grad_norm": 0.46373383888926234, "learning_rate": 1.5113573680959038e-06, "loss": -0.0, "num_tokens": 106411559.0, "reward": 0.794921875, "reward_std": 0.1038198471069336, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "step": 289 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 2.405278169797869e-10, "advantages/std": 0.4839994013309479, "advantages/var": 0.23425542048871595, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.359375, "epoch": 1.6652329749103942, "grad_norm": 0.10019241389098878, "learning_rate": 1.5082686105546159e-06, "loss": -0.0, "num_tokens": 106751002.0, "reward": 0.609375, "reward_std": 0.10332095623016357, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.48836761713027954, "step": 290 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.9091451620567432e-09, "advantages/std": 0.640272319316864, "advantages/var": 0.4099486428833963, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 1.6709677419354838, "grad_norm": 0.16276724216070787, "learning_rate": 1.5051733009745012e-06, "loss": 0.0, "num_tokens": 107095780.0, "reward": 0.658203125, "reward_std": 0.1826443374156952, "rewards/drgrpo_math_reward/mean": 0.658203125, "rewards/drgrpo_math_reward/std": 0.4747757613658905, "step": 291 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.2917462196001055e-09, "advantages/std": 0.48398712277412415, "advantages/var": 0.23424353501117512, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7421875, "epoch": 1.6767025089605734, "grad_norm": 0.11189440637542936, "learning_rate": 1.5020714792568794e-06, "loss": 0.0, "num_tokens": 107377861.0, "reward": 0.833984375, "reward_std": 0.09562712162733078, "rewards/drgrpo_math_reward/mean": 0.833984375, "rewards/drgrpo_math_reward/std": 0.3724585771560669, "step": 292 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.535464428556472e-09, "advantages/std": 0.5133556723594666, "advantages/var": 0.26353404634363997, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 1.682437275985663, "grad_norm": 0.09490895807177079, "learning_rate": 1.4989631853870184e-06, "loss": -0.0, "num_tokens": 107677668.0, "reward": 0.84375, "reward_std": 0.11300258338451385, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "step": 293 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.00011704931642e-09, "advantages/std": 0.6402655243873596, "advantages/var": 0.4099399417190206, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 1.6881720430107527, "grad_norm": 0.1545511979282254, "learning_rate": 1.4958484594336178e-06, "loss": 0.0, "num_tokens": 107992767.0, "reward": 0.755859375, "reward_std": 0.17813889682292938, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "step": 294 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.621223463227536e-10, "advantages/std": 0.4839938282966614, "advantages/var": 0.23425002582925813, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.6939068100358423, "grad_norm": 0.08907490583735535, "learning_rate": 1.4927273415482915e-06, "loss": 0.0, "num_tokens": 108284892.0, "reward": 0.8515625, "reward_std": 0.10144393146038055, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 295 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.273054515352864e-10, "advantages/std": 0.6402554512023926, "advantages/var": 0.4099270427943793, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 1.6996415770609319, "grad_norm": 0.1715134421660095, "learning_rate": 1.4895998719650523e-06, "loss": 0.0, "num_tokens": 108608678.0, "reward": 0.7734375, "reward_std": 0.16998592019081116, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "step": 296 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.2820720355862404e-09, "advantages/std": 0.5675210952758789, "advantages/var": 0.3220801935831332, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 1.7053763440860215, "grad_norm": 0.1458993534721859, "learning_rate": 1.4864660909997916e-06, "loss": 0.0, "num_tokens": 108901817.0, "reward": 0.7578125, "reward_std": 0.12776978313922882, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "step": 297 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.0285330311090656e-09, "advantages/std": 0.45274314284324646, "advantages/var": 0.20497635339158027, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7421875, "epoch": 1.7111111111111112, "grad_norm": 0.0869036102962838, "learning_rate": 1.4833260390497604e-06, "loss": -0.0, "num_tokens": 109221423.0, "reward": 0.865234375, "reward_std": 0.0935453474521637, "rewards/drgrpo_math_reward/mean": 0.865234375, "rewards/drgrpo_math_reward/std": 0.3418070077896118, "step": 298 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.621108532245012e-10, "advantages/std": 0.4839996099472046, "advantages/var": 0.23425562242904618, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4765625, "epoch": 1.7168458781362008, "grad_norm": 0.10039839094757973, "learning_rate": 1.4801797565930479e-06, "loss": -0.0, "num_tokens": 109538545.0, "reward": 0.76171875, "reward_std": 0.10272903740406036, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "step": 299 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 1.936211724226011e-09, "advantages/std": 0.5411277413368225, "advantages/var": 0.2928192324442911, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3046875, "epoch": 1.7225806451612904, "grad_norm": 0.12522351931122214, "learning_rate": 1.4770272841880607e-06, "loss": -0.0, "num_tokens": 109866583.0, "reward": 0.734375, "reward_std": 0.13149690628051758, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "step": 300 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 5.7435349757151235e-09, "advantages/std": 0.56753009557724, "advantages/var": 0.32209040938591116, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.72831541218638, "grad_norm": 0.12452469003826121, "learning_rate": 1.4738686624729987e-06, "loss": -0.0, "num_tokens": 110190807.0, "reward": 0.7890625, "reward_std": 0.13688018918037415, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4083731174468994, "step": 301 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 6.803630854449872e-10, "advantages/std": 0.34221526980400085, "advantages/var": 0.1171112908870251, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 1.7340501792114695, "grad_norm": 0.07188470897204434, "learning_rate": 1.4707039321653328e-06, "loss": 0.0, "num_tokens": 110481331.0, "reward": 0.806640625, "reward_std": 0.04125870764255524, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "step": 302 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.7565853756305945e-09, "advantages/std": 0.6627364754676819, "advantages/var": 0.4392196359153253, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 1.739784946236559, "grad_norm": 0.12514427175557807, "learning_rate": 1.467533134061278e-06, "loss": -0.0, "num_tokens": 110853971.0, "reward": 0.751953125, "reward_std": 0.18881280720233917, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "step": 303 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 5.333219744067853e-09, "advantages/std": 0.5675367712974548, "advantages/var": 0.32209798677473955, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.7455197132616487, "grad_norm": 0.13749030743894858, "learning_rate": 1.4643563090352697e-06, "loss": -0.0, "num_tokens": 111194806.0, "reward": 0.7421875, "reward_std": 0.14183580875396729, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "step": 304 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 5.106289676298233e-09, "advantages/std": 0.59275883436203, "advantages/var": 0.35136303571423255, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 1.7512544802867382, "grad_norm": 0.14109350282348268, "learning_rate": 1.4611734980394356e-06, "loss": -0.0, "num_tokens": 111520429.0, "reward": 0.783203125, "reward_std": 0.14179056882858276, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "step": 305 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 5.4990424830678515e-09, "advantages/std": 0.5927630066871643, "advantages/var": 0.3513679820968072, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 1.7569892473118278, "grad_norm": 0.14079476246916528, "learning_rate": 1.4579847421030676e-06, "loss": 0.0, "num_tokens": 111845685.0, "reward": 0.81640625, "reward_std": 0.14485347270965576, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "step": 306 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.302871876153281e-10, "advantages/std": 0.5411052107810974, "advantages/var": 0.29279484913445586, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.390625, "epoch": 1.7627240143369176, "grad_norm": 0.1059171157531617, "learning_rate": 1.4547900823320929e-06, "loss": -0.0, "num_tokens": 112181135.0, "reward": 0.78125, "reward_std": 0.11411448568105698, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "step": 307 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.749421437631947e-09, "advantages/std": 0.5927845239639282, "advantages/var": 0.351393491851141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.34375, "epoch": 1.7684587813620072, "grad_norm": 0.12806035309036248, "learning_rate": 1.451589559908545e-06, "loss": 0.0, "num_tokens": 112558563.0, "reward": 0.693359375, "reward_std": 0.1634243279695511, "rewards/drgrpo_math_reward/mean": 0.693359375, "rewards/drgrpo_math_reward/std": 0.4615498185157776, "step": 308 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.605649905986263e-10, "advantages/std": 0.5411111116409302, "advantages/var": 0.2928012351412832, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 1.7741935483870968, "grad_norm": 0.13217280124081415, "learning_rate": 1.4483832160900325e-06, "loss": -0.0, "num_tokens": 112835127.0, "reward": 0.90234375, "reward_std": 0.11795367300510406, "rewards/drgrpo_math_reward/mean": 0.90234375, "rewards/drgrpo_math_reward/std": 0.29713961482048035, "step": 309 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 1.5059324995642482e-09, "advantages/std": 0.541131317615509, "advantages/var": 0.2928231029042969, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.34375, "epoch": 1.7799283154121865, "grad_norm": 0.1042103884325726, "learning_rate": 1.4451710922092056e-06, "loss": -0.0, "num_tokens": 113190919.0, "reward": 0.689453125, "reward_std": 0.1346571296453476, "rewards/drgrpo_math_reward/mean": 0.689453125, "rewards/drgrpo_math_reward/std": 0.46317005157470703, "step": 310 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 5.593544897509335e-09, "advantages/std": 0.5411234498023987, "advantages/var": 0.2928145879260491, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.390625, "epoch": 1.7856630824372761, "grad_norm": 0.12275443138058066, "learning_rate": 1.4419532296732268e-06, "loss": 0.0, "num_tokens": 113533657.0, "reward": 0.736328125, "reward_std": 0.12753017246723175, "rewards/drgrpo_math_reward/mean": 0.736328125, "rewards/drgrpo_math_reward/std": 0.4410543739795685, "step": 311 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.1819086358559415e-09, "advantages/std": 0.640257716178894, "advantages/var": 0.40992994312661324, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5234375, "epoch": 1.7913978494623657, "grad_norm": 0.15468750709139886, "learning_rate": 1.4387296699632332e-06, "loss": -0.0, "num_tokens": 113869361.0, "reward": 0.62890625, "reward_std": 0.16922108829021454, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.4835699498653412, "step": 312 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.1783940920257324e-09, "advantages/std": 0.5927489995956421, "advantages/var": 0.3513513765216345, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 1.7971326164874553, "grad_norm": 0.16376837511765685, "learning_rate": 1.4355004546338045e-06, "loss": -0.0, "num_tokens": 114189463.0, "reward": 0.80078125, "reward_std": 0.1358291357755661, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "step": 313 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.01902755367384e-09, "advantages/std": 0.61696857213974, "advantages/var": 0.38065021900814955, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6640625, "epoch": 1.8028673835125448, "grad_norm": 0.1460085788876577, "learning_rate": 1.4322656253124264e-06, "loss": 0.0, "num_tokens": 114540298.0, "reward": 0.701171875, "reward_std": 0.16086292266845703, "rewards/drgrpo_math_reward/mean": 0.701171875, "rewards/drgrpo_math_reward/std": 0.45819199085235596, "step": 314 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 1.5059658413307318e-09, "advantages/std": 0.5411193370819092, "advantages/var": 0.29281013696396485, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 1.8086021505376344, "grad_norm": 0.14776849230159933, "learning_rate": 1.4290252236989535e-06, "loss": -0.0, "num_tokens": 114851091.0, "reward": 0.8046875, "reward_std": 0.12191344797611237, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 315 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.871784981132148e-09, "advantages/std": 0.567526638507843, "advantages/var": 0.3220864854160119, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 1.814336917562724, "grad_norm": 0.22498874566797478, "learning_rate": 1.4257792915650725e-06, "loss": -0.0, "num_tokens": 115173045.0, "reward": 0.712890625, "reward_std": 0.1328601837158203, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "step": 316 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.5355139297977354e-10, "advantages/std": 0.5133500695228577, "advantages/var": 0.2635282938791228, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8828125, "epoch": 1.8200716845878135, "grad_norm": 0.10952010356827811, "learning_rate": 1.4225278707537638e-06, "loss": -0.0, "num_tokens": 115440658.0, "reward": 0.837890625, "reward_std": 0.11053494364023209, "rewards/drgrpo_math_reward/mean": 0.837890625, "rewards/drgrpo_math_reward/std": 0.3689115643501282, "step": 317 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 1.8709387157304823e-09, "advantages/std": 0.6844524145126343, "advantages/var": 0.46847510773217493, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 1.8258064516129031, "grad_norm": 0.1573726381917088, "learning_rate": 1.4192710031787617e-06, "loss": -0.0, "num_tokens": 115773261.0, "reward": 0.783203125, "reward_std": 0.18314877152442932, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "step": 318 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5711609986453247e-09, "advantages/std": 0.5927607417106628, "advantages/var": 0.35136529691337515, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5859375, "epoch": 1.8315412186379927, "grad_norm": 0.14969333227473883, "learning_rate": 1.4160087308240133e-06, "loss": -0.0, "num_tokens": 116107792.0, "reward": 0.7890625, "reward_std": 0.1466420441865921, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4083731174468994, "step": 319 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 6.454107328405196e-10, "advantages/std": 0.5411220192909241, "advantages/var": 0.2928130397614872, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.8372759856630825, "grad_norm": 0.14928764023996713, "learning_rate": 1.4127410957431396e-06, "loss": -0.0, "num_tokens": 116384558.0, "reward": 0.841796875, "reward_std": 0.12439830601215363, "rewards/drgrpo_math_reward/mean": 0.841796875, "rewards/drgrpo_math_reward/std": 0.36528825759887695, "step": 320 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.0857542794618893e-09, "advantages/std": 0.45272037386894226, "advantages/var": 0.20495573691603486, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.843010752688172, "grad_norm": 0.10427236014288133, "learning_rate": 1.4094681400588907e-06, "loss": -0.0, "num_tokens": 116666484.0, "reward": 0.935546875, "reward_std": 0.0799964889883995, "rewards/drgrpo_math_reward/mean": 0.935546875, "rewards/drgrpo_math_reward/std": 0.24579854309558868, "step": 321 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.814170394739418e-09, "advantages/std": 0.5133600234985352, "advantages/var": 0.26353851372641657, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 1.8487455197132616, "grad_norm": 0.13244598189177256, "learning_rate": 1.4061899059626043e-06, "loss": 0.0, "num_tokens": 116966454.0, "reward": 0.79296875, "reward_std": 0.1180657371878624, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40557438135147095, "step": 322 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.106181851753312e-09, "advantages/std": 0.5927713513374329, "advantages/var": 0.35137787496640627, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 1.8544802867383514, "grad_norm": 0.1387511075687616, "learning_rate": 1.4029064357136626e-06, "loss": 0.0, "num_tokens": 117279017.0, "reward": 0.8046875, "reward_std": 0.1513465940952301, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 323 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 2.1513892524061168e-10, "advantages/std": 0.5411169528961182, "advantages/var": 0.29280755671157976, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5234375, "epoch": 1.860215053763441, "grad_norm": 0.13286635356924328, "learning_rate": 1.3996177716389452e-06, "loss": 0.0, "num_tokens": 117580643.0, "reward": 0.8203125, "reward_std": 0.12401571124792099, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "step": 324 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.071093160288001e-10, "advantages/std": 0.5133463740348816, "advantages/var": 0.26352449973476055, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 1.8659498207885306, "grad_norm": 0.14356777438603108, "learning_rate": 1.3963239561322857e-06, "loss": 0.0, "num_tokens": 117867676.0, "reward": 0.822265625, "reward_std": 0.10562324523925781, "rewards/drgrpo_math_reward/mean": 0.822265625, "rewards/drgrpo_math_reward/std": 0.3826628625392914, "step": 325 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 1.5059875723314338e-09, "advantages/std": 0.5411115288734436, "advantages/var": 0.2928016866797556, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6484375, "epoch": 1.8716845878136201, "grad_norm": 0.12649283681319096, "learning_rate": 1.3930250316539235e-06, "loss": 0.0, "num_tokens": 118176042.0, "reward": 0.794921875, "reward_std": 0.11861886829137802, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "step": 326 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5133352875709534, "advantages/var": 0.2635131174655534, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 1.8774193548387097, "grad_norm": 0.1315548895379891, "learning_rate": 1.3897210407299583e-06, "loss": -0.0, "num_tokens": 118455024.0, "reward": 0.828125, "reward_std": 0.1000194400548935, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3776407241821289, "step": 327 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5410828590393066, "advantages/var": 0.2927706603461502, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7421875, "epoch": 1.8831541218637993, "grad_norm": 0.15034233758443546, "learning_rate": 1.386412025951799e-06, "loss": -0.0, "num_tokens": 118755956.0, "reward": 0.765625, "reward_std": 0.09888088703155518, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42402184009552, "step": 328 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.461610645607385e-09, "advantages/std": 0.5675080418586731, "advantages/var": 0.32206537757426545, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6953125, "epoch": 1.8888888888888888, "grad_norm": 0.16823986917712755, "learning_rate": 1.3830980299756188e-06, "loss": -0.0, "num_tokens": 119024324.0, "reward": 0.818359375, "reward_std": 0.12063602358102798, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "step": 329 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 4.082278688104936e-09, "advantages/std": 0.5133103132247925, "advantages/var": 0.26348747766293457, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 1.8946236559139784, "grad_norm": 0.19994316675178933, "learning_rate": 1.379779095521801e-06, "loss": -0.0, "num_tokens": 119277898.0, "reward": 0.8984375, "reward_std": 0.08510598540306091, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.30236753821372986, "step": 330 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.886870346434056e-09, "advantages/std": 0.616975724697113, "advantages/var": 0.3806590448655278, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "epoch": 1.900358422939068, "grad_norm": 0.13909441501898573, "learning_rate": 1.3764552653743919e-06, "loss": -0.0, "num_tokens": 119656041.0, "reward": 0.693359375, "reward_std": 0.16647270321846008, "rewards/drgrpo_math_reward/mean": 0.693359375, "rewards/drgrpo_math_reward/std": 0.4615498185157776, "step": 331 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.5355228822652577e-10, "advantages/std": 0.5133490562438965, "advantages/var": 0.2635272535464992, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 1.9060931899641576, "grad_norm": 0.14183808921448746, "learning_rate": 1.3731265823805468e-06, "loss": -0.0, "num_tokens": 119971731.0, "reward": 0.8515625, "reward_std": 0.10679991543292999, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 332 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.535504450752997e-10, "advantages/std": 0.5133511424064636, "advantages/var": 0.2635293954100213, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.46875, "epoch": 1.9118279569892473, "grad_norm": 0.15402012078670543, "learning_rate": 1.3697930894499784e-06, "loss": -0.0, "num_tokens": 120286333.0, "reward": 0.71875, "reward_std": 0.11217661201953888, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "step": 333 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.205041347992218e-10, "advantages/std": 0.5675306916236877, "advantages/var": 0.32209108593486135, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 1.917562724014337, "grad_norm": 0.1858916013552339, "learning_rate": 1.3664548295544046e-06, "loss": 0.0, "num_tokens": 120615124.0, "reward": 0.798828125, "reward_std": 0.13676020503044128, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "step": 334 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.290891708454305e-09, "advantages/std": 0.541092574596405, "advantages/var": 0.29278117428336614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7421875, "epoch": 1.9232974910394265, "grad_norm": 0.1610278243203886, "learning_rate": 1.3631118457269927e-06, "loss": -0.0, "num_tokens": 120896694.0, "reward": 0.841796875, "reward_std": 0.10595491528511047, "rewards/drgrpo_math_reward/mean": 0.841796875, "rewards/drgrpo_math_reward/std": 0.36528825759887695, "step": 335 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.7210522542596562e-09, "advantages/std": 0.5411355495452881, "advantages/var": 0.29282768298168094, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8046875, "epoch": 1.9290322580645163, "grad_norm": 0.1492702474503473, "learning_rate": 1.359764181061807e-06, "loss": -0.0, "num_tokens": 121180709.0, "reward": 0.802734375, "reward_std": 0.13567635416984558, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "step": 336 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.4431588153749422e-09, "advantages/std": 0.48400211334228516, "advantages/var": 0.23425804571979825, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6796875, "epoch": 1.9347670250896059, "grad_norm": 0.12815595128064342, "learning_rate": 1.3564118787132506e-06, "loss": -0.0, "num_tokens": 121493730.0, "reward": 0.740234375, "reward_std": 0.10562227666378021, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 337 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.5428622095684087e-09, "advantages/std": 0.45272475481033325, "advantages/var": 0.20495970361807636, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 1.9405017921146954, "grad_norm": 0.12492529304061448, "learning_rate": 1.353054981895512e-06, "loss": 0.0, "num_tokens": 121755985.0, "reward": 0.87109375, "reward_std": 0.08171838521957397, "rewards/drgrpo_math_reward/mean": 0.87109375, "rewards/drgrpo_math_reward/std": 0.33542385697364807, "step": 338 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 2.4053692580034996e-10, "advantages/std": 0.48398107290267944, "advantages/var": 0.23423767892802871, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.546875, "epoch": 1.946236559139785, "grad_norm": 0.10316417751658882, "learning_rate": 1.349693533882005e-06, "loss": 0.0, "num_tokens": 122088766.0, "reward": 0.75390625, "reward_std": 0.093304343521595, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 339 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.0909367479607857e-09, "advantages/std": 0.6402680277824402, "advantages/var": 0.4099431474004156, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6796875, "epoch": 1.9519713261648746, "grad_norm": 0.15159061175268415, "learning_rate": 1.3463275780048135e-06, "loss": 0.0, "num_tokens": 122413845.0, "reward": 0.7109375, "reward_std": 0.18123199045658112, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45377036929130554, "step": 340 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 2.83038591020189e-09, "advantages/std": 0.6169582009315491, "advantages/var": 0.3806374216966937, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 1.9577060931899641, "grad_norm": 0.12964869232946996, "learning_rate": 1.3429571576541314e-06, "loss": -0.0, "num_tokens": 122713382.0, "reward": 0.779296875, "reward_std": 0.1495577096939087, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "step": 341 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.855936124625931e-10, "advantages/std": 0.5927508473396301, "advantages/var": 0.3513535670218495, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6796875, "epoch": 1.9634408602150537, "grad_norm": 0.14325207691426217, "learning_rate": 1.3395823162777038e-06, "loss": -0.0, "num_tokens": 123027629.0, "reward": 0.845703125, "reward_std": 0.13680529594421387, "rewards/drgrpo_math_reward/mean": 0.845703125, "rewards/drgrpo_math_reward/std": 0.36158639192581177, "step": 342 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.221902531244979e-09, "advantages/std": 0.41915544867515564, "advantages/var": 0.17569129015407103, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 1.9691756272401433, "grad_norm": 0.14809693510218058, "learning_rate": 1.3362030973802669e-06, "loss": 0.0, "num_tokens": 123314909.0, "reward": 0.787109375, "reward_std": 0.07834453880786896, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "step": 343 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 1.7675490140895606e-09, "advantages/std": 0.5927631258964539, "advantages/var": 0.3513681234225352, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7421875, "epoch": 1.9749103942652328, "grad_norm": 0.12393137874472906, "learning_rate": 1.3328195445229867e-06, "loss": -0.0, "num_tokens": 123619678.0, "reward": 0.833984375, "reward_std": 0.14865149557590485, "rewards/drgrpo_math_reward/mean": 0.833984375, "rewards/drgrpo_math_reward/std": 0.3724585771560669, "step": 344 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 2.164875657150087e-09, "advantages/std": 0.48397138714790344, "advantages/var": 0.23422830357786584, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 1.9806451612903224, "grad_norm": 0.12931458953963446, "learning_rate": 1.329431701322898e-06, "loss": -0.0, "num_tokens": 123887192.0, "reward": 0.865234375, "reward_std": 0.08804762363433838, "rewards/drgrpo_math_reward/mean": 0.865234375, "rewards/drgrpo_math_reward/std": 0.3418070077896118, "step": 345 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.5350671027430816e-09, "advantages/std": 0.5927683115005493, "advantages/var": 0.35137427111921227, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 1.9863799283154122, "grad_norm": 0.1531820928335527, "learning_rate": 1.3260396114523417e-06, "loss": 0.0, "num_tokens": 124183616.0, "reward": 0.837890625, "reward_std": 0.1491573452949524, "rewards/drgrpo_math_reward/mean": 0.837890625, "rewards/drgrpo_math_reward/std": 0.3689115643501282, "step": 346 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.178329278855848e-09, "advantages/std": 0.5927816033363342, "advantages/var": 0.3513900292539951, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6953125, "epoch": 1.9921146953405018, "grad_norm": 0.13652493583752662, "learning_rate": 1.322643318638403e-06, "loss": 0.0, "num_tokens": 124469529.0, "reward": 0.8046875, "reward_std": 0.15838181972503662, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 347 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.96391767664292e-09, "advantages/std": 0.5927708745002747, "advantages/var": 0.35137730965582037, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 1.9978494623655914, "grad_norm": 0.13064603491803345, "learning_rate": 1.3192428666623462e-06, "loss": 0.0, "num_tokens": 124764139.0, "reward": 0.80078125, "reward_std": 0.1513998806476593, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "step": 348 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.1339208765175476e-09, "advantages/std": 0.51333087682724, "advantages/var": 0.26350858910422303, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.0057347670250896, "grad_norm": 0.12614095237860806, "learning_rate": 1.3158382993590506e-06, "loss": 0.0, "num_tokens": 125045936.0, "reward": 0.896484375, "reward_std": 0.09809703379869461, "rewards/drgrpo_math_reward/mean": 0.896484375, "rewards/drgrpo_math_reward/std": 0.30492907762527466, "step": 349 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.810606103489544e-10, "advantages/std": 0.48399439454078674, "advantages/var": 0.23425057394690274, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 2.011469534050179, "grad_norm": 0.12165930395096783, "learning_rate": 1.3124296606164462e-06, "loss": -0.0, "num_tokens": 125319870.0, "reward": 0.86328125, "reward_std": 0.10221564769744873, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.3438861668109894, "step": 350 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 2.2563674127838317e-09, "advantages/std": 0.5675354599952698, "advantages/var": 0.32209649835204246, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8828125, "epoch": 2.0172043010752687, "grad_norm": 0.126047165461204, "learning_rate": 1.3090169943749473e-06, "loss": 0.0, "num_tokens": 125605488.0, "reward": 0.826171875, "reward_std": 0.14395257830619812, "rewards/drgrpo_math_reward/mean": 0.826171875, "rewards/drgrpo_math_reward/std": 0.3793322443962097, "step": 351 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.026322544695164e-10, "advantages/std": 0.6627382636070251, "advantages/var": 0.43922200604885475, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 2.0229390681003583, "grad_norm": 0.20069165337518377, "learning_rate": 1.3056003446268868e-06, "loss": -0.0, "num_tokens": 125902191.0, "reward": 0.8359375, "reward_std": 0.18451344966888428, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.37069445848464966, "step": 352 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.8142426463513973e-09, "advantages/std": 0.5133395791053772, "advantages/var": 0.2635175234760858, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 2.028673835125448, "grad_norm": 0.11621297082223309, "learning_rate": 1.302179755415948e-06, "loss": 0.0, "num_tokens": 126217658.0, "reward": 0.7265625, "reward_std": 0.1069074496626854, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "step": 353 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.071243776760643e-10, "advantages/std": 0.5133378505706787, "advantages/var": 0.26351574882852447, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 2.0344086021505374, "grad_norm": 0.1421985379687097, "learning_rate": 1.2987552708365974e-06, "loss": -0.0, "num_tokens": 126485706.0, "reward": 0.904296875, "reward_std": 0.10009176284074783, "rewards/drgrpo_math_reward/mean": 0.904296875, "rewards/drgrpo_math_reward/std": 0.2944713830947876, "step": 354 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.773838400865838e-10, "advantages/std": 0.6169597506523132, "advantages/var": 0.3806393339249645, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 2.0401433691756274, "grad_norm": 0.15419475534184002, "learning_rate": 1.2953269350335168e-06, "loss": -0.0, "num_tokens": 126814782.0, "reward": 0.740234375, "reward_std": 0.151978999376297, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 355 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4527159631252289, "advantages/var": 0.2049517432684036, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6171875, "epoch": 2.045878136200717, "grad_norm": 0.11545760066300567, "learning_rate": 1.2918947922010336e-06, "loss": 0.0, "num_tokens": 127125910.0, "reward": 0.77734375, "reward_std": 0.07682153582572937, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "step": 356 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.0285834055995543e-09, "advantages/std": 0.45272096991539, "advantages/var": 0.20495627660113147, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 2.0516129032258066, "grad_norm": 0.11379847124725691, "learning_rate": 1.28845888658255e-06, "loss": -0.0, "num_tokens": 127416604.0, "reward": 0.779296875, "reward_std": 0.07874394953250885, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "step": 357 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.872421315736629e-09, "advantages/std": 0.5411280393600464, "advantages/var": 0.2928195549816479, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 2.057347670250896, "grad_norm": 0.16818337256172777, "learning_rate": 1.285019262469976e-06, "loss": -0.0, "num_tokens": 127746209.0, "reward": 0.81640625, "reward_std": 0.1281217485666275, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "step": 358 }, { "advantages/mean": 4.0745362639427185e-09, "advantages/snr": 6.604163606996613e-09, "advantages/std": 0.6169647574424744, "advantages/var": 0.3806455119260512, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6015625, "epoch": 2.0630824372759857, "grad_norm": 0.1536513245415294, "learning_rate": 1.2815759642031551e-06, "loss": -0.0, "num_tokens": 128089839.0, "reward": 0.744140625, "reward_std": 0.1569366753101349, "rewards/drgrpo_math_reward/mean": 0.744140625, "rewards/drgrpo_math_reward/std": 0.43676990270614624, "step": 359 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5133576989173889, "advantages/var": 0.26353612703775653, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 2.0688172043010753, "grad_norm": 0.14895805857891264, "learning_rate": 1.2781290361692937e-06, "loss": -0.0, "num_tokens": 128374855.0, "reward": 0.80859375, "reward_std": 0.11426074802875519, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.3937928080558777, "step": 360 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.924278462022981e-09, "advantages/std": 0.4839853346347809, "advantages/var": 0.23424180414154083, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 2.074551971326165, "grad_norm": 0.12509254067759573, "learning_rate": 1.2746785228023901e-06, "loss": -0.0, "num_tokens": 128686585.0, "reward": 0.748046875, "reward_std": 0.09365837275981903, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "step": 361 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.5355813374807817e-10, "advantages/std": 0.5133424401283264, "advantages/var": 0.2635204608369044, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 2.0802867383512544, "grad_norm": 0.14619750882891186, "learning_rate": 1.27122446858266e-06, "loss": 0.0, "num_tokens": 128968781.0, "reward": 0.796875, "reward_std": 0.10654062032699585, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "step": 362 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.302820687302047e-10, "advantages/std": 0.5411116480827332, "advantages/var": 0.29280181569081165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 2.086021505376344, "grad_norm": 0.16779458530270636, "learning_rate": 1.2677669180359642e-06, "loss": 0.0, "num_tokens": 129247092.0, "reward": 0.75390625, "reward_std": 0.11874544620513916, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 363 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.60522809679252e-10, "advantages/std": 0.5411376357078552, "advantages/var": 0.29282994077948743, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 2.0917562724014336, "grad_norm": 0.24393615469273242, "learning_rate": 1.2643059157332337e-06, "loss": 0.0, "num_tokens": 129566678.0, "reward": 0.76953125, "reward_std": 0.13831031322479248, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "step": 364 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.102558159619941e-09, "advantages/std": 0.5675255060195923, "advantages/var": 0.3220851999827943, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6796875, "epoch": 2.097491039426523, "grad_norm": 0.20214402099750756, "learning_rate": 1.2608415062898969e-06, "loss": -0.0, "num_tokens": 129878980.0, "reward": 0.69921875, "reward_std": 0.13258638978004456, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "step": 365 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 4.7135517212832565e-09, "advantages/std": 0.5927520990371704, "advantages/var": 0.3513550509129715, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 2.1032258064516127, "grad_norm": 0.18338200742363903, "learning_rate": 1.2573737343653023e-06, "loss": 0.0, "num_tokens": 130166936.0, "reward": 0.8515625, "reward_std": 0.136513352394104, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 366 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.605476437873595e-10, "advantages/std": 0.5411220192909241, "advantages/var": 0.2928130397614872, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.1089605734767023, "grad_norm": 0.14782766652448337, "learning_rate": 1.2539026446621445e-06, "loss": 0.0, "num_tokens": 130459080.0, "reward": 0.892578125, "reward_std": 0.12517425417900085, "rewards/drgrpo_math_reward/mean": 0.892578125, "rewards/drgrpo_math_reward/std": 0.30995169281959534, "step": 367 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.2642654152997877e-09, "advantages/std": 0.6169700026512146, "advantages/var": 0.38065198417143975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 2.1146953405017923, "grad_norm": 0.14662700353906374, "learning_rate": 1.2504282819258865e-06, "loss": -0.0, "num_tokens": 130799341.0, "reward": 0.73046875, "reward_std": 0.16007742285728455, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44415023922920227, "step": 368 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.071096320022777e-10, "advantages/std": 0.5133461952209473, "advantages/var": 0.2635243161478229, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7421875, "epoch": 2.120430107526882, "grad_norm": 0.13301482332663542, "learning_rate": 1.2469506909441838e-06, "loss": -0.0, "num_tokens": 131082580.0, "reward": 0.798828125, "reward_std": 0.10763143002986908, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "step": 369 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 6.45391822907754e-10, "advantages/std": 0.5411378741264343, "advantages/var": 0.2928301988140767, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8359375, "epoch": 2.1261648745519715, "grad_norm": 0.14691022275789828, "learning_rate": 1.2434699165463078e-06, "loss": -0.0, "num_tokens": 131355511.0, "reward": 0.8515625, "reward_std": 0.1375398486852646, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 370 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.5455733460206952e-09, "advantages/std": 0.6402543783187866, "advantages/var": 0.40992566895637594, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 2.131899641577061, "grad_norm": 0.19123399831874063, "learning_rate": 1.2399860036025658e-06, "loss": -0.0, "num_tokens": 131680408.0, "reward": 0.720703125, "reward_std": 0.1640568971633911, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "step": 371 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.8864294237033408e-09, "advantages/std": 0.4839833676815033, "advantages/var": 0.2342399001923292, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 2.1376344086021506, "grad_norm": 0.1458787920393729, "learning_rate": 1.2364989970237248e-06, "loss": -0.0, "num_tokens": 131939900.0, "reward": 0.857421875, "reward_std": 0.09573355317115784, "rewards/drgrpo_math_reward/mean": 0.857421875, "rewards/drgrpo_math_reward/std": 0.3499840497970581, "step": 372 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.205336070445113e-10, "advantages/std": 0.5675103068351746, "advantages/var": 0.322067948364154, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6640625, "epoch": 2.14336917562724, "grad_norm": 0.16204768200760683, "learning_rate": 1.2330089417604304e-06, "loss": -0.0, "num_tokens": 132287329.0, "reward": 0.740234375, "reward_std": 0.11834049224853516, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 373 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 5.091389845951324e-09, "advantages/std": 0.6402238011360168, "advantages/var": 0.40988651554105004, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 2.1491039426523297, "grad_norm": 0.2020045498587441, "learning_rate": 1.2295158828026292e-06, "loss": -0.0, "num_tokens": 132598528.0, "reward": 0.83984375, "reward_std": 0.14493131637573242, "rewards/drgrpo_math_reward/mean": 0.83984375, "rewards/drgrpo_math_reward/std": 0.3671095669269562, "step": 374 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 2.5532655072002156e-09, "advantages/std": 0.5927308201789856, "advantages/var": 0.35132982519005296, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6015625, "epoch": 2.1548387096774193, "grad_norm": 0.1941086954799567, "learning_rate": 1.2260198651789884e-06, "loss": 0.0, "num_tokens": 132951166.0, "reward": 0.6796875, "reward_std": 0.11971627920866013, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4670529365539551, "step": 375 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.3329346052015485e-09, "advantages/std": 0.41914528608322144, "advantages/var": 0.17568277084578554, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 2.160573476702509, "grad_norm": 0.11886098845232254, "learning_rate": 1.2225209339563143e-06, "loss": 0.0, "num_tokens": 133242802.0, "reward": 0.841796875, "reward_std": 0.06997986882925034, "rewards/drgrpo_math_reward/mean": 0.841796875, "rewards/drgrpo_math_reward/std": 0.36528825759887695, "step": 376 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.3030647932309745e-10, "advantages/std": 0.5410809516906738, "advantages/var": 0.2927685962824853, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 2.1663082437275984, "grad_norm": 0.1679927960279385, "learning_rate": 1.2190191342389726e-06, "loss": -0.0, "num_tokens": 133549746.0, "reward": 0.892578125, "reward_std": 0.0971580371260643, "rewards/drgrpo_math_reward/mean": 0.892578125, "rewards/drgrpo_math_reward/std": 0.30995169281959534, "step": 377 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.2856971797302575e-09, "advantages/std": 0.45273226499557495, "advantages/var": 0.2049665037680235, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.172043010752688, "grad_norm": 0.104614103323364, "learning_rate": 1.2155145111683066e-06, "loss": -0.0, "num_tokens": 133849189.0, "reward": 0.927734375, "reward_std": 0.08478030562400818, "rewards/drgrpo_math_reward/mean": 0.927734375, "rewards/drgrpo_math_reward/std": 0.2591804563999176, "step": 378 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.102644766956689e-10, "advantages/std": 0.5675135254859924, "advantages/var": 0.3220716016095402, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6796875, "epoch": 2.1777777777777776, "grad_norm": 0.1584090475985635, "learning_rate": 1.2120071099220547e-06, "loss": 0.0, "num_tokens": 134144451.0, "reward": 0.826171875, "reward_std": 0.12561991810798645, "rewards/drgrpo_math_reward/mean": 0.826171875, "rewards/drgrpo_math_reward/std": 0.3793322443962097, "step": 379 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.071204805585281e-10, "advantages/std": 0.5133400559425354, "advantages/var": 0.2635180130350854, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.183512544802867, "grad_norm": 0.13516593492688567, "learning_rate": 1.2084969757137685e-06, "loss": -0.0, "num_tokens": 134435009.0, "reward": 0.873046875, "reward_std": 0.10430533438920975, "rewards/drgrpo_math_reward/mean": 0.873046875, "rewards/drgrpo_math_reward/std": 0.33324605226516724, "step": 380 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7211879966090195e-09, "advantages/std": 0.5410928726196289, "advantages/var": 0.29278149679976195, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 2.189247311827957, "grad_norm": 0.14185637015095218, "learning_rate": 1.2049841537922305e-06, "loss": -0.0, "num_tokens": 134750345.0, "reward": 0.82421875, "reward_std": 0.10634077340364456, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "step": 381 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.8142824609234217e-09, "advantages/std": 0.5133283138275146, "advantages/var": 0.26350595777699937, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 2.1949820788530467, "grad_norm": 0.1441663681415576, "learning_rate": 1.2014686894408693e-06, "loss": -0.0, "num_tokens": 135028666.0, "reward": 0.810546875, "reward_std": 0.09644509106874466, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "step": 382 }, { "advantages/mean": 3.14321368932724e-09, "advantages/snr": 4.7428265822945026e-09, "advantages/std": 0.6627300381660461, "advantages/var": 0.439211103487569, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 2.2007168458781363, "grad_norm": 0.17269693741112913, "learning_rate": 1.1979506279771778e-06, "loss": -0.0, "num_tokens": 135365399.0, "reward": 0.84375, "reward_std": 0.18173782527446747, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "step": 383 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 3.928060489968631e-09, "advantages/std": 0.5927368998527527, "advantages/var": 0.35133703244705217, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8828125, "epoch": 2.206451612903226, "grad_norm": 0.17169912935458742, "learning_rate": 1.1944300147521275e-06, "loss": -0.0, "num_tokens": 135652899.0, "reward": 0.833984375, "reward_std": 0.12923401594161987, "rewards/drgrpo_math_reward/mean": 0.833984375, "rewards/drgrpo_math_reward/std": 0.3724585771560669, "step": 384 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.721136995924802e-09, "advantages/std": 0.5411089062690735, "advantages/var": 0.29279884844371296, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 2.2121863799283155, "grad_norm": 0.16746276004490926, "learning_rate": 1.1909068951495848e-06, "loss": -0.0, "num_tokens": 135992079.0, "reward": 0.8046875, "reward_std": 0.11949022114276886, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 385 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.302966673538413e-10, "advantages/std": 0.5410932898521423, "advantages/var": 0.2927819483230145, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 2.217921146953405, "grad_norm": 0.17287899442195614, "learning_rate": 1.1873813145857248e-06, "loss": 0.0, "num_tokens": 136245479.0, "reward": 0.822265625, "reward_std": 0.10700596868991852, "rewards/drgrpo_math_reward/mean": 0.822265625, "rewards/drgrpo_math_reward/std": 0.3826628625392914, "step": 386 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.7495508246919965e-09, "advantages/std": 0.5927566289901733, "advantages/var": 0.351360421211794, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 2.2236559139784946, "grad_norm": 0.17870312195752103, "learning_rate": 1.1838533185084466e-06, "loss": -0.0, "num_tokens": 136549475.0, "reward": 0.744140625, "reward_std": 0.14218294620513916, "rewards/drgrpo_math_reward/mean": 0.744140625, "rewards/drgrpo_math_reward/std": 0.43676990270614624, "step": 387 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 4.302840119963264e-09, "advantages/std": 0.5411092042922974, "advantages/var": 0.2927991709698432, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 2.229390681003584, "grad_norm": 0.15407496378488114, "learning_rate": 1.1803229523967888e-06, "loss": -0.0, "num_tokens": 136860723.0, "reward": 0.771484375, "reward_std": 0.11682207882404327, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "step": 388 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.163213634205941e-09, "advantages/std": 0.5411295890808105, "advantages/var": 0.2928212321787669, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 2.2351254480286737, "grad_norm": 0.15881709617262366, "learning_rate": 1.1767902617603402e-06, "loss": 0.0, "num_tokens": 137150436.0, "reward": 0.796875, "reward_std": 0.13071075081825256, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "step": 389 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.810646685522236e-10, "advantages/std": 0.48399031162261963, "advantages/var": 0.23424662174456046, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.2408602150537633, "grad_norm": 0.15577339024115086, "learning_rate": 1.173255292138656e-06, "loss": -0.0, "num_tokens": 137412786.0, "reward": 0.919921875, "reward_std": 0.09781768918037415, "rewards/drgrpo_math_reward/mean": 0.919921875, "rewards/drgrpo_math_reward/std": 0.271679550409317, "step": 390 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 4.082004245128135e-09, "advantages/std": 0.5133448243141174, "advantages/var": 0.2635229086500921, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 2.246594982078853, "grad_norm": 0.139720076526808, "learning_rate": 1.1697180891006689e-06, "loss": -0.0, "num_tokens": 137673677.0, "reward": 0.916015625, "reward_std": 0.10578860342502594, "rewards/drgrpo_math_reward/mean": 0.916015625, "rewards/drgrpo_math_reward/std": 0.2776356339454651, "step": 391 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.924238175884536e-09, "advantages/std": 0.4839954674243927, "advantages/var": 0.23425161248735638, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 2.252329749103943, "grad_norm": 0.17179731794317873, "learning_rate": 1.1661786982441026e-06, "loss": 0.0, "num_tokens": 137956518.0, "reward": 0.80078125, "reward_std": 0.10269483923912048, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "step": 392 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4527115523815155, "advantages/var": 0.20494774965968166, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6953125, "epoch": 2.258064516129032, "grad_norm": 0.12883029602062734, "learning_rate": 1.1626371651948836e-06, "loss": -0.0, "num_tokens": 138216853.0, "reward": 0.77734375, "reward_std": 0.07352001965045929, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "step": 393 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.1748781824798187e-09, "advantages/std": 0.5133470892906189, "advantages/var": 0.26352523408315065, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 2.263799283154122, "grad_norm": 0.15520710771344395, "learning_rate": 1.1590935356065535e-06, "loss": 0.0, "num_tokens": 138507657.0, "reward": 0.826171875, "reward_std": 0.10712136328220367, "rewards/drgrpo_math_reward/mean": 0.826171875, "rewards/drgrpo_math_reward/std": 0.3793322443962097, "step": 394 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.3568123572681728e-09, "advantages/std": 0.592742919921875, "advantages/var": 0.3513441691175103, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.2695340501792116, "grad_norm": 0.19449315966234226, "learning_rate": 1.1555478551596793e-06, "loss": 0.0, "num_tokens": 138810705.0, "reward": 0.802734375, "reward_std": 0.13040372729301453, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "step": 395 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.0756715213485673e-09, "advantages/std": 0.5411285758018494, "advantages/var": 0.29282013554933783, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 2.275268817204301, "grad_norm": 0.1557934147606044, "learning_rate": 1.1520001695612673e-06, "loss": -0.0, "num_tokens": 139119707.0, "reward": 0.78125, "reward_std": 0.13216078281402588, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "step": 396 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.2678675593785022e-09, "advantages/std": 0.513325035572052, "advantages/var": 0.26350259214504845, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 2.2810035842293908, "grad_norm": 0.2504986247687904, "learning_rate": 1.1484505245441695e-06, "loss": 0.0, "num_tokens": 139402371.0, "reward": 0.869140625, "reward_std": 0.09275216609239578, "rewards/drgrpo_math_reward/mean": 0.869140625, "rewards/drgrpo_math_reward/std": 0.33757632970809937, "step": 397 }, { "advantages/mean": 2.9103830456733704e-09, "advantages/snr": 4.252081604666365e-09, "advantages/std": 0.6844607591629028, "advantages/var": 0.4684865308338573, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 2.2867383512544803, "grad_norm": 0.22008920102285146, "learning_rate": 1.1448989658664984e-06, "loss": -0.0, "num_tokens": 139717373.0, "reward": 0.81640625, "reward_std": 0.19477425515651703, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "step": 398 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 6.085073888430315e-10, "advantages/std": 0.3826258182525635, "advantages/var": 0.14640251679344374, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 2.29247311827957, "grad_norm": 0.10699067107689644, "learning_rate": 1.1413455393110348e-06, "loss": 0.0, "num_tokens": 140000779.0, "reward": 0.75390625, "reward_std": 0.05864076316356659, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 399 }, { "advantages/mean": 3.4924596548080444e-09, "advantages/snr": 6.454120835924054e-09, "advantages/std": 0.5411208868026733, "advantages/var": 0.2928118141341116, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.2982078853046595, "grad_norm": 0.143584974024751, "learning_rate": 1.137790290684638e-06, "loss": -0.0, "num_tokens": 140320442.0, "reward": 0.80859375, "reward_std": 0.1275765299797058, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.3937928080558777, "step": 400 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.6922476666615517e-09, "advantages/std": 0.5675339102745056, "advantages/var": 0.3220947393114706, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 2.303942652329749, "grad_norm": 0.16038727480298645, "learning_rate": 1.1342332658176555e-06, "loss": -0.0, "num_tokens": 140606448.0, "reward": 0.841796875, "reward_std": 0.14179058372974396, "rewards/drgrpo_math_reward/mean": 0.841796875, "rewards/drgrpo_math_reward/std": 0.36528825759887695, "step": 401 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.27312967264458e-10, "advantages/std": 0.6402488350868225, "advantages/var": 0.40991857083003325, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.3096774193548386, "grad_norm": 0.1810063862179228, "learning_rate": 1.1306745105633319e-06, "loss": 0.0, "num_tokens": 140898691.0, "reward": 0.76953125, "reward_std": 0.16423675417900085, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "step": 402 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.0285467752722323e-09, "advantages/std": 0.45273709297180176, "advantages/var": 0.20497087535255787, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.315412186379928, "grad_norm": 0.13843852993502573, "learning_rate": 1.1271140707972187e-06, "loss": 0.0, "num_tokens": 141204199.0, "reward": 0.76171875, "reward_std": 0.08886568248271942, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "step": 403 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.535649273813575e-09, "advantages/std": 0.5133347511291504, "advantages/var": 0.26351256671682677, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9765625, "epoch": 2.3211469534050178, "grad_norm": 0.20197605740152158, "learning_rate": 1.1235519924165812e-06, "loss": -0.0, "num_tokens": 141468050.0, "reward": 0.88671875, "reward_std": 0.1024949848651886, "rewards/drgrpo_math_reward/mean": 0.88671875, "rewards/drgrpo_math_reward/std": 0.3172462284564972, "step": 404 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.6035435084181911e-09, "advantages/std": 0.7259879112243652, "advantages/var": 0.5270584472439168, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 2.3268817204301078, "grad_norm": 0.2317255306536829, "learning_rate": 1.119988321339809e-06, "loss": 0.0, "num_tokens": 141822485.0, "reward": 0.64453125, "reward_std": 0.22093378007411957, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.47912323474884033, "step": 405 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.205044794916086e-10, "advantages/std": 0.5675304532051086, "advantages/var": 0.322090815315196, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.332616487455197, "grad_norm": 0.16110473985035828, "learning_rate": 1.1164231035058227e-06, "loss": -0.0, "num_tokens": 142110701.0, "reward": 0.84375, "reward_std": 0.13739262521266937, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "step": 406 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.9279234300024475e-10, "advantages/std": 0.5927575826644897, "advantages/var": 0.3513615518062494, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.338351254480287, "grad_norm": 0.16381732779278838, "learning_rate": 1.1128563848734815e-06, "loss": -0.0, "num_tokens": 142424876.0, "reward": 0.775390625, "reward_std": 0.14028584957122803, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "step": 407 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 2.2677938285268968e-10, "advantages/std": 0.5133417248725891, "advantages/var": 0.263519726495165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 2.3440860215053765, "grad_norm": 0.1772144220212225, "learning_rate": 1.109288211420992e-06, "loss": -0.0, "num_tokens": 142713058.0, "reward": 0.8671875, "reward_std": 0.10551576316356659, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.33970388770103455, "step": 408 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.2678370131583125e-09, "advantages/std": 0.513331949710846, "advantages/var": 0.2635096905939385, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 2.349820788530466, "grad_norm": 0.17743829231928446, "learning_rate": 1.1057186291453136e-06, "loss": 0.0, "num_tokens": 143008424.0, "reward": 0.8203125, "reward_std": 0.09666463732719421, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "step": 409 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4191282093524933, "advantages/var": 0.17566845587502744, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 2.3555555555555556, "grad_norm": 0.11065410353166621, "learning_rate": 1.102147684061568e-06, "loss": 0.0, "num_tokens": 143270353.0, "reward": 0.931640625, "reward_std": 0.06024399772286415, "rewards/drgrpo_math_reward/mean": 0.931640625, "rewards/drgrpo_math_reward/std": 0.25260838866233826, "step": 410 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.070815112247609e-10, "advantages/std": 0.5133621096611023, "advantages/var": 0.2635406556356976, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 2.361290322580645, "grad_norm": 0.12058743235103143, "learning_rate": 1.0985754222024436e-06, "loss": -0.0, "num_tokens": 143574750.0, "reward": 0.82421875, "reward_std": 0.11849889159202576, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "step": 411 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.0539763451658798e-09, "advantages/std": 0.6627206802368164, "advantages/var": 0.43919870001354866, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6953125, "epoch": 2.3670250896057348, "grad_norm": 0.24582421628597118, "learning_rate": 1.0950018896176042e-06, "loss": 0.0, "num_tokens": 143887439.0, "reward": 0.7734375, "reward_std": 0.17542308568954468, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "step": 412 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.964021160042995e-09, "advantages/std": 0.5927396416664124, "advantages/var": 0.3513402828028269, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 2.3727598566308243, "grad_norm": 0.16831243377221777, "learning_rate": 1.0914271323730934e-06, "loss": -0.0, "num_tokens": 144207020.0, "reward": 0.830078125, "reward_std": 0.12698355317115784, "rewards/drgrpo_math_reward/mean": 0.830078125, "rewards/drgrpo_math_reward/std": 0.3759314715862274, "step": 413 }, { "advantages/mean": 2.9103830456733704e-09, "advantages/snr": 4.7172533390438686e-09, "advantages/std": 0.6169655919075012, "advantages/var": 0.38064654159777334, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.378494623655914, "grad_norm": 0.20121535377215374, "learning_rate": 1.0878511965507434e-06, "loss": -0.0, "num_tokens": 144509950.0, "reward": 0.81640625, "reward_std": 0.15862742066383362, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "step": 414 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.0857489980017807e-09, "advantages/std": 0.45272114872932434, "advantages/var": 0.204956438506799, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8359375, "epoch": 2.3842293906810035, "grad_norm": 0.11696210357032727, "learning_rate": 1.0842741282475768e-06, "loss": 0.0, "num_tokens": 144806923.0, "reward": 0.8046875, "reward_std": 0.07889671623706818, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 415 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.1783347290489429e-09, "advantages/std": 0.5927788615226746, "advantages/var": 0.3513867786681182, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7109375, "epoch": 2.389964157706093, "grad_norm": 0.15119799780961807, "learning_rate": 1.0806959735752173e-06, "loss": 0.0, "num_tokens": 145151492.0, "reward": 0.75390625, "reward_std": 0.15613135695457458, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 416 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.621651812673634e-10, "advantages/std": 0.4839722812175751, "advantages/var": 0.23422916898694357, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 2.3956989247311826, "grad_norm": 0.12106880193386726, "learning_rate": 1.0771167786592916e-06, "loss": -0.0, "num_tokens": 145443614.0, "reward": 0.87109375, "reward_std": 0.08493967354297638, "rewards/drgrpo_math_reward/mean": 0.87109375, "rewards/drgrpo_math_reward/std": 0.33542385697364807, "step": 417 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.1109736201329835e-09, "advantages/std": 0.4191470146179199, "advantages/var": 0.17568421986311478, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 2.4014336917562726, "grad_norm": 0.14240905891000594, "learning_rate": 1.0735365896388359e-06, "loss": -0.0, "num_tokens": 145712142.0, "reward": 0.89453125, "reward_std": 0.07135801762342453, "rewards/drgrpo_math_reward/mean": 0.89453125, "rewards/drgrpo_math_reward/std": 0.3074568510055542, "step": 418 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7211007853779242e-09, "advantages/std": 0.5411202907562256, "advantages/var": 0.2928111690681021, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7109375, "epoch": 2.4071684587813618, "grad_norm": 0.12386530650000019, "learning_rate": 1.0699554526657028e-06, "loss": 0.0, "num_tokens": 146052479.0, "reward": 0.662109375, "reward_std": 0.12482717633247375, "rewards/drgrpo_math_reward/mean": 0.662109375, "rewards/drgrpo_math_reward/std": 0.4734536409378052, "step": 419 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.2677680238614377e-09, "advantages/std": 0.5133475661277771, "advantages/var": 0.2635257236493125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 2.412903225806452, "grad_norm": 0.1497921993179201, "learning_rate": 1.0663734139039632e-06, "loss": -0.0, "num_tokens": 146359004.0, "reward": 0.80078125, "reward_std": 0.10881631821393967, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "step": 420 }, { "advantages/mean": 2.2118911147117615e-09, "advantages/snr": 4.308924847307796e-09, "advantages/std": 0.5133278369903564, "advantages/var": 0.26350546822919796, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6015625, "epoch": 2.4186379928315414, "grad_norm": 0.12392074925244675, "learning_rate": 1.0627905195293135e-06, "loss": -0.0, "num_tokens": 146704282.0, "reward": 0.72265625, "reward_std": 0.0957798957824707, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "step": 421 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.810697932424957e-09, "advantages/std": 0.48398515582084656, "advantages/var": 0.23424163105492912, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.424372759856631, "grad_norm": 0.1231401875516177, "learning_rate": 1.0592068157284795e-06, "loss": -0.0, "num_tokens": 147018884.0, "reward": 0.775390625, "reward_std": 0.0946282371878624, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "step": 422 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.2679168037246144e-09, "advantages/std": 0.513313889503479, "advantages/var": 0.26349114915718985, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.4301075268817205, "grad_norm": 0.1690428147715938, "learning_rate": 1.0556223486986218e-06, "loss": -0.0, "num_tokens": 147296468.0, "reward": 0.86328125, "reward_std": 0.08604402095079422, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.3438861668109894, "step": 423 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.7213889323129395e-09, "advantages/std": 0.5133348703384399, "advantages/var": 0.26351268910538295, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.43584229390681, "grad_norm": 0.15804140123735858, "learning_rate": 1.0520371646467393e-06, "loss": 0.0, "num_tokens": 147609759.0, "reward": 0.873046875, "reward_std": 0.10262156277894974, "rewards/drgrpo_math_reward/mean": 0.873046875, "rewards/drgrpo_math_reward/std": 0.33324605226516724, "step": 424 }, { "advantages/mean": -2.444721758365631e-09, "advantages/snr": 3.962651296068918e-09, "advantages/std": 0.6169409155845642, "advantages/var": 0.3806160933223204, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8046875, "epoch": 2.4415770609318996, "grad_norm": 0.1448992339542876, "learning_rate": 1.0484513097890737e-06, "loss": -0.0, "num_tokens": 147944430.0, "reward": 0.86328125, "reward_std": 0.1336374580860138, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.3438861668109894, "step": 425 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2307457322610485e-09, "advantages/std": 0.5675355195999146, "advantages/var": 0.322096566007545, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4296875, "epoch": 2.447311827956989, "grad_norm": 0.1420888591584983, "learning_rate": 1.044864830350515e-06, "loss": 0.0, "num_tokens": 148280602.0, "reward": 0.75, "reward_std": 0.14083804190158844, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "step": 426 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.6003334389054963e-09, "advantages/std": 0.45268431305885315, "advantages/var": 0.20492308728956576, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6796875, "epoch": 2.4530465949820788, "grad_norm": 0.11648532723589593, "learning_rate": 1.041277772564003e-06, "loss": 0.0, "num_tokens": 148572129.0, "reward": 0.849609375, "reward_std": 0.059472277760505676, "rewards/drgrpo_math_reward/mean": 0.849609375, "rewards/drgrpo_math_reward/std": 0.35780346393585205, "step": 427 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.3606505453076139e-09, "advantages/std": 0.5133514404296875, "advantages/var": 0.263529701391235, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 2.4587813620071683, "grad_norm": 0.16835348729744734, "learning_rate": 1.0376901826699347e-06, "loss": 0.0, "num_tokens": 148878255.0, "reward": 0.830078125, "reward_std": 0.11218451708555222, "rewards/drgrpo_math_reward/mean": 0.830078125, "rewards/drgrpo_math_reward/std": 0.3759314715862274, "step": 428 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5133668780326843, "advantages/var": 0.263545551461025, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 2.464516129032258, "grad_norm": 0.14722757618807716, "learning_rate": 1.0341021069155647e-06, "loss": 0.0, "num_tokens": 149221628.0, "reward": 0.802734375, "reward_std": 0.12297804653644562, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "step": 429 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.872502787133794e-09, "advantages/std": 0.5411166548728943, "advantages/var": 0.292807234180831, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 2.4702508960573475, "grad_norm": 0.1650284735300854, "learning_rate": 1.0305135915544123e-06, "loss": -0.0, "num_tokens": 149527220.0, "reward": 0.796875, "reward_std": 0.12285147607326508, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "step": 430 }, { "advantages/mean": -2.444721758365631e-09, "advantages/snr": 3.9624089703378945e-09, "advantages/std": 0.616978645324707, "advantages/var": 0.38066264878671063, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5703125, "epoch": 2.4759856630824375, "grad_norm": 0.17051311764249838, "learning_rate": 1.026924682845663e-06, "loss": 0.0, "num_tokens": 149902409.0, "reward": 0.689453125, "reward_std": 0.16616153717041016, "rewards/drgrpo_math_reward/mean": 0.689453125, "rewards/drgrpo_math_reward/std": 0.46317005157470703, "step": 431 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.53557975759028e-10, "advantages/std": 0.5133426189422607, "advantages/var": 0.2635206444224991, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.481720430107527, "grad_norm": 0.19611428736682768, "learning_rate": 1.0233354270535726e-06, "loss": -0.0, "num_tokens": 150213360.0, "reward": 0.80078125, "reward_std": 0.10622870922088623, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "step": 432 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.8863473110235288e-09, "advantages/std": 0.4839971363544464, "advantages/var": 0.2342532279993046, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6484375, "epoch": 2.4874551971326166, "grad_norm": 0.1276679703082484, "learning_rate": 1.0197458704468718e-06, "loss": 0.0, "num_tokens": 150531399.0, "reward": 0.662109375, "reward_std": 0.10023994743824005, "rewards/drgrpo_math_reward/mean": 0.662109375, "rewards/drgrpo_math_reward/std": 0.4734536409378052, "step": 433 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.62170691433367e-10, "advantages/std": 0.483969509601593, "advantages/var": 0.23422648622400644, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 2.493189964157706, "grad_norm": 0.11033413745748184, "learning_rate": 1.0161560592981686e-06, "loss": 0.0, "num_tokens": 150856347.0, "reward": 0.75390625, "reward_std": 0.08615704625844955, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 434 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.1748494292439067e-09, "advantages/std": 0.5133517384529114, "advantages/var": 0.26353000737262633, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.484375, "epoch": 2.498924731182796, "grad_norm": 0.15839313515939943, "learning_rate": 1.0125660398833527e-06, "loss": -0.0, "num_tokens": 151186455.0, "reward": 0.771484375, "reward_std": 0.10942963510751724, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "step": 435 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.6410556661506185e-09, "advantages/std": 0.5675143003463745, "advantages/var": 0.322072481097635, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 2.5046594982078854, "grad_norm": 0.15967604663138468, "learning_rate": 1.0089758584809977e-06, "loss": 0.0, "num_tokens": 151518670.0, "reward": 0.771484375, "reward_std": 0.12428855150938034, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "step": 436 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 6.454470629259003e-10, "advantages/std": 0.5410915613174438, "advantages/var": 0.2927800777289491, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5859375, "epoch": 2.510394265232975, "grad_norm": 0.11803648559507998, "learning_rate": 1.005385561371767e-06, "loss": 0.0, "num_tokens": 151844583.0, "reward": 0.771484375, "reward_std": 0.1010635495185852, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "step": 437 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.641700407020763e-09, "advantages/std": 0.6169565916061401, "advantages/var": 0.3806354359262656, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6328125, "epoch": 2.5161290322580645, "grad_norm": 0.14263409688069592, "learning_rate": 1.0017951948378134e-06, "loss": -0.0, "num_tokens": 152175089.0, "reward": 0.767578125, "reward_std": 0.148737370967865, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "step": 438 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.8869295913484784e-09, "advantages/std": 0.616956353187561, "advantages/var": 0.38063514173849455, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6328125, "epoch": 2.521863799283154, "grad_norm": 0.17819790247561107, "learning_rate": 9.982048051621867e-07, "loss": 0.0, "num_tokens": 152517666.0, "reward": 0.740234375, "reward_std": 0.1465548574924469, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 439 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.5715159003220472e-09, "advantages/std": 0.4527108669281006, "advantages/var": 0.2049471290347924, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.5275985663082436, "grad_norm": 0.10467144278715852, "learning_rate": 9.946144386282334e-07, "loss": 0.0, "num_tokens": 152809779.0, "reward": 0.86328125, "reward_std": 0.0759955644607544, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.3438861668109894, "step": 440 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5927723050117493, "advantages/var": 0.3513790055889423, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 2.533333333333333, "grad_norm": 0.12932536682141396, "learning_rate": 9.91024141519002e-07, "loss": 0.0, "num_tokens": 153127788.0, "reward": 0.8046875, "reward_std": 0.15048781037330627, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 441 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.0757504374073951e-09, "advantages/std": 0.541088879108429, "advantages/var": 0.29277717509481604, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 2.539068100358423, "grad_norm": 0.14135910889621736, "learning_rate": 9.874339601166472e-07, "loss": -0.0, "num_tokens": 153475780.0, "reward": 0.802734375, "reward_std": 0.1028466522693634, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "step": 442 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.070459151379954e-10, "advantages/std": 0.5133822560310364, "advantages/var": 0.2635613408075166, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7421875, "epoch": 2.5448028673835124, "grad_norm": 0.12587258067331306, "learning_rate": 9.838439407018315e-07, "loss": -0.0, "num_tokens": 153788999.0, "reward": 0.74609375, "reward_std": 0.13438570499420166, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "step": 443 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.2307858033570434e-09, "advantages/std": 0.5675170421600342, "advantages/var": 0.322075593142074, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.5505376344086024, "grad_norm": 0.15067628290309287, "learning_rate": 9.80254129553128e-07, "loss": -0.0, "num_tokens": 154097505.0, "reward": 0.748046875, "reward_std": 0.12687712907791138, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "step": 444 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.5354686413860026e-10, "advantages/std": 0.5133551955223083, "advantages/var": 0.26353355676974743, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5703125, "epoch": 2.5562724014336915, "grad_norm": 0.14872955941305907, "learning_rate": 9.766645729464275e-07, "loss": -0.0, "num_tokens": 154455219.0, "reward": 0.751953125, "reward_std": 0.11151237040758133, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "step": 445 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 2.9481275270528198e-09, "advantages/std": 0.5133424997329712, "advantages/var": 0.2635205220320955, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.5620071684587815, "grad_norm": 0.13447894875540042, "learning_rate": 9.730753171543374e-07, "loss": 0.0, "num_tokens": 154769177.0, "reward": 0.8515625, "reward_std": 0.10654062032699585, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 446 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.3886888845541056e-09, "advantages/std": 0.4191555082798004, "advantages/var": 0.17569134012129783, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 2.567741935483871, "grad_norm": 0.08671878485475096, "learning_rate": 9.694864084455876e-07, "loss": -0.0, "num_tokens": 155049411.0, "reward": 0.84375, "reward_std": 0.07807311415672302, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "step": 447 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 2.1514257476305157e-10, "advantages/std": 0.5411077737808228, "advantages/var": 0.29279762284603805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7265625, "epoch": 2.5734767025089607, "grad_norm": 0.15589780184703084, "learning_rate": 9.658978930844352e-07, "loss": 0.0, "num_tokens": 155376856.0, "reward": 0.822265625, "reward_std": 0.11529115587472916, "rewards/drgrpo_math_reward/mean": 0.822265625, "rewards/drgrpo_math_reward/std": 0.3826628625392914, "step": 448 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5711594187769079e-09, "advantages/std": 0.5927613377571106, "advantages/var": 0.35136600353959935, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8359375, "epoch": 2.5792114695340502, "grad_norm": 0.1712452940810284, "learning_rate": 9.623098173300653e-07, "loss": -0.0, "num_tokens": 155685757.0, "reward": 0.810546875, "reward_std": 0.14327478408813477, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "step": 449 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 6.085087633301075e-10, "advantages/std": 0.38262495398521423, "advantages/var": 0.1464018554121873, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7109375, "epoch": 2.58494623655914, "grad_norm": 0.11120519198753674, "learning_rate": 9.58722227435997e-07, "loss": 0.0, "num_tokens": 155967407.0, "reward": 0.849609375, "reward_std": 0.058760739862918854, "rewards/drgrpo_math_reward/mean": 0.849609375, "rewards/drgrpo_math_reward/std": 0.35780346393585205, "step": 450 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.174811829644306e-09, "advantages/std": 0.5133578181266785, "advantages/var": 0.2635362494317839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8046875, "epoch": 2.5906810035842294, "grad_norm": 0.11318451458826649, "learning_rate": 9.551351696494853e-07, "loss": 0.0, "num_tokens": 156272170.0, "reward": 0.802734375, "reward_std": 0.1154317855834961, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "step": 451 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7210016407356785e-09, "advantages/std": 0.5411514639854431, "advantages/var": 0.29284490697358834, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.515625, "epoch": 2.596415770609319, "grad_norm": 0.13939905619620457, "learning_rate": 9.515486902109261e-07, "loss": -0.0, "num_tokens": 156601137.0, "reward": 0.77734375, "reward_std": 0.1498623639345169, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "step": 452 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.924258792697588e-09, "advantages/std": 0.48399028182029724, "advantages/var": 0.23424659289649075, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "epoch": 2.6021505376344085, "grad_norm": 0.10909931681343228, "learning_rate": 9.479628353532608e-07, "loss": 0.0, "num_tokens": 156916304.0, "reward": 0.78515625, "reward_std": 0.09808912873268127, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4111155867576599, "step": 453 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.0285969479707764e-09, "advantages/std": 0.4527150094509125, "advantages/var": 0.20495087978213977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 2.607885304659498, "grad_norm": 0.11896194744055381, "learning_rate": 9.443776513013783e-07, "loss": -0.0, "num_tokens": 157199903.0, "reward": 0.818359375, "reward_std": 0.07564391195774078, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "step": 454 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7211540590142996e-09, "advantages/std": 0.5411035418510437, "advantages/var": 0.2927930430037442, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6953125, "epoch": 2.6136200716845877, "grad_norm": 0.13143017131543272, "learning_rate": 9.407931842715202e-07, "loss": -0.0, "num_tokens": 157514707.0, "reward": 0.857421875, "reward_std": 0.11222390830516815, "rewards/drgrpo_math_reward/mean": 0.857421875, "rewards/drgrpo_math_reward/std": 0.3499840497970581, "step": 455 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.5428178270499856e-09, "advantages/std": 0.4527377784252167, "advantages/var": 0.2049714960134006, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 2.6193548387096772, "grad_norm": 0.1249542880400599, "learning_rate": 9.372094804706866e-07, "loss": -0.0, "num_tokens": 157814353.0, "reward": 0.830078125, "reward_std": 0.08919276297092438, "rewards/drgrpo_math_reward/mean": 0.830078125, "rewards/drgrpo_math_reward/std": 0.3759314715862274, "step": 456 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.442553522727353e-09, "advantages/std": 0.5133560299873352, "advantages/var": 0.2635344135243578, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.640625, "epoch": 2.6250896057347672, "grad_norm": 0.12179968113062707, "learning_rate": 9.336265860960369e-07, "loss": -0.0, "num_tokens": 158152474.0, "reward": 0.8125, "reward_std": 0.11229199916124344, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39069411158561707, "step": 457 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 5.554817545737748e-10, "advantages/std": 0.41915082931518555, "advantages/var": 0.1756874177156078, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7265625, "epoch": 2.6308243727598564, "grad_norm": 0.08382445148867387, "learning_rate": 9.300445473342972e-07, "loss": 0.0, "num_tokens": 158483778.0, "reward": 0.791015625, "reward_std": 0.07797222584486008, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "step": 458 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 2.3665255708920767e-09, "advantages/std": 0.5411175489425659, "advantages/var": 0.2928082017736102, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 2.6365591397849464, "grad_norm": 0.13832740252429176, "learning_rate": 9.264634103611637e-07, "loss": -0.0, "num_tokens": 158802072.0, "reward": 0.8515625, "reward_std": 0.1208883672952652, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 459 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.855695983708911e-10, "advantages/std": 0.5927689671516418, "advantages/var": 0.35137504841802425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7265625, "epoch": 2.642293906810036, "grad_norm": 0.13116703682292238, "learning_rate": 9.228832213407084e-07, "loss": -0.0, "num_tokens": 159149427.0, "reward": 0.79296875, "reward_std": 0.15211141109466553, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40557438135147095, "step": 460 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.0756894127132805e-09, "advantages/std": 0.5411195755004883, "advantages/var": 0.29281039498982864, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 2.6480286738351255, "grad_norm": 0.126184820992536, "learning_rate": 9.193040264247828e-07, "loss": -0.0, "num_tokens": 159480903.0, "reward": 0.75390625, "reward_std": 0.1230177953839302, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 461 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.2821220184127886e-09, "advantages/std": 0.5675124526023865, "advantages/var": 0.32207038385877595, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 2.653763440860215, "grad_norm": 0.15254977151598384, "learning_rate": 9.157258717524234e-07, "loss": 0.0, "num_tokens": 159775780.0, "reward": 0.857421875, "reward_std": 0.12180736660957336, "rewards/drgrpo_math_reward/mean": 0.857421875, "rewards/drgrpo_math_reward/std": 0.3499840497970581, "step": 462 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.230680073009183e-09, "advantages/std": 0.5675657987594604, "advantages/var": 0.32213093592146436, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 2.6594982078853047, "grad_norm": 0.1626832076070562, "learning_rate": 9.121488034492568e-07, "loss": -0.0, "num_tokens": 160094037.0, "reward": 0.806640625, "reward_std": 0.1666650027036667, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "step": 463 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.07113318375781e-10, "advantages/std": 0.5133441090583801, "advantages/var": 0.26352217430494207, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7109375, "epoch": 2.6652329749103942, "grad_norm": 0.1290596594861587, "learning_rate": 9.085728676269066e-07, "loss": 0.0, "num_tokens": 160386910.0, "reward": 0.876953125, "reward_std": 0.10735304653644562, "rewards/drgrpo_math_reward/mean": 0.876953125, "rewards/drgrpo_math_reward/std": 0.32881227135658264, "step": 464 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.442170479912332e-09, "advantages/std": 0.5411251783370972, "advantages/var": 0.2928164586303552, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5703125, "epoch": 2.670967741935484, "grad_norm": 0.12580500900257388, "learning_rate": 9.049981103823959e-07, "loss": -0.0, "num_tokens": 160707705.0, "reward": 0.7578125, "reward_std": 0.1284025013446808, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "step": 465 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.4432534598041818e-09, "advantages/std": 0.48397037386894226, "advantages/var": 0.23422732278284375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.609375, "epoch": 2.6767025089605734, "grad_norm": 0.1029098271629491, "learning_rate": 9.014245777975564e-07, "loss": 0.0, "num_tokens": 161044778.0, "reward": 0.751953125, "reward_std": 0.08406735956668854, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "step": 466 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.320792675363603e-09, "advantages/std": 0.5927470326423645, "advantages/var": 0.35134904470632833, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "epoch": 2.682437275985663, "grad_norm": 0.12175324968600575, "learning_rate": 8.978523159384322e-07, "loss": -0.0, "num_tokens": 161388991.0, "reward": 0.72265625, "reward_std": 0.13346660137176514, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "step": 467 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 5.657196813692878e-09, "advantages/std": 0.4527219235897064, "advantages/var": 0.20495714009876398, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 2.688172043010753, "grad_norm": 0.1245093137266545, "learning_rate": 8.942813708546866e-07, "loss": -0.0, "num_tokens": 161694389.0, "reward": 0.828125, "reward_std": 0.0813746303319931, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3776407241821289, "step": 468 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 5.143013517932645e-10, "advantages/std": 0.4527124762535095, "advantages/var": 0.20494858615558442, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 2.693906810035842, "grad_norm": 0.11107370718492254, "learning_rate": 8.907117885790083e-07, "loss": -0.0, "num_tokens": 161957166.0, "reward": 0.845703125, "reward_std": 0.07328139245510101, "rewards/drgrpo_math_reward/mean": 0.845703125, "rewards/drgrpo_math_reward/std": 0.36158639192581177, "step": 469 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.4615219696445517e-09, "advantages/std": 0.567528486251831, "advantages/var": 0.3220885827072948, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 2.699641577060932, "grad_norm": 0.16486127363227157, "learning_rate": 8.871436151265182e-07, "loss": -0.0, "num_tokens": 162291859.0, "reward": 0.791015625, "reward_std": 0.1355608105659485, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "step": 470 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.3566879415918826e-09, "advantages/std": 0.5927742123603821, "advantages/var": 0.35138126683947135, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 2.7053763440860212, "grad_norm": 0.1546921724255238, "learning_rate": 8.835768964941772e-07, "loss": -0.0, "num_tokens": 162612609.0, "reward": 0.818359375, "reward_std": 0.15497282147407532, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "step": 471 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.5355413139273237e-10, "advantages/std": 0.5133469700813293, "advantages/var": 0.26352511169168125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 2.7111111111111112, "grad_norm": 0.14647386039658442, "learning_rate": 8.800116786601908e-07, "loss": 0.0, "num_tokens": 162917713.0, "reward": 0.76171875, "reward_std": 0.111458919942379, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "step": 472 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.461588152848798e-09, "advantages/std": 0.5675132274627686, "advantages/var": 0.3220712633452081, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5546875, "epoch": 2.716845878136201, "grad_norm": 0.15640819202200756, "learning_rate": 8.764480075834186e-07, "loss": -0.0, "num_tokens": 163243955.0, "reward": 0.68359375, "reward_std": 0.11971768736839294, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.46552830934524536, "step": 473 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.174916520937862e-09, "advantages/std": 0.5133408904075623, "advantages/var": 0.26351886976442884, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 2.7225806451612904, "grad_norm": 0.11697442611204033, "learning_rate": 8.728859292027814e-07, "loss": -0.0, "num_tokens": 163529053.0, "reward": 0.90234375, "reward_std": 0.10221564769744873, "rewards/drgrpo_math_reward/mean": 0.90234375, "rewards/drgrpo_math_reward/std": 0.29713961482048035, "step": 474 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.2641919184776544e-09, "advantages/std": 0.6169900298118591, "advantages/var": 0.3806766968872388, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 2.72831541218638, "grad_norm": 0.15299233369215207, "learning_rate": 8.693254894366682e-07, "loss": -0.0, "num_tokens": 163872617.0, "reward": 0.6796875, "reward_std": 0.17652980983257294, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4670529365539551, "step": 475 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.5357156321964103e-10, "advantages/std": 0.5133272409439087, "advantages/var": 0.2635048562950857, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 2.7340501792114695, "grad_norm": 0.13763967474400313, "learning_rate": 8.657667341823448e-07, "loss": -0.0, "num_tokens": 164155919.0, "reward": 0.8359375, "reward_std": 0.09518137574195862, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.37069445848464966, "step": 476 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.571467993568879e-09, "advantages/std": 0.4527193009853363, "advantages/var": 0.20495476548465152, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.739784946236559, "grad_norm": 0.11022428723086183, "learning_rate": 8.62209709315362e-07, "loss": 0.0, "num_tokens": 164424857.0, "reward": 0.826171875, "reward_std": 0.08010854572057724, "rewards/drgrpo_math_reward/mean": 0.826171875, "rewards/drgrpo_math_reward/std": 0.3793322443962097, "step": 477 }, { "advantages/mean": 2.2118911147117615e-09, "advantages/snr": 3.897278805602547e-09, "advantages/std": 0.5675475597381592, "advantages/var": 0.32211023256473936, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8359375, "epoch": 2.7455197132616487, "grad_norm": 0.13083577515346181, "learning_rate": 8.58654460688965e-07, "loss": -0.0, "num_tokens": 164718731.0, "reward": 0.875, "reward_std": 0.14918473362922668, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3310423493385315, "step": 478 }, { "advantages/mean": 4.307366907596588e-09, "advantages/snr": 7.589821076135159e-09, "advantages/std": 0.5675188899040222, "advantages/var": 0.3220776903978937, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 2.7512544802867382, "grad_norm": 0.1726247747294042, "learning_rate": 8.551010341335015e-07, "loss": -0.0, "num_tokens": 165049000.0, "reward": 0.734375, "reward_std": 0.126377135515213, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "step": 479 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 2.4945674184430398e-09, "advantages/std": 0.5133429169654846, "advantages/var": 0.26352095039863244, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.756989247311828, "grad_norm": 0.1295598531682697, "learning_rate": 8.515494754558308e-07, "loss": -0.0, "num_tokens": 165347455.0, "reward": 0.759765625, "reward_std": 0.10680782049894333, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "step": 480 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.8255437793605207e-09, "advantages/std": 0.38262128829956055, "advantages/var": 0.14639905026001543, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 2.762724014336918, "grad_norm": 0.08643226466468112, "learning_rate": 8.479998304387328e-07, "loss": -0.0, "num_tokens": 165635839.0, "reward": 0.8671875, "reward_std": 0.05880707502365112, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.33970388770103455, "step": 481 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.7212852923504086e-09, "advantages/std": 0.5133544206619263, "advantages/var": 0.26353276121314195, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 2.768458781362007, "grad_norm": 0.13261685373576124, "learning_rate": 8.444521448403206e-07, "loss": -0.0, "num_tokens": 165960711.0, "reward": 0.806640625, "reward_std": 0.11455363780260086, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "step": 482 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.7566714802153193e-09, "advantages/std": 0.6627039909362793, "advantages/var": 0.43917657960287215, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 2.774193548387097, "grad_norm": 0.1860093827371551, "learning_rate": 8.409064643934467e-07, "loss": -0.0, "num_tokens": 166253803.0, "reward": 0.73046875, "reward_std": 0.16081658005714417, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44415023922920227, "step": 483 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.205136139453875e-10, "advantages/std": 0.5675241351127625, "advantages/var": 0.32208364393548905, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8359375, "epoch": 2.7799283154121865, "grad_norm": 0.15777509900469788, "learning_rate": 8.373628348051163e-07, "loss": 0.0, "num_tokens": 166540513.0, "reward": 0.8125, "reward_std": 0.133546844124794, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39069411158561707, "step": 484 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.656827408349434e-09, "advantages/std": 0.452751487493515, "advantages/var": 0.20498390942759048, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 2.785663082437276, "grad_norm": 0.1267454444984254, "learning_rate": 8.338213017558972e-07, "loss": 0.0, "num_tokens": 166825397.0, "reward": 0.767578125, "reward_std": 0.10100381821393967, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "step": 485 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.989130202551909e-09, "advantages/std": 0.5133433938026428, "advantages/var": 0.26352143996081523, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9921875, "epoch": 2.7913978494623657, "grad_norm": 0.13588128630042334, "learning_rate": 8.302819108993311e-07, "loss": 0.0, "num_tokens": 167099224.0, "reward": 0.828125, "reward_std": 0.10420571267604828, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3776407241821289, "step": 486 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.071382811790197e-10, "advantages/std": 0.5133299827575684, "advantages/var": 0.26350767119788543, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 2.7971326164874553, "grad_norm": 0.16009909991494425, "learning_rate": 8.267447078613441e-07, "loss": -0.0, "num_tokens": 167379627.0, "reward": 0.826171875, "reward_std": 0.09820909798145294, "rewards/drgrpo_math_reward/mean": 0.826171875, "rewards/drgrpo_math_reward/std": 0.3793322443962097, "step": 487 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 1.5059791120963031e-09, "advantages/std": 0.5411145687103271, "advantages/var": 0.29280497647056336, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.802867383512545, "grad_norm": 0.15466765149038694, "learning_rate": 8.232097382396597e-07, "loss": 0.0, "num_tokens": 167663681.0, "reward": 0.818359375, "reward_std": 0.12086933106184006, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "step": 488 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.356705240509174e-09, "advantages/std": 0.5927698612213135, "advantages/var": 0.35137610837233524, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.8086021505376344, "grad_norm": 0.1633543706234296, "learning_rate": 8.196770476032114e-07, "loss": 0.0, "num_tokens": 167963062.0, "reward": 0.783203125, "reward_std": 0.15272332727909088, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "step": 489 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.599866793707939e-09, "advantages/std": 0.4527429938316345, "advantages/var": 0.20497621846363145, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 2.814336917562724, "grad_norm": 0.1464367433032741, "learning_rate": 8.161466814915533e-07, "loss": -0.0, "num_tokens": 168239377.0, "reward": 0.853515625, "reward_std": 0.0906161442399025, "rewards/drgrpo_math_reward/mean": 0.853515625, "rewards/drgrpo_math_reward/std": 0.35393697023391724, "step": 490 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 5.454851317084751e-10, "advantages/std": 0.6402483582496643, "advantages/var": 0.4099179602413905, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.8200716845878135, "grad_norm": 0.20566564554840297, "learning_rate": 8.126186854142751e-07, "loss": 0.0, "num_tokens": 168551193.0, "reward": 0.75, "reward_std": 0.16102272272109985, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "step": 491 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 2.4052515111402953e-10, "advantages/std": 0.48400476574897766, "advantages/var": 0.23426061326772274, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 2.825806451612903, "grad_norm": 0.14259835391841993, "learning_rate": 8.090931048504151e-07, "loss": -0.0, "num_tokens": 168864751.0, "reward": 0.75390625, "reward_std": 0.10799271613359451, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 492 }, { "advantages/mean": -2.9103830456733704e-09, "advantages/snr": 4.125146382249182e-09, "advantages/std": 0.705522358417511, "advantages/var": 0.49776179822700684, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.8315412186379927, "grad_norm": 0.21263049834142375, "learning_rate": 8.055699852478724e-07, "loss": -0.0, "num_tokens": 169197302.0, "reward": 0.74609375, "reward_std": 0.20345906913280487, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "step": 493 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.571349165441145e-09, "advantages/std": 0.45274022221565247, "advantages/var": 0.20497370881187837, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9765625, "epoch": 2.8372759856630827, "grad_norm": 0.14523948304919773, "learning_rate": 8.020493720228223e-07, "loss": -0.0, "num_tokens": 169469589.0, "reward": 0.759765625, "reward_std": 0.0906703919172287, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "step": 494 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.284503073336768e-09, "advantages/std": 0.5927740335464478, "advantages/var": 0.35138105484692517, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.843010752688172, "grad_norm": 0.1454002648227422, "learning_rate": 7.985313105591307e-07, "loss": -0.0, "num_tokens": 169782795.0, "reward": 0.744140625, "reward_std": 0.15181267261505127, "rewards/drgrpo_math_reward/mean": 0.744140625, "rewards/drgrpo_math_reward/std": 0.43676990270614624, "step": 495 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.721409787649705e-09, "advantages/std": 0.5133309364318848, "advantages/var": 0.2635086502980357, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 2.848745519713262, "grad_norm": 0.14495323639719965, "learning_rate": 7.950158462077697e-07, "loss": -0.0, "num_tokens": 170077534.0, "reward": 0.787109375, "reward_std": 0.0976499691605568, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "step": 496 }, { "advantages/mean": -2.9103830456733704e-09, "advantages/snr": 4.391739135531545e-09, "advantages/std": 0.6626948714256287, "advantages/var": 0.4391644926138305, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 2.8544802867383514, "grad_norm": 0.15780742167614628, "learning_rate": 7.915030242862316e-07, "loss": 0.0, "num_tokens": 170392808.0, "reward": 0.8359375, "reward_std": 0.15261822938919067, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.37069445848464966, "step": 497 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.40544094853215e-09, "advantages/std": 0.4839666485786438, "advantages/var": 0.2342237169364445, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9921875, "epoch": 2.860215053763441, "grad_norm": 0.13608430329020385, "learning_rate": 7.879928900779455e-07, "loss": -0.0, "num_tokens": 170662037.0, "reward": 0.876953125, "reward_std": 0.08312932401895523, "rewards/drgrpo_math_reward/mean": 0.876953125, "rewards/drgrpo_math_reward/std": 0.32881227135658264, "step": 498 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5411210656166077, "advantages/var": 0.292812007654053, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 2.8659498207885306, "grad_norm": 0.16652949874975723, "learning_rate": 7.844854888316932e-07, "loss": 0.0, "num_tokens": 170947178.0, "reward": 0.814453125, "reward_std": 0.12505322694778442, "rewards/drgrpo_math_reward/mean": 0.814453125, "rewards/drgrpo_math_reward/std": 0.38912075757980347, "step": 499 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.855850019779855e-10, "advantages/std": 0.5927573442459106, "advantages/var": 0.351361269157465, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.87168458781362, "grad_norm": 0.15719641140679624, "learning_rate": 7.809808657610273e-07, "loss": -0.0, "num_tokens": 171245100.0, "reward": 0.90625, "reward_std": 0.14334052801132202, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29176566004753113, "step": 500 }, { "advantages/mean": 2.9103830456733704e-09, "advantages/snr": 5.1281917750255954e-09, "advantages/std": 0.5675261616706848, "advantages/var": 0.3220859441806603, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 2.8774193548387097, "grad_norm": 0.18309257039811708, "learning_rate": 7.774790660436857e-07, "loss": -0.0, "num_tokens": 171534809.0, "reward": 0.80859375, "reward_std": 0.13590936362743378, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.3937928080558777, "step": 501 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.773974398403802e-10, "advantages/std": 0.616937518119812, "advantages/var": 0.3806119012638334, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 2.8831541218637993, "grad_norm": 0.18922570489766816, "learning_rate": 7.739801348210115e-07, "loss": -0.0, "num_tokens": 171833364.0, "reward": 0.740234375, "reward_std": 0.13248127698898315, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 502 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.040980300623946e-09, "advantages/std": 0.3422335386276245, "advantages/var": 0.11712379496158576, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8359375, "epoch": 2.888888888888889, "grad_norm": 0.07872022511056052, "learning_rate": 7.704841171973706e-07, "loss": -0.0, "num_tokens": 172134387.0, "reward": 0.8671875, "reward_std": 0.05012226477265358, "rewards/drgrpo_math_reward/mean": 0.8671875, "rewards/drgrpo_math_reward/std": 0.33970388770103455, "step": 503 }, { "advantages/mean": 2.9103830456733704e-09, "advantages/snr": 4.717222805279834e-09, "advantages/std": 0.6169695854187012, "advantages/var": 0.380651469331724, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 2.8946236559139784, "grad_norm": 0.1548837411433228, "learning_rate": 7.669910582395698e-07, "loss": -0.0, "num_tokens": 172444965.0, "reward": 0.791015625, "reward_std": 0.1620882898569107, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "step": 504 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.1783833090844127e-09, "advantages/std": 0.5927544236183167, "advantages/var": 0.3513578067190828, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 2.900358422939068, "grad_norm": 0.17166260096329386, "learning_rate": 7.635010029762755e-07, "loss": -0.0, "num_tokens": 172742982.0, "reward": 0.84375, "reward_std": 0.13941451907157898, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "step": 505 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 1.963958950203137e-10, "advantages/std": 0.5927584171295166, "advantages/var": 0.35136254107789, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8046875, "epoch": 2.9060931899641576, "grad_norm": 0.18868306308434354, "learning_rate": 7.60013996397434e-07, "loss": -0.0, "num_tokens": 173051883.0, "reward": 0.833984375, "reward_std": 0.1434411108493805, "rewards/drgrpo_math_reward/mean": 0.833984375, "rewards/drgrpo_math_reward/std": 0.3724585771560669, "step": 506 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 5.593719267138317e-09, "advantages/std": 0.5411065816879272, "advantages/var": 0.2927963327459935, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8046875, "epoch": 2.9118279569892476, "grad_norm": 0.2497121393286097, "learning_rate": 7.565300834536923e-07, "loss": 0.0, "num_tokens": 173380971.0, "reward": 0.75, "reward_std": 0.11647041887044907, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "step": 507 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.273355153839819e-10, "advantages/std": 0.6402289867401123, "advantages/var": 0.4098931554622709, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 2.9175627240143367, "grad_norm": 0.13554234758745362, "learning_rate": 7.530493090558162e-07, "loss": -0.0, "num_tokens": 173679565.0, "reward": 0.833984375, "reward_std": 0.14728689193725586, "rewards/drgrpo_math_reward/mean": 0.833984375, "rewards/drgrpo_math_reward/std": 0.3724585771560669, "step": 508 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6169537305831909, "advantages/var": 0.3806319056805165, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8046875, "epoch": 2.9232974910394267, "grad_norm": 0.17805531696103077, "learning_rate": 7.495717180741139e-07, "loss": -0.0, "num_tokens": 173979223.0, "reward": 0.82421875, "reward_std": 0.14755409955978394, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "step": 509 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 3.864537528264306e-09, "advantages/std": 0.662727952003479, "advantages/var": 0.43920833836672557, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 2.9290322580645163, "grad_norm": 0.2160512362885447, "learning_rate": 7.460973553378556e-07, "loss": 0.0, "num_tokens": 174294533.0, "reward": 0.8203125, "reward_std": 0.1815127432346344, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "step": 510 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.641705000928957e-09, "advantages/std": 0.6169555187225342, "advantages/var": 0.3806341120821912, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 2.934767025089606, "grad_norm": 0.1488616475052658, "learning_rate": 7.426262656346978e-07, "loss": 0.0, "num_tokens": 174607078.0, "reward": 0.892578125, "reward_std": 0.14711399376392365, "rewards/drgrpo_math_reward/mean": 0.892578125, "rewards/drgrpo_math_reward/std": 0.30995169281959534, "step": 511 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.071514477679013e-10, "advantages/std": 0.5133225321769714, "advantages/var": 0.2635000220405779, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 2.9405017921146954, "grad_norm": 0.12786992918889556, "learning_rate": 7.391584937101033e-07, "loss": -0.0, "num_tokens": 174933629.0, "reward": 0.71484375, "reward_std": 0.09369811415672302, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.45193037390708923, "step": 512 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.0570802802262377e-09, "advantages/std": 0.45274001359939575, "advantages/var": 0.20497351991398105, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 2.946236559139785, "grad_norm": 0.15283426411891252, "learning_rate": 7.356940842667663e-07, "loss": 0.0, "num_tokens": 175196936.0, "reward": 0.880859375, "reward_std": 0.09078904986381531, "rewards/drgrpo_math_reward/mean": 0.880859375, "rewards/drgrpo_math_reward/std": 0.32427072525024414, "step": 513 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.733141337114521e-09, "advantages/std": 0.5411072373390198, "advantages/var": 0.2927970423006663, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 2.9519713261648746, "grad_norm": 0.15065505485347655, "learning_rate": 7.322330819640359e-07, "loss": 0.0, "num_tokens": 175501731.0, "reward": 0.779296875, "reward_std": 0.11654369533061981, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "step": 514 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 1.700843602297829e-10, "advantages/std": 0.6844563484191895, "advantages/var": 0.46848049289133087, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 2.957706093189964, "grad_norm": 0.17188195005538665, "learning_rate": 7.287755314173401e-07, "loss": -0.0, "num_tokens": 175829937.0, "reward": 0.728515625, "reward_std": 0.1868506520986557, "rewards/drgrpo_math_reward/mean": 0.728515625, "rewards/drgrpo_math_reward/std": 0.44516023993492126, "step": 515 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.1748232568793934e-09, "advantages/std": 0.5133559703826904, "advantages/var": 0.26353435232755373, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 2.9634408602150537, "grad_norm": 0.16238056899978312, "learning_rate": 7.2532147719761e-07, "loss": 0.0, "num_tokens": 176114652.0, "reward": 0.876953125, "reward_std": 0.11563748866319656, "rewards/drgrpo_math_reward/mean": 0.876953125, "rewards/drgrpo_math_reward/std": 0.32881227135658264, "step": 516 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.442140527069891e-09, "advantages/std": 0.5411298871040344, "advantages/var": 0.29282155471722504, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 2.9691756272401433, "grad_norm": 0.15854929276227409, "learning_rate": 7.21870963830706e-07, "loss": 0.0, "num_tokens": 176431855.0, "reward": 0.794921875, "reward_std": 0.13046488165855408, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "step": 517 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.547389514548822e-10, "advantages/std": 0.6169832348823547, "advantages/var": 0.3806683121258949, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 2.974910394265233, "grad_norm": 0.15350811205670697, "learning_rate": 7.18424035796845e-07, "loss": 0.0, "num_tokens": 176769742.0, "reward": 0.712890625, "reward_std": 0.1691429167985916, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "step": 518 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 2.828607530889913e-09, "advantages/std": 0.4527204632759094, "advantages/var": 0.20495581786875405, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 2.9806451612903224, "grad_norm": 0.09225290131687847, "learning_rate": 7.149807375300238e-07, "loss": -0.0, "num_tokens": 177046990.0, "reward": 0.734375, "reward_std": 0.08012305945158005, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "step": 519 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.605388284627523e-10, "advantages/std": 0.5411275625228882, "advantages/var": 0.29281903892196226, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6328125, "epoch": 2.9863799283154124, "grad_norm": 0.1542908539426812, "learning_rate": 7.115411134174499e-07, "loss": -0.0, "num_tokens": 177383622.0, "reward": 0.814453125, "reward_std": 0.13154886662960052, "rewards/drgrpo_math_reward/mean": 0.814453125, "rewards/drgrpo_math_reward/std": 0.38912075757980347, "step": 520 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.5356924592071656e-10, "advantages/std": 0.5133298635482788, "advantages/var": 0.26350754881049454, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 2.9921146953405016, "grad_norm": 0.13218068251826057, "learning_rate": 7.081052077989667e-07, "loss": 0.0, "num_tokens": 177682396.0, "reward": 0.87109375, "reward_std": 0.09723131358623505, "rewards/drgrpo_math_reward/mean": 0.87109375, "rewards/drgrpo_math_reward/std": 0.33542385697364807, "step": 521 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.3675068004428033e-09, "advantages/std": 0.48398253321647644, "advantages/var": 0.23423909245863772, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 2.9978494623655916, "grad_norm": 0.11106244173842911, "learning_rate": 7.046730649664831e-07, "loss": -0.0, "num_tokens": 177980979.0, "reward": 0.845703125, "reward_std": 0.0946824923157692, "rewards/drgrpo_math_reward/mean": 0.845703125, "rewards/drgrpo_math_reward/std": 0.36158639192581177, "step": 522 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.114156498140379e-09, "advantages/std": 0.45274046063423157, "advantages/var": 0.20497392469529618, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.0057347670250896, "grad_norm": 0.10825865243172832, "learning_rate": 7.012447291634027e-07, "loss": -0.0, "num_tokens": 178253988.0, "reward": 0.861328125, "reward_std": 0.09542001038789749, "rewards/drgrpo_math_reward/mean": 0.861328125, "rewards/drgrpo_math_reward/std": 0.34594178199768066, "step": 523 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 2.256439454551957e-09, "advantages/std": 0.5675173401832581, "advantages/var": 0.32207593140867985, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 3.011469534050179, "grad_norm": 0.13094895582062505, "learning_rate": 6.97820244584052e-07, "loss": 0.0, "num_tokens": 178589824.0, "reward": 0.744140625, "reward_std": 0.12597328424453735, "rewards/drgrpo_math_reward/mean": 0.744140625, "rewards/drgrpo_math_reward/std": 0.43676990270614624, "step": 524 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.1024181306172613e-10, "advantages/std": 0.5675448775291443, "advantages/var": 0.3221071880095714, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 3.0172043010752687, "grad_norm": 0.13681558983591305, "learning_rate": 6.943996553731131e-07, "loss": 0.0, "num_tokens": 178901535.0, "reward": 0.826171875, "reward_std": 0.14785149693489075, "rewards/drgrpo_math_reward/mean": 0.826171875, "rewards/drgrpo_math_reward/std": 0.3793322443962097, "step": 525 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 7.856741835320363e-10, "advantages/std": 0.2963450253009796, "advantages/var": 0.08782037402063825, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 3.0229390681003583, "grad_norm": 0.05437563667381563, "learning_rate": 6.909830056250526e-07, "loss": 0.0, "num_tokens": 179193585.0, "reward": 0.783203125, "reward_std": 0.0234375, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "step": 526 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.8863881890463823e-09, "advantages/std": 0.48399028182029724, "advantages/var": 0.23424659289649075, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.028673835125448, "grad_norm": 0.13581596678974395, "learning_rate": 6.875703393835541e-07, "loss": 0.0, "num_tokens": 179472526.0, "reward": 0.89453125, "reward_std": 0.10218144953250885, "rewards/drgrpo_math_reward/mean": 0.89453125, "rewards/drgrpo_math_reward/std": 0.3074568510055542, "step": 527 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.367500164876164e-09, "advantages/std": 0.48398348689079285, "advantages/var": 0.23424001558297025, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6171875, "epoch": 3.0344086021505374, "grad_norm": 0.10171657675323968, "learning_rate": 6.841617006409493e-07, "loss": 0.0, "num_tokens": 179800323.0, "reward": 0.8515625, "reward_std": 0.09586012363433838, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 528 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.38261184096336365, "advantages/var": 0.14639182084537428, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.0401433691756274, "grad_norm": 0.08612559073431363, "learning_rate": 6.807571333376538e-07, "loss": 0.0, "num_tokens": 180078522.0, "reward": 0.908203125, "reward_std": 0.05298367142677307, "rewards/drgrpo_math_reward/mean": 0.908203125, "rewards/drgrpo_math_reward/std": 0.289021372795105, "step": 529 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.48398223519325256, "advantages/var": 0.23423880398265684, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 3.045878136200717, "grad_norm": 0.0951669361314145, "learning_rate": 6.77356681361597e-07, "loss": -0.0, "num_tokens": 180394824.0, "reward": 0.859375, "reward_std": 0.09122256934642792, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3479743003845215, "step": 530 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.8863181639230688e-09, "advantages/std": 0.484002023935318, "advantages/var": 0.23425795917348413, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.0516129032258066, "grad_norm": 0.1146195865811895, "learning_rate": 6.739603885476582e-07, "loss": -0.0, "num_tokens": 180713122.0, "reward": 0.86328125, "reward_std": 0.10634076595306396, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.3438861668109894, "step": 531 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.1749522797573105e-09, "advantages/std": 0.513335108757019, "advantages/var": 0.26351293388258057, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 3.057347670250896, "grad_norm": 0.12532098456111387, "learning_rate": 6.70568298677102e-07, "loss": -0.0, "num_tokens": 181037016.0, "reward": 0.833984375, "reward_std": 0.10205584019422531, "rewards/drgrpo_math_reward/mean": 0.833984375, "rewards/drgrpo_math_reward/std": 0.3724585771560669, "step": 532 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 6.084967723244806e-10, "advantages/std": 0.3826324939727783, "advantages/var": 0.14640762544382824, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 3.0630824372759857, "grad_norm": 0.11411629624885912, "learning_rate": 6.671804554770134e-07, "loss": 0.0, "num_tokens": 181320459.0, "reward": 0.806640625, "reward_std": 0.06403100490570068, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "step": 533 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.928134751236398e-10, "advantages/std": 0.5927256941795349, "advantages/var": 0.35132374854061155, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 3.0688172043010753, "grad_norm": 0.15689120044322116, "learning_rate": 6.637969026197332e-07, "loss": -0.0, "num_tokens": 181678667.0, "reward": 0.759765625, "reward_std": 0.11795922368764877, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "step": 534 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.4422732334977564e-09, "advantages/std": 0.541109025478363, "advantages/var": 0.29279897745414374, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.074551971326165, "grad_norm": 0.11602013779135575, "learning_rate": 6.604176837222959e-07, "loss": -0.0, "num_tokens": 182004562.0, "reward": 0.724609375, "reward_std": 0.11552447080612183, "rewards/drgrpo_math_reward/mean": 0.724609375, "rewards/drgrpo_math_reward/std": 0.44714778661727905, "step": 535 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.7213266841935173e-09, "advantages/std": 0.5133466124534607, "advantages/var": 0.26352474451744357, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 3.0802867383512544, "grad_norm": 0.12682437007431677, "learning_rate": 6.570428423458686e-07, "loss": 0.0, "num_tokens": 182304972.0, "reward": 0.83203125, "reward_std": 0.10787242650985718, "rewards/drgrpo_math_reward/mean": 0.83203125, "rewards/drgrpo_math_reward/std": 0.374204158782959, "step": 536 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 1.8462004219468555e-09, "advantages/std": 0.5675103664398193, "advantages/var": 0.322068016016658, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 3.086021505376344, "grad_norm": 0.15478342619225094, "learning_rate": 6.536724219951865e-07, "loss": -0.0, "num_tokens": 182613353.0, "reward": 0.85546875, "reward_std": 0.1205412894487381, "rewards/drgrpo_math_reward/mean": 0.85546875, "rewards/drgrpo_math_reward/std": 0.35197147727012634, "step": 537 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.8484839348983715e-09, "advantages/std": 0.4839945137500763, "advantages/var": 0.2342506893401728, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 3.0917562724014336, "grad_norm": 0.12677997665527743, "learning_rate": 6.50306466117995e-07, "loss": -0.0, "num_tokens": 182902954.0, "reward": 0.716796875, "reward_std": 0.0982498973608017, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "step": 538 }, { "advantages/mean": 2.2118911147117615e-09, "advantages/snr": 3.73148598441503e-09, "advantages/std": 0.592764139175415, "advantages/var": 0.3513693246923708, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 3.097491039426523, "grad_norm": 0.15564109546794921, "learning_rate": 6.46945018104488e-07, "loss": -0.0, "num_tokens": 183200990.0, "reward": 0.791015625, "reward_std": 0.15004178881645203, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "step": 539 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.535216402950783e-09, "advantages/std": 0.5927432775497437, "advantages/var": 0.35134459308041244, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5859375, "epoch": 3.1032258064516127, "grad_norm": 0.13588765843136735, "learning_rate": 6.435881212867493e-07, "loss": 0.0, "num_tokens": 183538292.0, "reward": 0.744140625, "reward_std": 0.13094235956668854, "rewards/drgrpo_math_reward/mean": 0.744140625, "rewards/drgrpo_math_reward/std": 0.43676990270614624, "step": 540 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5133616328239441, "advantages/var": 0.263540166055666, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7109375, "epoch": 3.1089605734767023, "grad_norm": 0.09528215150270615, "learning_rate": 6.402358189381933e-07, "loss": 0.0, "num_tokens": 183892140.0, "reward": 0.775390625, "reward_std": 0.12154807895421982, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "step": 541 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 6.085247836719041e-10, "advantages/std": 0.3826148808002472, "advantages/var": 0.14639414700978737, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.1146953405017923, "grad_norm": 0.08764646106401491, "learning_rate": 6.368881542730071e-07, "loss": -0.0, "num_tokens": 184150068.0, "reward": 0.943359375, "reward_std": 0.05452118441462517, "rewards/drgrpo_math_reward/mean": 0.943359375, "rewards/drgrpo_math_reward/std": 0.23138070106506348, "step": 542 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 5.143046359192473e-10, "advantages/std": 0.4527095854282379, "advantages/var": 0.20494596873860704, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 3.120430107526882, "grad_norm": 0.1652856414195539, "learning_rate": 6.335451704455957e-07, "loss": -0.0, "num_tokens": 184433471.0, "reward": 0.84375, "reward_std": 0.07135801017284393, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "step": 543 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.749554418950833e-09, "advantages/std": 0.5927558541297913, "advantages/var": 0.3513595026051384, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.1261648745519715, "grad_norm": 0.14374979201615984, "learning_rate": 6.302069105500216e-07, "loss": -0.0, "num_tokens": 184783837.0, "reward": 0.7578125, "reward_std": 0.13830919563770294, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "step": 544 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.4432546151649846e-09, "advantages/std": 0.4839699864387512, "advantages/var": 0.23422694777352504, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 3.131899641577061, "grad_norm": 0.14662032841317657, "learning_rate": 6.268734176194534e-07, "loss": -0.0, "num_tokens": 185060371.0, "reward": 0.775390625, "reward_std": 0.08537977933883667, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "step": 545 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.810670975825565e-09, "advantages/std": 0.48398786783218384, "advantages/var": 0.23424425620874345, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7265625, "epoch": 3.1376344086021506, "grad_norm": 0.13763071735502963, "learning_rate": 6.23544734625608e-07, "loss": -0.0, "num_tokens": 185377453.0, "reward": 0.73828125, "reward_std": 0.09655161201953888, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "step": 546 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.848482276086284e-09, "advantages/std": 0.483994722366333, "advantages/var": 0.23425089127846377, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 3.14336917562724, "grad_norm": 0.15433429201418694, "learning_rate": 6.202209044781989e-07, "loss": -0.0, "num_tokens": 185698382.0, "reward": 0.857421875, "reward_std": 0.10177649557590485, "rewards/drgrpo_math_reward/mean": 0.857421875, "rewards/drgrpo_math_reward/std": 0.3499840497970581, "step": 547 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 1.8182829259860732e-10, "advantages/std": 0.6402486562728882, "advantages/var": 0.4099183418592389, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 3.1491039426523297, "grad_norm": 0.15283745464871998, "learning_rate": 6.169019700243815e-07, "loss": -0.0, "num_tokens": 186079380.0, "reward": 0.740234375, "reward_std": 0.16467590630054474, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 548 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 5.291760556958914e-09, "advantages/std": 0.4839858114719391, "advantages/var": 0.23424226570615136, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8359375, "epoch": 3.1548387096774193, "grad_norm": 0.1032344649503809, "learning_rate": 6.13587974048201e-07, "loss": 0.0, "num_tokens": 186368836.0, "reward": 0.791015625, "reward_std": 0.09727106243371964, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "step": 549 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 8.332069918659889e-10, "advantages/std": 0.4191586971282959, "advantages/var": 0.1756940133782905, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.160573476702509, "grad_norm": 0.12583463745668524, "learning_rate": 6.10278959270042e-07, "loss": 0.0, "num_tokens": 186673772.0, "reward": 0.87109375, "reward_std": 0.07685574144124985, "rewards/drgrpo_math_reward/mean": 0.87109375, "rewards/drgrpo_math_reward/std": 0.33542385697364807, "step": 550 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.1749021439075908e-09, "advantages/std": 0.5133432149887085, "advantages/var": 0.2635212563749434, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 3.1663082437275984, "grad_norm": 0.1259366748999031, "learning_rate": 6.069749683460764e-07, "loss": 0.0, "num_tokens": 186963333.0, "reward": 0.703125, "reward_std": 0.10617542266845703, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "step": 551 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.45271867513656616, "advantages/var": 0.20495419881740773, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.172043010752688, "grad_norm": 0.1257763502692383, "learning_rate": 6.036760438677144e-07, "loss": -0.0, "num_tokens": 187269568.0, "reward": 0.794921875, "reward_std": 0.07933682948350906, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "step": 552 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.567516565322876, "advantages/var": 0.32207505191587416, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 3.1777777777777776, "grad_norm": 0.15291075721661262, "learning_rate": 6.003822283610546e-07, "loss": 0.0, "num_tokens": 187567109.0, "reward": 0.87109375, "reward_std": 0.12498891353607178, "rewards/drgrpo_math_reward/mean": 0.87109375, "rewards/drgrpo_math_reward/std": 0.33542385697364807, "step": 553 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.356772542822538e-09, "advantages/std": 0.5927529335021973, "advantages/var": 0.3513560401754603, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.183512544802867, "grad_norm": 0.1486295626729303, "learning_rate": 5.970935642863374e-07, "loss": -0.0, "num_tokens": 187869412.0, "reward": 0.875, "reward_std": 0.13847552239894867, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3310423493385315, "step": 554 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.7494844708323167e-09, "advantages/std": 0.5927709341049194, "advantages/var": 0.35137738031961874, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 3.189247311827957, "grad_norm": 0.14971386219234584, "learning_rate": 5.938100940373956e-07, "loss": -0.0, "num_tokens": 188178184.0, "reward": 0.716796875, "reward_std": 0.15152645111083984, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "step": 555 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.2307325481483419e-09, "advantages/std": 0.5675415992736816, "advantages/var": 0.32210346690612823, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 3.1949820788530467, "grad_norm": 0.14664748085683182, "learning_rate": 5.905318599411097e-07, "loss": -0.0, "num_tokens": 188513199.0, "reward": 0.74609375, "reward_std": 0.1464487761259079, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "step": 556 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 2.4946418597261635e-09, "advantages/std": 0.5133275985717773, "advantages/var": 0.2635052234554678, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 3.2007168458781363, "grad_norm": 0.161297132735312, "learning_rate": 5.872589042568604e-07, "loss": 0.0, "num_tokens": 188816644.0, "reward": 0.88671875, "reward_std": 0.09467554092407227, "rewards/drgrpo_math_reward/mean": 0.88671875, "rewards/drgrpo_math_reward/std": 0.3172462284564972, "step": 557 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.1634053001405585e-09, "advantages/std": 0.5411095023155212, "advantages/var": 0.2927994934961511, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8828125, "epoch": 3.206451612903226, "grad_norm": 0.1507883428637121, "learning_rate": 5.839912691759866e-07, "loss": -0.0, "num_tokens": 189121996.0, "reward": 0.83984375, "reward_std": 0.11311560869216919, "rewards/drgrpo_math_reward/mean": 0.83984375, "rewards/drgrpo_math_reward/std": 0.3671095669269562, "step": 558 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 3.928023755379517e-09, "advantages/std": 0.5927424430847168, "advantages/var": 0.35134360383403873, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.2121863799283155, "grad_norm": 0.14583269053890843, "learning_rate": 5.807289968212383e-07, "loss": -0.0, "num_tokens": 189407142.0, "reward": 0.892578125, "reward_std": 0.12980522215366364, "rewards/drgrpo_math_reward/mean": 0.892578125, "rewards/drgrpo_math_reward/std": 0.30995169281959534, "step": 559 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.60579020225265e-10, "advantages/std": 0.5411022901535034, "advantages/var": 0.2927916884093662, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.217921146953405, "grad_norm": 0.14668445436426233, "learning_rate": 5.774721292462356e-07, "loss": -0.0, "num_tokens": 189687211.0, "reward": 0.841796875, "reward_std": 0.11284181475639343, "rewards/drgrpo_math_reward/mean": 0.841796875, "rewards/drgrpo_math_reward/std": 0.36528825759887695, "step": 560 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.5817288134223e-09, "advantages/std": 0.5411040186882019, "advantages/var": 0.29279355904052196, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8359375, "epoch": 3.2236559139784946, "grad_norm": 0.1448439199256027, "learning_rate": 5.742207084349273e-07, "loss": 0.0, "num_tokens": 190004815.0, "reward": 0.701171875, "reward_std": 0.11344269663095474, "rewards/drgrpo_math_reward/mean": 0.701171875, "rewards/drgrpo_math_reward/std": 0.45819199085235596, "step": 561 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.2170459650526033e-09, "advantages/std": 0.3826160132884979, "advantages/var": 0.14639501362478402, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 3.229390681003584, "grad_norm": 0.07755816168862036, "learning_rate": 5.709747763010466e-07, "loss": -0.0, "num_tokens": 190270210.0, "reward": 0.8984375, "reward_std": 0.05243149772286415, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.30236753821372986, "step": 562 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.1109571108846604e-09, "advantages/std": 0.41915324330329895, "advantages/var": 0.17568944137167453, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.2351254480286737, "grad_norm": 0.13211010106295823, "learning_rate": 5.677343746875738e-07, "loss": -0.0, "num_tokens": 190549433.0, "reward": 0.87890625, "reward_std": 0.0752047598361969, "rewards/drgrpo_math_reward/mean": 0.87890625, "rewards/drgrpo_math_reward/std": 0.3265552520751953, "step": 563 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.4433025198297015e-09, "advantages/std": 0.48395392298698425, "advantages/var": 0.23421139957449189, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 3.2408602150537633, "grad_norm": 0.10739945087524533, "learning_rate": 5.644995453661954e-07, "loss": -0.0, "num_tokens": 190832125.0, "reward": 0.890625, "reward_std": 0.07537011057138443, "rewards/drgrpo_math_reward/mean": 0.890625, "rewards/drgrpo_math_reward/std": 0.31241437792778015, "step": 564 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.567499577999115, "advantages/var": 0.3220557710291736, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9765625, "epoch": 3.246594982078853, "grad_norm": 0.14994849521435435, "learning_rate": 5.612703300367668e-07, "loss": 0.0, "num_tokens": 191104740.0, "reward": 0.869140625, "reward_std": 0.1184716448187828, "rewards/drgrpo_math_reward/mean": 0.869140625, "rewards/drgrpo_math_reward/std": 0.33757632970809937, "step": 565 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.405252844059139e-09, "advantages/std": 0.48400449752807617, "advantages/var": 0.2342603536274055, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.252329749103943, "grad_norm": 0.13795679865002983, "learning_rate": 5.580467703267735e-07, "loss": 0.0, "num_tokens": 191420943.0, "reward": 0.794921875, "reward_std": 0.10408572852611542, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "step": 566 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 2.051220267307662e-10, "advantages/std": 0.567541778087616, "advantages/var": 0.3221036698748527, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.258064516129032, "grad_norm": 0.17681736099133125, "learning_rate": 5.548289077907943e-07, "loss": -0.0, "num_tokens": 191730536.0, "reward": 0.8046875, "reward_std": 0.14528566598892212, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 567 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.773717724931474e-10, "advantages/std": 0.6169794797897339, "advantages/var": 0.38066367848161065, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.263799283154122, "grad_norm": 0.14192774831571833, "learning_rate": 5.51616783909968e-07, "loss": -0.0, "num_tokens": 192055565.0, "reward": 0.818359375, "reward_std": 0.16729867458343506, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "step": 568 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.5714410785437144e-09, "advantages/std": 0.45272403955459595, "advantages/var": 0.20495905599063136, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.2695340501792116, "grad_norm": 0.12318138190411515, "learning_rate": 5.484104400914552e-07, "loss": -0.0, "num_tokens": 192355782.0, "reward": 0.888671875, "reward_std": 0.08082009106874466, "rewards/drgrpo_math_reward/mean": 0.888671875, "rewards/drgrpo_math_reward/std": 0.31484565138816833, "step": 569 }, { "advantages/mean": 3.3760443329811096e-09, "advantages/snr": 6.57624093213123e-09, "advantages/std": 0.5133699178695679, "advantages/var": 0.26354867257340686, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.275268817204301, "grad_norm": 0.1661914263831332, "learning_rate": 5.452099176679071e-07, "loss": -0.0, "num_tokens": 192636424.0, "reward": 0.92578125, "reward_std": 0.1228528842329979, "rewards/drgrpo_math_reward/mean": 0.92578125, "rewards/drgrpo_math_reward/std": 0.2623828947544098, "step": 570 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.5355802842203247e-10, "advantages/std": 0.513342559337616, "advantages/var": 0.26352058322729377, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8359375, "epoch": 3.2810035842293908, "grad_norm": 0.16457025693843225, "learning_rate": 5.420152578969325e-07, "loss": -0.0, "num_tokens": 192945316.0, "reward": 0.818359375, "reward_std": 0.10444433987140656, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "step": 571 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.1783843755203652e-09, "advantages/std": 0.5927538871765137, "advantages/var": 0.3513571707628671, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.2867383512544803, "grad_norm": 0.16012121413236188, "learning_rate": 5.388265019605641e-07, "loss": 0.0, "num_tokens": 193252806.0, "reward": 0.80859375, "reward_std": 0.136393740773201, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.3937928080558777, "step": 572 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 4.082008510798809e-09, "advantages/std": 0.5133442878723145, "advantages/var": 0.26352235789113365, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 3.29247311827957, "grad_norm": 0.1275652092115823, "learning_rate": 5.356436909647302e-07, "loss": -0.0, "num_tokens": 193574046.0, "reward": 0.791015625, "reward_std": 0.10845740139484406, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "step": 573 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 3.3674812951264533e-09, "advantages/std": 0.4839861989021301, "advantages/var": 0.23424264072773227, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 3.2982078853046595, "grad_norm": 0.14278291680222485, "learning_rate": 5.324668659387221e-07, "loss": -0.0, "num_tokens": 193875932.0, "reward": 0.705078125, "reward_std": 0.09369116276502609, "rewards/drgrpo_math_reward/mean": 0.705078125, "rewards/drgrpo_math_reward/std": 0.4564536213874817, "step": 574 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 7.27283853142189e-10, "advantages/std": 0.6402744650840759, "advantages/var": 0.40995139063869956, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.303942652329749, "grad_norm": 0.15219813751427388, "learning_rate": 5.292960678346674e-07, "loss": -0.0, "num_tokens": 194224123.0, "reward": 0.775390625, "reward_std": 0.18594348430633545, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "step": 575 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.5713115896008884e-09, "advantages/std": 0.45274683833122253, "advantages/var": 0.20497969961891815, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 3.3096774193548386, "grad_norm": 0.11006288633014727, "learning_rate": 5.261313375270013e-07, "loss": -0.0, "num_tokens": 194500000.0, "reward": 0.865234375, "reward_std": 0.09352388232946396, "rewards/drgrpo_math_reward/mean": 0.865234375, "rewards/drgrpo_math_reward/std": 0.3418070077896118, "step": 576 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.057113591757749e-09, "advantages/std": 0.4527326822280884, "advantages/var": 0.20496688155743925, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 3.315412186379928, "grad_norm": 0.12957980699410093, "learning_rate": 5.229727158119396e-07, "loss": 0.0, "num_tokens": 194769005.0, "reward": 0.796875, "reward_std": 0.08741521090269089, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "step": 577 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.5356792928414916e-10, "advantages/std": 0.5133313536643982, "advantages/var": 0.26350907865492346, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 3.3211469534050178, "grad_norm": 0.14741947055491092, "learning_rate": 5.198202434069519e-07, "loss": -0.0, "num_tokens": 195050261.0, "reward": 0.8046875, "reward_std": 0.1032325029373169, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 578 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.528610893652193e-09, "advantages/std": 0.6169590950012207, "advantages/var": 0.3806385249047253, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.3268817204301078, "grad_norm": 0.19575346343968025, "learning_rate": 5.166739609502396e-07, "loss": -0.0, "num_tokens": 195356541.0, "reward": 0.759765625, "reward_std": 0.15360400080680847, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "step": 579 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.886475814100697e-09, "advantages/std": 0.4839755892753601, "advantages/var": 0.23423237101443206, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 3.332616487455197, "grad_norm": 0.10387602560798019, "learning_rate": 5.135339090002084e-07, "loss": 0.0, "num_tokens": 195678633.0, "reward": 0.873046875, "reward_std": 0.08725681900978088, "rewards/drgrpo_math_reward/mean": 0.873046875, "rewards/drgrpo_math_reward/std": 0.33324605226516724, "step": 580 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.0571255083399635e-09, "advantages/std": 0.45273005962371826, "advantages/var": 0.2049645068868955, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 3.338351254480287, "grad_norm": 0.12775512024384822, "learning_rate": 5.104001280349479e-07, "loss": -0.0, "num_tokens": 195981976.0, "reward": 0.84375, "reward_std": 0.08587770164012909, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "step": 581 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.571482043774397e-09, "advantages/std": 0.4527168273925781, "advantages/var": 0.20495252580440138, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 3.3440860215053765, "grad_norm": 0.12806825382700077, "learning_rate": 5.072726584517085e-07, "loss": 0.0, "num_tokens": 196300076.0, "reward": 0.80078125, "reward_std": 0.077679343521595, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "step": 582 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.53506745820479e-09, "advantages/std": 0.5927682518959045, "advantages/var": 0.35137420045572654, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 3.349820788530466, "grad_norm": 0.1469857474295955, "learning_rate": 5.041515405663821e-07, "loss": -0.0, "num_tokens": 196619739.0, "reward": 0.7734375, "reward_std": 0.1481795608997345, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "step": 583 }, { "advantages/mean": -3.4924596548080444e-09, "advantages/snr": 7.21590604496012e-09, "advantages/std": 0.48399460315704346, "advantages/var": 0.23425077588514398, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.3555555555555556, "grad_norm": 0.1549935571686468, "learning_rate": 5.010368146129814e-07, "loss": 0.0, "num_tokens": 196892655.0, "reward": 0.861328125, "reward_std": 0.10293962806463242, "rewards/drgrpo_math_reward/mean": 0.861328125, "rewards/drgrpo_math_reward/std": 0.34594178199768066, "step": 584 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.085633013570688e-09, "advantages/std": 0.4527381658554077, "advantages/var": 0.20497184682211866, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 3.361290322580645, "grad_norm": 0.1161635420885722, "learning_rate": 4.979285207431203e-07, "loss": -0.0, "num_tokens": 197215170.0, "reward": 0.90625, "reward_std": 0.08875361829996109, "rewards/drgrpo_math_reward/mean": 0.90625, "rewards/drgrpo_math_reward/std": 0.29176566004753113, "step": 585 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.085752857528543e-09, "advantages/std": 0.452720582485199, "advantages/var": 0.20495592580573785, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 3.3670250896057348, "grad_norm": 0.15313380562401754, "learning_rate": 4.948266990254988e-07, "loss": -0.0, "num_tokens": 197512892.0, "reward": 0.849609375, "reward_std": 0.08149883896112442, "rewards/drgrpo_math_reward/mean": 0.849609375, "rewards/drgrpo_math_reward/std": 0.35780346393585205, "step": 586 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.151400627408177e-09, "advantages/std": 0.541114091873169, "advantages/var": 0.2928044604237243, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.3727598566308243, "grad_norm": 0.1330792981659149, "learning_rate": 4.917313894453841e-07, "loss": 0.0, "num_tokens": 197804400.0, "reward": 0.88671875, "reward_std": 0.11713007092475891, "rewards/drgrpo_math_reward/mean": 0.88671875, "rewards/drgrpo_math_reward/std": 0.3172462284564972, "step": 587 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.2908317030182552e-09, "advantages/std": 0.5411177277565002, "advantages/var": 0.2928083952923579, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 3.378494623655914, "grad_norm": 0.14798485845923126, "learning_rate": 4.886426319040964e-07, "loss": -0.0, "num_tokens": 198081580.0, "reward": 0.912109375, "reward_std": 0.12070301175117493, "rewards/drgrpo_math_reward/mean": 0.912109375, "rewards/drgrpo_math_reward/std": 0.2834126651287079, "step": 588 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.0513161355725303e-09, "advantages/std": 0.5675152540206909, "advantages/var": 0.32207356354616934, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 3.3842293906810035, "grad_norm": 0.14043681657091434, "learning_rate": 4.855604662184934e-07, "loss": 0.0, "num_tokens": 198391286.0, "reward": 0.798828125, "reward_std": 0.12394887208938599, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "step": 589 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 4.082123687277044e-09, "advantages/std": 0.513329803943634, "advantages/var": 0.26350748761680975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9765625, "epoch": 3.389964157706093, "grad_norm": 0.1304889536023605, "learning_rate": 4.8248493212046e-07, "loss": -0.0, "num_tokens": 198696458.0, "reward": 0.873046875, "reward_std": 0.0971047431230545, "rewards/drgrpo_math_reward/mean": 0.873046875, "rewards/drgrpo_math_reward/std": 0.33324605226516724, "step": 590 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.0513191517981716e-09, "advantages/std": 0.5675144195556641, "advantages/var": 0.3220726164036023, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.3956989247311826, "grad_norm": 0.12163622819941129, "learning_rate": 4.794160692563917e-07, "loss": -0.0, "num_tokens": 199007710.0, "reward": 0.90234375, "reward_std": 0.12463457137346268, "rewards/drgrpo_math_reward/mean": 0.90234375, "rewards/drgrpo_math_reward/std": 0.29713961482048035, "step": 591 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.6283239385402842e-09, "advantages/std": 0.5133624076843262, "advantages/var": 0.2635409616234483, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 3.4014336917562726, "grad_norm": 0.14186071274028908, "learning_rate": 4.7635391718668693e-07, "loss": -0.0, "num_tokens": 199353261.0, "reward": 0.78125, "reward_std": 0.12253245711326599, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "step": 592 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 5.142713226070768e-10, "advantages/std": 0.4527389109134674, "advantages/var": 0.20497252145511258, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 3.4071684587813618, "grad_norm": 0.1303146274754573, "learning_rate": 4.7329851538523545e-07, "loss": -0.0, "num_tokens": 199658344.0, "reward": 0.798828125, "reward_std": 0.09032991528511047, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "step": 593 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.8864786580049818e-09, "advantages/std": 0.4839751124382019, "advantages/var": 0.23423190945957018, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 3.412903225806452, "grad_norm": 0.11191105541155916, "learning_rate": 4.7024990323891103e-07, "loss": -0.0, "num_tokens": 199974429.0, "reward": 0.8515625, "reward_std": 0.09110813587903976, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 594 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 5.554970398215216e-09, "advantages/std": 0.4191392958164215, "advantages/var": 0.1756777492974857, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8359375, "epoch": 3.4186379928315414, "grad_norm": 0.10172340783277224, "learning_rate": 4.672081200470611e-07, "loss": -0.0, "num_tokens": 200272994.0, "reward": 0.861328125, "reward_std": 0.0673922598361969, "rewards/drgrpo_math_reward/mean": 0.861328125, "rewards/drgrpo_math_reward/std": 0.34594178199768066, "step": 595 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.7212099106479572e-09, "advantages/std": 0.34224575757980347, "advantages/var": 0.1171321585813736, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.424372759856631, "grad_norm": 0.08759449081516499, "learning_rate": 4.641732050210031e-07, "loss": -0.0, "num_tokens": 200572872.0, "reward": 0.810546875, "reward_std": 0.054241843521595, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "step": 596 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.886441509946751e-09, "advantages/std": 0.48398134112358093, "advantages/var": 0.23423793855578, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9765625, "epoch": 3.4301075268817205, "grad_norm": 0.11896124211723928, "learning_rate": 4.611451972835175e-07, "loss": 0.0, "num_tokens": 200854418.0, "reward": 0.828125, "reward_std": 0.09358368813991547, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3776407241821289, "step": 597 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.5429161423351942e-09, "advantages/std": 0.4527089297771454, "advantages/var": 0.20494537509996835, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.43584229390681, "grad_norm": 0.09743096188706662, "learning_rate": 4.5812413586834275e-07, "loss": -0.0, "num_tokens": 201152250.0, "reward": 0.75390625, "reward_std": 0.07383356243371964, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 598 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.9243743261171734e-09, "advantages/std": 0.48396122455596924, "advantages/var": 0.23421846687371328, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.4415770609318996, "grad_norm": 0.1301120702960006, "learning_rate": 4.5511005971967366e-07, "loss": -0.0, "num_tokens": 201437219.0, "reward": 0.916015625, "reward_std": 0.07928258180618286, "rewards/drgrpo_math_reward/mean": 0.916015625, "rewards/drgrpo_math_reward/std": 0.2776356339454651, "step": 599 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.205189568333778e-10, "advantages/std": 0.5675204396247864, "advantages/var": 0.3220794493919108, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9765625, "epoch": 3.447311827956989, "grad_norm": 0.14509488175828564, "learning_rate": 4.5210300769165797e-07, "loss": -0.0, "num_tokens": 201752082.0, "reward": 0.84375, "reward_std": 0.1279679238796234, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "step": 600 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 6.023925738017299e-09, "advantages/std": 0.5411137342453003, "advantages/var": 0.29280407338889347, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.4530465949820788, "grad_norm": 0.13441716873493229, "learning_rate": 4.4910301854789755e-07, "loss": -0.0, "num_tokens": 202051619.0, "reward": 0.892578125, "reward_std": 0.12424871325492859, "rewards/drgrpo_math_reward/mean": 0.892578125, "rewards/drgrpo_math_reward/std": 0.30995169281959534, "step": 601 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.272734990491316e-09, "advantages/std": 0.6402827501296997, "advantages/var": 0.40996200011365147, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.4587813620071683, "grad_norm": 0.1467972704488675, "learning_rate": 4.461101309609461e-07, "loss": -0.0, "num_tokens": 202381312.0, "reward": 0.806640625, "reward_std": 0.19325673580169678, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "step": 602 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 1.8868732630196982e-10, "advantages/std": 0.6169747710227966, "advantages/var": 0.38065786807863233, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 3.464516129032258, "grad_norm": 0.16322798633491956, "learning_rate": 4.431243835118124e-07, "loss": 0.0, "num_tokens": 202729221.0, "reward": 0.697265625, "reward_std": 0.16305816173553467, "rewards/drgrpo_math_reward/mean": 0.697265625, "rewards/drgrpo_math_reward/std": 0.45989060401916504, "step": 603 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 2.2563903992956313e-09, "advantages/std": 0.5675296783447266, "advantages/var": 0.3220899358020688, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 3.4702508960573475, "grad_norm": 0.16674418898730192, "learning_rate": 4.401458146894618e-07, "loss": -0.0, "num_tokens": 203075387.0, "reward": 0.697265625, "reward_std": 0.13353893160820007, "rewards/drgrpo_math_reward/mean": 0.697265625, "rewards/drgrpo_math_reward/std": 0.45989060401916504, "step": 604 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5675270557403564, "advantages/var": 0.32208695899731765, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 3.4759856630824375, "grad_norm": 0.17764678978053158, "learning_rate": 4.37174462890322e-07, "loss": 0.0, "num_tokens": 203376124.0, "reward": 0.818359375, "reward_std": 0.1303846389055252, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "step": 605 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.989116878889126e-09, "advantages/std": 0.5133447647094727, "advantages/var": 0.26352284745462384, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 3.481720430107527, "grad_norm": 0.14556271615337782, "learning_rate": 4.3421036641778553e-07, "loss": -0.0, "num_tokens": 203684832.0, "reward": 0.7421875, "reward_std": 0.108929343521595, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "step": 606 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.6664536763430788e-09, "advantages/std": 0.419148713350296, "advantages/var": 0.17568564390320862, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8828125, "epoch": 3.4874551971326166, "grad_norm": 0.1327028816891556, "learning_rate": 4.3125356348171813e-07, "loss": -0.0, "num_tokens": 203964077.0, "reward": 0.775390625, "reward_std": 0.07581022381782532, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "step": 607 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.0427135045567887e-09, "advantages/std": 0.38260361552238464, "advantages/var": 0.14638552661080073, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.493189964157706, "grad_norm": 0.10448744852830436, "learning_rate": 4.283040921979646e-07, "loss": 0.0, "num_tokens": 204232777.0, "reward": 0.94140625, "reward_std": 0.05012226849794388, "rewards/drgrpo_math_reward/mean": 0.94140625, "rewards/drgrpo_math_reward/std": 0.23509246110916138, "step": 608 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.302788457909435e-10, "advantages/std": 0.5411157011985779, "advantages/var": 0.2928062020836286, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 3.498924731182796, "grad_norm": 0.17561878793889285, "learning_rate": 4.253619905878588e-07, "loss": -0.0, "num_tokens": 204520944.0, "reward": 0.8046875, "reward_std": 0.11806906759738922, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 609 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.132115098738562e-09, "advantages/std": 0.6169795989990234, "advantages/var": 0.38066382558099576, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 3.5046594982078854, "grad_norm": 0.15992391939659845, "learning_rate": 4.224272965777326e-07, "loss": -0.0, "num_tokens": 204821191.0, "reward": 0.720703125, "reward_std": 0.1684030294418335, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "step": 610 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.636556372595551e-10, "advantages/std": 0.6402503252029419, "advantages/var": 0.40992047892247285, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 3.510394265232975, "grad_norm": 0.17234081555738018, "learning_rate": 4.195000479984264e-07, "loss": -0.0, "num_tokens": 205119740.0, "reward": 0.796875, "reward_std": 0.1630127876996994, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "step": 611 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.102520673996109e-10, "advantages/std": 0.5675306916236877, "advantages/var": 0.32209108593486135, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 3.5161290322580645, "grad_norm": 0.16516853551310717, "learning_rate": 4.1658028258480426e-07, "loss": 0.0, "num_tokens": 205402755.0, "reward": 0.880859375, "reward_std": 0.13408415019512177, "rewards/drgrpo_math_reward/mean": 0.880859375, "rewards/drgrpo_math_reward/std": 0.32427072525024414, "step": 612 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.1423008271752013e-09, "advantages/std": 0.5927647352218628, "advantages/var": 0.3513700313226451, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 3.521863799283154, "grad_norm": 0.14821893537551872, "learning_rate": 4.1366803797526373e-07, "loss": -0.0, "num_tokens": 205716989.0, "reward": 0.814453125, "reward_std": 0.14475145936012268, "rewards/drgrpo_math_reward/mean": 0.814453125, "rewards/drgrpo_math_reward/std": 0.38912075757980347, "step": 613 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 1.8869217525770157e-10, "advantages/std": 0.6169589161872864, "advantages/var": 0.38063830426299106, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 3.5275985663082436, "grad_norm": 0.17426120738428458, "learning_rate": 4.1076335171125286e-07, "loss": 0.0, "num_tokens": 206037324.0, "reward": 0.818359375, "reward_std": 0.15007738769054413, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "step": 614 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.692337632508361e-09, "advantages/std": 0.5675200819969177, "advantages/var": 0.3220790434697882, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.533333333333333, "grad_norm": 0.14943839439129744, "learning_rate": 4.078662612367868e-07, "loss": 0.0, "num_tokens": 206353117.0, "reward": 0.81640625, "reward_std": 0.1298515498638153, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "step": 615 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 5.55477686548795e-10, "advantages/std": 0.4191538989543915, "advantages/var": 0.17568999100866822, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.539068100358423, "grad_norm": 0.12842834619702884, "learning_rate": 4.049768038979631e-07, "loss": -0.0, "num_tokens": 206641487.0, "reward": 0.8515625, "reward_std": 0.07662828266620636, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 616 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.547490864636814e-10, "advantages/std": 0.616974949836731, "advantages/var": 0.3806580887260367, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.5448028673835124, "grad_norm": 0.18417062384449173, "learning_rate": 4.020950169424815e-07, "loss": -0.0, "num_tokens": 206948183.0, "reward": 0.794921875, "reward_std": 0.16533556580543518, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "step": 617 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 5.74369664126654e-09, "advantages/std": 0.5675141215324402, "advantages/var": 0.3220722781387373, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8828125, "epoch": 3.5505376344086024, "grad_norm": 0.15590417345650115, "learning_rate": 3.992209375191634e-07, "loss": -0.0, "num_tokens": 207250926.0, "reward": 0.87890625, "reward_std": 0.12412214279174805, "rewards/drgrpo_math_reward/mean": 0.87890625, "rewards/drgrpo_math_reward/std": 0.3265552520751953, "step": 618 }, { "advantages/mean": -2.2118911147117615e-09, "advantages/snr": 3.897435982260616e-09, "advantages/std": 0.5675246715545654, "advantages/var": 0.32208425282311737, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.5562724014336915, "grad_norm": 0.15084853015770833, "learning_rate": 3.963546026774741e-07, "loss": -0.0, "num_tokens": 207548989.0, "reward": 0.75390625, "reward_std": 0.13471657037734985, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 619 }, { "advantages/mean": 3.259629011154175e-09, "advantages/snr": 5.283368656935815e-09, "advantages/std": 0.616960346698761, "advantages/var": 0.38064006939865536, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 3.5620071684587815, "grad_norm": 0.15672548417982696, "learning_rate": 3.934960493670441e-07, "loss": -0.0, "num_tokens": 207853494.0, "reward": 0.75, "reward_std": 0.1559775471687317, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "step": 620 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.9229627649737e-09, "advantages/std": 0.5675378441810608, "advantages/var": 0.32209920457768604, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.567741935483871, "grad_norm": 0.16132355096223355, "learning_rate": 3.9064531443719194e-07, "loss": -0.0, "num_tokens": 208151603.0, "reward": 0.810546875, "reward_std": 0.14276140928268433, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "step": 621 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.7212761294408613e-09, "advantages/std": 0.5133561491966248, "advantages/var": 0.26353453591798726, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9765625, "epoch": 3.5734767025089607, "grad_norm": 0.14325395003548821, "learning_rate": 3.8780243463645093e-07, "loss": -0.0, "num_tokens": 208448097.0, "reward": 0.771484375, "reward_std": 0.11366778612136841, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "step": 622 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.102433641051396e-09, "advantages/std": 0.5675427317619324, "advantages/var": 0.3221047523757967, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 3.5792114695340502, "grad_norm": 0.14079319054062883, "learning_rate": 3.849674466120951e-07, "loss": -0.0, "num_tokens": 208775686.0, "reward": 0.8203125, "reward_std": 0.14661605656147003, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "step": 623 }, { "advantages/mean": 3.958120942115784e-09, "advantages/snr": 5.782838032532784e-09, "advantages/std": 0.684459924697876, "advantages/var": 0.46848538851742205, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.58494623655914, "grad_norm": 0.18965297206023404, "learning_rate": 3.8214038690966577e-07, "loss": -0.0, "num_tokens": 209096183.0, "reward": 0.767578125, "reward_std": 0.1928303837776184, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "step": 624 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.535403343407754e-09, "advantages/std": 0.5133625864982605, "advantages/var": 0.263541145216184, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8828125, "epoch": 3.5906810035842294, "grad_norm": 0.18044768978972742, "learning_rate": 3.79321291972501e-07, "loss": 0.0, "num_tokens": 209372558.0, "reward": 0.802734375, "reward_std": 0.12325643002986908, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "step": 625 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.6923395714757357e-09, "advantages/std": 0.5675197839736938, "advantages/var": 0.32207870520154813, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.596415770609319, "grad_norm": 0.13969163915186816, "learning_rate": 3.765101981412665e-07, "loss": -0.0, "num_tokens": 209660493.0, "reward": 0.869140625, "reward_std": 0.12639163434505463, "rewards/drgrpo_math_reward/mean": 0.869140625, "rewards/drgrpo_math_reward/std": 0.33757632970809937, "step": 626 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 7.714526878971394e-10, "advantages/std": 0.4527120888233185, "advantages/var": 0.2049482353667722, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 3.6021505376344085, "grad_norm": 0.12858763725946423, "learning_rate": 3.7370714165348616e-07, "loss": -0.0, "num_tokens": 209931089.0, "reward": 0.78125, "reward_std": 0.0737205296754837, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "step": 627 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 7.215750978974355e-10, "advantages/std": 0.48400500416755676, "advantages/var": 0.23426084405923664, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.607885304659498, "grad_norm": 0.11020447234286741, "learning_rate": 3.709121586430752e-07, "loss": -0.0, "num_tokens": 210207396.0, "reward": 0.81640625, "reward_std": 0.10557594150304794, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "step": 628 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 2.7967401024144414e-09, "advantages/std": 0.5411297082901001, "advantages/var": 0.2928213611941288, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 3.6136200716845877, "grad_norm": 0.17058722020067563, "learning_rate": 3.681252851398743e-07, "loss": 0.0, "num_tokens": 210522167.0, "reward": 0.662109375, "reward_std": 0.13268837332725525, "rewards/drgrpo_math_reward/mean": 0.662109375, "rewards/drgrpo_math_reward/std": 0.4734536409378052, "step": 629 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.535094473503778e-09, "advantages/std": 0.5927637219429016, "advantages/var": 0.3513688300516016, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.6193548387096772, "grad_norm": 0.13651801006199424, "learning_rate": 3.6534655706918605e-07, "loss": -0.0, "num_tokens": 210835185.0, "reward": 0.796875, "reward_std": 0.14507952332496643, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "step": 630 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 4.114141332246097e-09, "advantages/std": 0.4527421295642853, "advantages/var": 0.20497543588240408, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.6250896057347672, "grad_norm": 0.12239317038704738, "learning_rate": 3.625760102513102e-07, "loss": 0.0, "num_tokens": 211146050.0, "reward": 0.787109375, "reward_std": 0.09283240139484406, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "step": 631 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.442517440062621e-09, "advantages/std": 0.5410706400871277, "advantages/var": 0.29275743756429407, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.6308243727598564, "grad_norm": 0.13458599842161728, "learning_rate": 3.598136804010836e-07, "loss": -0.0, "num_tokens": 211425217.0, "reward": 0.880859375, "reward_std": 0.09001073241233826, "rewards/drgrpo_math_reward/mean": 0.880859375, "rewards/drgrpo_math_reward/std": 0.32427072525024414, "step": 632 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.8862899060157375e-09, "advantages/std": 0.48400676250457764, "advantages/var": 0.23426254615016262, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 3.6365591397849464, "grad_norm": 0.14436920149189938, "learning_rate": 3.570596031274189e-07, "loss": 0.0, "num_tokens": 211714759.0, "reward": 0.818359375, "reward_std": 0.10678636282682419, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "step": 633 }, { "advantages/mean": 3.4924596548080444e-09, "advantages/snr": 6.154011748014873e-09, "advantages/std": 0.5675094127655029, "advantages/var": 0.322066933577446, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.642293906810036, "grad_norm": 0.15288688769044656, "learning_rate": 3.5431381393284497e-07, "loss": -0.0, "num_tokens": 211973020.0, "reward": 0.888671875, "reward_std": 0.12152662128210068, "rewards/drgrpo_math_reward/mean": 0.888671875, "rewards/drgrpo_math_reward/std": 0.31484565138816833, "step": 634 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.2219728342255805e-09, "advantages/std": 0.4191421866416931, "advantages/var": 0.1756801726227799, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.6480286738351255, "grad_norm": 0.08792338573392945, "learning_rate": 3.515763482130505e-07, "loss": -0.0, "num_tokens": 212258378.0, "reward": 0.8203125, "reward_std": 0.06805649399757385, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "step": 635 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5411182641983032, "advantages/var": 0.2928089758489847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.653763440860215, "grad_norm": 0.16740119805520542, "learning_rate": 3.488472412564264e-07, "loss": -0.0, "num_tokens": 212545143.0, "reward": 0.84765625, "reward_std": 0.12259122729301453, "rewards/drgrpo_math_reward/mean": 0.84765625, "rewards/drgrpo_math_reward/std": 0.35970520973205566, "step": 636 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.3206771056182065e-09, "advantages/std": 0.5927628874778748, "advantages/var": 0.3513678407711076, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 3.6594982078853047, "grad_norm": 0.14579423869361835, "learning_rate": 3.4612652824361297e-07, "loss": -0.0, "num_tokens": 212854307.0, "reward": 0.80078125, "reward_std": 0.14928391575813293, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "step": 637 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.0757235382908334e-09, "advantages/std": 0.541102409362793, "advantages/var": 0.2927918174182196, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.6652329749103942, "grad_norm": 0.14705958522246057, "learning_rate": 3.434142442470437e-07, "loss": 0.0, "num_tokens": 213174432.0, "reward": 0.763671875, "reward_std": 0.11167868226766586, "rewards/drgrpo_math_reward/mean": 0.763671875, "rewards/drgrpo_math_reward/std": 0.42524150013923645, "step": 638 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.896012708835768e-09, "advantages/std": 0.5133635997772217, "advantages/var": 0.26354218557622744, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 3.670967741935484, "grad_norm": 0.16110250008348792, "learning_rate": 3.407104242304951e-07, "loss": 0.0, "num_tokens": 213463894.0, "reward": 0.689453125, "reward_std": 0.11953892558813095, "rewards/drgrpo_math_reward/mean": 0.689453125, "rewards/drgrpo_math_reward/std": 0.46317005157470703, "step": 639 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.8142342202109997e-09, "advantages/std": 0.5133419632911682, "advantages/var": 0.2635199712756311, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.6767025089605734, "grad_norm": 0.13721684792861433, "learning_rate": 3.38015103048635e-07, "loss": -0.0, "num_tokens": 213744224.0, "reward": 0.828125, "reward_std": 0.10679332166910172, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3776407241821289, "step": 640 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.4615020636738457e-09, "advantages/std": 0.5675330758094788, "advantages/var": 0.32209379213776757, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9921875, "epoch": 3.682437275985663, "grad_norm": 0.1942724582218833, "learning_rate": 3.3532831544657456e-07, "loss": 0.0, "num_tokens": 214021898.0, "reward": 0.849609375, "reward_std": 0.13800355792045593, "rewards/drgrpo_math_reward/mean": 0.849609375, "rewards/drgrpo_math_reward/std": 0.35780346393585205, "step": 641 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.773684549516566e-10, "advantages/std": 0.6169849038124084, "advantages/var": 0.3806703715324069, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 3.688172043010753, "grad_norm": 0.1845917765945994, "learning_rate": 3.3265009605941797e-07, "loss": 0.0, "num_tokens": 214343628.0, "reward": 0.8125, "reward_std": 0.17393982410430908, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39069411158561707, "step": 642 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.9242331994786085e-09, "advantages/std": 0.483996719121933, "advantages/var": 0.2342528241207953, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 3.693906810035842, "grad_norm": 0.12310687903439475, "learning_rate": 3.2998047941181893e-07, "loss": 0.0, "num_tokens": 214649435.0, "reward": 0.78515625, "reward_std": 0.10409127175807953, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4111155867576599, "step": 643 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.4431732111918586e-09, "advantages/std": 0.48399728536605835, "advantages/var": 0.23425337224171372, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 3.699641577060932, "grad_norm": 0.16059981959104727, "learning_rate": 3.273194999175328e-07, "loss": -0.0, "num_tokens": 214930862.0, "reward": 0.8515625, "reward_std": 0.10161572694778442, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 644 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 6.023828198309366e-09, "advantages/std": 0.5411224961280823, "advantages/var": 0.2928135558158864, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 3.7053763440860212, "grad_norm": 0.14492305476902217, "learning_rate": 3.246671918789755e-07, "loss": 0.0, "num_tokens": 215249701.0, "reward": 0.7421875, "reward_std": 0.12647120654582977, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "step": 645 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.205426559656824e-10, "advantages/std": 0.5675040483474731, "advantages/var": 0.32206084489077114, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 3.7111111111111112, "grad_norm": 0.17178524432408973, "learning_rate": 3.220235894867793e-07, "loss": 0.0, "num_tokens": 215536431.0, "reward": 0.8359375, "reward_std": 0.11669550091028214, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.37069445848464966, "step": 646 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.909248618982831e-09, "advantages/std": 0.6402495503425598, "advantages/var": 0.40991948671385003, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.716845878136201, "grad_norm": 0.15967655157343486, "learning_rate": 3.193887268193525e-07, "loss": 0.0, "num_tokens": 215835892.0, "reward": 0.8203125, "reward_std": 0.1649497002363205, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "step": 647 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.4431761437082932e-09, "advantages/std": 0.48399630188941956, "advantages/var": 0.23425242024263415, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 3.7225806451612904, "grad_norm": 0.15360926405809952, "learning_rate": 3.1676263784244173e-07, "loss": 0.0, "num_tokens": 216122057.0, "reward": 0.837890625, "reward_std": 0.10450422763824463, "rewards/drgrpo_math_reward/mean": 0.837890625, "rewards/drgrpo_math_reward/std": 0.3689115643501282, "step": 648 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 4.0820246256351454e-09, "advantages/std": 0.5133422613143921, "advantages/var": 0.2635202772513736, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 3.72831541218638, "grad_norm": 0.12057187918539579, "learning_rate": 3.141453564086921e-07, "loss": 0.0, "num_tokens": 216398129.0, "reward": 0.8125, "reward_std": 0.10628747940063477, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39069411158561707, "step": 649 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 6.154014333401642e-10, "advantages/std": 0.5675091743469238, "advantages/var": 0.3220666629679272, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 3.7340501792114695, "grad_norm": 0.15926247993307774, "learning_rate": 3.1153691625721133e-07, "loss": 0.0, "num_tokens": 216693054.0, "reward": 0.80859375, "reward_std": 0.1180666983127594, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.3937928080558777, "step": 650 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.5428215847386409e-09, "advantages/std": 0.45273667573928833, "advantages/var": 0.2049704975594615, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.739784946236559, "grad_norm": 0.12787875168333235, "learning_rate": 3.0893735101313535e-07, "loss": -0.0, "num_tokens": 216961618.0, "reward": 0.88671875, "reward_std": 0.08835325390100479, "rewards/drgrpo_math_reward/mean": 0.88671875, "rewards/drgrpo_math_reward/std": 0.3172462284564972, "step": 651 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 3.872578716443472e-09, "advantages/std": 0.5411060452461243, "advantages/var": 0.2927957522019007, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 3.7455197132616487, "grad_norm": 0.17488225994533893, "learning_rate": 3.0634669418719514e-07, "loss": 0.0, "num_tokens": 217269577.0, "reward": 0.8046875, "reward_std": 0.11300259083509445, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 652 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.000226906034969e-09, "advantages/std": 0.6402479410171509, "advantages/var": 0.4099174259767011, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.7512544802867382, "grad_norm": 0.20686220495279142, "learning_rate": 3.0376497917528343e-07, "loss": -0.0, "num_tokens": 217561156.0, "reward": 0.76171875, "reward_std": 0.1649145483970642, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "step": 653 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.174964813967151e-09, "advantages/std": 0.5133330821990967, "advantages/var": 0.26351085328002455, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.756989247311828, "grad_norm": 0.14804567188959603, "learning_rate": 3.0119223925802485e-07, "loss": -0.0, "num_tokens": 217826169.0, "reward": 0.77734375, "reward_std": 0.09765692055225372, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "step": 654 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.1513778775243404e-09, "advantages/std": 0.5411198139190674, "advantages/var": 0.2928106530158061, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 3.762724014336918, "grad_norm": 0.17053802212581443, "learning_rate": 2.986285076003474e-07, "loss": 0.0, "num_tokens": 218100909.0, "reward": 0.7734375, "reward_std": 0.12313222885131836, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "step": 655 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 1.818208278765465e-10, "advantages/std": 0.6402749419212341, "advantages/var": 0.40995200125223974, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.768458781362007, "grad_norm": 0.18010642754653938, "learning_rate": 2.9607381725105507e-07, "loss": -0.0, "num_tokens": 218402490.0, "reward": 0.787109375, "reward_std": 0.1855122447013855, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "step": 656 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.721107231122964e-09, "advantages/std": 0.5411182641983032, "advantages/var": 0.2928089758489847, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 3.774193548387097, "grad_norm": 0.16522868422754847, "learning_rate": 2.9352820114240005e-07, "loss": 0.0, "num_tokens": 218692068.0, "reward": 0.8046875, "reward_std": 0.12501347064971924, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 657 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.927921455144884e-10, "advantages/std": 0.5927578806877136, "advantages/var": 0.35136190511738974, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.7799283154121865, "grad_norm": 0.15727661632282722, "learning_rate": 2.909916920896599e-07, "loss": 0.0, "num_tokens": 218994008.0, "reward": 0.892578125, "reward_std": 0.1368992179632187, "rewards/drgrpo_math_reward/mean": 0.892578125, "rewards/drgrpo_math_reward/std": 0.30995169281959534, "step": 658 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.45273256301879883, "advantages/var": 0.20496677361757065, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 3.785663082437276, "grad_norm": 0.12326793176115014, "learning_rate": 2.8846432279071466e-07, "loss": -0.0, "num_tokens": 219267167.0, "reward": 0.853515625, "reward_std": 0.08505964279174805, "rewards/drgrpo_math_reward/mean": 0.853515625, "rewards/drgrpo_math_reward/std": 0.35393697023391724, "step": 659 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.0571139980025972e-09, "advantages/std": 0.4527325928211212, "advantages/var": 0.20496680060253514, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 3.7913978494623657, "grad_norm": 0.13262123440598064, "learning_rate": 2.8594612582562394e-07, "loss": -0.0, "num_tokens": 219554730.0, "reward": 0.93359375, "reward_std": 0.08824022859334946, "rewards/drgrpo_math_reward/mean": 0.93359375, "rewards/drgrpo_math_reward/std": 0.2492343932390213, "step": 660 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.705532968044281, "advantages/var": 0.49777676899737244, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 3.7971326164874553, "grad_norm": 0.21695940704000402, "learning_rate": 2.834371336562077e-07, "loss": -0.0, "num_tokens": 219862851.0, "reward": 0.740234375, "reward_std": 0.21411943435668945, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 661 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.5356608600579656e-10, "advantages/std": 0.5133334398269653, "advantages/var": 0.26351122044458464, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9765625, "epoch": 3.802867383512545, "grad_norm": 0.2001023726702437, "learning_rate": 2.8093737862562885e-07, "loss": -0.0, "num_tokens": 220171188.0, "reward": 0.78515625, "reward_std": 0.10084541141986847, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4111155867576599, "step": 662 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.4053879208288676e-09, "advantages/std": 0.4839773178100586, "advantages/var": 0.23423404415461846, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9765625, "epoch": 3.8086021505376344, "grad_norm": 0.1440464194435048, "learning_rate": 2.784468929579741e-07, "loss": 0.0, "num_tokens": 220475273.0, "reward": 0.796875, "reward_std": 0.0892922431230545, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "step": 663 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.7210878940326842e-09, "advantages/std": 0.5411243438720703, "advantages/var": 0.2928155555309786, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 3.814336917562724, "grad_norm": 0.16219604948673835, "learning_rate": 2.759657087578403e-07, "loss": -0.0, "num_tokens": 220734835.0, "reward": 0.810546875, "reward_std": 0.12454880028963089, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "step": 664 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6169808506965637, "advantages/var": 0.38066537012625545, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9921875, "epoch": 3.8200716845878135, "grad_norm": 0.1734558785267858, "learning_rate": 2.734938580099196e-07, "loss": 0.0, "num_tokens": 221030232.0, "reward": 0.771484375, "reward_std": 0.16906268894672394, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "step": 665 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.4052441612206881e-09, "advantages/std": 0.662747859954834, "advantages/var": 0.43923472587471224, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 3.825806451612903, "grad_norm": 0.2062990076108302, "learning_rate": 2.7103137257858863e-07, "loss": -0.0, "num_tokens": 221345590.0, "reward": 0.71484375, "reward_std": 0.19725781679153442, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.45193037390708923, "step": 666 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.4839722216129303, "advantages/var": 0.23422911129295532, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9765625, "epoch": 3.8315412186379927, "grad_norm": 0.1902413560147162, "learning_rate": 2.685782842074953e-07, "loss": -0.0, "num_tokens": 221624847.0, "reward": 0.802734375, "reward_std": 0.08808041363954544, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "step": 667 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.230812820504071e-09, "advantages/std": 0.5675045847892761, "advantages/var": 0.3220614537568487, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.8372759856630827, "grad_norm": 0.17531042989722373, "learning_rate": 2.6613462451915227e-07, "loss": -0.0, "num_tokens": 221907786.0, "reward": 0.83203125, "reward_std": 0.11729401350021362, "rewards/drgrpo_math_reward/mean": 0.83203125, "rewards/drgrpo_math_reward/std": 0.374204158782959, "step": 668 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.057193895941553e-09, "advantages/std": 0.4527150094509125, "advantages/var": 0.20495087978213977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9140625, "epoch": 3.843010752688172, "grad_norm": 0.11476219949620473, "learning_rate": 2.6370042501452674e-07, "loss": 0.0, "num_tokens": 222181807.0, "reward": 0.802734375, "reward_std": 0.07564390450716019, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "step": 669 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.3330026199448945e-09, "advantages/std": 0.41913673281669617, "advantages/var": 0.17567560079625455, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.848745519713262, "grad_norm": 0.13074881430862978, "learning_rate": 2.6127571707263694e-07, "loss": -0.0, "num_tokens": 222449409.0, "reward": 0.912109375, "reward_std": 0.06695909798145294, "rewards/drgrpo_math_reward/mean": 0.912109375, "rewards/drgrpo_math_reward/std": 0.2834126651287079, "step": 670 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 2.909206368668616e-09, "advantages/std": 0.6402588486671448, "advantages/var": 0.4099313932965778, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 3.8544802867383514, "grad_norm": 0.18274125254758827, "learning_rate": 2.5886053195014534e-07, "loss": -0.0, "num_tokens": 222763702.0, "reward": 0.86328125, "reward_std": 0.17413797974586487, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.3438861668109894, "step": 671 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.5355602723645945e-10, "advantages/std": 0.5133448243141174, "advantages/var": 0.2635229086500921, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.860215053763441, "grad_norm": 0.1257136284058581, "learning_rate": 2.564549007809568e-07, "loss": -0.0, "num_tokens": 223047258.0, "reward": 0.75390625, "reward_std": 0.10551716387271881, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 672 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 2.5531923341803613e-09, "advantages/std": 0.5927478075027466, "advantages/var": 0.3513499632993131, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.8659498207885306, "grad_norm": 0.23182874353303698, "learning_rate": 2.540588545758179e-07, "loss": 0.0, "num_tokens": 223321089.0, "reward": 0.748046875, "reward_std": 0.1344509720802307, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "step": 673 }, { "advantages/mean": 1.979060471057892e-09, "advantages/snr": 2.89134524289262e-09, "advantages/std": 0.6844773888587952, "advantages/var": 0.4685092958589543, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9765625, "epoch": 3.87168458781362, "grad_norm": 0.21365272382634962, "learning_rate": 2.516724242219157e-07, "loss": -0.0, "num_tokens": 223630066.0, "reward": 0.732421875, "reward_std": 0.20882296562194824, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "step": 674 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 5.106105863857109e-09, "advantages/std": 0.5927801728248596, "advantages/var": 0.35138833329427044, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.8774193548387097, "grad_norm": 0.26570139747654664, "learning_rate": 2.4929564048248066e-07, "loss": 0.0, "num_tokens": 223946827.0, "reward": 0.734375, "reward_std": 0.15763607621192932, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "step": 675 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.692219359349283e-09, "advantages/std": 0.5675382614135742, "advantages/var": 0.3220996781683425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.8831541218637993, "grad_norm": 0.15828955555006316, "learning_rate": 2.4692853399638913e-07, "loss": 0.0, "num_tokens": 224249913.0, "reward": 0.7890625, "reward_std": 0.13930808007717133, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4083731174468994, "step": 676 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.1109761479080987e-09, "advantages/std": 0.4191460609436035, "advantages/var": 0.175683420404539, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.888888888888889, "grad_norm": 0.15276629719138582, "learning_rate": 2.4457113527777007e-07, "loss": -0.0, "num_tokens": 224522630.0, "reward": 0.84375, "reward_std": 0.07317627221345901, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "step": 677 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.848576356688285e-09, "advantages/std": 0.4839828908443451, "advantages/var": 0.23423943863004926, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 3.8946236559139784, "grad_norm": 0.16934751142429033, "learning_rate": 2.4222347471560934e-07, "loss": -0.0, "num_tokens": 224790651.0, "reward": 0.853515625, "reward_std": 0.09500166773796082, "rewards/drgrpo_math_reward/mean": 0.853515625, "rewards/drgrpo_math_reward/std": 0.35393697023391724, "step": 678 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.0512780026275952e-09, "advantages/std": 0.5675258040428162, "advantages/var": 0.32208553825444497, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 3.900358422939068, "grad_norm": 0.13594979556848966, "learning_rate": 2.3988558257336044e-07, "loss": -0.0, "num_tokens": 225066029.0, "reward": 0.78125, "reward_std": 0.13252761960029602, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "step": 679 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.2677727634498434e-09, "advantages/std": 0.5133464932441711, "advantages/var": 0.26352462212608785, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 3.9060931899641576, "grad_norm": 0.1928003018565501, "learning_rate": 2.37557488988552e-07, "loss": 0.0, "num_tokens": 225354419.0, "reward": 0.779296875, "reward_std": 0.10755260288715363, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "step": 680 }, { "advantages/mean": 3.14321368932724e-09, "advantages/snr": 5.538666999830783e-09, "advantages/std": 0.5675036311149597, "advantages/var": 0.3220603713286643, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 3.9118279569892476, "grad_norm": 0.14320976482649841, "learning_rate": 2.352392239724016e-07, "loss": -0.0, "num_tokens": 225662406.0, "reward": 0.724609375, "reward_std": 0.11929762363433838, "rewards/drgrpo_math_reward/mean": 0.724609375, "rewards/drgrpo_math_reward/std": 0.44714778661727905, "step": 681 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 3.1749902513442827e-09, "advantages/std": 0.5133289694786072, "advantages/var": 0.2635066309059688, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9765625, "epoch": 3.9175627240143367, "grad_norm": 0.19268822370195293, "learning_rate": 2.3293081740942688e-07, "loss": 0.0, "num_tokens": 225945574.0, "reward": 0.837890625, "reward_std": 0.09605368971824646, "rewards/drgrpo_math_reward/mean": 0.837890625, "rewards/drgrpo_math_reward/std": 0.3689115643501282, "step": 682 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.7213168890634796e-09, "advantages/std": 0.5133484601974487, "advantages/var": 0.2635266415870916, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.9232974910394267, "grad_norm": 0.17349606783479535, "learning_rate": 2.3063229905706106e-07, "loss": -0.0, "num_tokens": 226239280.0, "reward": 0.904296875, "reward_std": 0.10999394953250885, "rewards/drgrpo_math_reward/mean": 0.904296875, "rewards/drgrpo_math_reward/std": 0.2944713830947876, "step": 683 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.1513894893840995e-09, "advantages/std": 0.5411168932914734, "advantages/var": 0.2928074922054158, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.9290322580645163, "grad_norm": 0.1589412689510367, "learning_rate": 2.2834369854527046e-07, "loss": 0.0, "num_tokens": 226505261.0, "reward": 0.869140625, "reward_std": 0.11813901364803314, "rewards/drgrpo_math_reward/mean": 0.869140625, "rewards/drgrpo_math_reward/std": 0.33757632970809937, "step": 684 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 2.6459188922616563e-09, "advantages/std": 0.4839787483215332, "advantages/var": 0.23423542882687798, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8828125, "epoch": 3.934767025089606, "grad_norm": 0.15078959433342837, "learning_rate": 2.2606504537617065e-07, "loss": 0.0, "num_tokens": 226807215.0, "reward": 0.755859375, "reward_std": 0.0906703993678093, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "step": 685 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.302995587572159e-10, "advantages/std": 0.541089653968811, "advantages/var": 0.29277801363208766, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.9405017921146954, "grad_norm": 0.19890985911157777, "learning_rate": 2.2379636892364717e-07, "loss": -0.0, "num_tokens": 227110502.0, "reward": 0.796875, "reward_std": 0.10260801762342453, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "step": 686 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 9.070663456864474e-10, "advantages/std": 0.51337069272995, "advantages/var": 0.2635494681540287, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.946236559139785, "grad_norm": 0.30265659424025637, "learning_rate": 2.2153769843297664e-07, "loss": -0.0, "num_tokens": 227384882.0, "reward": 0.869140625, "reward_std": 0.12724943459033966, "rewards/drgrpo_math_reward/mean": 0.869140625, "rewards/drgrpo_math_reward/std": 0.33757632970809937, "step": 687 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.3606917802677714e-09, "advantages/std": 0.5133358836174011, "advantages/var": 0.263513729409258, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 3.9519713261648746, "grad_norm": 0.1759014553216669, "learning_rate": 2.1928906302045046e-07, "loss": -0.0, "num_tokens": 227642986.0, "reward": 0.796875, "reward_std": 0.10405848175287247, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "step": 688 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.8869133669866228e-09, "advantages/std": 0.616961658000946, "advantages/var": 0.3806416874432763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.957706093189964, "grad_norm": 0.19463623879007233, "learning_rate": 2.1705049167299815e-07, "loss": -0.0, "num_tokens": 227942174.0, "reward": 0.759765625, "reward_std": 0.1565905660390854, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "step": 689 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 2.7213108856315355e-09, "advantages/std": 0.5133495926856995, "advantages/var": 0.26352780431057354, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0, "epoch": 3.9634408602150537, "grad_norm": 0.4066671072422913, "learning_rate": 2.1482201324781456e-07, "loss": -0.0, "num_tokens": 228209124.0, "reward": 0.875, "reward_std": 0.10986974835395813, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3310423493385315, "step": 690 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 1.886967510046216e-10, "advantages/std": 0.6169439554214478, "advantages/var": 0.3806198441310613, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 3.9691756272401433, "grad_norm": 0.6601911378597407, "learning_rate": 2.1260365647198797e-07, "loss": -0.0, "num_tokens": 228453151.0, "reward": 0.837890625, "reward_std": 0.13737812638282776, "rewards/drgrpo_math_reward/mean": 0.837890625, "rewards/drgrpo_math_reward/std": 0.3689115643501282, "step": 691 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.1783985948508463e-09, "advantages/std": 0.5927467346191406, "advantages/var": 0.3513486914016539, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 3.974910394265233, "grad_norm": 1.0232483324456232, "learning_rate": 2.1039544994212967e-07, "loss": -0.0, "num_tokens": 228744219.0, "reward": 0.86328125, "reward_std": 0.13417719304561615, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.3438861668109894, "step": 692 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 4.567153604918983e-09, "advantages/std": 0.6627318859100342, "advantages/var": 0.43921355260187056, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 3.9806451612903224, "grad_norm": 1.772551770468125, "learning_rate": 2.0819742212400437e-07, "loss": -0.0, "num_tokens": 228978542.0, "reward": 0.900390625, "reward_std": 0.18489480018615723, "rewards/drgrpo_math_reward/mean": 0.900390625, "rewards/drgrpo_math_reward/std": 0.29977133870124817, "step": 693 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.134857604573951e-09, "advantages/std": 0.8206514716148376, "advantages/var": 0.6734688378635987, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9921875, "epoch": 3.9863799283154124, "grad_norm": 1.8704075364901005, "learning_rate": 2.060096013521646e-07, "loss": 0.0, "num_tokens": 229215502.0, "reward": 0.71484375, "reward_std": 0.2834837734699249, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.45193037390708923, "step": 694 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 2.8804302763244926e-09, "advantages/std": 0.889150857925415, "advantages/var": 0.7905892481495016, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 3.9921146953405016, "grad_norm": 3.3320960737008414, "learning_rate": 2.038320158295851e-07, "loss": 0.0, "num_tokens": 229445960.0, "reward": 0.69140625, "reward_std": 0.33010637760162354, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.4623647928237915, "step": 695 }, { "advantages/mean": 3.026798367500305e-09, "advantages/snr": 3.284602118143365e-09, "advantages/std": 0.9215114116668701, "advantages/var": 0.8491832818322678, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9609375, "epoch": 3.9978494623655916, "grad_norm": 6.583372987311205, "learning_rate": 2.0166469362729865e-07, "loss": -0.0, "num_tokens": 229672705.0, "reward": 0.6640625, "reward_std": 0.38278907537460327, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4727790653705597, "step": 696 }, { "advantages/mean": 3.4924596548080444e-09, "advantages/snr": 4.002608758483306e-09, "advantages/std": 0.8725458383560181, "advantages/var": 0.7613362400324064, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.00573476702509, "grad_norm": 7.68639110623062, "learning_rate": 1.9950766268403462e-07, "loss": -0.0, "num_tokens": 229913110.0, "reward": 0.560546875, "reward_std": 0.33965355157852173, "rewards/drgrpo_math_reward/mean": 0.560546875, "rewards/drgrpo_math_reward/std": 0.49680593609809875, "step": 697 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 1.5427528491259788e-09, "advantages/std": 0.9055137038230896, "advantages/var": 0.81995506781141, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.011469534050179, "grad_norm": 8.409813224286058, "learning_rate": 1.973609508058588e-07, "loss": -0.0, "num_tokens": 230150222.0, "reward": 0.486328125, "reward_std": 0.40682053565979004, "rewards/drgrpo_math_reward/mean": 0.486328125, "rewards/drgrpo_math_reward/std": 0.5003018379211426, "step": 698 }, { "advantages/mean": -6.05359673500061e-09, "advantages/snr": 6.458557691029199e-09, "advantages/std": 0.9372985363006592, "advantages/var": 0.8785285461513581, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 4.017204301075269, "grad_norm": 8.977476511321376, "learning_rate": 1.9522458566581557e-07, "loss": -0.0, "num_tokens": 230399709.0, "reward": 0.458984375, "reward_std": 0.45079731941223145, "rewards/drgrpo_math_reward/mean": 0.458984375, "rewards/drgrpo_math_reward/std": 0.49880221486091614, "step": 699 }, { "advantages/mean": 4.6566128730773926e-09, "advantages/snr": 4.9680995015424616e-09, "advantages/std": 0.9373026490211487, "advantages/var": 0.8785362558620626, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 4.022939068100358, "grad_norm": 6.901022579097523, "learning_rate": 1.9309859480356982e-07, "loss": 0.0, "num_tokens": 230693702.0, "reward": 0.392578125, "reward_std": 0.44991397857666016, "rewards/drgrpo_math_reward/mean": 0.392578125, "rewards/drgrpo_math_reward/std": 0.4888018071651459, "step": 700 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 2.0947748303755292e-09, "advantages/std": 0.8891863226890564, "advantages/var": 0.7906523164572867, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "epoch": 4.028673835125448, "grad_norm": 4.60936661197636, "learning_rate": 1.9098300562505264e-07, "loss": 0.0, "num_tokens": 231046363.0, "reward": 0.326171875, "reward_std": 0.38328152894973755, "rewards/drgrpo_math_reward/mean": 0.326171875, "rewards/drgrpo_math_reward/std": 0.4692695140838623, "step": 701 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 4.165976360970246e-10, "advantages/std": 0.8383291959762573, "advantages/var": 0.7027958408261981, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.7421875, "epoch": 4.034408602150537, "grad_norm": 2.8515970143436213, "learning_rate": 1.8887784540210893e-07, "loss": 0.0, "num_tokens": 231444263.0, "reward": 0.25, "reward_std": 0.32979846000671387, "rewards/drgrpo_math_reward/mean": 0.25, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "step": 702 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 5.336751890159132e-10, "advantages/std": 0.8725556135177612, "advantages/var": 0.7613532986813567, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.546875, "epoch": 4.040143369175627, "grad_norm": 4.570706811979059, "learning_rate": 1.86783141272144e-07, "loss": -0.0, "num_tokens": 231872716.0, "reward": 0.23046875, "reward_std": 0.3486773669719696, "rewards/drgrpo_math_reward/mean": 0.23046875, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "step": 703 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 5.131342597262635e-09, "advantages/std": 0.725987434387207, "advantages/var": 0.5270577548881192, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "epoch": 4.045878136200717, "grad_norm": 2.906913376568377, "learning_rate": 1.8469892023777568e-07, "loss": 0.0, "num_tokens": 232334214.0, "reward": 0.15234375, "reward_std": 0.22879299521446228, "rewards/drgrpo_math_reward/mean": 0.15234375, "rewards/drgrpo_math_reward/std": 0.35970520973205566, "step": 704 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 4.354052861790876e-09, "advantages/std": 0.8555914163589478, "advantages/var": 0.7320366717471103, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.046875, "epoch": 4.051612903225807, "grad_norm": 2.5298945866180382, "learning_rate": 1.8262520916648427e-07, "loss": 0.0, "num_tokens": 232805525.0, "reward": 0.171875, "reward_std": 0.31024086475372314, "rewards/drgrpo_math_reward/mean": 0.171875, "rewards/drgrpo_math_reward/std": 0.3776407241821289, "step": 705 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.521292545837355e-09, "advantages/std": 0.7652395367622375, "advantages/var": 0.5855915486240839, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.671875, "epoch": 4.057347670250896, "grad_norm": 2.5502107989966234, "learning_rate": 1.805620347902681e-07, "loss": 0.0, "num_tokens": 233288161.0, "reward": 0.08984375, "reward_std": 0.21923676133155823, "rewards/drgrpo_math_reward/mean": 0.08984375, "rewards/drgrpo_math_reward/std": 0.2862374484539032, "step": 706 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 3.2661660314189134e-09, "advantages/std": 0.7841417193412781, "advantages/var": 0.6148782360114957, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1015625, "epoch": 4.063082437275986, "grad_norm": 2.569589449802741, "learning_rate": 1.7850942370529755e-07, "loss": 0.0, "num_tokens": 233762151.0, "reward": 0.13671875, "reward_std": 0.2433396279811859, "rewards/drgrpo_math_reward/mean": 0.13671875, "rewards/drgrpo_math_reward/std": 0.3438861668109894, "step": 707 }, { "advantages/mean": -1.0011717677116394e-08, "advantages/snr": 1.1056974759994152e-08, "advantages/std": 0.9054662585258484, "advantages/var": 0.8198691453287985, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "epoch": 4.068817204301075, "grad_norm": 4.8433455514292465, "learning_rate": 1.7646740237157254e-07, "loss": 0.0, "num_tokens": 234209179.0, "reward": 0.20703125, "reward_std": 0.34983617067337036, "rewards/drgrpo_math_reward/mean": 0.20703125, "rewards/drgrpo_math_reward/std": 0.40557438135147095, "step": 708 }, { "advantages/mean": -4.423782229423523e-09, "advantages/snr": 4.800450610578066e-09, "advantages/std": 0.9215347766876221, "advantages/var": 0.8492263446447055, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.328125, "epoch": 4.074551971326165, "grad_norm": 25.357307990413574, "learning_rate": 1.7443599711258217e-07, "loss": 0.0, "num_tokens": 234527612.0, "reward": 0.455078125, "reward_std": 0.41415148973464966, "rewards/drgrpo_math_reward/mean": 0.455078125, "rewards/drgrpo_math_reward/std": 0.4984649419784546, "step": 709 }, { "advantages/mean": 3.14321368932724e-09, "advantages/snr": 3.3535206907954663e-09, "advantages/std": 0.9372876882553101, "advantages/var": 0.8785082105549833, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6484375, "epoch": 4.080286738351255, "grad_norm": 25.115885713174688, "learning_rate": 1.724152341149645e-07, "loss": -0.0, "num_tokens": 234804684.0, "reward": 0.591796875, "reward_std": 0.4300433397293091, "rewards/drgrpo_math_reward/mean": 0.591796875, "rewards/drgrpo_math_reward/std": 0.49198177456855774, "step": 710 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 4.713197557218414e-09, "advantages/std": 0.8891949653625488, "advantages/var": 0.7906676864261044, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3515625, "epoch": 4.086021505376344, "grad_norm": 10.656088132384975, "learning_rate": 1.7040513942816904e-07, "loss": -0.0, "num_tokens": 235114476.0, "reward": 0.5, "reward_std": 0.3932029604911804, "rewards/drgrpo_math_reward/mean": 0.5, "rewards/drgrpo_math_reward/std": 0.5004889965057373, "step": 711 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 8.163548267034734e-10, "advantages/std": 0.8556229472160339, "advantages/var": 0.732090627802652, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "epoch": 4.091756272401434, "grad_norm": 4.620097004010371, "learning_rate": 1.6840573896412126e-07, "loss": 0.0, "num_tokens": 235535678.0, "reward": 0.265625, "reward_std": 0.35676854848861694, "rewards/drgrpo_math_reward/mean": 0.265625, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "step": 712 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 6.943662721205975e-10, "advantages/std": 0.8382846713066101, "advantages/var": 0.7027211901476313, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6953125, "epoch": 4.097491039426523, "grad_norm": 4.916310404331925, "learning_rate": 1.6641705849688914e-07, "loss": 0.0, "num_tokens": 235997332.0, "reward": 0.14453125, "reward_std": 0.2790958881378174, "rewards/drgrpo_math_reward/mean": 0.14453125, "rewards/drgrpo_math_reward/std": 0.35197147727012634, "step": 713 }, { "advantages/mean": -6.868503987789154e-09, "advantages/snr": 8.758907917484618e-09, "advantages/std": 0.7841735482215881, "advantages/var": 0.6149281537304354, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.103225806451613, "grad_norm": 4.2545525228625065, "learning_rate": 1.6443912366234925e-07, "loss": 0.0, "num_tokens": 236451339.0, "reward": 0.1953125, "reward_std": 0.2787782549858093, "rewards/drgrpo_math_reward/mean": 0.1953125, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "step": 714 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 5.336994330769659e-10, "advantages/std": 0.8725159764289856, "advantages/var": 0.7612841291238261, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.2578125, "epoch": 4.108960573476702, "grad_norm": 6.259241103726965, "learning_rate": 1.6247195995785835e-07, "loss": 0.0, "num_tokens": 236882756.0, "reward": 0.205078125, "reward_std": 0.3086491525173187, "rewards/drgrpo_math_reward/mean": 0.205078125, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "step": 715 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 7.579651995306932e-10, "advantages/std": 0.9215356111526489, "advantages/var": 0.8492278826224862, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4453125, "epoch": 4.114695340501792, "grad_norm": 9.147428006740263, "learning_rate": 1.6051559274192273e-07, "loss": 0.0, "num_tokens": 237300546.0, "reward": 0.392578125, "reward_std": 0.4161492586135864, "rewards/drgrpo_math_reward/mean": 0.392578125, "rewards/drgrpo_math_reward/std": 0.4888018071651459, "step": 716 }, { "advantages/mean": 5.587935447692871e-09, "advantages/snr": 5.9618130449032785e-09, "advantages/std": 0.9372879266738892, "advantages/var": 0.8785086574886378, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6484375, "epoch": 4.120430107526881, "grad_norm": 14.281627364541597, "learning_rate": 1.58570047233873e-07, "loss": -0.0, "num_tokens": 237703533.0, "reward": 0.44921875, "reward_std": 0.42843863368034363, "rewards/drgrpo_math_reward/mean": 0.44921875, "rewards/drgrpo_math_reward/std": 0.497901052236557, "step": 717 }, { "advantages/mean": -6.170012056827545e-09, "advantages/snr": 7.211123753337807e-09, "advantages/std": 0.8556241989135742, "advantages/var": 0.7320927697664956, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.40625, "epoch": 4.1261648745519715, "grad_norm": 5.5343064551109356, "learning_rate": 1.5663534851353778e-07, "loss": 0.0, "num_tokens": 238127487.0, "reward": 0.3046875, "reward_std": 0.3614029884338379, "rewards/drgrpo_math_reward/mean": 0.3046875, "rewards/drgrpo_math_reward/std": 0.4607250988483429, "step": 718 }, { "advantages/mean": -3.3760443329811096e-09, "advantages/snr": 3.79692012860199e-09, "advantages/std": 0.8891533613204956, "advantages/var": 0.7905936999475358, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "epoch": 4.131899641577061, "grad_norm": 7.105772260226459, "learning_rate": 1.547115215209207e-07, "loss": 0.0, "num_tokens": 238622481.0, "reward": 0.263671875, "reward_std": 0.3387971520423889, "rewards/drgrpo_math_reward/mean": 0.263671875, "rewards/drgrpo_math_reward/std": 0.4410543739795685, "step": 719 }, { "advantages/mean": -3.3760443329811096e-09, "advantages/snr": 4.027309774233489e-09, "advantages/std": 0.8382877111434937, "advantages/var": 0.7027262866541975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.359375, "epoch": 4.137634408602151, "grad_norm": 6.4721651959237665, "learning_rate": 1.527985910558799e-07, "loss": 0.0, "num_tokens": 239116135.0, "reward": 0.138671875, "reward_std": 0.2789635956287384, "rewards/drgrpo_math_reward/mean": 0.138671875, "rewards/drgrpo_math_reward/std": 0.34594178199768066, "step": 720 }, { "advantages/mean": -4.307366907596588e-09, "advantages/snr": 5.13819898441036e-09, "advantages/std": 0.8383028507232666, "advantages/var": 0.7027516695307554, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7421875, "epoch": 4.14336917562724, "grad_norm": 5.065619532206326, "learning_rate": 1.508965817778065e-07, "loss": 0.0, "num_tokens": 239579191.0, "reward": 0.19921875, "reward_std": 0.30044057965278625, "rewards/drgrpo_math_reward/mean": 0.19921875, "rewards/drgrpo_math_reward/std": 0.39980348944664, "step": 721 }, { "advantages/mean": -5.820766091346741e-09, "advantages/snr": 6.4282963939934014e-09, "advantages/std": 0.9054912328720093, "advantages/var": 0.8199143728080713, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "epoch": 4.14910394265233, "grad_norm": 6.110061059033174, "learning_rate": 1.4900551820530827e-07, "loss": 0.0, "num_tokens": 240022978.0, "reward": 0.26171875, "reward_std": 0.37789881229400635, "rewards/drgrpo_math_reward/mean": 0.26171875, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "step": 722 }, { "advantages/mean": -5.3551048040390015e-09, "advantages/snr": 6.137319594503546e-09, "advantages/std": 0.8725478053092957, "advantages/var": 0.7613396725500685, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6484375, "epoch": 4.15483870967742, "grad_norm": 6.520809119722505, "learning_rate": 1.4712542471589273e-07, "loss": 0.0, "num_tokens": 240480865.0, "reward": 0.232421875, "reward_std": 0.3416900336742401, "rewards/drgrpo_math_reward/mean": 0.232421875, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "step": 723 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 3.665910260859461e-09, "advantages/std": 0.889173150062561, "advantages/var": 0.7906288907921777, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "epoch": 4.160573476702509, "grad_norm": 8.597991583479846, "learning_rate": 1.452563255456536e-07, "loss": 0.0, "num_tokens": 240928400.0, "reward": 0.30078125, "reward_std": 0.3663269877433777, "rewards/drgrpo_math_reward/mean": 0.30078125, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "step": 724 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 2.571314156871267e-09, "advantages/std": 0.9054927825927734, "advantages/var": 0.8199171793276037, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "epoch": 4.166308243727599, "grad_norm": 5.9645043309416, "learning_rate": 1.4339824478895757e-07, "loss": 0.0, "num_tokens": 241389352.0, "reward": 0.28125, "reward_std": 0.3812732398509979, "rewards/drgrpo_math_reward/mean": 0.28125, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "step": 725 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 2.969162176864489e-09, "advantages/std": 0.7841627597808838, "advantages/var": 0.614911233827172, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "epoch": 4.172043010752688, "grad_norm": 3.708280765865338, "learning_rate": 1.415512063981339e-07, "loss": 0.0, "num_tokens": 241878881.0, "reward": 0.20703125, "reward_std": 0.2667984962463379, "rewards/drgrpo_math_reward/mean": 0.20703125, "rewards/drgrpo_math_reward/std": 0.40557438135147095, "step": 726 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.2420846679714254e-09, "advantages/std": 0.9372575283050537, "advantages/var": 0.8784516743644986, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5859375, "epoch": 4.177777777777778, "grad_norm": 5.237068324179484, "learning_rate": 1.3971523418316643e-07, "loss": 0.0, "num_tokens": 242349629.0, "reward": 0.25390625, "reward_std": 0.38156604766845703, "rewards/drgrpo_math_reward/mean": 0.25390625, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "step": 727 }, { "advantages/mean": -7.916241884231567e-09, "advantages/snr": 8.903244216957507e-09, "advantages/std": 0.8891412615776062, "advantages/var": 0.7905721830398171, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "epoch": 4.183512544802867, "grad_norm": 4.039773045877283, "learning_rate": 1.3789035181138596e-07, "loss": 0.0, "num_tokens": 242838101.0, "reward": 0.193359375, "reward_std": 0.32620543241500854, "rewards/drgrpo_math_reward/mean": 0.193359375, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "step": 728 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 2.31417466758633e-09, "advantages/std": 0.9054959416389465, "advantages/var": 0.8199229003246025, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6953125, "epoch": 4.189247311827957, "grad_norm": 2.203421433030281, "learning_rate": 1.3607658280716472e-07, "loss": 0.0, "num_tokens": 243310997.0, "reward": 0.2734375, "reward_std": 0.3824250102043152, "rewards/drgrpo_math_reward/mean": 0.2734375, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "step": 729 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 1.360610144913298e-10, "advantages/std": 0.8556111454963684, "advantages/var": 0.7320704322976077, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.765625, "epoch": 4.194982078853046, "grad_norm": 2.726930300656471, "learning_rate": 1.3427395055161393e-07, "loss": 0.0, "num_tokens": 243765501.0, "reward": 0.287109375, "reward_std": 0.3410155773162842, "rewards/drgrpo_math_reward/mean": 0.287109375, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "step": 730 }, { "advantages/mean": 3.4924596548080444e-09, "advantages/snr": 3.927664104455861e-09, "advantages/std": 0.8891950845718384, "advantages/var": 0.7906678984267188, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.234375, "epoch": 4.200716845878136, "grad_norm": 4.417042183284378, "learning_rate": 1.3248247828228243e-07, "loss": 0.0, "num_tokens": 244219983.0, "reward": 0.423828125, "reward_std": 0.38947027921676636, "rewards/drgrpo_math_reward/mean": 0.423828125, "rewards/drgrpo_math_reward/std": 0.4946470856666565, "step": 731 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 2.3566510671331464e-09, "advantages/std": 0.8891752362251282, "advantages/var": 0.7906326007160125, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3515625, "epoch": 4.2064516129032254, "grad_norm": 4.888428222658115, "learning_rate": 1.3070218909285657e-07, "loss": 0.0, "num_tokens": 244624263.0, "reward": 0.46875, "reward_std": 0.3708840608596802, "rewards/drgrpo_math_reward/mean": 0.46875, "rewards/drgrpo_math_reward/std": 0.4995105266571045, "step": 732 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 2.164740111532761e-09, "advantages/std": 0.9680033922195435, "advantages/var": 0.9370305673485433, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.53125, "epoch": 4.2121863799283155, "grad_norm": 3.1822627216070356, "learning_rate": 1.2893310593286244e-07, "loss": 0.0, "num_tokens": 245038181.0, "reward": 0.443359375, "reward_std": 0.4213073253631592, "rewards/drgrpo_math_reward/mean": 0.443359375, "rewards/drgrpo_math_reward/std": 0.49726733565330505, "step": 733 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 7.579636307352374e-10, "advantages/std": 0.9215375185012817, "advantages/var": 0.8492313980055002, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.640625, "epoch": 4.217921146953405, "grad_norm": 4.989127661422375, "learning_rate": 1.2717525160737065e-07, "loss": 0.0, "num_tokens": 245434121.0, "reward": 0.537109375, "reward_std": 0.4180387258529663, "rewards/drgrpo_math_reward/mean": 0.537109375, "rewards/drgrpo_math_reward/std": 0.4991086423397064, "step": 734 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 1.5159751111431517e-09, "advantages/std": 0.9215084314346313, "advantages/var": 0.8491777892051147, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4765625, "epoch": 4.223655913978495, "grad_norm": 2.5806507245387267, "learning_rate": 1.2542864877670245e-07, "loss": -0.0, "num_tokens": 245849687.0, "reward": 0.513671875, "reward_std": 0.38430657982826233, "rewards/drgrpo_math_reward/mean": 0.513671875, "rewards/drgrpo_math_reward/std": 0.5003018379211426, "step": 735 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 3.974653243817178e-09, "advantages/std": 0.937261700630188, "advantages/var": 0.8784594954681921, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.640625, "epoch": 4.229390681003585, "grad_norm": 2.759360712120659, "learning_rate": 1.2369331995613663e-07, "loss": -0.0, "num_tokens": 246216900.0, "reward": 0.587890625, "reward_std": 0.3935227692127228, "rewards/drgrpo_math_reward/mean": 0.587890625, "rewards/drgrpo_math_reward/std": 0.49269601702690125, "step": 736 }, { "advantages/mean": -4.0745362639427185e-09, "advantages/snr": 4.421427916851783e-09, "advantages/std": 0.9215430617332458, "advantages/var": 0.849241614628685, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4140625, "epoch": 4.235125448028674, "grad_norm": 2.8305873157792174, "learning_rate": 1.2196928751561964e-07, "loss": 0.0, "num_tokens": 246614929.0, "reward": 0.466796875, "reward_std": 0.42713767290115356, "rewards/drgrpo_math_reward/mean": 0.466796875, "rewards/drgrpo_math_reward/std": 0.4993842542171478, "step": 737 }, { "advantages/mean": 6.05359673500061e-09, "advantages/snr": 7.376265840767068e-09, "advantages/std": 0.8206858038902283, "advantages/var": 0.6735251887069502, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.71875, "epoch": 4.240860215053764, "grad_norm": 4.531865839489409, "learning_rate": 1.202565736794785e-07, "loss": -0.0, "num_tokens": 247009016.0, "reward": 0.529296875, "reward_std": 0.3286014795303345, "rewards/drgrpo_math_reward/mean": 0.529296875, "rewards/drgrpo_math_reward/std": 0.49962911009788513, "step": 738 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 2.5265413503943537e-09, "advantages/std": 0.9215390086174011, "advantages/var": 0.8492341444035425, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.421875, "epoch": 4.246594982078853, "grad_norm": 3.501569194120093, "learning_rate": 1.1855520052613211e-07, "loss": -0.0, "num_tokens": 247426131.0, "reward": 0.498046875, "reward_std": 0.41952577233314514, "rewards/drgrpo_math_reward/mean": 0.498046875, "rewards/drgrpo_math_reward/std": 0.5004851818084717, "step": 739 }, { "advantages/mean": 7.2177499532699585e-09, "advantages/snr": 8.117224173453709e-09, "advantages/std": 0.8891894221305847, "advantages/var": 0.7906578284289232, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.65625, "epoch": 4.252329749103943, "grad_norm": 3.226518678571578, "learning_rate": 1.1686518998780881e-07, "loss": -0.0, "num_tokens": 247834061.0, "reward": 0.552734375, "reward_std": 0.3843798041343689, "rewards/drgrpo_math_reward/mean": 0.552734375, "rewards/drgrpo_math_reward/std": 0.4976975917816162, "step": 740 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 2.1347325458015452e-09, "advantages/std": 0.8725426197052002, "advantages/var": 0.7613306232020136, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.734375, "epoch": 4.258064516129032, "grad_norm": 2.2190177247717124, "learning_rate": 1.1518656385026148e-07, "loss": 0.0, "num_tokens": 248242051.0, "reward": 0.548828125, "reward_std": 0.3403865694999695, "rewards/drgrpo_math_reward/mean": 0.548828125, "rewards/drgrpo_math_reward/std": 0.498096764087677, "step": 741 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 2.127780891812341e-09, "advantages/std": 0.8206812143325806, "advantages/var": 0.673517655558399, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.984375, "epoch": 4.263799283154122, "grad_norm": 1.9052592957222863, "learning_rate": 1.1351934375248906e-07, "loss": -0.0, "num_tokens": 248609560.0, "reward": 0.609375, "reward_std": 0.32576605677604675, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.48836761713027954, "step": 742 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 3.848453550075227e-09, "advantages/std": 0.7259975075721741, "advantages/var": 0.527072381001009, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "epoch": 4.269534050179211, "grad_norm": 2.81859144543926, "learning_rate": 1.1186355118645552e-07, "loss": -0.0, "num_tokens": 248926941.0, "reward": 0.73828125, "reward_std": 0.23164395987987518, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "step": 743 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.405259706318843e-09, "advantages/std": 0.6627405285835266, "advantages/var": 0.43922500822717225, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 4.275268817204301, "grad_norm": 2.6150185864987927, "learning_rate": 1.1021920749681402e-07, "loss": -0.0, "num_tokens": 249215007.0, "reward": 0.806640625, "reward_std": 0.1928926408290863, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "step": 744 }, { "advantages/mean": 4.0745362639427185e-09, "advantages/snr": 5.952942758418465e-09, "advantages/std": 0.6844574809074402, "advantages/var": 0.46848204317015885, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6953125, "epoch": 4.28100358422939, "grad_norm": 3.98670834198594, "learning_rate": 1.085863338806312e-07, "loss": -0.0, "num_tokens": 249525069.0, "reward": 0.798828125, "reward_std": 0.18608346581459045, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "step": 745 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.2070717451365926e-10, "advantages/std": 0.7259913682937622, "advantages/var": 0.5270634668370491, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 4.28673835125448, "grad_norm": 2.6058524293124483, "learning_rate": 1.069649513871147e-07, "loss": -0.0, "num_tokens": 249873832.0, "reward": 0.720703125, "reward_std": 0.22323155403137207, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "step": 746 }, { "advantages/mean": 3.958120942115784e-09, "advantages/snr": 5.452174160643454e-09, "advantages/std": 0.7259711027145386, "advantages/var": 0.5270340419765631, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "epoch": 4.29247311827957, "grad_norm": 0.7938164933781715, "learning_rate": 1.0535508091734068e-07, "loss": -0.0, "num_tokens": 250181966.0, "reward": 0.763671875, "reward_std": 0.20304685831069946, "rewards/drgrpo_math_reward/mean": 0.763671875, "rewards/drgrpo_math_reward/std": 0.42524150013923645, "step": 747 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.3200847607238852e-09, "advantages/std": 0.7055020928382874, "advantages/var": 0.49773320299920343, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6953125, "epoch": 4.2982078853046595, "grad_norm": 0.36819538898333676, "learning_rate": 1.0375674322398497e-07, "loss": -0.0, "num_tokens": 250500962.0, "reward": 0.763671875, "reward_std": 0.18035967648029327, "rewards/drgrpo_math_reward/mean": 0.763671875, "rewards/drgrpo_math_reward/std": 0.42524150013923645, "step": 748 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.545665771887131e-09, "advantages/std": 0.6402311325073242, "advantages/var": 0.40989590303161094, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 4.3039426523297495, "grad_norm": 0.6738704133342835, "learning_rate": 1.0216995891105629e-07, "loss": -0.0, "num_tokens": 250775680.0, "reward": 0.837890625, "reward_std": 0.14936304092407227, "rewards/drgrpo_math_reward/mean": 0.837890625, "rewards/drgrpo_math_reward/std": 0.3689115643501282, "step": 749 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 2.886455808313393e-09, "advantages/std": 0.7259684205055237, "advantages/var": 0.5270301475712849, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6484375, "epoch": 4.309677419354839, "grad_norm": 4.319784062342443, "learning_rate": 1.0059474843362892e-07, "loss": -0.0, "num_tokens": 251110001.0, "reward": 0.705078125, "reward_std": 0.20250748097896576, "rewards/drgrpo_math_reward/mean": 0.705078125, "rewards/drgrpo_math_reward/std": 0.4564536213874817, "step": 750 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5675346255302429, "advantages/var": 0.32209555117575306, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 4.315412186379929, "grad_norm": 0.35397583843148533, "learning_rate": 9.903113209758096e-08, "loss": -0.0, "num_tokens": 251399321.0, "reward": 0.783203125, "reward_std": 0.13874930143356323, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "step": 751 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.564349521126482e-09, "advantages/std": 0.5675033330917358, "advantages/var": 0.3220600330702297, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.765625, "epoch": 4.321146953405018, "grad_norm": 1.2699286031393517, "learning_rate": 9.747913005933061e-08, "loss": -0.0, "num_tokens": 251675814.0, "reward": 0.83203125, "reward_std": 0.11564444750547409, "rewards/drgrpo_math_reward/mean": 0.83203125, "rewards/drgrpo_math_reward/std": 0.374204158782959, "step": 752 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 2.0000339475218994e-09, "advantages/std": 0.64027339220047, "advantages/var": 0.40995001675989684, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.671875, "epoch": 4.326881720430108, "grad_norm": 0.38079951771812853, "learning_rate": 9.59387623255784e-08, "loss": -0.0, "num_tokens": 251971724.0, "reward": 0.69921875, "reward_std": 0.18402248620986938, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "step": 753 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.5607988331870454e-09, "advantages/std": 0.7458701133728027, "advantages/var": 0.5563222260227576, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6328125, "epoch": 4.332616487455197, "grad_norm": 0.4657732380231506, "learning_rate": 9.441004875304736e-08, "loss": -0.0, "num_tokens": 252296083.0, "reward": 0.740234375, "reward_std": 0.22108560800552368, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 754 }, { "advantages/mean": 4.889443516731262e-09, "advantages/snr": 7.377541135461571e-09, "advantages/std": 0.6627470254898071, "advantages/var": 0.43923361979558706, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5703125, "epoch": 4.338351254480287, "grad_norm": 0.49361383972388956, "learning_rate": 9.289300904822827e-08, "loss": -0.0, "num_tokens": 252594940.0, "reward": 0.724609375, "reward_std": 0.19735825061798096, "rewards/drgrpo_math_reward/mean": 0.724609375, "rewards/drgrpo_math_reward/std": 0.44714778661727905, "step": 755 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.102621498954752e-09, "advantages/std": 0.5675167441368103, "advantages/var": 0.3220752548756458, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 4.344086021505376, "grad_norm": 0.1984093872675469, "learning_rate": 9.13876627671255e-08, "loss": -0.0, "num_tokens": 252885032.0, "reward": 0.888671875, "reward_std": 0.12329062819480896, "rewards/drgrpo_math_reward/mean": 0.888671875, "rewards/drgrpo_math_reward/std": 0.31484565138816833, "step": 756 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.3606552124651114e-09, "advantages/std": 0.6844662427902222, "advantages/var": 0.46849403751936336, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.703125, "epoch": 4.349820788530466, "grad_norm": 0.40842434623251006, "learning_rate": 8.989402931500434e-08, "loss": 0.0, "num_tokens": 253192135.0, "reward": 0.833984375, "reward_std": 0.19146154820919037, "rewards/drgrpo_math_reward/mean": 0.833984375, "rewards/drgrpo_math_reward/std": 0.3724585771560669, "step": 757 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 9.434349909307424e-10, "advantages/std": 0.6169758439064026, "advantages/var": 0.38065919196401765, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 4.355555555555555, "grad_norm": 0.3462832210880558, "learning_rate": 8.841212794614128e-08, "loss": -0.0, "num_tokens": 253501027.0, "reward": 0.76953125, "reward_std": 0.16659927368164062, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "step": 758 }, { "advantages/mean": 4.0745362639427185e-09, "advantages/snr": 5.462764921432089e-09, "advantages/std": 0.7458743453025818, "advantages/var": 0.556328538980555, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 4.361290322580645, "grad_norm": 0.3492173028249447, "learning_rate": 8.694197776357559e-08, "loss": -0.0, "num_tokens": 253820039.0, "reward": 0.755859375, "reward_std": 0.22633320093154907, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "step": 759 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.0190307619908837e-09, "advantages/std": 0.6169679164886475, "advantages/var": 0.38064940997634267, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "epoch": 4.367025089605734, "grad_norm": 0.3740114589802323, "learning_rate": 8.54835977188636e-08, "loss": -0.0, "num_tokens": 254115745.0, "reward": 0.720703125, "reward_std": 0.15711575746536255, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "step": 760 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.5095075060681394e-09, "advantages/std": 0.6169711351394653, "advantages/var": 0.3806533815952804, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 4.372759856630824, "grad_norm": 0.7628594982715093, "learning_rate": 8.403700661183355e-08, "loss": -0.0, "num_tokens": 254387413.0, "reward": 0.828125, "reward_std": 0.1611357480287552, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3776407241821289, "step": 761 }, { "advantages/mean": 2.444721758365631e-09, "advantages/snr": 3.6888734138139515e-09, "advantages/std": 0.6627285480499268, "advantages/var": 0.4392091284003641, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "epoch": 4.378494623655914, "grad_norm": 2.673884749795482, "learning_rate": 8.260222309034393e-08, "loss": 0.0, "num_tokens": 254704315.0, "reward": 0.72265625, "reward_std": 0.18154552578926086, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "step": 762 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.401648398936979e-10, "advantages/std": 0.684464156627655, "advantages/var": 0.4684911817080071, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 4.3842293906810035, "grad_norm": 0.29911755582979926, "learning_rate": 8.117926565004285e-08, "loss": -0.0, "num_tokens": 254994769.0, "reward": 0.755859375, "reward_std": 0.1912364661693573, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "step": 763 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 1.8182902048412135e-09, "advantages/std": 0.6402460932731628, "advantages/var": 0.40991505995154753, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 4.3899641577060935, "grad_norm": 0.36558018144192533, "learning_rate": 7.976815263412961e-08, "loss": 0.0, "num_tokens": 255280143.0, "reward": 0.8359375, "reward_std": 0.158061683177948, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.37069445848464966, "step": 764 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.5675068497657776, "advantages/var": 0.32206402453107685, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 4.395698924731183, "grad_norm": 0.16878776422135772, "learning_rate": 7.83689022331182e-08, "loss": -0.0, "num_tokens": 255593242.0, "reward": 0.806640625, "reward_std": 0.11614333093166351, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "step": 765 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 2.871762058953609e-09, "advantages/std": 0.567531168460846, "advantages/var": 0.3220916271745331, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 4.401433691756273, "grad_norm": 2.9782304836323417, "learning_rate": 7.698153248460271e-08, "loss": -0.0, "num_tokens": 255882590.0, "reward": 0.845703125, "reward_std": 0.13428467512130737, "rewards/drgrpo_math_reward/mean": 0.845703125, "rewards/drgrpo_math_reward/std": 0.36158639192581177, "step": 766 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 5.102662336306835e-10, "advantages/std": 0.6844387054443359, "advantages/var": 0.46845634151031845, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.407168458781362, "grad_norm": 0.36694475933989806, "learning_rate": 7.560606127302527e-08, "loss": -0.0, "num_tokens": 256195437.0, "reward": 0.783203125, "reward_std": 0.1714237928390503, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "step": 767 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.0513094568187474e-09, "advantages/std": 0.567517101764679, "advantages/var": 0.32207566079538097, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.412903225806452, "grad_norm": 0.21972989548609242, "learning_rate": 7.424250632944484e-08, "loss": 0.0, "num_tokens": 256493828.0, "reward": 0.755859375, "reward_std": 0.13014443218708038, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "step": 768 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 3.3329428995338245e-09, "advantages/std": 0.41914424300193787, "advantages/var": 0.17568189644166754, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9765625, "epoch": 4.418637992831541, "grad_norm": 0.1260518239411874, "learning_rate": 7.289088523130926e-08, "loss": -0.0, "num_tokens": 256754073.0, "reward": 0.91015625, "reward_std": 0.06982050836086273, "rewards/drgrpo_math_reward/mean": 0.91015625, "rewards/drgrpo_math_reward/std": 0.2862374484539032, "step": 769 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 2.72148689197311e-09, "advantages/std": 0.5133163928985596, "advantages/var": 0.2634937192183884, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.424372759856631, "grad_norm": 0.16380096652552203, "learning_rate": 7.155121540222808e-08, "loss": -0.0, "num_tokens": 257023441.0, "reward": 0.873046875, "reward_std": 0.0879673883318901, "rewards/drgrpo_math_reward/mean": 0.873046875, "rewards/drgrpo_math_reward/std": 0.33324605226516724, "step": 770 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 5.4429086866074076e-09, "advantages/std": 0.5133225321769714, "advantages/var": 0.2635000220405779, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9921875, "epoch": 4.43010752688172, "grad_norm": 0.8837725904958764, "learning_rate": 7.022351411174865e-08, "loss": -0.0, "num_tokens": 257268805.0, "reward": 0.86328125, "reward_std": 0.09369811415672302, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.3438861668109894, "step": 771 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5712023923299962e-09, "advantages/std": 0.5927451252937317, "advantages/var": 0.3513467835594817, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.43584229390681, "grad_norm": 0.196671560402542, "learning_rate": 6.890779847513295e-08, "loss": -0.0, "num_tokens": 257574273.0, "reward": 0.806640625, "reward_std": 0.13083267211914062, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "step": 772 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 2.160414803424439e-09, "advantages/std": 0.5927419662475586, "advantages/var": 0.3513430385510219, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.4415770609319, "grad_norm": 0.30283625149113114, "learning_rate": 6.760408545313678e-08, "loss": -0.0, "num_tokens": 257891574.0, "reward": 0.818359375, "reward_std": 0.13228076696395874, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "step": 773 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 2.0513409119745486e-09, "advantages/std": 0.5675083994865417, "advantages/var": 0.32206578348777626, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 4.447311827956989, "grad_norm": 0.21910369116727957, "learning_rate": 6.631239185179205e-08, "loss": -0.0, "num_tokens": 258168992.0, "reward": 0.82421875, "reward_std": 0.11784065514802933, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "step": 774 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.855767866456956e-10, "advantages/std": 0.5927635431289673, "advantages/var": 0.35136861806280706, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 4.453046594982079, "grad_norm": 0.264191588835815, "learning_rate": 6.503273432218914e-08, "loss": 0.0, "num_tokens": 258483644.0, "reward": 0.7578125, "reward_std": 0.1468944400548935, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "step": 775 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6410327431126133e-09, "advantages/std": 0.5675222277641296, "advantages/var": 0.32208147900636064, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 4.458781362007168, "grad_norm": 0.2440824545648418, "learning_rate": 6.376512936026279e-08, "loss": 0.0, "num_tokens": 258799205.0, "reward": 0.806640625, "reward_std": 0.1292927861213684, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "step": 776 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.733007870100216e-09, "advantages/std": 0.5411224961280823, "advantages/var": 0.2928135558158864, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.464516129032258, "grad_norm": 0.220097227701071, "learning_rate": 6.250959330657924e-08, "loss": -0.0, "num_tokens": 259059084.0, "reward": 0.90234375, "reward_std": 0.1291673183441162, "rewards/drgrpo_math_reward/mean": 0.90234375, "rewards/drgrpo_math_reward/std": 0.29713961482048035, "step": 777 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 5.442693813241055e-09, "advantages/std": 0.5133427977561951, "advantages/var": 0.2635208280081578, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 4.4702508960573475, "grad_norm": 0.15713390372924083, "learning_rate": 6.126614234612593e-08, "loss": 0.0, "num_tokens": 259343486.0, "reward": 0.833984375, "reward_std": 0.10695268213748932, "rewards/drgrpo_math_reward/mean": 0.833984375, "rewards/drgrpo_math_reward/std": 0.3724585771560669, "step": 778 }, { "advantages/mean": -3.725290298461914e-09, "advantages/snr": 6.2844777966519496e-09, "advantages/std": 0.5927764177322388, "advantages/var": 0.35138388141946564, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 4.4759856630824375, "grad_norm": 0.25751454552901154, "learning_rate": 6.003479250810217e-08, "loss": 0.0, "num_tokens": 259651891.0, "reward": 0.80859375, "reward_std": 0.15706941485404968, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.3937928080558777, "step": 779 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 3.927890252658936e-09, "advantages/std": 0.5927625894546509, "advantages/var": 0.351367487456983, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 4.481720430107527, "grad_norm": 0.27410417467988246, "learning_rate": 5.881555966571328e-08, "loss": 0.0, "num_tokens": 259959878.0, "reward": 0.724609375, "reward_std": 0.14745557308197021, "rewards/drgrpo_math_reward/mean": 0.724609375, "rewards/drgrpo_math_reward/std": 0.44714778661727905, "step": 780 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 4.994482029219315e-09, "advantages/std": 0.745881199836731, "advantages/var": 0.5563387642698814, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 4.487455197132617, "grad_norm": 1.219282388988274, "learning_rate": 5.760845953596527e-08, "loss": -0.0, "num_tokens": 260280154.0, "reward": 0.775390625, "reward_std": 0.23378583788871765, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "step": 781 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.142412684446116e-09, "advantages/std": 0.5927436351776123, "advantages/var": 0.35134501704357035, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 4.493189964157706, "grad_norm": 0.2302409383548915, "learning_rate": 5.6413507679463066e-08, "loss": 0.0, "num_tokens": 260559086.0, "reward": 0.78515625, "reward_std": 0.13406416773796082, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4111155867576599, "step": 782 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 1.5307899024645274e-09, "advantages/std": 0.6844426393508911, "advantages/var": 0.468461726561614, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 4.498924731182796, "grad_norm": 0.8205527010311658, "learning_rate": 5.523071950020908e-08, "loss": -0.0, "num_tokens": 260893852.0, "reward": 0.787109375, "reward_std": 0.1733008325099945, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "step": 783 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.1026632953692145e-10, "advantages/std": 0.5675109624862671, "advantages/var": 0.32206869254208925, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 4.504659498207886, "grad_norm": 2.052771171513005, "learning_rate": 5.4060110245405136e-08, "loss": -0.0, "num_tokens": 261191366.0, "reward": 0.830078125, "reward_std": 0.12061456590890884, "rewards/drgrpo_math_reward/mean": 0.830078125, "rewards/drgrpo_math_reward/std": 0.3759314715862274, "step": 784 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.5456247718924694e-09, "advantages/std": 0.6402414441108704, "advantages/var": 0.40990910675717274, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 4.510394265232975, "grad_norm": 0.23770498822754227, "learning_rate": 5.2901695005255765e-08, "loss": -0.0, "num_tokens": 261479764.0, "reward": 0.849609375, "reward_std": 0.15957194566726685, "rewards/drgrpo_math_reward/mean": 0.849609375, "rewards/drgrpo_math_reward/std": 0.35780346393585205, "step": 785 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.8142801436253046e-09, "advantages/std": 0.5133289694786072, "advantages/var": 0.2635066309059688, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8828125, "epoch": 4.516129032258064, "grad_norm": 11.409456462029091, "learning_rate": 5.175548871277358e-08, "loss": -0.0, "num_tokens": 261742265.0, "reward": 0.88671875, "reward_std": 0.09734338521957397, "rewards/drgrpo_math_reward/mean": 0.88671875, "rewards/drgrpo_math_reward/std": 0.3172462284564972, "step": 786 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 1.3208493102526126e-09, "advantages/std": 0.6169570088386536, "advantages/var": 0.38063595075513845, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 4.521863799283154, "grad_norm": 0.47828630744156886, "learning_rate": 5.06215061435874e-08, "loss": -0.0, "num_tokens": 262075041.0, "reward": 0.69140625, "reward_std": 0.1479862928390503, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.4623647928237915, "step": 787 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 9.620976424250035e-10, "advantages/std": 0.48400625586509705, "advantages/var": 0.2342620557165498, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.796875, "epoch": 4.527598566308244, "grad_norm": 0.2614049311644065, "learning_rate": 4.9499761915750335e-08, "loss": -0.0, "num_tokens": 262394300.0, "reward": 0.71875, "reward_std": 0.10612116754055023, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "step": 788 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 8.605806317657228e-10, "advantages/std": 0.5411012768745422, "advantages/var": 0.29279059183526, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 4.533333333333333, "grad_norm": 0.20317511478239234, "learning_rate": 4.8390270489553245e-08, "loss": -0.0, "num_tokens": 262680922.0, "reward": 0.88671875, "reward_std": 0.10833083093166351, "rewards/drgrpo_math_reward/mean": 0.88671875, "rewards/drgrpo_math_reward/std": 0.3172462284564972, "step": 789 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.401859322807059e-10, "advantages/std": 0.684421718120575, "advantages/var": 0.46843308823511975, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 4.539068100358423, "grad_norm": 0.31980400098181055, "learning_rate": 4.729304616733687e-08, "loss": -0.0, "num_tokens": 262992027.0, "reward": 0.6875, "reward_std": 0.15736782550811768, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "step": 790 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 5.280282833421398e-09, "advantages/std": 0.705509603023529, "advantages/var": 0.49774379995841755, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.544802867383512, "grad_norm": 0.33136049222113123, "learning_rate": 4.620810309330803e-08, "loss": -0.0, "num_tokens": 263308007.0, "reward": 0.83203125, "reward_std": 0.19274532794952393, "rewards/drgrpo_math_reward/mean": 0.83203125, "rewards/drgrpo_math_reward/std": 0.374204158782959, "step": 791 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 3.692424888056325e-09, "advantages/std": 0.5675066709518433, "advantages/var": 0.3220638215748437, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.550537634408602, "grad_norm": 0.24336329844797921, "learning_rate": 4.513545525335705e-08, "loss": 0.0, "num_tokens": 263595505.0, "reward": 0.857421875, "reward_std": 0.11893805116415024, "rewards/drgrpo_math_reward/mean": 0.857421875, "rewards/drgrpo_math_reward/std": 0.3499840497970581, "step": 792 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 1.9639804763367311e-10, "advantages/std": 0.5927519202232361, "advantages/var": 0.35135483892833363, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.96875, "epoch": 4.5562724014336915, "grad_norm": 0.24081194446519236, "learning_rate": 4.407511647487816e-08, "loss": -0.0, "num_tokens": 263885381.0, "reward": 0.875, "reward_std": 0.13893014192581177, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3310423493385315, "step": 793 }, { "advantages/mean": 3.841705620288849e-09, "advantages/snr": 7.099462538784409e-09, "advantages/std": 0.5411262512207031, "advantages/var": 0.2928176197601715, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.5620071684587815, "grad_norm": 0.20626935780844513, "learning_rate": 4.30271004265903e-08, "loss": -0.0, "num_tokens": 264185308.0, "reward": 0.8515625, "reward_std": 0.1258736550807953, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 794 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.85595271384983e-10, "advantages/std": 0.5927495956420898, "advantages/var": 0.351352083133861, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.567741935483871, "grad_norm": 0.3313189979526605, "learning_rate": 4.199142061836136e-08, "loss": -0.0, "num_tokens": 264450040.0, "reward": 0.865234375, "reward_std": 0.1340775489807129, "rewards/drgrpo_math_reward/mean": 0.865234375, "rewards/drgrpo_math_reward/std": 0.3418070077896118, "step": 795 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.513231101544016e-10, "advantages/std": 0.6627250909805298, "advantages/var": 0.4392045462151515, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.573476702508961, "grad_norm": 0.28315487447958143, "learning_rate": 4.096809040103444e-08, "loss": -0.0, "num_tokens": 264768923.0, "reward": 0.646484375, "reward_std": 0.17737406492233276, "rewards/drgrpo_math_reward/mean": 0.646484375, "rewards/drgrpo_math_reward/std": 0.47852855920791626, "step": 796 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6169514060020447, "advantages/var": 0.38062903736789977, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.57921146953405, "grad_norm": 0.34426939547375307, "learning_rate": 3.995712296625475e-08, "loss": -0.0, "num_tokens": 265041887.0, "reward": 0.83203125, "reward_std": 0.14172375202178955, "rewards/drgrpo_math_reward/mean": 0.83203125, "rewards/drgrpo_math_reward/std": 0.374204158782959, "step": 797 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.027036693504973e-10, "advantages/std": 0.662670910358429, "advantages/var": 0.439132735435269, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.58494623655914, "grad_norm": 0.33297519077024823, "learning_rate": 3.895853134630034e-08, "loss": 0.0, "num_tokens": 265329939.0, "reward": 0.771484375, "reward_std": 0.1340966522693634, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "step": 798 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 2.256450119020363e-09, "advantages/std": 0.5675146579742432, "advantages/var": 0.3220728870156222, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 4.59068100358423, "grad_norm": 1.18450339461088, "learning_rate": 3.797232841391407e-08, "loss": 0.0, "num_tokens": 265634169.0, "reward": 0.84375, "reward_std": 0.1279679238796234, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "step": 799 }, { "advantages/mean": 4.423782229423523e-09, "advantages/snr": 8.175375515516081e-09, "advantages/std": 0.5411105751991272, "advantages/var": 0.2928006545923303, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.596415770609319, "grad_norm": 0.1944220550404474, "learning_rate": 3.699852688213745e-08, "loss": -0.0, "num_tokens": 265943640.0, "reward": 0.85546875, "reward_std": 0.11735516041517258, "rewards/drgrpo_math_reward/mean": 0.85546875, "rewards/drgrpo_math_reward/std": 0.35197147727012634, "step": 800 }, { "advantages/mean": 1.979060471057892e-09, "advantages/snr": 3.855420513431888e-09, "advantages/std": 0.5133189558982849, "advantages/var": 0.26349635048450537, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 4.602150537634409, "grad_norm": 0.21129304414749323, "learning_rate": 3.6037139304146756e-08, "loss": 0.0, "num_tokens": 266210004.0, "reward": 0.859375, "reward_std": 0.08989076316356659, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3479743003845215, "step": 801 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 4.169210725800228e-09, "advantages/std": 0.7259883284568787, "advantages/var": 0.5270590530556127, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8359375, "epoch": 4.607885304659498, "grad_norm": 0.307389646914457, "learning_rate": 3.508817807309094e-08, "loss": -0.0, "num_tokens": 266533857.0, "reward": 0.81640625, "reward_std": 0.2256123572587967, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "step": 802 }, { "advantages/mean": -4.889443516731262e-09, "advantages/snr": 8.61557029833357e-09, "advantages/std": 0.5675124526023865, "advantages/var": 0.32207038385877595, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.953125, "epoch": 4.613620071684588, "grad_norm": 1.1998236390811272, "learning_rate": 3.4151655421932654e-08, "loss": 0.0, "num_tokens": 266821378.0, "reward": 0.748046875, "reward_std": 0.12180736660957336, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "step": 803 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 1.698208084863039e-09, "advantages/std": 0.616966724395752, "advantages/var": 0.38064793901162375, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.619354838709677, "grad_norm": 0.28728040677523475, "learning_rate": 3.322758342329002e-08, "loss": 0.0, "num_tokens": 267116505.0, "reward": 0.75, "reward_std": 0.15760092437267303, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "step": 804 }, { "advantages/mean": -2.444721758365631e-09, "advantages/snr": 3.5717952041184526e-09, "advantages/std": 0.6844518184661865, "advantages/var": 0.46847429180166955, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 4.625089605734767, "grad_norm": 0.37903712947514134, "learning_rate": 3.2315973989280654e-08, "loss": 0.0, "num_tokens": 267409911.0, "reward": 0.765625, "reward_std": 0.1830492913722992, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42402184009552, "step": 805 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.856015911535099e-10, "advantages/std": 0.5927448272705078, "advantages/var": 0.35134643025594414, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 4.630824372759856, "grad_norm": 4.71934259473162, "learning_rate": 3.141683887136892e-08, "loss": 0.0, "num_tokens": 267702607.0, "reward": 0.705078125, "reward_std": 0.13154324889183044, "rewards/drgrpo_math_reward/mean": 0.705078125, "rewards/drgrpo_math_reward/std": 0.4564536213874817, "step": 806 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.0909316700410061e-09, "advantages/std": 0.640271008014679, "advantages/var": 0.4099469637041331, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8046875, "epoch": 4.636559139784946, "grad_norm": 0.5264073248252015, "learning_rate": 3.053018966021392e-08, "loss": -0.0, "num_tokens": 267997432.0, "reward": 0.736328125, "reward_std": 0.1805422157049179, "rewards/drgrpo_math_reward/mean": 0.736328125, "rewards/drgrpo_math_reward/std": 0.4410543739795685, "step": 807 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 5.102563244260762e-10, "advantages/std": 0.6844519972801208, "advantages/var": 0.46847453658074656, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 4.6422939068100355, "grad_norm": 0.30127677781763657, "learning_rate": 2.9656037785520395e-08, "loss": -0.0, "num_tokens": 268320378.0, "reward": 0.6953125, "reward_std": 0.1853528767824173, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4607250988483429, "step": 808 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.512948110388532e-09, "advantages/std": 0.5675086379051208, "advantages/var": 0.32206605409692557, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.6480286738351255, "grad_norm": 0.3487141362642593, "learning_rate": 2.8794394515890607e-08, "loss": -0.0, "num_tokens": 268615586.0, "reward": 0.859375, "reward_std": 0.1205422505736351, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3479743003845215, "step": 809 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.545642309300802e-09, "advantages/std": 0.640237033367157, "advantages/var": 0.4099034588947781, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 4.6537634408602155, "grad_norm": 0.38237931858812224, "learning_rate": 2.79452709586806e-08, "loss": -0.0, "num_tokens": 268928671.0, "reward": 0.80078125, "reward_std": 0.15222682058811188, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "step": 810 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 1.3747864716784658e-09, "advantages/std": 0.5927518606185913, "advantages/var": 0.3513547682668019, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.659498207885305, "grad_norm": 3.547206669974566, "learning_rate": 2.7108678059855062e-08, "loss": -0.0, "num_tokens": 269220010.0, "reward": 0.798828125, "reward_std": 0.13695251941680908, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "step": 811 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 4.53557501792538e-09, "advantages/std": 0.5133431553840637, "advantages/var": 0.263521195179667, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 4.665232974910394, "grad_norm": 0.3232108818879062, "learning_rate": 2.6284626603848114e-08, "loss": -0.0, "num_tokens": 269511927.0, "reward": 0.763671875, "reward_std": 0.1038198471069336, "rewards/drgrpo_math_reward/mean": 0.763671875, "rewards/drgrpo_math_reward/std": 0.42524150013923645, "step": 812 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 2.453040936415248e-09, "advantages/std": 0.6169481873512268, "advantages/var": 0.38062506587596445, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8828125, "epoch": 4.670967741935484, "grad_norm": 0.5810087019735833, "learning_rate": 2.5473127213422762e-08, "loss": 0.0, "num_tokens": 269785006.0, "reward": 0.876953125, "reward_std": 0.13854879140853882, "rewards/drgrpo_math_reward/mean": 0.876953125, "rewards/drgrpo_math_reward/std": 0.32881227135658264, "step": 813 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.9279139507042534e-10, "advantages/std": 0.5927590131759644, "advantages/var": 0.3513632477013431, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 4.676702508960574, "grad_norm": 0.2724491989152861, "learning_rate": 2.4674190349535217e-08, "loss": -0.0, "num_tokens": 270076307.0, "reward": 0.8203125, "reward_std": 0.14251455664634705, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "step": 814 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 6.323740339150568e-09, "advantages/std": 0.6627330183982849, "advantages/var": 0.43921505367530145, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 4.682437275985663, "grad_norm": 0.4035517118970694, "learning_rate": 2.3887826311198898e-08, "loss": -0.0, "num_tokens": 270378396.0, "reward": 0.822265625, "reward_std": 0.184912770986557, "rewards/drgrpo_math_reward/mean": 0.822265625, "rewards/drgrpo_math_reward/std": 0.3826628625392914, "step": 815 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.0256693787405242e-09, "advantages/std": 0.5675089955329895, "advantages/var": 0.3220664600108627, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 4.688172043010753, "grad_norm": 0.2894317682385916, "learning_rate": 2.311404523535243e-08, "loss": -0.0, "num_tokens": 270649266.0, "reward": 0.83984375, "reward_std": 0.12105467915534973, "rewards/drgrpo_math_reward/mean": 0.83984375, "rewards/drgrpo_math_reward/std": 0.3671095669269562, "step": 816 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 6.792847433233427e-09, "advantages/std": 0.6169653534889221, "advantages/var": 0.3806462474057106, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.693906810035842, "grad_norm": 0.3168126944629125, "learning_rate": 2.2352857096728627e-08, "loss": -0.0, "num_tokens": 270937142.0, "reward": 0.833984375, "reward_std": 0.15824154019355774, "rewards/drgrpo_math_reward/mean": 0.833984375, "rewards/drgrpo_math_reward/std": 0.3724585771560669, "step": 817 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 2.4998531045350503e-09, "advantages/std": 0.41911977529525757, "advantages/var": 0.1756613860435472, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.984375, "epoch": 4.699641577060932, "grad_norm": 0.17760023001620784, "learning_rate": 2.1604271707726497e-08, "loss": -0.0, "num_tokens": 271211741.0, "reward": 0.9296875, "reward_std": 0.05920084938406944, "rewards/drgrpo_math_reward/mean": 0.9296875, "rewards/drgrpo_math_reward/std": 0.25592297315597534, "step": 818 }, { "advantages/mean": 4.423782229423523e-09, "advantages/snr": 7.462716831471666e-09, "advantages/std": 0.5927844047546387, "advantages/var": 0.3513933505203113, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7890625, "epoch": 4.705376344086021, "grad_norm": 0.4351535555067835, "learning_rate": 2.086829871828377e-08, "loss": -0.0, "num_tokens": 271515811.0, "reward": 0.83203125, "reward_std": 0.16309301555156708, "rewards/drgrpo_math_reward/mean": 0.83203125, "rewards/drgrpo_math_reward/std": 0.374204158782959, "step": 819 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 2.909241848016568e-09, "advantages/std": 0.6402510404586792, "advantages/var": 0.40992139480842127, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 4.711111111111111, "grad_norm": 0.3057089468555479, "learning_rate": 2.0144947615753138e-08, "loss": 0.0, "num_tokens": 271806476.0, "reward": 0.82421875, "reward_std": 0.16036951541900635, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "step": 820 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 6.037951276437399e-09, "advantages/std": 0.61697918176651, "advantages/var": 0.3806633107332722, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 4.7168458781362, "grad_norm": 0.3036903299927811, "learning_rate": 1.9434227724779984e-08, "loss": -0.0, "num_tokens": 272118725.0, "reward": 0.740234375, "reward_std": 0.16697950661182404, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "step": 821 }, { "advantages/mean": 3.3760443329811096e-09, "advantages/snr": 5.272968941618861e-09, "advantages/std": 0.6402549147605896, "advantages/var": 0.40992635587508985, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 4.72258064516129, "grad_norm": 0.2531324439537048, "learning_rate": 1.8736148207181947e-08, "loss": -0.0, "num_tokens": 272445086.0, "reward": 0.6875, "reward_std": 0.1725948452949524, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "step": 822 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 6.600095927817857e-10, "advantages/std": 0.7055371403694153, "advantages/var": 0.497782656440652, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 4.7283154121863795, "grad_norm": 5.1920208729012, "learning_rate": 1.8050718061830894e-08, "loss": -0.0, "num_tokens": 272772655.0, "reward": 0.736328125, "reward_std": 0.21224334836006165, "rewards/drgrpo_math_reward/mean": 0.736328125, "rewards/drgrpo_math_reward/std": 0.4410543739795685, "step": 823 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.547570342509261e-10, "advantages/std": 0.6169684529304504, "advantages/var": 0.38065007191139344, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 4.7340501792114695, "grad_norm": 0.7358602448947559, "learning_rate": 1.7377946124536804e-08, "loss": -0.0, "num_tokens": 273072271.0, "reward": 0.703125, "reward_std": 0.1547667682170868, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "step": 824 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.1321125832245161e-09, "advantages/std": 0.6169809699058533, "advantages/var": 0.3806655172259674, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "epoch": 4.7397849462365595, "grad_norm": 0.3632386066971972, "learning_rate": 1.6717841067934392e-08, "loss": -0.0, "num_tokens": 273376262.0, "reward": 0.8828125, "reward_std": 0.16324588656425476, "rewards/drgrpo_math_reward/mean": 0.8828125, "rewards/drgrpo_math_reward/std": 0.32195815443992615, "step": 825 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.000379598275315e-09, "advantages/std": 0.640223503112793, "advantages/var": 0.40988613393801643, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.745519713261649, "grad_norm": 0.2594695070167408, "learning_rate": 1.607041140137033e-08, "loss": 0.0, "num_tokens": 273688243.0, "reward": 0.80078125, "reward_std": 0.14359387755393982, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "step": 826 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 4.3029557716265667e-10, "advantages/std": 0.5410946607589722, "advantages/var": 0.2927834319018672, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.751254480286739, "grad_norm": 0.18913249076844055, "learning_rate": 1.543566547079467e-08, "loss": -0.0, "num_tokens": 273979439.0, "reward": 0.77734375, "reward_std": 0.10755911469459534, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "step": 827 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 1.7676206437724108e-09, "advantages/std": 0.5927391052246094, "advantages/var": 0.35133964686247054, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.921875, "epoch": 4.756989247311828, "grad_norm": 0.29131511796766, "learning_rate": 1.481361145865223e-08, "loss": -0.0, "num_tokens": 274279751.0, "reward": 0.81640625, "reward_std": 0.12848131358623505, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "step": 828 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 3.442198537478098e-09, "advantages/std": 0.5411207675933838, "advantages/var": 0.29281168512085287, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.762724014336918, "grad_norm": 0.2943951427496866, "learning_rate": 1.4204257383778062e-08, "loss": 0.0, "num_tokens": 274568466.0, "reward": 0.7421875, "reward_std": 0.1254923790693283, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "step": 829 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5095384229213223e-09, "advantages/std": 0.616958498954773, "advantages/var": 0.3806377894325266, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 4.768458781362007, "grad_norm": 0.28338499681761764, "learning_rate": 1.3607611101293382e-08, "loss": -0.0, "num_tokens": 274893028.0, "reward": 0.81640625, "reward_std": 0.15344181656837463, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "step": 830 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 9.434362669362212e-10, "advantages/std": 0.6169750094413757, "advantages/var": 0.3806581622751857, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.774193548387097, "grad_norm": 4.83122243782873, "learning_rate": 1.3023680302504336e-08, "loss": -0.0, "num_tokens": 275190442.0, "reward": 0.84765625, "reward_std": 0.16136980056762695, "rewards/drgrpo_math_reward/mean": 0.84765625, "rewards/drgrpo_math_reward/std": 0.35970520973205566, "step": 831 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 2.381210696187349e-09, "advantages/std": 0.6844478249549866, "advantages/var": 0.46846882508561194, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "epoch": 4.779928315412186, "grad_norm": 0.40657116175716995, "learning_rate": 1.2452472514803636e-08, "loss": -0.0, "num_tokens": 275511006.0, "reward": 0.73828125, "reward_std": 0.1753622442483902, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "step": 832 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.6169554591178894, "advantages/var": 0.3806340385353657, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.785663082437276, "grad_norm": 0.4469064604118644, "learning_rate": 1.1893995101572406e-08, "loss": -0.0, "num_tokens": 275776753.0, "reward": 0.8515625, "reward_std": 0.14840367436408997, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "step": 833 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.454662227979153e-09, "advantages/std": 0.6402328610420227, "advantages/var": 0.40989811635805395, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 4.791397849462365, "grad_norm": 11.740726794736275, "learning_rate": 1.1348255262086048e-08, "loss": -0.0, "num_tokens": 276080727.0, "reward": 0.84375, "reward_std": 0.15176604688167572, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "step": 834 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 4.810742959506476e-10, "advantages/std": 0.48398062586784363, "advantages/var": 0.23423724621542963, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 4.797132616487455, "grad_norm": 0.3828803615944811, "learning_rate": 1.0815260031421191e-08, "loss": -0.0, "num_tokens": 276369278.0, "reward": 0.7421875, "reward_std": 0.09154270589351654, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "step": 835 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 7.856072790321206e-10, "advantages/std": 0.592740535736084, "advantages/var": 0.35134134270469985, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "epoch": 4.802867383512545, "grad_norm": 0.2028323419937937, "learning_rate": 1.029501628036511e-08, "loss": 0.0, "num_tokens": 276672004.0, "reward": 0.7734375, "reward_std": 0.13146832585334778, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "step": 836 }, { "advantages/mean": 4.190951585769653e-09, "advantages/snr": 6.123092422989506e-09, "advantages/std": 0.6844501495361328, "advantages/var": 0.46847200720003457, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.808602150537634, "grad_norm": 0.5052491640955836, "learning_rate": 9.787530715326786e-09, "loss": -0.0, "num_tokens": 276967508.0, "reward": 0.814453125, "reward_std": 0.18260450661182404, "rewards/drgrpo_math_reward/mean": 0.814453125, "rewards/drgrpo_math_reward/std": 0.38912075757980347, "step": 837 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.7259534597396851, "advantages/var": 0.5270084257080185, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8046875, "epoch": 4.8143369175627235, "grad_norm": 0.7766056495451963, "learning_rate": 9.292809878251096e-09, "loss": -0.0, "num_tokens": 277277044.0, "reward": 0.810546875, "reward_std": 0.1890445053577423, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "step": 838 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 1.122517320002539e-09, "advantages/std": 0.7259640693664551, "advantages/var": 0.5270238300111032, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.828125, "epoch": 4.8200716845878135, "grad_norm": 2.7660565766296066, "learning_rate": 8.81086014653365e-09, "loss": 0.0, "num_tokens": 277569553.0, "reward": 0.8203125, "reward_std": 0.2025153934955597, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "step": 839 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 3.927880773520875e-09, "advantages/std": 0.5927640199661255, "advantages/var": 0.3513691833664012, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8515625, "epoch": 4.825806451612904, "grad_norm": 0.6919707065546522, "learning_rate": 8.341687732939418e-09, "loss": -0.0, "num_tokens": 277892024.0, "reward": 0.75390625, "reward_std": 0.14766615629196167, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "step": 840 }, { "advantages/mean": 4.423782229423523e-09, "advantages/snr": 6.463358221045911e-09, "advantages/std": 0.6844401955604553, "advantages/var": 0.4684583812988343, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.831541218637993, "grad_norm": 0.29981412722211415, "learning_rate": 7.885298685522235e-09, "loss": -0.0, "num_tokens": 278169756.0, "reward": 0.880859375, "reward_std": 0.17268195748329163, "rewards/drgrpo_math_reward/mean": 0.880859375, "rewards/drgrpo_math_reward/std": 0.32427072525024414, "step": 841 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 8.204906920220266e-10, "advantages/std": 0.5675399899482727, "advantages/var": 0.3221016401904855, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 4.837275985663083, "grad_norm": 0.3425340435216381, "learning_rate": 7.4416988875465325e-09, "loss": -0.0, "num_tokens": 278510353.0, "reward": 0.708984375, "reward_std": 0.14107999205589294, "rewards/drgrpo_math_reward/mean": 0.708984375, "rewards/drgrpo_math_reward/std": 0.45467492938041687, "step": 842 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.300139131445486e-10, "advantages/std": 0.7055176496505737, "advantages/var": 0.4977551539684697, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 4.843010752688172, "grad_norm": 0.43184702792387214, "learning_rate": 7.010894057412287e-09, "loss": -0.0, "num_tokens": 278820789.0, "reward": 0.779296875, "reward_std": 0.19433510303497314, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "step": 843 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 1.7008713002334431e-09, "advantages/std": 0.6844452023506165, "advantages/var": 0.4684652350207763, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.848745519713262, "grad_norm": 1.6935836881535076, "learning_rate": 6.592889748580521e-09, "loss": -0.0, "num_tokens": 279149115.0, "reward": 0.89453125, "reward_std": 0.17613041400909424, "rewards/drgrpo_math_reward/mean": 0.89453125, "rewards/drgrpo_math_reward/std": 0.3074568510055542, "step": 844 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.0003445897561706e-09, "advantages/std": 0.6402291059494019, "advantages/var": 0.4098933081047704, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.854480286738351, "grad_norm": 6.70456186968645, "learning_rate": 6.1876913495021446e-09, "loss": -0.0, "num_tokens": 279428993.0, "reward": 0.8203125, "reward_std": 0.14519062638282776, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "step": 845 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 2.0513441437232844e-10, "advantages/std": 0.5675075054168701, "advantages/var": 0.32206476870447887, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 4.860215053763441, "grad_norm": 0.2684052876878027, "learning_rate": 5.795304083548558e-09, "loss": -0.0, "num_tokens": 279697835.0, "reward": 0.765625, "reward_std": 0.12338300049304962, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42402184009552, "step": 846 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 1.22963608335927e-09, "advantages/std": 0.6627222895622253, "advantages/var": 0.43920083308259805, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 4.86594982078853, "grad_norm": 0.2849542271704683, "learning_rate": 5.415733008943713e-09, "loss": 0.0, "num_tokens": 280039843.0, "reward": 0.6953125, "reward_std": 0.17735926806926727, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4607250988483429, "step": 847 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.6410649734315498e-09, "advantages/std": 0.5675110816955566, "advantages/var": 0.32206882784726076, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.87168458781362, "grad_norm": 33.87135686339874, "learning_rate": 5.048983018699826e-09, "loss": 0.0, "num_tokens": 280324651.0, "reward": 0.84375, "reward_std": 0.12259218841791153, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "step": 848 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 2.6349200084194924e-09, "advantages/std": 0.6627259254455566, "advantages/var": 0.4392056522576695, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8046875, "epoch": 4.877419354838709, "grad_norm": 0.3441351557536405, "learning_rate": 4.695058840553545e-09, "loss": -0.0, "num_tokens": 280626875.0, "reward": 0.796875, "reward_std": 0.17946280539035797, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "step": 849 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 2.8864785593892077e-09, "advantages/std": 0.7259626984596252, "advantages/var": 0.5270218395547808, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8359375, "epoch": 4.883154121863799, "grad_norm": 0.4745591809976997, "learning_rate": 4.353965036905549e-09, "loss": -0.0, "num_tokens": 280941082.0, "reward": 0.80859375, "reward_std": 0.1988142430782318, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.3937928080558777, "step": 850 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.7737600153170586e-10, "advantages/std": 0.6169725656509399, "advantages/var": 0.3806551467659034, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.859375, "epoch": 4.888888888888889, "grad_norm": 0.29792565291262124, "learning_rate": 4.025706004760931e-09, "loss": -0.0, "num_tokens": 281210664.0, "reward": 0.869140625, "reward_std": 0.1603502631187439, "rewards/drgrpo_math_reward/mean": 0.869140625, "rewards/drgrpo_math_reward/std": 0.33757632970809937, "step": 851 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.924406914974663e-09, "advantages/std": 0.4839530289173126, "advantages/var": 0.23421053419824123, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9453125, "epoch": 4.894623655913978, "grad_norm": 0.41463643485807317, "learning_rate": 3.710285975673688e-09, "loss": -0.0, "num_tokens": 281480061.0, "reward": 0.896484375, "reward_std": 0.07438573986291885, "rewards/drgrpo_math_reward/mean": 0.896484375, "rewards/drgrpo_math_reward/std": 0.30492907762527466, "step": 852 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 1.9362867987600033e-09, "advantages/std": 0.5411067605018616, "advantages/var": 0.292796526260819, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7734375, "epoch": 4.900358422939068, "grad_norm": 0.6492459682695432, "learning_rate": 3.407709015691096e-09, "loss": -0.0, "num_tokens": 281778926.0, "reward": 0.81640625, "reward_std": 0.1140536367893219, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "step": 853 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.4423001551929787e-09, "advantages/std": 0.541104793548584, "advantages/var": 0.2927943976012557, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8359375, "epoch": 4.9060931899641576, "grad_norm": 0.25960354338775543, "learning_rate": 3.1179790253019756e-09, "loss": -0.0, "num_tokens": 282069736.0, "reward": 0.7578125, "reward_std": 0.11584333330392838, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "step": 854 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 4.000283140113865e-09, "advantages/std": 0.6402389407157898, "advantages/var": 0.4099059012088766, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 4.911827956989248, "grad_norm": 0.337882378273501, "learning_rate": 2.841099739386066e-09, "loss": -0.0, "num_tokens": 282354240.0, "reward": 0.828125, "reward_std": 0.15640974044799805, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3776407241821289, "step": 855 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 1.509469736725578e-09, "advantages/std": 0.6169865727424622, "advantages/var": 0.38067243094448955, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.917562724014337, "grad_norm": 0.3740112685181258, "learning_rate": 2.577074727165951e-09, "loss": -0.0, "num_tokens": 282656092.0, "reward": 0.748046875, "reward_std": 0.17139552533626556, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "step": 856 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.8142415930795671e-09, "advantages/std": 0.5133398771286011, "advantages/var": 0.26351782945040725, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8671875, "epoch": 4.923297491039427, "grad_norm": 0.3748467582055791, "learning_rate": 2.3259073921612083e-09, "loss": -0.0, "num_tokens": 282950399.0, "reward": 0.849609375, "reward_std": 0.1032009869813919, "rewards/drgrpo_math_reward/mean": 0.849609375, "rewards/drgrpo_math_reward/std": 0.35780346393585205, "step": 857 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 3.5133862523635003e-10, "advantages/std": 0.6626958250999451, "advantages/var": 0.439165756604897, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.890625, "epoch": 4.929032258064516, "grad_norm": 1.06498664685161, "learning_rate": 2.0876009721443322e-09, "loss": -0.0, "num_tokens": 283262790.0, "reward": 0.845703125, "reward_std": 0.15411502122879028, "rewards/drgrpo_math_reward/mean": 0.845703125, "rewards/drgrpo_math_reward/std": 0.36158639192581177, "step": 858 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.3606777256501085e-09, "advantages/std": 0.6844549179077148, "advantages/var": 0.46847853464805667, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "epoch": 4.934767025089606, "grad_norm": 0.3046245215840264, "learning_rate": 1.8621585390989902e-09, "loss": -0.0, "num_tokens": 283573246.0, "reward": 0.720703125, "reward_std": 0.18707478046417236, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "step": 859 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 5.66079588366732e-10, "advantages/std": 0.616955578327179, "advantages/var": 0.38063418562902385, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.7578125, "epoch": 4.940501792114695, "grad_norm": 6.835268787580969, "learning_rate": 1.649582999180721e-09, "loss": -0.0, "num_tokens": 283909299.0, "reward": 0.76171875, "reward_std": 0.14601755142211914, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "step": 860 }, { "advantages/mean": 2.7939677238464355e-09, "advantages/snr": 4.528616143787336e-09, "advantages/std": 0.6169583797454834, "advantages/var": 0.3806376423381721, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 4.946236559139785, "grad_norm": 0.758890173336466, "learning_rate": 1.4498770926790749e-09, "loss": 0.0, "num_tokens": 284172804.0, "reward": 0.798828125, "reward_std": 0.15289105474948883, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "step": 861 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.773842775969158e-10, "advantages/std": 0.6169590353965759, "advantages/var": 0.38063845135747343, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.875, "epoch": 4.951971326164875, "grad_norm": 0.33922171928713213, "learning_rate": 1.2630433939825324e-09, "loss": -0.0, "num_tokens": 284418558.0, "reward": 0.908203125, "reward_std": 0.15073469281196594, "rewards/drgrpo_math_reward/mean": 0.908203125, "rewards/drgrpo_math_reward/std": 0.289021372795105, "step": 862 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 3.928048640026104e-10, "advantages/std": 0.592738687992096, "advantages/var": 0.35133915224259127, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9375, "epoch": 4.957706093189964, "grad_norm": 0.621828748495713, "learning_rate": 1.0890843115451964e-09, "loss": -0.0, "num_tokens": 284717745.0, "reward": 0.736328125, "reward_std": 0.12599125504493713, "rewards/drgrpo_math_reward/mean": 0.736328125, "rewards/drgrpo_math_reward/std": 0.4410543739795685, "step": 863 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 2.5531969555101674e-09, "advantages/std": 0.5927467346191406, "advantages/var": 0.3513486914016539, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.734375, "epoch": 4.963440860215054, "grad_norm": 0.3745349666216179, "learning_rate": 9.28002087855928e-10, "loss": -0.0, "num_tokens": 285043174.0, "reward": 0.7578125, "reward_std": 0.13417719304561615, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "step": 864 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 5.269724372636866e-10, "advantages/std": 0.6627404689788818, "advantages/var": 0.43922492922234824, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.9296875, "epoch": 4.969175627240143, "grad_norm": 0.6960286629793649, "learning_rate": 7.797987994092592e-10, "loss": -0.0, "num_tokens": 285339840.0, "reward": 0.822265625, "reward_std": 0.19137969613075256, "rewards/drgrpo_math_reward/mean": 0.822265625, "rewards/drgrpo_math_reward/std": 0.3826628625392914, "step": 865 }, { "advantages/mean": 2.2118911147117615e-09, "advantages/snr": 3.135203955509604e-09, "advantages/std": 0.7055014967918396, "advantages/var": 0.49773236197552606, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "epoch": 4.974910394265233, "grad_norm": 0.5262217487946637, "learning_rate": 6.44476356678636e-10, "loss": -0.0, "num_tokens": 285641510.0, "reward": 0.837890625, "reward_std": 0.18257594108581543, "rewards/drgrpo_math_reward/mean": 0.837890625, "rewards/drgrpo_math_reward/std": 0.3689115643501282, "step": 866 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3826264441013336, "advantages/var": 0.14640299572563098, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8984375, "epoch": 4.980645161290322, "grad_norm": 0.1459227486216982, "learning_rate": 5.220365040918828e-10, "loss": -0.0, "num_tokens": 285928027.0, "reward": 0.791015625, "reward_std": 0.060130972415208817, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "step": 867 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 4.512765158179138e-09, "advantages/std": 0.5675316452980042, "advantages/var": 0.3220921684146596, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "epoch": 4.986379928315412, "grad_norm": 0.3068036354910241, "learning_rate": 4.124808200086649e-10, "loss": 0.0, "num_tokens": 286215900.0, "reward": 0.71875, "reward_std": 0.13494986295700073, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "step": 868 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 3.4422011915958375e-09, "advantages/std": 0.5411203503608704, "advantages/var": 0.2928112335746711, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8828125, "epoch": 4.992114695340502, "grad_norm": 0.5593070273586229, "learning_rate": 3.158107167000601e-10, "loss": -0.0, "num_tokens": 286500340.0, "reward": 0.8359375, "reward_std": 0.12065667659044266, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.37069445848464966, "step": 869 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 1.5096247634818422e-09, "advantages/std": 0.6169232130050659, "advantages/var": 0.38059425074449393, "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8203125, "epoch": 4.997849462365592, "grad_norm": 0.2930251040737949, "learning_rate": 2.3202744033057332e-10, "loss": -0.0, "num_tokens": 286807654.0, "reward": 0.7109375, "reward_std": 0.12585854530334473, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45377036929130554, "step": 870 }, { "epoch": 4.997849462365592, "step": 870, "total_flos": 0.0, "train_loss": -3.883855995433084e-10, "train_runtime": 68601.8379, "train_samples_per_second": 0.407, "train_steps_per_second": 0.013 } ], "logging_steps": 1, "max_steps": 875, "num_input_tokens_seen": 286807654, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }