{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.997849462365592, "eval_steps": 500, "global_step": 870, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 2.726633008541522e-09, "advantages/std": 0.17078252136707306, "advantages/var": 0.029166669604494766, "completions/clipped_ratio": 0.8515625, "epoch": 0.005734767025089606, "grad_norm": 29.436094176262426, "learning_rate": 2e-06, "loss": -4.8315, "num_tokens": 566999.0, "residual_var": 0.021875012665987015, "reward": 0.06640625, "reward_std": 0.0801548957824707, "rewards/drgrpo_math_reward/mean": 0.06640625, "rewards/drgrpo_math_reward/std": 0.2492343932390213, "rho2": 0.24999991059303284, "step": 1 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 6.457559566095379e-10, "advantages/std": 0.18027757108211517, "advantages/var": 0.03250000263526709, "completions/clipped_ratio": 0.96875, "epoch": 0.011469534050179211, "grad_norm": 28.798940440905398, "learning_rate": 1.9999935545509886e-06, "loss": -2.7224, "num_tokens": 1137183.0, "residual_var": 0.025390639901161194, "reward": 0.0390625, "reward_std": 0.07861834019422531, "rewards/drgrpo_math_reward/mean": 0.0390625, "rewards/drgrpo_math_reward/std": 0.1939331740140915, "rho2": 0.21874985098838806, "step": 2 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 3.899613938865147e-09, "advantages/std": 0.17911821603775024, "advantages/var": 0.03208333531654617, "completions/clipped_ratio": 0.9140625, "epoch": 0.017204301075268817, "grad_norm": 27.19540958422738, "learning_rate": 1.999974218287042e-06, "loss": -4.9833, "num_tokens": 1715529.0, "residual_var": 0.02606772445142269, "reward": 0.041015625, "reward_std": 0.07245541363954544, "rewards/drgrpo_math_reward/mean": 0.041015625, "rewards/drgrpo_math_reward/std": 0.19852031767368317, "rho2": 0.18749986588954926, "step": 3 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 5.302057099022084e-09, "advantages/std": 0.24152295291423798, "advantages/var": 0.058333336784413214, "completions/clipped_ratio": 0.890625, "epoch": 0.022939068100358423, "grad_norm": 42.384155632670115, "learning_rate": 1.999941991457422e-06, "loss": -6.1923, "num_tokens": 2291546.0, "residual_var": 0.03645836189389229, "reward": 0.08203125, "reward_std": 0.13787700235843658, "rewards/drgrpo_math_reward/mean": 0.08203125, "rewards/drgrpo_math_reward/std": 0.2746807038784027, "rho2": 0.3749997019767761, "step": 4 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 8.038737674265128e-10, "advantages/std": 0.14481790363788605, "advantages/var": 0.02097222521407205, "completions/clipped_ratio": 0.984375, "epoch": 0.02867383512544803, "grad_norm": 22.302366064436793, "learning_rate": 1.999896874477561e-06, "loss": -2.6598, "num_tokens": 2877760.0, "residual_var": 0.017039941623806953, "reward": 0.029296875, "reward_std": 0.05793476849794388, "rewards/drgrpo_math_reward/mean": 0.029296875, "rewards/drgrpo_math_reward/std": 0.16880230605602264, "rho2": 0.18749986588954926, "step": 5 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 5.14935038391775e-10, "advantages/std": 0.22607767581939697, "advantages/var": 0.05111111550390035, "completions/clipped_ratio": 0.90625, "epoch": 0.034408602150537634, "grad_norm": 33.874029343041286, "learning_rate": 1.999838867929058e-06, "loss": -6.9258, "num_tokens": 3443248.0, "residual_var": 0.03513891249895096, "reward": 0.0625, "reward_std": 0.11889171600341797, "rewards/drgrpo_math_reward/mean": 0.0625, "rewards/drgrpo_math_reward/std": 0.2422981858253479, "rho2": 0.3124998211860657, "step": 6 }, { "advantages/mean": -5.238689482212067e-10, "advantages/snr": 2.5494827782458826e-09, "advantages/std": 0.20548047125339508, "advantages/var": 0.04222222406651732, "completions/clipped_ratio": 0.859375, "epoch": 0.04014336917562724, "grad_norm": 30.360328177388602, "learning_rate": 1.9997679725596696e-06, "loss": -3.3703, "num_tokens": 4012376.0, "residual_var": 0.029027793556451797, "reward": 0.0703125, "reward_std": 0.10771197080612183, "rewards/drgrpo_math_reward/mean": 0.0703125, "rewards/drgrpo_math_reward/std": 0.25592297315597534, "rho2": 0.31249988079071045, "step": 7 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 1.2570654448336628e-09, "advantages/std": 0.18521758913993835, "advantages/var": 0.03430555532681101, "completions/clipped_ratio": 0.921875, "epoch": 0.045878136200716846, "grad_norm": 42.120177530417756, "learning_rate": 1.9996841892832997e-06, "loss": -4.5594, "num_tokens": 4589170.0, "residual_var": 0.025729183107614517, "reward": 0.037109375, "reward_std": 0.08725681155920029, "rewards/drgrpo_math_reward/mean": 0.037109375, "rewards/drgrpo_math_reward/std": 0.18921469151973724, "rho2": 0.24999991059303284, "step": 8 }, { "advantages/mean": -1.7462298274040222e-10, "advantages/snr": 8.918936482526003e-10, "advantages/std": 0.19578900933265686, "advantages/var": 0.038333336175463195, "completions/clipped_ratio": 0.9140625, "epoch": 0.05161290322580645, "grad_norm": 33.73912379907987, "learning_rate": 1.9995875191799916e-06, "loss": -6.2692, "num_tokens": 5157654.0, "residual_var": 0.02635417878627777, "reward": 0.0546875, "reward_std": 0.10144393146038055, "rewards/drgrpo_math_reward/mean": 0.0546875, "rewards/drgrpo_math_reward/std": 0.2275916188955307, "rho2": 0.31249988079071045, "step": 9 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 4.490305058908481e-09, "advantages/std": 0.23333333432674408, "advantages/var": 0.05444444490803613, "completions/clipped_ratio": 0.875, "epoch": 0.05734767025089606, "grad_norm": 37.14264674954341, "learning_rate": 1.999477963495908e-06, "loss": -7.3835, "num_tokens": 5743493.0, "residual_var": 0.03913196176290512, "reward": 0.07421875, "reward_std": 0.12270566076040268, "rewards/drgrpo_math_reward/mean": 0.07421875, "rewards/drgrpo_math_reward/std": 0.2623828947544098, "rho2": 0.28124985098838806, "step": 10 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 3.916947836500089e-10, "advantages/std": 0.2972092628479004, "advantages/var": 0.08833334592259234, "completions/clipped_ratio": 0.71875, "epoch": 0.06308243727598567, "grad_norm": 47.63575724747301, "learning_rate": 1.999355523643321e-06, "loss": -15.511, "num_tokens": 6297441.0, "residual_var": 0.05244794487953186, "reward": 0.125, "reward_std": 0.18755567073822021, "rewards/drgrpo_math_reward/mean": 0.125, "rewards/drgrpo_math_reward/std": 0.3310423493385315, "rho2": 0.4062498211860657, "step": 11 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.909916870530274e-09, "advantages/std": 0.24381232261657715, "advantages/var": 0.0594444486596899, "completions/clipped_ratio": 0.4453125, "epoch": 0.06881720430107527, "grad_norm": 38.10125152218821, "learning_rate": 1.9992202012005907e-06, "loss": -12.1075, "num_tokens": 6834270.0, "residual_var": 0.03529515862464905, "reward": 0.17578125, "reward_std": 0.14519158005714417, "rewards/drgrpo_math_reward/mean": 0.17578125, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "rho2": 0.4062498211860657, "step": 12 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 7.947162817853429e-10, "advantages/std": 0.29297327995300293, "advantages/var": 0.08583334276642063, "completions/clipped_ratio": 0.5546875, "epoch": 0.07455197132616488, "grad_norm": 47.734524060569484, "learning_rate": 1.999071997912144e-06, "loss": -13.4026, "num_tokens": 7383217.0, "residual_var": 0.05096358060836792, "reward": 0.16796875, "reward_std": 0.18288393318653107, "rewards/drgrpo_math_reward/mean": 0.16796875, "rewards/drgrpo_math_reward/std": 0.374204158782959, "rho2": 0.4062497019767761, "step": 13 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.913872134654483e-10, "advantages/std": 0.29744282364845276, "advantages/var": 0.08847223333996457, "completions/clipped_ratio": 0.0703125, "epoch": 0.08028673835125448, "grad_norm": 51.60177742523651, "learning_rate": 1.9989109156884548e-06, "loss": -22.348, "num_tokens": 7903796.0, "residual_var": 0.04976566135883331, "reward": 0.279296875, "reward_std": 0.18770846724510193, "rewards/drgrpo_math_reward/mean": 0.279296875, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "rho2": 0.4374997913837433, "step": 14 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.841731182885672e-09, "advantages/std": 0.2457980215549469, "advantages/var": 0.06041666740032614, "completions/clipped_ratio": 0.2890625, "epoch": 0.08602150537634409, "grad_norm": 32.098586621535844, "learning_rate": 1.9987369566060176e-06, "loss": -6.5785, "num_tokens": 8461442.0, "residual_var": 0.04342450574040413, "reward": 0.193359375, "reward_std": 0.12809449434280396, "rewards/drgrpo_math_reward/mean": 0.193359375, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "rho2": 0.2812497913837433, "step": 15 }, { "advantages/mean": 1.979060471057892e-09, "advantages/snr": 6.8329272590397935e-09, "advantages/std": 0.2896358072757721, "advantages/var": 0.0838889008562882, "completions/clipped_ratio": -0.046875, "epoch": 0.09175627240143369, "grad_norm": 39.76147228336156, "learning_rate": 1.998550122907321e-06, "loss": -11.131, "num_tokens": 8988969.0, "residual_var": 0.04456600546836853, "reward": 0.2421875, "reward_std": 0.18617744743824005, "rewards/drgrpo_math_reward/mean": 0.2421875, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "rho2": 0.4687497615814209, "step": 16 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 5.18015433883761e-09, "advantages/std": 0.24720662832260132, "advantages/var": 0.06111111708662875, "completions/clipped_ratio": -0.84375, "epoch": 0.0974910394265233, "grad_norm": 32.332261514543724, "learning_rate": 1.9983504170008193e-06, "loss": -7.4037, "num_tokens": 9465160.0, "residual_var": 0.04010418802499771, "reward": 0.41796875, "reward_std": 0.13648879528045654, "rewards/drgrpo_math_reward/mean": 0.41796875, "rewards/drgrpo_math_reward/std": 0.4937073290348053, "rho2": 0.3437498211860657, "step": 17 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 3.128097928239396e-09, "advantages/std": 0.22329604625701904, "advantages/var": 0.04986112427401679, "completions/clipped_ratio": -0.9765625, "epoch": 0.1032258064516129, "grad_norm": 24.53036305993529, "learning_rate": 1.998137841460901e-06, "loss": -4.028, "num_tokens": 9929492.0, "residual_var": 0.031163211911916733, "reward": 0.443359375, "reward_std": 0.12912550568580627, "rewards/drgrpo_math_reward/mean": 0.443359375, "rewards/drgrpo_math_reward/std": 0.49726733565330505, "rho2": 0.3749998211860657, "step": 18 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 5.273420137067528e-09, "advantages/std": 0.28698626160621643, "advantages/var": 0.0823611143507117, "completions/clipped_ratio": -1.1640625, "epoch": 0.10896057347670252, "grad_norm": 36.80155958866352, "learning_rate": 1.9979123990278553e-06, "loss": -3.3392, "num_tokens": 10404359.0, "residual_var": 0.04118059203028679, "reward": 0.455078125, "reward_std": 0.1889258325099945, "rewards/drgrpo_math_reward/mean": 0.455078125, "rewards/drgrpo_math_reward/std": 0.4984649419784546, "rho2": 0.4999997019767761, "step": 19 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.298516459201491e-09, "advantages/std": 0.2979094088077545, "advantages/var": 0.0887500158561858, "completions/clipped_ratio": -1.546875, "epoch": 0.11469534050179211, "grad_norm": 33.935212696188884, "learning_rate": 1.9976740926078385e-06, "loss": -5.4336, "num_tokens": 10888205.0, "residual_var": 0.04437503218650818, "reward": 0.505859375, "reward_std": 0.20330065488815308, "rewards/drgrpo_math_reward/mean": 0.505859375, "rewards/drgrpo_math_reward/std": 0.5004546642303467, "rho2": 0.4999997615814209, "step": 20 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.0561672872039563e-09, "advantages/std": 0.28308814764022827, "advantages/var": 0.08013889933437568, "completions/clipped_ratio": -1.6171875, "epoch": 0.12043010752688173, "grad_norm": 32.98600181285323, "learning_rate": 1.997422925272834e-06, "loss": -4.7775, "num_tokens": 11326566.0, "residual_var": 0.037565141916275024, "reward": 0.490234375, "reward_std": 0.1916278600692749, "rewards/drgrpo_math_reward/mean": 0.490234375, "rewards/drgrpo_math_reward/std": 0.5003935098648071, "rho2": 0.5312496423721313, "step": 21 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.413239245893062e-10, "advantages/std": 0.26378655433654785, "advantages/var": 0.06958334624874851, "completions/clipped_ratio": -0.84375, "epoch": 0.12616487455197134, "grad_norm": 30.43295029753108, "learning_rate": 1.997158900260614e-06, "loss": -1.5344, "num_tokens": 11837440.0, "residual_var": 0.030442731454968452, "reward": 0.353515625, "reward_std": 0.18370884656906128, "rewards/drgrpo_math_reward/mean": 0.353515625, "rewards/drgrpo_math_reward/std": 0.47852855920791626, "rho2": 0.5624997615814209, "step": 22 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.891385763964462e-09, "advantages/std": 0.32210248708724976, "advantages/var": 0.1037500121877919, "completions/clipped_ratio": -1.796875, "epoch": 0.13189964157706094, "grad_norm": 33.857258232407155, "learning_rate": 1.996882020974698e-06, "loss": -1.7386, "num_tokens": 12283452.0, "residual_var": 0.03566410392522812, "reward": 0.548828125, "reward_std": 0.2492963969707489, "rewards/drgrpo_math_reward/mean": 0.548828125, "rewards/drgrpo_math_reward/std": 0.498096764087677, "rho2": 0.6562497019767761, "step": 23 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.0536815079074696e-09, "advantages/std": 0.3314530551433563, "advantages/var": 0.10986112776386481, "completions/clipped_ratio": -1.4140625, "epoch": 0.13763440860215054, "grad_norm": 33.519015758205086, "learning_rate": 1.996592290984309e-06, "loss": -2.2481, "num_tokens": 12787388.0, "residual_var": 0.04806428402662277, "reward": 0.427734375, "reward_std": 0.2445918470621109, "rewards/drgrpo_math_reward/mean": 0.427734375, "rewards/drgrpo_math_reward/std": 0.4952339828014374, "rho2": 0.5624996423721313, "step": 24 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.33768001198768616, "advantages/var": 0.11402779049600387, "completions/clipped_ratio": -1.5390625, "epoch": 0.14336917562724014, "grad_norm": 35.29558431065458, "learning_rate": 1.9962897140243264e-06, "loss": -2.5098, "num_tokens": 13261289.0, "residual_var": 0.04276047274470329, "reward": 0.494140625, "reward_std": 0.2509792149066925, "rewards/drgrpo_math_reward/mean": 0.494140625, "rewards/drgrpo_math_reward/std": 0.5004546642303467, "rho2": 0.6249995827674866, "step": 25 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.8445344294446445e-09, "advantages/std": 0.31556829810142517, "advantages/var": 0.09958335076662994, "completions/clipped_ratio": -1.78125, "epoch": 0.14910394265232976, "grad_norm": 33.67891304915034, "learning_rate": 1.995974293995239e-06, "loss": -1.3061, "num_tokens": 13703398.0, "residual_var": 0.052903689444065094, "reward": 0.458984375, "reward_std": 0.21658912301063538, "rewards/drgrpo_math_reward/mean": 0.458984375, "rewards/drgrpo_math_reward/std": 0.49880221486091614, "rho2": 0.4687497019767761, "step": 26 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 1.972348770869203e-09, "advantages/std": 0.3541421890258789, "advantages/var": 0.12541669004804135, "completions/clipped_ratio": -1.8984375, "epoch": 0.15483870967741936, "grad_norm": 37.48492000481343, "learning_rate": 1.995646034963094e-06, "loss": -2.42, "num_tokens": 14140628.0, "residual_var": 0.03527350723743439, "reward": 0.529296875, "reward_std": 0.2901952266693115, "rewards/drgrpo_math_reward/mean": 0.529296875, "rewards/drgrpo_math_reward/std": 0.49962911009788513, "rho2": 0.7187495231628418, "step": 27 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 3.5437726181975533e-10, "advantages/std": 0.328506737947464, "advantages/var": 0.10791667687688378, "completions/clipped_ratio": -2.0078125, "epoch": 0.16057347670250896, "grad_norm": 35.69078367577697, "learning_rate": 1.995304941159446e-06, "loss": -1.4049, "num_tokens": 14559652.0, "residual_var": 0.05058597773313522, "reward": 0.498046875, "reward_std": 0.23819924890995026, "rewards/drgrpo_math_reward/mean": 0.498046875, "rewards/drgrpo_math_reward/std": 0.5004851818084717, "rho2": 0.5312497615814209, "step": 28 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.1184822013219018e-09, "advantages/std": 0.31224989891052246, "advantages/var": 0.0974999993696315, "completions/clipped_ratio": -1.703125, "epoch": 0.16630824372759856, "grad_norm": 34.32610965768352, "learning_rate": 1.9949510169813e-06, "loss": -1.8244, "num_tokens": 15031895.0, "residual_var": 0.05179690569639206, "reward": 0.4453125, "reward_std": 0.21496553719043732, "rewards/drgrpo_math_reward/mean": 0.4453125, "rewards/drgrpo_math_reward/std": 0.49748632311820984, "rho2": 0.4687498211860657, "step": 29 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.213368049234649e-10, "advantages/std": 0.36228442192077637, "advantages/var": 0.1312500023664711, "completions/clipped_ratio": -1.7890625, "epoch": 0.17204301075268819, "grad_norm": 38.19737943291566, "learning_rate": 1.9945842669910563e-06, "loss": -1.4182, "num_tokens": 15486959.0, "residual_var": 0.036914125084877014, "reward": 0.494140625, "reward_std": 0.2947877049446106, "rewards/drgrpo_math_reward/mean": 0.494140625, "rewards/drgrpo_math_reward/std": 0.5004546642303467, "rho2": 0.7187495827674866, "step": 30 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.31024184823036194, "advantages/var": 0.09625000439339093, "completions/clipped_ratio": -1.875, "epoch": 0.17777777777777778, "grad_norm": 33.43177028110988, "learning_rate": 1.994204695916451e-06, "loss": -0.7224, "num_tokens": 15942489.0, "residual_var": 0.027070339769124985, "reward": 0.462890625, "reward_std": 0.2466275990009308, "rewards/drgrpo_math_reward/mean": 0.462890625, "rewards/drgrpo_math_reward/std": 0.4991086423397064, "rho2": 0.7187497615814209, "step": 31 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 6.073842373190156e-10, "advantages/std": 0.38333335518836975, "advantages/var": 0.14694446119997284, "completions/clipped_ratio": -2.5234375, "epoch": 0.18351254480286738, "grad_norm": 39.405043160591404, "learning_rate": 1.9938123086504976e-06, "loss": -2.5768, "num_tokens": 16329206.0, "residual_var": 0.06888026744127274, "reward": 0.6796875, "reward_std": 0.30288636684417725, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4670529365539551, "rho2": 0.5312496423721313, "step": 32 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.3192895811008385e-09, "advantages/std": 0.3529636561870575, "advantages/var": 0.12458334258893533, "completions/clipped_ratio": -2.1875, "epoch": 0.18924731182795698, "grad_norm": 39.8765737788672, "learning_rate": 1.9934071102514193e-06, "loss": -2.8275, "num_tokens": 16780029.0, "residual_var": 0.038932349532842636, "reward": 0.431640625, "reward_std": 0.2764374911785126, "rewards/drgrpo_math_reward/mean": 0.431640625, "rewards/drgrpo_math_reward/std": 0.4957893490791321, "rho2": 0.6874996423721313, "step": 33 }, { "advantages/mean": 2.2118911147117615e-09, "advantages/snr": 7.1658531579274366e-09, "advantages/std": 0.3086709976196289, "advantages/var": 0.09527778477149695, "completions/clipped_ratio": -2.625, "epoch": 0.1949820788530466, "grad_norm": 32.54472515325468, "learning_rate": 1.9929891059425876e-06, "loss": -0.8608, "num_tokens": 17171725.0, "residual_var": 0.03275177255272865, "reward": 0.69921875, "reward_std": 0.23320434987545013, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "rho2": 0.6562497019767761, "step": 34 }, { "advantages/mean": 2.9103830456733704e-10, "advantages/snr": 9.627275908518855e-10, "advantages/std": 0.3023059666156769, "advantages/var": 0.09138889745143874, "completions/clipped_ratio": -2.2734375, "epoch": 0.2007168458781362, "grad_norm": 33.022970974460435, "learning_rate": 1.9925583011124534e-06, "loss": -0.4326, "num_tokens": 17588176.0, "residual_var": 0.03998268395662308, "reward": 0.48828125, "reward_std": 0.212461918592453, "rewards/drgrpo_math_reward/mean": 0.48828125, "rewards/drgrpo_math_reward/std": 0.5003514885902405, "rho2": 0.5624996423721313, "step": 35 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 5.813987920407096e-09, "advantages/std": 0.3203730583190918, "advantages/var": 0.1026388964967282, "completions/clipped_ratio": -2.359375, "epoch": 0.2064516129032258, "grad_norm": 32.74047300778524, "learning_rate": 1.9921147013144777e-06, "loss": -1.236, "num_tokens": 18006078.0, "residual_var": 0.04490455985069275, "reward": 0.583984375, "reward_std": 0.23606085777282715, "rewards/drgrpo_math_reward/mean": 0.583984375, "rewards/drgrpo_math_reward/std": 0.493378221988678, "rho2": 0.5624996423721313, "step": 36 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.311185524404915e-09, "advantages/std": 0.3240370452404022, "advantages/var": 0.10500000668813048, "completions/clipped_ratio": -2.3125, "epoch": 0.2121863799283154, "grad_norm": 34.29940947967677, "learning_rate": 1.9916583122670605e-06, "loss": -1.3775, "num_tokens": 18427380.0, "residual_var": 0.0426563024520874, "reward": 0.48828125, "reward_std": 0.24060603976249695, "rewards/drgrpo_math_reward/mean": 0.48828125, "rewards/drgrpo_math_reward/std": 0.5003514885902405, "rho2": 0.5937496423721313, "step": 37 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.5389614812742682e-09, "advantages/std": 0.3782268762588501, "advantages/var": 0.1430555699245275, "completions/clipped_ratio": -2.5078125, "epoch": 0.21792114695340503, "grad_norm": 42.00872536494267, "learning_rate": 1.9911891398534664e-06, "loss": -1.1315, "num_tokens": 18838583.0, "residual_var": 0.03129347041249275, "reward": 0.5703125, "reward_std": 0.31695032119750977, "rewards/drgrpo_math_reward/mean": 0.5703125, "rewards/drgrpo_math_reward/std": 0.4955156147480011, "rho2": 0.7812495827674866, "step": 38 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 5.04643209573345e-09, "advantages/std": 0.3229637145996094, "advantages/var": 0.10430556094797794, "completions/clipped_ratio": -2.40625, "epoch": 0.22365591397849463, "grad_norm": 37.606776965104736, "learning_rate": 1.990707190121749e-06, "loss": -2.1019, "num_tokens": 19239452.0, "residual_var": 0.04563374072313309, "reward": 0.521484375, "reward_std": 0.22826901078224182, "rewards/drgrpo_math_reward/mean": 0.521484375, "rewards/drgrpo_math_reward/std": 0.5000267624855042, "rho2": 0.5624995231628418, "step": 39 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.3403754775914196e-09, "advantages/std": 0.34741106629371643, "advantages/var": 0.12069444898333703, "completions/clipped_ratio": -2.703125, "epoch": 0.22939068100358423, "grad_norm": 40.0768501058647, "learning_rate": 1.990212469284673e-06, "loss": -1.9656, "num_tokens": 19613064.0, "residual_var": 0.060347266495227814, "reward": 0.611328125, "reward_std": 0.2643835246562958, "rewards/drgrpo_math_reward/mean": 0.611328125, "rewards/drgrpo_math_reward/std": 0.4879252314567566, "rho2": 0.4999997913837433, "step": 40 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.129390090922875e-09, "advantages/std": 0.2733536660671234, "advantages/var": 0.07472222675233642, "completions/clipped_ratio": -2.484375, "epoch": 0.23512544802867383, "grad_norm": 28.414525230607456, "learning_rate": 1.9897049837196347e-06, "loss": -1.2123, "num_tokens": 20015749.0, "residual_var": 0.03736114129424095, "reward": 0.5859375, "reward_std": 0.17966848611831665, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.49304109811782837, "rho2": 0.4999997019767761, "step": 41 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 2.638128848150659e-09, "advantages/std": 0.30889591574668884, "advantages/var": 0.09541668676498549, "completions/clipped_ratio": -2.59375, "epoch": 0.24086021505376345, "grad_norm": 33.072759234697685, "learning_rate": 1.9891847399685785e-06, "loss": -1.0083, "num_tokens": 20409768.0, "residual_var": 0.03876306116580963, "reward": 0.529296875, "reward_std": 0.2219194918870926, "rewards/drgrpo_math_reward/mean": 0.529296875, "rewards/drgrpo_math_reward/std": 0.49962911009788513, "rho2": 0.5937497019767761, "step": 42 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 3.8586239393648264e-09, "advantages/std": 0.33187180757522583, "advantages/var": 0.11013889666324772, "completions/clipped_ratio": -2.625, "epoch": 0.24659498207885305, "grad_norm": 31.44924704380032, "learning_rate": 1.988651744737914e-06, "loss": -0.3868, "num_tokens": 20785216.0, "residual_var": 0.03441844508051872, "reward": 0.615234375, "reward_std": 0.26327085494995117, "rewards/drgrpo_math_reward/mean": 0.615234375, "rewards/drgrpo_math_reward/std": 0.4870156943798065, "rho2": 0.6874997019767761, "step": 43 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.291472050601333e-09, "advantages/std": 0.3048223555088043, "advantages/var": 0.09291666841793589, "completions/clipped_ratio": -2.5625, "epoch": 0.2523297491039427, "grad_norm": 29.722054809107682, "learning_rate": 1.9881060048984273e-06, "loss": -1.3503, "num_tokens": 21160754.0, "residual_var": 0.04065108671784401, "reward": 0.576171875, "reward_std": 0.21357515454292297, "rewards/drgrpo_math_reward/mean": 0.576171875, "rewards/drgrpo_math_reward/std": 0.4946470856666565, "rho2": 0.5624996423721313, "step": 44 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.464399520101665e-09, "advantages/std": 0.3129163980484009, "advantages/var": 0.09791667216758526, "completions/clipped_ratio": -2.640625, "epoch": 0.25806451612903225, "grad_norm": 32.109626992793025, "learning_rate": 1.9875475274851963e-06, "loss": -0.1157, "num_tokens": 21543821.0, "residual_var": 0.052018266171216965, "reward": 0.494140625, "reward_std": 0.21901246905326843, "rewards/drgrpo_math_reward/mean": 0.494140625, "rewards/drgrpo_math_reward/std": 0.5004546642303467, "rho2": 0.4687497913837433, "step": 45 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3109126389026642, "advantages/var": 0.09666666902941845, "completions/clipped_ratio": -2.703125, "epoch": 0.2637992831541219, "grad_norm": 35.79287294918412, "learning_rate": 1.9869763196974956e-06, "loss": -2.8167, "num_tokens": 21944208.0, "residual_var": 0.042291708290576935, "reward": 0.55078125, "reward_std": 0.22741690278053284, "rewards/drgrpo_math_reward/mean": 0.55078125, "rewards/drgrpo_math_reward/std": 0.497901052236557, "rho2": 0.5624997019767761, "step": 46 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 3.3684269641578203e-09, "advantages/std": 0.34560737013816833, "advantages/var": 0.11944445429382089, "completions/clipped_ratio": -2.796875, "epoch": 0.26953405017921145, "grad_norm": 34.29969161574922, "learning_rate": 1.9863923888987067e-06, "loss": -0.7204, "num_tokens": 22280788.0, "residual_var": 0.04479171335697174, "reward": 0.73046875, "reward_std": 0.2649552524089813, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44415023922920227, "rho2": 0.6249997019767761, "step": 47 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.5855474046225945e-09, "advantages/std": 0.32467934489250183, "advantages/var": 0.10541667699982415, "completions/clipped_ratio": -2.7890625, "epoch": 0.2752688172043011, "grad_norm": 32.38128413606818, "learning_rate": 1.9857957426162217e-06, "loss": -0.7119, "num_tokens": 22610643.0, "residual_var": 0.03953128680586815, "reward": 0.681640625, "reward_std": 0.24216532707214355, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.6249997615814209, "step": 48 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.2857738137245178, "advantages/var": 0.08166667261065541, "completions/clipped_ratio": -2.703125, "epoch": 0.2810035842293907, "grad_norm": 28.47475443541043, "learning_rate": 1.9851863885413475e-06, "loss": -0.5351, "num_tokens": 22950911.0, "residual_var": 0.04593753442168236, "reward": 0.59765625, "reward_std": 0.18342819809913635, "rewards/drgrpo_math_reward/mean": 0.59765625, "rewards/drgrpo_math_reward/std": 0.4908501207828522, "rho2": 0.4374997019767761, "step": 49 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.028085371557379e-09, "advantages/std": 0.34681087732315063, "advantages/var": 0.12027778462965344, "completions/clipped_ratio": -2.8515625, "epoch": 0.2867383512544803, "grad_norm": 36.80295976447085, "learning_rate": 1.9845643345292055e-06, "loss": -1.8412, "num_tokens": 23293975.0, "residual_var": 0.045104220509529114, "reward": 0.625, "reward_std": 0.2658538818359375, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4845963716506958, "rho2": 0.6249996423721313, "step": 50 }, { "advantages/mean": 2.9103830456733704e-09, "advantages/snr": 7.966263326278952e-09, "advantages/std": 0.365338534116745, "advantages/var": 0.13347224451057205, "completions/clipped_ratio": -2.78125, "epoch": 0.2924731182795699, "grad_norm": 38.62206236189509, "learning_rate": 1.9839295885986295e-06, "loss": -0.3355, "num_tokens": 23658416.0, "residual_var": 0.041710127145051956, "reward": 0.572265625, "reward_std": 0.2920263707637787, "rewards/drgrpo_math_reward/mean": 0.572265625, "rewards/drgrpo_math_reward/std": 0.4952339828014374, "rho2": 0.6874996423721313, "step": 51 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.7957152519569e-09, "advantages/std": 0.33312496542930603, "advantages/var": 0.11097224259227634, "completions/clipped_ratio": -2.859375, "epoch": 0.2982078853046595, "grad_norm": 31.568044516980944, "learning_rate": 1.9832821589320657e-06, "loss": -0.6364, "num_tokens": 23990154.0, "residual_var": 0.04508252441883087, "reward": 0.693359375, "reward_std": 0.24725444614887238, "rewards/drgrpo_math_reward/mean": 0.693359375, "rewards/drgrpo_math_reward/std": 0.4615498185157776, "rho2": 0.5937495827674866, "step": 52 }, { "advantages/mean": -2.2118911147117615e-09, "advantages/snr": 6.238866009104758e-09, "advantages/std": 0.3545341491699219, "advantages/var": 0.12569446292764042, "completions/clipped_ratio": -2.8828125, "epoch": 0.3039426523297491, "grad_norm": 34.539429066421185, "learning_rate": 1.9826220538754633e-06, "loss": -1.525, "num_tokens": 24333751.0, "residual_var": 0.039279572665691376, "reward": 0.615234375, "reward_std": 0.28511297702789307, "rewards/drgrpo_math_reward/mean": 0.615234375, "rewards/drgrpo_math_reward/std": 0.4870156943798065, "rho2": 0.6874996423721313, "step": 53 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.0960716353833015e-09, "advantages/std": 0.3186342418193817, "advantages/var": 0.10152778005981222, "completions/clipped_ratio": -2.78125, "epoch": 0.3096774193548387, "grad_norm": 33.0914602530221, "learning_rate": 1.981949281938169e-06, "loss": -0.9074, "num_tokens": 24666975.0, "residual_var": 0.0475911907851696, "reward": 0.669921875, "reward_std": 0.22530576586723328, "rewards/drgrpo_math_reward/mean": 0.669921875, "rewards/drgrpo_math_reward/std": 0.47070086002349854, "rho2": 0.5312497019767761, "step": 54 }, { "advantages/mean": 5.820766091346741e-11, "advantages/snr": 1.7206111824871743e-10, "advantages/std": 0.33829641342163086, "advantages/var": 0.11444446333393898, "completions/clipped_ratio": -2.8359375, "epoch": 0.3154121863799283, "grad_norm": 35.14927646244548, "learning_rate": 1.981263851792818e-06, "loss": 0.2656, "num_tokens": 25032186.0, "residual_var": 0.046493109315633774, "reward": 0.5234375, "reward_std": 0.25199148058891296, "rewards/drgrpo_math_reward/mean": 0.5234375, "rewards/drgrpo_math_reward/std": 0.49993884563446045, "rho2": 0.5937496423721313, "step": 55 }, { "advantages/mean": -1.1059455573558807e-09, "advantages/snr": 3.060807498763691e-09, "advantages/std": 0.3613247573375702, "advantages/var": 0.13055558026505398, "completions/clipped_ratio": -2.8359375, "epoch": 0.3211469534050179, "grad_norm": 34.93777277042175, "learning_rate": 1.98056577227522e-06, "loss": -0.1509, "num_tokens": 25400736.0, "residual_var": 0.04079866781830788, "reward": 0.609375, "reward_std": 0.28988340497016907, "rewards/drgrpo_math_reward/mean": 0.609375, "rewards/drgrpo_math_reward/std": 0.48836761713027954, "rho2": 0.6874996423721313, "step": 56 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 6.804442682099992e-09, "advantages/std": 0.3421744406223297, "advantages/var": 0.11708334781520424, "completions/clipped_ratio": -2.8671875, "epoch": 0.32688172043010755, "grad_norm": 35.06148683865433, "learning_rate": 1.9798550523842466e-06, "loss": -1.0673, "num_tokens": 25754713.0, "residual_var": 0.04024743288755417, "reward": 0.548828125, "reward_std": 0.26655441522598267, "rewards/drgrpo_math_reward/mean": 0.548828125, "rewards/drgrpo_math_reward/std": 0.498096764087677, "rho2": 0.6562497615814209, "step": 57 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.932812620751568e-09, "advantages/std": 0.3011552095413208, "advantages/var": 0.09069446023387684, "completions/clipped_ratio": -2.890625, "epoch": 0.3326164874551971, "grad_norm": 30.133970214467123, "learning_rate": 1.9791317012817163e-06, "loss": -0.8993, "num_tokens": 26065226.0, "residual_var": 0.036844652146101, "reward": 0.740234375, "reward_std": 0.21664923429489136, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "rho2": 0.5937497615814209, "step": 58 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.30345237255096436, "advantages/var": 0.09208334240680927, "completions/clipped_ratio": -2.78125, "epoch": 0.33835125448028674, "grad_norm": 31.433599583915893, "learning_rate": 1.9783957282922735e-06, "loss": -0.6743, "num_tokens": 26408851.0, "residual_var": 0.051796916872262955, "reward": 0.716796875, "reward_std": 0.19450139999389648, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "rho2": 0.43749961256980896, "step": 59 }, { "advantages/mean": -1.6880221664905548e-09, "advantages/snr": 4.6717588139024755e-09, "advantages/std": 0.3613247573375702, "advantages/var": 0.13055558026505398, "completions/clipped_ratio": -2.7734375, "epoch": 0.34408602150537637, "grad_norm": 36.36877636312811, "learning_rate": 1.9776471429032713e-06, "loss": -1.4261, "num_tokens": 26784375.0, "residual_var": 0.036718837916851044, "reward": 0.5859375, "reward_std": 0.28580355644226074, "rewards/drgrpo_math_reward/mean": 0.5859375, "rewards/drgrpo_math_reward/std": 0.49304109811782837, "rho2": 0.7187494039535522, "step": 60 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3120274543762207, "advantages/var": 0.09736113228450449, "completions/clipped_ratio": -2.6640625, "epoch": 0.34982078853046594, "grad_norm": 31.787886615863176, "learning_rate": 1.9768859547646473e-06, "loss": -1.0275, "num_tokens": 27167796.0, "residual_var": 0.0456380657851696, "reward": 0.501953125, "reward_std": 0.21528351306915283, "rewards/drgrpo_math_reward/mean": 0.501953125, "rewards/drgrpo_math_reward/std": 0.5004851818084717, "rho2": 0.5312496423721313, "step": 61 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.119480343577251e-10, "advantages/std": 0.2825970947742462, "advantages/var": 0.0798611179748443, "completions/clipped_ratio": -2.671875, "epoch": 0.35555555555555557, "grad_norm": 29.446574026034092, "learning_rate": 1.9761121736888013e-06, "loss": -0.6952, "num_tokens": 27512802.0, "residual_var": 0.03743492439389229, "reward": 0.603515625, "reward_std": 0.193976491689682, "rewards/drgrpo_math_reward/mean": 0.603515625, "rewards/drgrpo_math_reward/std": 0.4896455705165863, "rho2": 0.5312497615814209, "step": 62 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.8745360177794165e-10, "advantages/std": 0.30046260356903076, "advantages/var": 0.09027777614348054, "completions/clipped_ratio": -2.7109375, "epoch": 0.36129032258064514, "grad_norm": 32.810116930758326, "learning_rate": 1.9753258096504644e-06, "loss": -0.7128, "num_tokens": 27880614.0, "residual_var": 0.04513894021511078, "reward": 0.65234375, "reward_std": 0.20285914838314056, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.47669193148612976, "rho2": 0.4999995827674866, "step": 63 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.162360805333825e-09, "advantages/std": 0.30046260356903076, "advantages/var": 0.09027777614348054, "completions/clipped_ratio": -2.6015625, "epoch": 0.36702508960573477, "grad_norm": 32.54419136106036, "learning_rate": 1.974526872786577e-06, "loss": -1.329, "num_tokens": 28264628.0, "residual_var": 0.05360245332121849, "reward": 0.65625, "reward_std": 0.19380438327789307, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.4754233956336975, "rho2": 0.4062498211860657, "step": 64 }, { "advantages/mean": 5.820766091346741e-11, "advantages/snr": 1.6921021167445229e-10, "advantages/std": 0.3439961373806, "advantages/var": 0.11833334253277261, "completions/clipped_ratio": -2.3984375, "epoch": 0.3727598566308244, "grad_norm": 40.154017589377275, "learning_rate": 1.973715373396152e-06, "loss": -1.6288, "num_tokens": 28690098.0, "residual_var": 0.055468786507844925, "reward": 0.54296875, "reward_std": 0.25088727474212646, "rewards/drgrpo_math_reward/mean": 0.54296875, "rewards/drgrpo_math_reward/std": 0.49863746762275696, "rho2": 0.5312497615814209, "step": 65 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.5655488538617933e-09, "advantages/std": 0.29744282364845276, "advantages/var": 0.08847223333996457, "completions/clipped_ratio": -2.7578125, "epoch": 0.37849462365591396, "grad_norm": 31.71768374129029, "learning_rate": 1.9728913219401447e-06, "loss": -2.1175, "num_tokens": 29034604.0, "residual_var": 0.041471388190984726, "reward": 0.728515625, "reward_std": 0.20333345234394073, "rewards/drgrpo_math_reward/mean": 0.728515625, "rewards/drgrpo_math_reward/std": 0.44516023993492126, "rho2": 0.5312497615814209, "step": 66 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 4.144786466879422e-09, "advantages/std": 0.2808716893196106, "advantages/var": 0.07888890586125186, "completions/clipped_ratio": -2.46875, "epoch": 0.3842293906810036, "grad_norm": 29.75251535823391, "learning_rate": 1.9720547290413193e-06, "loss": -0.7041, "num_tokens": 29422092.0, "residual_var": 0.051770858466625214, "reward": 0.58203125, "reward_std": 0.173513263463974, "rewards/drgrpo_math_reward/mean": 0.58203125, "rewards/drgrpo_math_reward/std": 0.4937073290348053, "rho2": 0.3437498211860657, "step": 67 }, { "advantages/mean": 1.3387762010097504e-09, "advantages/snr": 4.062289118106036e-09, "advantages/std": 0.3295620083808899, "advantages/var": 0.10861111736804574, "completions/clipped_ratio": -2.5546875, "epoch": 0.3899641577060932, "grad_norm": 36.113379266550254, "learning_rate": 1.971205605484109e-06, "loss": -2.1523, "num_tokens": 29828955.0, "residual_var": 0.04751741141080856, "reward": 0.6484375, "reward_std": 0.2383894920349121, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4779251217842102, "rho2": 0.5624996423721313, "step": 68 }, { "advantages/mean": -1.979060471057892e-09, "advantages/snr": 6.690444501441469e-09, "advantages/std": 0.2958040237426758, "advantages/var": 0.0875000204623575, "completions/clipped_ratio": -2.609375, "epoch": 0.3956989247311828, "grad_norm": 32.10912423364059, "learning_rate": 1.9703439622144798e-06, "loss": -1.4137, "num_tokens": 30165437.0, "residual_var": 0.04101566597819328, "reward": 0.7578125, "reward_std": 0.20152214169502258, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "rho2": 0.5312496423721313, "step": 69 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.5552681091074503e-09, "advantages/std": 0.2733536660671234, "advantages/var": 0.07472222675233642, "completions/clipped_ratio": -2.796875, "epoch": 0.4014336917562724, "grad_norm": 31.614840432380678, "learning_rate": 1.969469810339786e-06, "loss": -0.4546, "num_tokens": 30516058.0, "residual_var": 0.03502606973052025, "reward": 0.7265625, "reward_std": 0.18578603863716125, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "rho2": 0.5312497615814209, "step": 70 }, { "advantages/mean": 2.9103830456733704e-10, "advantages/snr": 9.66406310375784e-10, "advantages/std": 0.3011552095413208, "advantages/var": 0.09069446023387684, "completions/clipped_ratio": -2.5, "epoch": 0.407168458781362, "grad_norm": 30.885956972771066, "learning_rate": 1.968583161128631e-06, "loss": -1.9752, "num_tokens": 30880221.0, "residual_var": 0.05384986475110054, "reward": 0.681640625, "reward_std": 0.19717584550380707, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.40624964237213135, "step": 71 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 6.2117144142305535e-09, "advantages/std": 0.28111881017684937, "advantages/var": 0.07902778543524747, "completions/clipped_ratio": -2.7421875, "epoch": 0.4129032258064516, "grad_norm": 28.911886529162505, "learning_rate": 1.9676840260107193e-06, "loss": 0.1067, "num_tokens": 31257949.0, "residual_var": 0.046922776848077774, "reward": 0.701171875, "reward_std": 0.17475129663944244, "rewards/drgrpo_math_reward/mean": 0.701171875, "rewards/drgrpo_math_reward/std": 0.45819199085235596, "rho2": 0.4062497317790985, "step": 72 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 3.923121251303124e-10, "advantages/std": 0.2967415750026703, "advantages/var": 0.0880555623350654, "completions/clipped_ratio": -2.484375, "epoch": 0.41863799283154124, "grad_norm": 33.05813113726881, "learning_rate": 1.9667724165767103e-06, "loss": -1.5584, "num_tokens": 31637949.0, "residual_var": 0.04402781277894974, "reward": 0.62890625, "reward_std": 0.2027869075536728, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.4835699498653412, "rho2": 0.4999997317790985, "step": 73 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 4.5601915188318476e-09, "advantages/std": 0.3318718373775482, "advantages/var": 0.11013891644434981, "completions/clipped_ratio": -2.4609375, "epoch": 0.4243727598566308, "grad_norm": 33.36001000324979, "learning_rate": 1.9658483445780673e-06, "loss": -0.6869, "num_tokens": 32033074.0, "residual_var": 0.0413021557033062, "reward": 0.595703125, "reward_std": 0.24531753361225128, "rewards/drgrpo_math_reward/mean": 0.595703125, "rewards/drgrpo_math_reward/std": 0.4912354052066803, "rho2": 0.624999463558197, "step": 74 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.429973110515777e-09, "advantages/std": 0.31534814834594727, "advantages/var": 0.09944445466521756, "completions/clipped_ratio": -2.625, "epoch": 0.43010752688172044, "grad_norm": 31.808077461108994, "learning_rate": 1.964911821926909e-06, "loss": -2.8939, "num_tokens": 32390577.0, "residual_var": 0.0466146394610405, "reward": 0.69140625, "reward_std": 0.22101986408233643, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.4623647928237915, "rho2": 0.5312495231628418, "step": 75 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.133883827822451e-10, "advantages/std": 0.28161245584487915, "advantages/var": 0.07930557528698401, "completions/clipped_ratio": -2.390625, "epoch": 0.43584229390681006, "grad_norm": 30.280333887350324, "learning_rate": 1.9639628606958534e-06, "loss": -1.5517, "num_tokens": 32753728.0, "residual_var": 0.04213111475110054, "reward": 0.576171875, "reward_std": 0.18690845370292664, "rewards/drgrpo_math_reward/mean": 0.576171875, "rewards/drgrpo_math_reward/std": 0.4946470856666565, "rho2": 0.4687497019767761, "step": 76 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.2261956480186747e-09, "advantages/std": 0.2886751592159271, "advantages/var": 0.08333334754834087, "completions/clipped_ratio": -2.453125, "epoch": 0.44157706093189963, "grad_norm": 33.44201675062511, "learning_rate": 1.9630014731178623e-06, "loss": -0.4674, "num_tokens": 33133178.0, "residual_var": 0.04166669398546219, "reward": 0.57421875, "reward_std": 0.19587956368923187, "rewards/drgrpo_math_reward/mean": 0.57421875, "rewards/drgrpo_math_reward/std": 0.4949444830417633, "rho2": 0.4999997317790985, "step": 77 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 4.473220604335547e-09, "advantages/std": 0.23422449827194214, "advantages/var": 0.054861115590743026, "completions/clipped_ratio": -2.5390625, "epoch": 0.44731182795698926, "grad_norm": 23.682189457717946, "learning_rate": 1.962027671586086e-06, "loss": -1.2852, "num_tokens": 33481909.0, "residual_var": 0.03600262105464935, "reward": 0.646484375, "reward_std": 0.12895467877388, "rewards/drgrpo_math_reward/mean": 0.646484375, "rewards/drgrpo_math_reward/std": 0.47852855920791626, "rho2": 0.34374988079071045, "step": 78 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 5.587935224175462e-09, "advantages/std": 0.25, "advantages/var": 0.0625, "completions/clipped_ratio": -2.8515625, "epoch": 0.45304659498207883, "grad_norm": 26.301262831419308, "learning_rate": 1.9610414686536994e-06, "loss": -0.7137, "num_tokens": 33825311.0, "residual_var": 0.035156264901161194, "reward": 0.79296875, "reward_std": 0.15520820021629333, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40557438135147095, "rho2": 0.43749988079071045, "step": 79 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 3.224869082847296e-09, "advantages/std": 0.32489314675331116, "advantages/var": 0.10555555680726858, "completions/clipped_ratio": -2.359375, "epoch": 0.45878136200716846, "grad_norm": 38.3914580765648, "learning_rate": 1.9600428770337452e-06, "loss": -1.6889, "num_tokens": 34225099.0, "residual_var": 0.056076426059007645, "reward": 0.55859375, "reward_std": 0.22781488299369812, "rewards/drgrpo_math_reward/mean": 0.55859375, "rewards/drgrpo_math_reward/std": 0.4970405399799347, "rho2": 0.4687497615814209, "step": 80 }, { "advantages/mean": 2.6775524020195007e-09, "advantages/snr": 7.548142814857893e-09, "advantages/std": 0.35472995042800903, "advantages/var": 0.12583333773065775, "completions/clipped_ratio": -2.34375, "epoch": 0.4645161290322581, "grad_norm": 38.6424896405289, "learning_rate": 1.9590319095989656e-06, "loss": -0.8039, "num_tokens": 34615306.0, "residual_var": 0.04718755558133125, "reward": 0.59375, "reward_std": 0.2702009677886963, "rewards/drgrpo_math_reward/mean": 0.59375, "rewards/drgrpo_math_reward/std": 0.49161264300346375, "rho2": 0.6249996423721313, "step": 81 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.145400670526419e-09, "advantages/std": 0.27131369709968567, "advantages/var": 0.07361112223389998, "completions/clipped_ratio": -2.46875, "epoch": 0.47025089605734766, "grad_norm": 28.441987315968927, "learning_rate": 1.9580085793816383e-06, "loss": -1.4966, "num_tokens": 34980858.0, "residual_var": 0.04830732196569443, "reward": 0.6484375, "reward_std": 0.15774814784526825, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4779251217842102, "rho2": 0.3437497615814209, "step": 82 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.9847457993149985e-09, "advantages/std": 0.3120274245738983, "advantages/var": 0.0973611136862198, "completions/clipped_ratio": -2.515625, "epoch": 0.4759856630824373, "grad_norm": 32.53679814900421, "learning_rate": 1.9569728995734097e-06, "loss": -2.0527, "num_tokens": 35354981.0, "residual_var": 0.05172312632203102, "reward": 0.673828125, "reward_std": 0.2139427363872528, "rewards/drgrpo_math_reward/mean": 0.673828125, "rewards/drgrpo_math_reward/std": 0.4692695140838623, "rho2": 0.4687497615814209, "step": 83 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.5805066593368211e-09, "advantages/std": 0.294627845287323, "advantages/var": 0.08680556721865074, "completions/clipped_ratio": -2.515625, "epoch": 0.4817204301075269, "grad_norm": 32.77682131750612, "learning_rate": 1.955924883525122e-06, "loss": -1.2668, "num_tokens": 35725815.0, "residual_var": 0.05154084041714668, "reward": 0.638671875, "reward_std": 0.18537190556526184, "rewards/drgrpo_math_reward/mean": 0.638671875, "rewards/drgrpo_math_reward/std": 0.48085519671440125, "rho2": 0.4062497019767761, "step": 84 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.790868203086861e-09, "advantages/std": 0.25027763843536377, "advantages/var": 0.06263889630078268, "completions/clipped_ratio": -2.8203125, "epoch": 0.4874551971326165, "grad_norm": 25.60047375069588, "learning_rate": 1.954864544746643e-06, "loss": -0.6133, "num_tokens": 36048976.0, "residual_var": 0.03719186782836914, "reward": 0.724609375, "reward_std": 0.1485384702682495, "rewards/drgrpo_math_reward/mean": 0.724609375, "rewards/drgrpo_math_reward/std": 0.44714778661727905, "rho2": 0.4062497615814209, "step": 85 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 5.95515752303676e-09, "advantages/std": 0.2541325092315674, "advantages/var": 0.06458333224833268, "completions/clipped_ratio": -2.7265625, "epoch": 0.4931899641577061, "grad_norm": 26.76727004658746, "learning_rate": 1.953791896906692e-06, "loss": -0.8705, "num_tokens": 36379119.0, "residual_var": 0.04440106078982353, "reward": 0.728515625, "reward_std": 0.14256325364112854, "rewards/drgrpo_math_reward/mean": 0.728515625, "rewards/drgrpo_math_reward/std": 0.44516023993492126, "rho2": 0.3124999403953552, "step": 86 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.891385763964462e-09, "advantages/std": 0.32210248708724976, "advantages/var": 0.1037500121877919, "completions/clipped_ratio": -2.625, "epoch": 0.4989247311827957, "grad_norm": 35.655584888427214, "learning_rate": 1.952706953832663e-06, "loss": -1.2552, "num_tokens": 36731877.0, "residual_var": 0.04863286763429642, "reward": 0.662109375, "reward_std": 0.22177350521087646, "rewards/drgrpo_math_reward/mean": 0.662109375, "rewards/drgrpo_math_reward/std": 0.4734536409378052, "rho2": 0.5312495827674866, "step": 87 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 8.168238777016627e-10, "advantages/std": 0.285043865442276, "advantages/var": 0.08125000522627435, "completions/clipped_ratio": -2.5, "epoch": 0.5046594982078854, "grad_norm": 30.737015015982255, "learning_rate": 1.9516097295104467e-06, "loss": -0.3468, "num_tokens": 37113531.0, "residual_var": 0.04062502458691597, "reward": 0.619140625, "reward_std": 0.18963877856731415, "rewards/drgrpo_math_reward/mean": 0.619140625, "rewards/drgrpo_math_reward/std": 0.48607301712036133, "rho2": 0.49999985098838806, "step": 88 }, { "advantages/mean": -4.0745362639427185e-10, "advantages/snr": 1.4138188749862316e-09, "advantages/std": 0.28819364309310913, "advantages/var": 0.08305557591927837, "completions/clipped_ratio": -2.4453125, "epoch": 0.5103942652329749, "grad_norm": 31.00173858062215, "learning_rate": 1.9505002380842493e-06, "loss": -0.3095, "num_tokens": 37456324.0, "residual_var": 0.057100724428892136, "reward": 0.7578125, "reward_std": 0.172368124127388, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "rho2": 0.3124997615814209, "step": 89 }, { "advantages/mean": -1.979060471057892e-09, "advantages/snr": 6.127810401962046e-09, "advantages/std": 0.3229637145996094, "advantages/var": 0.10430556094797794, "completions/clipped_ratio": -2.71875, "epoch": 0.5161290322580645, "grad_norm": 33.05465801943676, "learning_rate": 1.9493784938564127e-06, "loss": -1.1773, "num_tokens": 37816567.0, "residual_var": 0.042374178767204285, "reward": 0.666015625, "reward_std": 0.23562106490135193, "rewards/drgrpo_math_reward/mean": 0.666015625, "rewards/drgrpo_math_reward/std": 0.47209542989730835, "rho2": 0.5937496423721313, "step": 90 }, { "advantages/mean": -2.444721758365631e-09, "advantages/snr": 7.891446530473376e-09, "advantages/std": 0.3097938597202301, "advantages/var": 0.09597223552035761, "completions/clipped_ratio": -2.46875, "epoch": 0.5218637992831541, "grad_norm": 30.982969420039755, "learning_rate": 1.948244511287226e-06, "loss": -1.5703, "num_tokens": 38214715.0, "residual_var": 0.04198788106441498, "reward": 0.638671875, "reward_std": 0.22393552958965302, "rewards/drgrpo_math_reward/mean": 0.638671875, "rewards/drgrpo_math_reward/std": 0.48085519671440125, "rho2": 0.5624997615814209, "step": 91 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 5.286810017044903e-09, "advantages/std": 0.28625941276550293, "advantages/var": 0.08194445139685058, "completions/clipped_ratio": -2.6171875, "epoch": 0.5275985663082438, "grad_norm": 32.022601552867776, "learning_rate": 1.9470983049947442e-06, "loss": -0.1695, "num_tokens": 38562270.0, "residual_var": 0.04097225144505501, "reward": 0.59765625, "reward_std": 0.1907959133386612, "rewards/drgrpo_math_reward/mean": 0.59765625, "rewards/drgrpo_math_reward/std": 0.4908501207828522, "rho2": 0.4999997615814209, "step": 92 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 9.46156823761702e-10, "advantages/std": 0.24608038365840912, "advantages/var": 0.060555555221469826, "completions/clipped_ratio": -2.7265625, "epoch": 0.5333333333333333, "grad_norm": 23.016382128405997, "learning_rate": 1.945939889754595e-06, "loss": -0.2369, "num_tokens": 38894439.0, "residual_var": 0.035954881459474564, "reward": 0.74609375, "reward_std": 0.14727336168289185, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "rho2": 0.40624985098838806, "step": 93 }, { "advantages/mean": -4.0745362639427185e-10, "advantages/snr": 1.234770876704364e-09, "advantages/std": 0.3299831748008728, "advantages/var": 0.10888889565166338, "completions/clipped_ratio": -2.7109375, "epoch": 0.5390681003584229, "grad_norm": 33.28785783206862, "learning_rate": 1.944769280499791e-06, "loss": -1.0677, "num_tokens": 39249650.0, "residual_var": 0.047638922929763794, "reward": 0.703125, "reward_std": 0.24272273480892181, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "rho2": 0.5624997615814209, "step": 94 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 5.450382814108627e-09, "advantages/std": 0.25630930066108704, "advantages/var": 0.06569445760537551, "completions/clipped_ratio": -2.4765625, "epoch": 0.5448028673835126, "grad_norm": 28.505523230392296, "learning_rate": 1.9435864923205368e-06, "loss": -1.1094, "num_tokens": 39633209.0, "residual_var": 0.04516495391726494, "reward": 0.521484375, "reward_std": 0.1514061689376831, "rewards/drgrpo_math_reward/mean": 0.521484375, "rewards/drgrpo_math_reward/std": 0.5000267624855042, "rho2": 0.3124998211860657, "step": 95 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.830588846684818e-10, "advantages/std": 0.3039097189903259, "advantages/var": 0.09236111729677887, "completions/clipped_ratio": -2.8515625, "epoch": 0.5505376344086022, "grad_norm": 32.373698865565096, "learning_rate": 1.9423915404640348e-06, "loss": -1.7002, "num_tokens": 39962331.0, "residual_var": 0.04040802642703056, "reward": 0.783203125, "reward_std": 0.21759384870529175, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "rho2": 0.5624997019767761, "step": 96 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.2627435241664555e-10, "advantages/std": 0.27309951186180115, "advantages/var": 0.07458334337915407, "completions/clipped_ratio": -2.65625, "epoch": 0.5562724014336917, "grad_norm": 28.16041254581349, "learning_rate": 1.9411844403342867e-06, "loss": -1.6583, "num_tokens": 40323661.0, "residual_var": 0.048945337533950806, "reward": 0.689453125, "reward_std": 0.1569630354642868, "rewards/drgrpo_math_reward/mean": 0.689453125, "rewards/drgrpo_math_reward/std": 0.46317005157470703, "rho2": 0.3437498211860657, "step": 97 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23243877291679382, "advantages/var": 0.054027783155064846, "completions/clipped_ratio": -2.796875, "epoch": 0.5620071684587814, "grad_norm": 24.960814501818952, "learning_rate": 1.9399652074918976e-06, "loss": -0.7598, "num_tokens": 40653638.0, "residual_var": 0.03376738354563713, "reward": 0.642578125, "reward_std": 0.1325943022966385, "rewards/drgrpo_math_reward/mean": 0.642578125, "rewards/drgrpo_math_reward/std": 0.4797092080116272, "rho2": 0.3749998211860657, "step": 98 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.0984154373211944e-09, "advantages/std": 0.27738863229751587, "advantages/var": 0.07694445332788646, "completions/clipped_ratio": -2.71875, "epoch": 0.567741935483871, "grad_norm": 30.202527221168374, "learning_rate": 1.938733857653874e-06, "loss": -1.6763, "num_tokens": 40991682.0, "residual_var": 0.0384722538292408, "reward": 0.75390625, "reward_std": 0.1837950199842453, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.4999997913837433, "step": 99 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 4.108775516096273e-09, "advantages/std": 0.28333336114883423, "advantages/var": 0.08027779353989573, "completions/clipped_ratio": -2.6875, "epoch": 0.5734767025089605, "grad_norm": 29.816640120446095, "learning_rate": 1.9374904066934204e-06, "loss": -1.428, "num_tokens": 41345514.0, "residual_var": 0.047664958983659744, "reward": 0.7578125, "reward_std": 0.17804491519927979, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "rho2": 0.4062498211860657, "step": 100 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 5.2400423632308246e-09, "advantages/std": 0.24438132345676422, "advantages/var": 0.05972223125447962, "completions/clipped_ratio": -2.609375, "epoch": 0.5792114695340502, "grad_norm": 26.15861600594887, "learning_rate": 1.936234870639737e-06, "loss": -0.774, "num_tokens": 41695463.0, "residual_var": 0.03919273242354393, "reward": 0.6875, "reward_std": 0.1342916190624237, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "rho2": 0.3437498211860657, "step": 101 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.45124264315808e-09, "advantages/std": 0.31247222423553467, "advantages/var": 0.09763889091870226, "completions/clipped_ratio": -2.890625, "epoch": 0.5849462365591398, "grad_norm": 32.92922304686191, "learning_rate": 1.934967265677811e-06, "loss": -1.8569, "num_tokens": 42011865.0, "residual_var": 0.04576826095581055, "reward": 0.810546875, "reward_std": 0.21477454900741577, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "rho2": 0.5312498211860657, "step": 102 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 8.318917128636031e-10, "advantages/std": 0.27988094091415405, "advantages/var": 0.07833334108699219, "completions/clipped_ratio": -2.5625, "epoch": 0.5906810035842294, "grad_norm": 33.857150701891065, "learning_rate": 1.933687608148208e-06, "loss": -1.6523, "num_tokens": 42362398.0, "residual_var": 0.0440625324845314, "reward": 0.62890625, "reward_std": 0.17926228046417236, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.4835699498653412, "rho2": 0.4374997317790985, "step": 103 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.2762124247677023e-09, "advantages/std": 0.3068658709526062, "advantages/var": 0.09416666275550156, "completions/clipped_ratio": -2.6171875, "epoch": 0.596415770609319, "grad_norm": 31.238896855509584, "learning_rate": 1.9323959145468632e-06, "loss": -2.1058, "num_tokens": 42706851.0, "residual_var": 0.04708337038755417, "reward": 0.62890625, "reward_std": 0.20591552555561066, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.4835699498653412, "rho2": 0.4999997317790985, "step": 104 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.265271752359298e-09, "advantages/std": 0.300231397151947, "advantages/var": 0.09013889183581014, "completions/clipped_ratio": -2.859375, "epoch": 0.6021505376344086, "grad_norm": 31.03100344073479, "learning_rate": 1.9310922015248674e-06, "loss": -0.5022, "num_tokens": 43053105.0, "residual_var": 0.03661896288394928, "reward": 0.630859375, "reward_std": 0.2152237743139267, "rewards/drgrpo_math_reward/mean": 0.630859375, "rewards/drgrpo_math_reward/std": 0.4830440282821655, "rho2": 0.5937496423721313, "step": 105 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 6.29612066425447e-09, "advantages/std": 0.24037009477615356, "advantages/var": 0.05777778246269705, "completions/clipped_ratio": -2.8203125, "epoch": 0.6078853046594982, "grad_norm": 22.980274560279945, "learning_rate": 1.929776485888251e-06, "loss": -1.4719, "num_tokens": 43374582.0, "residual_var": 0.0325000174343586, "reward": 0.8125, "reward_std": 0.14931906759738922, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39069411158561707, "rho2": 0.43749988079071045, "step": 106 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 3.0089276720991367e-09, "advantages/std": 0.23213981091976166, "advantages/var": 0.053888891813862694, "completions/clipped_ratio": -2.859375, "epoch": 0.6136200716845878, "grad_norm": 24.577276561697886, "learning_rate": 1.928448784597772e-06, "loss": -0.0314, "num_tokens": 43698499.0, "residual_var": 0.03368057683110237, "reward": 0.76171875, "reward_std": 0.13252761960029602, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "rho2": 0.3749998211860657, "step": 107 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.726419543135627e-09, "advantages/std": 0.29556915163993835, "advantages/var": 0.08736112340115287, "completions/clipped_ratio": -2.578125, "epoch": 0.6193548387096774, "grad_norm": 30.019917221827324, "learning_rate": 1.927109114768691e-06, "loss": -0.8054, "num_tokens": 44057268.0, "residual_var": 0.051870688796043396, "reward": 0.623046875, "reward_std": 0.1867973506450653, "rewards/drgrpo_math_reward/mean": 0.623046875, "rewards/drgrpo_math_reward/std": 0.4850969910621643, "rho2": 0.4062498211860657, "step": 108 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.2835783362388611, "advantages/var": 0.08041667278400055, "completions/clipped_ratio": -2.7109375, "epoch": 0.625089605734767, "grad_norm": 27.98698546480038, "learning_rate": 1.925757493670555e-06, "loss": -1.0549, "num_tokens": 44371964.0, "residual_var": 0.0402083657681942, "reward": 0.705078125, "reward_std": 0.19510778784751892, "rewards/drgrpo_math_reward/mean": 0.705078125, "rewards/drgrpo_math_reward/std": 0.4564536213874817, "rho2": 0.4999997615814209, "step": 109 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 7.488619214291302e-10, "advantages/std": 0.3109126389026642, "advantages/var": 0.09666666902941845, "completions/clipped_ratio": -2.734375, "epoch": 0.6308243727598566, "grad_norm": 34.848586459018364, "learning_rate": 1.9243939387269745e-06, "loss": -2.4801, "num_tokens": 44703161.0, "residual_var": 0.04833338037133217, "reward": 0.765625, "reward_std": 0.21757444739341736, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42402184009552, "rho2": 0.49999967217445374, "step": 110 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 2.986702584959865e-09, "advantages/std": 0.2728451192378998, "advantages/var": 0.07444445909194375, "completions/clipped_ratio": -2.6171875, "epoch": 0.6365591397849463, "grad_norm": 31.07988005662117, "learning_rate": 1.9230184675153973e-06, "loss": -3.9704, "num_tokens": 45046109.0, "residual_var": 0.037222251296043396, "reward": 0.80078125, "reward_std": 0.18137921392917633, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "rho2": 0.4999997615814209, "step": 111 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 6.0184887961217405e-09, "advantages/std": 0.27080127596855164, "advantages/var": 0.07333333106619566, "completions/clipped_ratio": -2.90625, "epoch": 0.6422939068100358, "grad_norm": 32.82506439112962, "learning_rate": 1.9216310977668816e-06, "loss": -3.1148, "num_tokens": 45326331.0, "residual_var": 0.03895837441086769, "reward": 0.84765625, "reward_std": 0.1734059453010559, "rewards/drgrpo_math_reward/mean": 0.84765625, "rewards/drgrpo_math_reward/std": 0.35970520973205566, "rho2": 0.46874967217445374, "step": 112 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 1.0284736351218284e-09, "advantages/std": 0.22638463973999023, "advantages/var": 0.051250005110205166, "completions/clipped_ratio": -2.6484375, "epoch": 0.6480286738351254, "grad_norm": 23.56485422205502, "learning_rate": 1.9202318473658702e-06, "loss": -0.4371, "num_tokens": 45648257.0, "residual_var": 0.030429702252149582, "reward": 0.830078125, "reward_std": 0.13561411201953888, "rewards/drgrpo_math_reward/mean": 0.830078125, "rewards/drgrpo_math_reward/std": 0.3759314715862274, "rho2": 0.40624985098838806, "step": 113 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.54572504726175e-09, "advantages/std": 0.30731815099716187, "advantages/var": 0.09444444593231438, "completions/clipped_ratio": -2.34375, "epoch": 0.6537634408602151, "grad_norm": 32.618804536359384, "learning_rate": 1.918820734349957e-06, "loss": -1.9718, "num_tokens": 45990679.0, "residual_var": 0.04427086561918259, "reward": 0.625, "reward_std": 0.21223780512809753, "rewards/drgrpo_math_reward/mean": 0.625, "rewards/drgrpo_math_reward/std": 0.4845963716506958, "rho2": 0.5312497615814209, "step": 114 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.9490161485841037e-09, "advantages/std": 0.23892119526863098, "advantages/var": 0.057083337548591295, "completions/clipped_ratio": -2.640625, "epoch": 0.6594982078853047, "grad_norm": 23.441600063054505, "learning_rate": 1.917397776909656e-06, "loss": -1.1864, "num_tokens": 46299792.0, "residual_var": 0.03746095299720764, "reward": 0.767578125, "reward_std": 0.13512971997261047, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "rho2": 0.34374985098838806, "step": 115 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.4599870418689282e-09, "advantages/std": 0.23921167850494385, "advantages/var": 0.057222227133152614, "completions/clipped_ratio": -2.515625, "epoch": 0.6652329749103942, "grad_norm": 26.187522976046864, "learning_rate": 1.9159629933881667e-06, "loss": -2.4797, "num_tokens": 46611215.0, "residual_var": 0.0393403023481369, "reward": 0.7109375, "reward_std": 0.1254923790693283, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45377036929130554, "rho2": 0.3124997913837433, "step": 116 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.298516889217388e-09, "advantages/std": 0.29790937900543213, "advantages/var": 0.0887499980994022, "completions/clipped_ratio": -2.5859375, "epoch": 0.6709677419354839, "grad_norm": 32.51008457335095, "learning_rate": 1.9145164022811366e-06, "loss": -2.4398, "num_tokens": 46955441.0, "residual_var": 0.047148458659648895, "reward": 0.748046875, "reward_std": 0.20099927484989166, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "rho2": 0.46874985098838806, "step": 117 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 5.00200410821974e-10, "advantages/std": 0.23273734748363495, "advantages/var": 0.05416667291371824, "completions/clipped_ratio": -2.53125, "epoch": 0.6767025089605735, "grad_norm": 26.59728367862591, "learning_rate": 1.9130580222364246e-06, "loss": -0.622, "num_tokens": 47296402.0, "residual_var": 0.03216147795319557, "reward": 0.65234375, "reward_std": 0.1397346556186676, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.47669193148612976, "rho2": 0.40624985098838806, "step": 118 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 5.24278659516381e-09, "advantages/std": 0.2220485508441925, "advantages/var": 0.049305558932005944, "completions/clipped_ratio": -2.59375, "epoch": 0.6824372759856631, "grad_norm": 23.857909361808463, "learning_rate": 1.9115878720538587e-06, "loss": 0.0416, "num_tokens": 47597946.0, "residual_var": 0.030815988779067993, "reward": 0.814453125, "reward_std": 0.12807637453079224, "rewards/drgrpo_math_reward/mean": 0.814453125, "rewards/drgrpo_math_reward/std": 0.38912075757980347, "rho2": 0.37499988079071045, "step": 119 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 3.1142321116265092e-09, "advantages/std": 0.2616719603538513, "advantages/var": 0.06847221483542754, "completions/clipped_ratio": -2.4609375, "epoch": 0.6881720430107527, "grad_norm": 29.153983730350756, "learning_rate": 1.9101059706849955e-06, "loss": -2.5864, "num_tokens": 47925691.0, "residual_var": 0.03423614799976349, "reward": 0.767578125, "reward_std": 0.17110365629196167, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "rho2": 0.49999967217445374, "step": 120 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.645304879921266e-09, "advantages/std": 0.2640496790409088, "advantages/var": 0.06972223300160696, "completions/clipped_ratio": -2.328125, "epoch": 0.6939068100358423, "grad_norm": 35.400483650292266, "learning_rate": 1.9086123372328743e-06, "loss": -1.8255, "num_tokens": 48263203.0, "residual_var": 0.045755233615636826, "reward": 0.71875, "reward_std": 0.14787563681602478, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "rho2": 0.3437498211860657, "step": 121 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.419235235642553e-09, "advantages/std": 0.24608038365840912, "advantages/var": 0.060555555221469826, "completions/clipped_ratio": -2.0703125, "epoch": 0.6996415770609319, "grad_norm": 28.02709562123867, "learning_rate": 1.9071069909517714e-06, "loss": -2.261, "num_tokens": 48626409.0, "residual_var": 0.03784724697470665, "reward": 0.61328125, "reward_std": 0.1407259702682495, "rewards/drgrpo_math_reward/mean": 0.61328125, "rewards/drgrpo_math_reward/std": 0.48747459053993225, "rho2": 0.3749997913837433, "step": 122 }, { "advantages/mean": -1.6880221664905548e-09, "advantages/snr": 6.281200961773778e-09, "advantages/std": 0.2687419354915619, "advantages/var": 0.07222222789175081, "completions/clipped_ratio": -2.3125, "epoch": 0.7053763440860215, "grad_norm": 31.75494076388148, "learning_rate": 1.9055899512469525e-06, "loss": -4.2113, "num_tokens": 48982817.0, "residual_var": 0.03611113131046295, "reward": 0.70703125, "reward_std": 0.18038788437843323, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.455569326877594, "rho2": 0.49999988079071045, "step": 123 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.4157847881709487e-09, "advantages/std": 0.2409472018480301, "advantages/var": 0.058055554078395355, "completions/clipped_ratio": -2.15625, "epoch": 0.7111111111111111, "grad_norm": 25.229028421351064, "learning_rate": 1.9040612376744214e-06, "loss": -2.7044, "num_tokens": 49334840.0, "residual_var": 0.029027797281742096, "reward": 0.5703125, "reward_std": 0.15800055861473083, "rewards/drgrpo_math_reward/mean": 0.5703125, "rewards/drgrpo_math_reward/std": 0.4955156147480011, "rho2": 0.4999997317790985, "step": 124 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.1638538857381264e-09, "advantages/std": 0.3075440526008606, "advantages/var": 0.09458334429016091, "completions/clipped_ratio": -2.453125, "epoch": 0.7168458781362007, "grad_norm": 37.728745532806784, "learning_rate": 1.9025208699406693e-06, "loss": -5.4887, "num_tokens": 49672536.0, "residual_var": 0.04433596879243851, "reward": 0.654296875, "reward_std": 0.2129577100276947, "rewards/drgrpo_math_reward/mean": 0.654296875, "rewards/drgrpo_math_reward/std": 0.4760620892047882, "rho2": 0.5312497615814209, "step": 125 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 5.099658476573414e-09, "advantages/std": 0.25110867619514465, "advantages/var": 0.06305556726047801, "completions/clipped_ratio": -2.5546875, "epoch": 0.7225806451612903, "grad_norm": 31.189033052065547, "learning_rate": 1.9009688679024189e-06, "loss": -2.9939, "num_tokens": 49984803.0, "residual_var": 0.03940974175930023, "reward": 0.703125, "reward_std": 0.14921027421951294, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "rho2": 0.37499985098838806, "step": 126 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 4.49938174780678e-09, "advantages/std": 0.258736252784729, "advantages/var": 0.06694444850508319, "completions/clipped_ratio": -2.1015625, "epoch": 0.72831541218638, "grad_norm": 36.916885531108164, "learning_rate": 1.899405251566371e-06, "loss": -6.6347, "num_tokens": 50337648.0, "residual_var": 0.039748284965753555, "reward": 0.703125, "reward_std": 0.16253292560577393, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "rho2": 0.40624988079071045, "step": 127 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.952242982513429e-09, "advantages/std": 0.26509955525398254, "advantages/var": 0.07027777419585934, "completions/clipped_ratio": -2.2421875, "epoch": 0.7340501792114695, "grad_norm": 34.12099182084098, "learning_rate": 1.8978300410889436e-06, "loss": -3.872, "num_tokens": 50654394.0, "residual_var": 0.037335094064474106, "reward": 0.6953125, "reward_std": 0.16867445409297943, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4607250988483429, "rho2": 0.4687497615814209, "step": 128 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 9.142140892585025e-10, "advantages/std": 0.2546784579753876, "advantages/var": 0.06486111695672125, "completions/clipped_ratio": -2.578125, "epoch": 0.7397849462365591, "grad_norm": 33.50250388737256, "learning_rate": 1.896243256776015e-06, "loss": -4.1713, "num_tokens": 50964294.0, "residual_var": 0.04053822532296181, "reward": 0.791015625, "reward_std": 0.14649513363838196, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "rho2": 0.3749997615814209, "step": 129 }, { "advantages/mean": -7.566995918750763e-10, "advantages/snr": 2.6546776025304036e-09, "advantages/std": 0.285043865442276, "advantages/var": 0.08125000522627435, "completions/clipped_ratio": -2.5, "epoch": 0.7455197132616488, "grad_norm": 35.64887005278684, "learning_rate": 1.8946449190826594e-06, "loss": -4.7494, "num_tokens": 51266933.0, "residual_var": 0.05078127607703209, "reward": 0.705078125, "reward_std": 0.17685548961162567, "rewards/drgrpo_math_reward/mean": 0.705078125, "rewards/drgrpo_math_reward/std": 0.4564536213874817, "rho2": 0.37499985098838806, "step": 130 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 4.998592723313688e-09, "advantages/std": 0.30276504158973694, "advantages/var": 0.09166667040883514, "completions/clipped_ratio": -2.4609375, "epoch": 0.7512544802867384, "grad_norm": 32.403998403564955, "learning_rate": 1.8930350486128855e-06, "loss": -3.2787, "num_tokens": 51601695.0, "residual_var": 0.04583336412906647, "reward": 0.68359375, "reward_std": 0.20655442774295807, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.46552830934524536, "rho2": 0.4999997615814209, "step": 131 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.224326080532804e-09, "advantages/std": 0.31402406096458435, "advantages/var": 0.09861111086468899, "completions/clipped_ratio": -2.453125, "epoch": 0.7569892473118279, "grad_norm": 37.208491305272936, "learning_rate": 1.8914136661193688e-06, "loss": -4.1807, "num_tokens": 51933158.0, "residual_var": 0.049305595457553864, "reward": 0.75, "reward_std": 0.21507197618484497, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "rho2": 0.4999997019767761, "step": 132 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 4.327703234254224e-09, "advantages/std": 0.26900023221969604, "advantages/var": 0.0723611249342504, "completions/clipped_ratio": -2.421875, "epoch": 0.7627240143369176, "grad_norm": 32.20776292854696, "learning_rate": 1.8897807925031862e-06, "loss": -2.5995, "num_tokens": 52256893.0, "residual_var": 0.04296444356441498, "reward": 0.783203125, "reward_std": 0.1603575348854065, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "rho2": 0.4062497019767761, "step": 133 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.3070853035884116e-10, "advantages/std": 0.2702879309654236, "advantages/var": 0.07305556562556959, "completions/clipped_ratio": -2.328125, "epoch": 0.7684587813620072, "grad_norm": 29.549175385450976, "learning_rate": 1.8881364488135445e-06, "loss": -2.8085, "num_tokens": 52576424.0, "residual_var": 0.04565975069999695, "reward": 0.73828125, "reward_std": 0.15783660113811493, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "rho2": 0.3749998211860657, "step": 134 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.1711773090970947e-09, "advantages/std": 0.2936835289001465, "advantages/var": 0.08625001514724318, "completions/clipped_ratio": -2.4375, "epoch": 0.7741935483870968, "grad_norm": 39.59347710044669, "learning_rate": 1.8864806562475108e-06, "loss": -6.0586, "num_tokens": 52917277.0, "residual_var": 0.04312502592802048, "reward": 0.740234375, "reward_std": 0.20168951153755188, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "rho2": 0.4999998211860657, "step": 135 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.336034084095076e-10, "advantages/std": 0.26848340034484863, "advantages/var": 0.07208333626073227, "completions/clipped_ratio": -2.3046875, "epoch": 0.7799283154121864, "grad_norm": 34.70622499354557, "learning_rate": 1.8848134361497382e-06, "loss": -7.257, "num_tokens": 53251349.0, "residual_var": 0.047304727137088776, "reward": 0.708984375, "reward_std": 0.14985541999340057, "rewards/drgrpo_math_reward/mean": 0.708984375, "rewards/drgrpo_math_reward/std": 0.45467492938041687, "rho2": 0.34374967217445374, "step": 136 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.7548191728888788e-09, "advantages/std": 0.26536139845848083, "advantages/var": 0.07041667179184063, "completions/clipped_ratio": -2.53125, "epoch": 0.785663082437276, "grad_norm": 32.42150899639404, "learning_rate": 1.883134810012191e-06, "loss": -4.42, "num_tokens": 53536845.0, "residual_var": 0.04621097072958946, "reward": 0.787109375, "reward_std": 0.15378239750862122, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "rho2": 0.3437497615814209, "step": 137 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.8423436125458726e-09, "advantages/std": 0.32766008377075195, "advantages/var": 0.10736113049665619, "completions/clipped_ratio": -2.234375, "epoch": 0.7913978494623656, "grad_norm": 42.37282530518774, "learning_rate": 1.8814447994738676e-06, "loss": -8.1968, "num_tokens": 53895812.0, "residual_var": 0.04026047885417938, "reward": 0.658203125, "reward_std": 0.24190157651901245, "rewards/drgrpo_math_reward/mean": 0.658203125, "rewards/drgrpo_math_reward/std": 0.4747757613658905, "rho2": 0.6249995231628418, "step": 138 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 3.2381128953452067e-09, "advantages/std": 0.25166115164756775, "advantages/var": 0.06333333524858009, "completions/clipped_ratio": -2.453125, "epoch": 0.7971326164874551, "grad_norm": 30.11770160973668, "learning_rate": 1.8797434263205215e-06, "loss": -2.488, "num_tokens": 54201228.0, "residual_var": 0.041562534868717194, "reward": 0.70703125, "reward_std": 0.1403549164533615, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.455569326877594, "rho2": 0.3437497913837433, "step": 139 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.7539199724385576e-09, "advantages/std": 0.33187180757522583, "advantages/var": 0.11013889666324772, "completions/clipped_ratio": -2.2265625, "epoch": 0.8028673835125448, "grad_norm": 42.25628141371556, "learning_rate": 1.8780307124843801e-06, "loss": -8.849, "num_tokens": 54519286.0, "residual_var": 0.061953168362379074, "reward": 0.650390625, "reward_std": 0.2232649326324463, "rewards/drgrpo_math_reward/mean": 0.650390625, "rewards/drgrpo_math_reward/std": 0.47731292247772217, "rho2": 0.4374997019767761, "step": 140 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 5.864986533940788e-09, "advantages/std": 0.27788886427879333, "advantages/var": 0.07722222089015762, "completions/clipped_ratio": -2.6484375, "epoch": 0.8086021505376344, "grad_norm": 35.572825878828176, "learning_rate": 1.8763066800438634e-06, "loss": -0.5565, "num_tokens": 54796939.0, "residual_var": 0.04343752562999725, "reward": 0.7421875, "reward_std": 0.17755252122879028, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "rho2": 0.4374998211860657, "step": 141 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.577308712098342e-09, "advantages/std": 0.26034167408943176, "advantages/var": 0.0677777872676879, "completions/clipped_ratio": -2.6796875, "epoch": 0.814336917562724, "grad_norm": 27.220764829896947, "learning_rate": 1.8745713512232975e-06, "loss": -2.0889, "num_tokens": 55090469.0, "residual_var": 0.046597253531217575, "reward": 0.73828125, "reward_std": 0.143000990152359, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "rho2": 0.3124997317790985, "step": 142 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.403118581123502e-09, "advantages/std": 0.29083216190338135, "advantages/var": 0.08458334639739462, "completions/clipped_ratio": -2.453125, "epoch": 0.8200716845878137, "grad_norm": 34.05651806766556, "learning_rate": 1.872824748392629e-06, "loss": -2.5689, "num_tokens": 55395012.0, "residual_var": 0.044934939593076706, "reward": 0.701171875, "reward_std": 0.18776723742485046, "rewards/drgrpo_math_reward/mean": 0.701171875, "rewards/drgrpo_math_reward/std": 0.45819199085235596, "rho2": 0.46874967217445374, "step": 143 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.4671263536797923e-09, "advantages/std": 0.23804762959480286, "advantages/var": 0.05666667395570446, "completions/clipped_ratio": -2.2421875, "epoch": 0.8258064516129032, "grad_norm": 27.74705302742698, "learning_rate": 1.8710668940671375e-06, "loss": -0.6783, "num_tokens": 55708770.0, "residual_var": 0.0389583483338356, "reward": 0.62890625, "reward_std": 0.12483605742454529, "rewards/drgrpo_math_reward/mean": 0.62890625, "rewards/drgrpo_math_reward/std": 0.4835699498653412, "rho2": 0.31249988079071045, "step": 144 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 7.858647284893909e-10, "advantages/std": 0.2962731719017029, "advantages/var": 0.08777779238869599, "completions/clipped_ratio": -2.4375, "epoch": 0.8315412186379928, "grad_norm": 36.09380396601779, "learning_rate": 1.8692978109071436e-06, "loss": -6.2432, "num_tokens": 56012230.0, "residual_var": 0.04388892650604248, "reward": 0.671875, "reward_std": 0.1975100338459015, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4699897766113281, "rho2": 0.4999997019767761, "step": 145 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.28012895584106445, "advantages/var": 0.07847223190060504, "completions/clipped_ratio": -2.640625, "epoch": 0.8372759856630825, "grad_norm": 35.900498057580585, "learning_rate": 1.8675175217177175e-06, "loss": -6.586, "num_tokens": 56297418.0, "residual_var": 0.044140659272670746, "reward": 0.794921875, "reward_std": 0.17694061994552612, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "rho2": 0.4374997019767761, "step": 146 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 7.672724191406409e-10, "advantages/std": 0.30345237255096436, "advantages/var": 0.09208334240680927, "completions/clipped_ratio": -2.28125, "epoch": 0.843010752688172, "grad_norm": 37.39134805084733, "learning_rate": 1.8657260494483857e-06, "loss": -4.8001, "num_tokens": 56630433.0, "residual_var": 0.048919305205345154, "reward": 0.708984375, "reward_std": 0.20562899112701416, "rewards/drgrpo_math_reward/mean": 0.708984375, "rewards/drgrpo_math_reward/std": 0.45467492938041687, "rho2": 0.4687497615814209, "step": 147 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.9561684715730563e-09, "advantages/std": 0.23804762959480286, "advantages/var": 0.05666667395570446, "completions/clipped_ratio": -2.4765625, "epoch": 0.8487455197132616, "grad_norm": 30.393977496000083, "learning_rate": 1.863923417192835e-06, "loss": -3.6028, "num_tokens": 56937811.0, "residual_var": 0.03541668877005577, "reward": 0.7578125, "reward_std": 0.14091923832893372, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "rho2": 0.3749998211860657, "step": 148 }, { "advantages/mean": -5.238689482212067e-10, "advantages/snr": 1.7738179546489343e-09, "advantages/std": 0.2953341007232666, "advantages/var": 0.08722223105002058, "completions/clipped_ratio": -2.65625, "epoch": 0.8544802867383513, "grad_norm": 40.15410412837706, "learning_rate": 1.862109648188614e-06, "loss": -6.0191, "num_tokens": 57227147.0, "residual_var": 0.04633684828877449, "reward": 0.84765625, "reward_std": 0.1902497410774231, "rewards/drgrpo_math_reward/mean": 0.84765625, "rewards/drgrpo_math_reward/std": 0.35970520973205566, "rho2": 0.46874964237213135, "step": 149 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.2953868413520204e-09, "advantages/std": 0.2535853981971741, "advantages/var": 0.06430555417881934, "completions/clipped_ratio": -2.59375, "epoch": 0.8602150537634409, "grad_norm": 32.18043136600018, "learning_rate": 1.8602847658168334e-06, "loss": -3.3899, "num_tokens": 57503455.0, "residual_var": 0.03416234999895096, "reward": 0.810546875, "reward_std": 0.16125573217868805, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "rho2": 0.4687497615814209, "step": 150 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.153542651794206e-09, "advantages/std": 0.2702879309654236, "advantages/var": 0.07305556562556959, "completions/clipped_ratio": -2.1796875, "epoch": 0.8659498207885304, "grad_norm": 38.13655309488044, "learning_rate": 1.858448793601866e-06, "loss": -5.6129, "num_tokens": 57852349.0, "residual_var": 0.04109378159046173, "reward": 0.69921875, "reward_std": 0.1738053411245346, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "rho2": 0.4374997913837433, "step": 151 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 6.620013517399945e-09, "advantages/std": 0.2813657522201538, "advantages/var": 0.07916668652241299, "completions/clipped_ratio": -2.4296875, "epoch": 0.8716845878136201, "grad_norm": 36.50026866407634, "learning_rate": 1.8566017552110425e-06, "loss": -1.8084, "num_tokens": 58160996.0, "residual_var": 0.04947919398546219, "reward": 0.7109375, "reward_std": 0.17487786710262299, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45377036929130554, "rho2": 0.3749998211860657, "step": 152 }, { "advantages/mean": -1.7462298274040222e-09, "advantages/snr": 6.969448276172948e-09, "advantages/std": 0.2505549490451813, "advantages/var": 0.06277778249103338, "completions/clipped_ratio": -2.625, "epoch": 0.8774193548387097, "grad_norm": 33.18420394466814, "learning_rate": 1.8547436744543466e-06, "loss": -3.5112, "num_tokens": 58446763.0, "residual_var": 0.037274330854415894, "reward": 0.82421875, "reward_std": 0.14997635781764984, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "rho2": 0.4062498211860657, "step": 153 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.2896358072757721, "advantages/var": 0.0838889008562882, "completions/clipped_ratio": -2.2734375, "epoch": 0.8831541218637993, "grad_norm": 35.52841655642974, "learning_rate": 1.8528745752841072e-06, "loss": -4.0018, "num_tokens": 58798203.0, "residual_var": 0.04980906471610069, "reward": 0.68359375, "reward_std": 0.18068912625312805, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.46552830934524536, "rho2": 0.4062497317790985, "step": 154 }, { "advantages/mean": -1.4551915228366852e-09, "advantages/snr": 5.203938525071466e-09, "advantages/std": 0.27963271737098694, "advantages/var": 0.07819445662428226, "completions/clipped_ratio": -2.625, "epoch": 0.8888888888888888, "grad_norm": 35.129188292834954, "learning_rate": 1.850994481794692e-06, "loss": -5.2733, "num_tokens": 59092112.0, "residual_var": 0.046427976340055466, "reward": 0.798828125, "reward_std": 0.17982199788093567, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "rho2": 0.40624985098838806, "step": 155 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.0416865816239633e-09, "advantages/std": 0.3061862289905548, "advantages/var": 0.09375000682345647, "completions/clipped_ratio": -2.359375, "epoch": 0.8946236559139785, "grad_norm": 41.01386187545441, "learning_rate": 1.8491034182221936e-06, "loss": -5.3663, "num_tokens": 59429539.0, "residual_var": 0.05273441970348358, "reward": 0.716796875, "reward_std": 0.1961509883403778, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "rho2": 0.43749961256980896, "step": 156 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.5234254757253314e-09, "advantages/std": 0.33040380477905273, "advantages/var": 0.10916667421247439, "completions/clipped_ratio": -2.4140625, "epoch": 0.9003584229390681, "grad_norm": 40.18367991396442, "learning_rate": 1.84720140894412e-06, "loss": -4.5327, "num_tokens": 59757269.0, "residual_var": 0.054583385586738586, "reward": 0.68359375, "reward_std": 0.2329733967781067, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.46552830934524536, "rho2": 0.49999961256980896, "step": 157 }, { "advantages/mean": 8.731149137020111e-10, "advantages/snr": 2.8838032103556524e-09, "advantages/std": 0.3027650713920593, "advantages/var": 0.09166668845503878, "completions/clipped_ratio": -2.15625, "epoch": 0.9060931899641577, "grad_norm": 40.62651427218035, "learning_rate": 1.845288478479079e-06, "loss": -0.9751, "num_tokens": 60099644.0, "residual_var": 0.04869794845581055, "reward": 0.69921875, "reward_std": 0.2026558220386505, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "rho2": 0.4687497615814209, "step": 158 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.703888723375309e-10, "advantages/std": 0.2474873960018158, "advantages/var": 0.06125001117975959, "completions/clipped_ratio": -2.5078125, "epoch": 0.9118279569892473, "grad_norm": 28.337298759357747, "learning_rate": 1.8433646514864622e-06, "loss": -0.4044, "num_tokens": 60398378.0, "residual_var": 0.038281265646219254, "reward": 0.802734375, "reward_std": 0.14674167335033417, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "rho2": 0.37499985098838806, "step": 159 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.22546251118183136, "advantages/var": 0.05083334394841743, "completions/clipped_ratio": -2.453125, "epoch": 0.9175627240143369, "grad_norm": 29.80459833915241, "learning_rate": 1.841429952766127e-06, "loss": -2.1933, "num_tokens": 60704923.0, "residual_var": 0.03494793549180031, "reward": 0.80078125, "reward_std": 0.11855455487966537, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "rho2": 0.31249991059303284, "step": 160 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.442399351516589e-09, "advantages/std": 0.31446605920791626, "advantages/var": 0.0988889023937567, "completions/clipped_ratio": -2.265625, "epoch": 0.9232974910394265, "grad_norm": 38.49379688291279, "learning_rate": 1.8394844072580772e-06, "loss": -1.3907, "num_tokens": 61046432.0, "residual_var": 0.052534762769937515, "reward": 0.75390625, "reward_std": 0.2125454992055893, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.4687497615814209, "step": 161 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.996482923379791e-09, "advantages/std": 0.23303553462028503, "advantages/var": 0.054305560395762065, "completions/clipped_ratio": -2.4609375, "epoch": 0.9290322580645162, "grad_norm": 32.133383049582775, "learning_rate": 1.8375280400421418e-06, "loss": -2.473, "num_tokens": 61388947.0, "residual_var": 0.032243940979242325, "reward": 0.771484375, "reward_std": 0.1386905461549759, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "rho2": 0.40624985098838806, "step": 162 }, { "advantages/mean": -5.820766091346741e-11, "advantages/snr": 2.2022284563901245e-10, "advantages/std": 0.2643125355243683, "advantages/var": 0.06986111643532045, "completions/clipped_ratio": -2.2890625, "epoch": 0.9347670250896057, "grad_norm": 33.44241356396429, "learning_rate": 1.8355608763376506e-06, "loss": -2.0501, "num_tokens": 61741597.0, "residual_var": 0.03929691016674042, "reward": 0.767578125, "reward_std": 0.16086195409297943, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "rho2": 0.43749961256980896, "step": 163 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 4.481115930760375e-09, "advantages/std": 0.2078327238559723, "advantages/var": 0.04319444110539283, "completions/clipped_ratio": -2.359375, "epoch": 0.9405017921146953, "grad_norm": 26.784891548433972, "learning_rate": 1.833582941503111e-06, "loss": -2.2134, "num_tokens": 62086731.0, "residual_var": 0.031046023592352867, "reward": 0.865234375, "reward_std": 0.10210912674665451, "rewards/drgrpo_math_reward/mean": 0.865234375, "rewards/drgrpo_math_reward/std": 0.3418070077896118, "rho2": 0.28124988079071045, "step": 164 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 2.750526549712868e-09, "advantages/std": 0.2962731719017029, "advantages/var": 0.08777779238869599, "completions/clipped_ratio": -2.1171875, "epoch": 0.946236559139785, "grad_norm": 42.68266951009528, "learning_rate": 1.8315942610358788e-06, "loss": -3.0715, "num_tokens": 62459211.0, "residual_var": 0.05211809277534485, "reward": 0.72265625, "reward_std": 0.1834416389465332, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "rho2": 0.4062497019767761, "step": 165 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.749650228156056e-09, "advantages/std": 0.26614534854888916, "advantages/var": 0.0708333465542097, "completions/clipped_ratio": -2.3515625, "epoch": 0.9519713261648746, "grad_norm": 33.890020487122975, "learning_rate": 1.8295948605718311e-06, "loss": -2.8072, "num_tokens": 62822726.0, "residual_var": 0.03984377905726433, "reward": 0.84375, "reward_std": 0.16864262521266937, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "rho2": 0.4374997615814209, "step": 166 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 6.570342965980063e-09, "advantages/std": 0.23033791780471802, "advantages/var": 0.053055556378613034, "completions/clipped_ratio": -2.2265625, "epoch": 0.9577060931899641, "grad_norm": 32.72878021095955, "learning_rate": 1.8275847658850357e-06, "loss": -2.6428, "num_tokens": 63176463.0, "residual_var": 0.036475714296102524, "reward": 0.921875, "reward_std": 0.12257213890552521, "rewards/drgrpo_math_reward/mean": 0.921875, "rewards/drgrpo_math_reward/std": 0.26863065361976624, "rho2": 0.31249985098838806, "step": 167 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 2.0912321022543545e-09, "advantages/std": 0.22267316281795502, "advantages/var": 0.049583337439351505, "completions/clipped_ratio": -2.171875, "epoch": 0.9634408602150538, "grad_norm": 27.99962794838936, "learning_rate": 1.8255640028874178e-06, "loss": -1.8965, "num_tokens": 63567533.0, "residual_var": 0.03253908082842827, "reward": 0.748046875, "reward_std": 0.12323734164237976, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "rho2": 0.3437498211860657, "step": 168 }, { "advantages/mean": 5.820766091346741e-11, "advantages/snr": 2.1597102872951344e-10, "advantages/std": 0.2695160508155823, "advantages/var": 0.07263890164722753, "completions/clipped_ratio": -1.921875, "epoch": 0.9691756272401434, "grad_norm": 52.53775524955538, "learning_rate": 1.8235325976284273e-06, "loss": -1.3777, "num_tokens": 63974289.0, "residual_var": 0.04312937334179878, "reward": 0.662109375, "reward_std": 0.1663278490304947, "rewards/drgrpo_math_reward/mean": 0.662109375, "rewards/drgrpo_math_reward/std": 0.4734536409378052, "rho2": 0.4062497913837433, "step": 169 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.25819888710975647, "advantages/var": 0.06666666530471677, "completions/clipped_ratio": -2.03125, "epoch": 0.974910394265233, "grad_norm": 32.43502504499752, "learning_rate": 1.8214905762947024e-06, "loss": -0.1963, "num_tokens": 64371003.0, "residual_var": 0.03958335146307945, "reward": 0.6484375, "reward_std": 0.15933895111083984, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4779251217842102, "rho2": 0.4062498211860657, "step": 170 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 4.032744941432326e-09, "advantages/std": 0.2309401035308838, "advantages/var": 0.05333333141885532, "completions/clipped_ratio": -2.6015625, "epoch": 0.9806451612903225, "grad_norm": 30.289980939758703, "learning_rate": 1.8194379652097318e-06, "loss": -2.2162, "num_tokens": 64737763.0, "residual_var": 0.036666687577962875, "reward": 0.91015625, "reward_std": 0.12506677210330963, "rewards/drgrpo_math_reward/mean": 0.91015625, "rewards/drgrpo_math_reward/std": 0.2862374484539032, "rho2": 0.3124998211860657, "step": 171 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 7.576208412102917e-10, "advantages/std": 0.30731815099716187, "advantages/var": 0.09444444593231438, "completions/clipped_ratio": -2.140625, "epoch": 0.9863799283154122, "grad_norm": 37.37291569267018, "learning_rate": 1.8173747908335156e-06, "loss": -1.8143, "num_tokens": 65115557.0, "residual_var": 0.06197919324040413, "reward": 0.796875, "reward_std": 0.1927589476108551, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "rho2": 0.3437498211860657, "step": 172 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.3186342418193817, "advantages/var": 0.10152778005981222, "completions/clipped_ratio": -1.7734375, "epoch": 0.9921146953405018, "grad_norm": 42.82568657002979, "learning_rate": 1.8153010797622244e-06, "loss": -1.4481, "num_tokens": 65548328.0, "residual_var": 0.044418442994356155, "reward": 0.611328125, "reward_std": 0.23046398162841797, "rewards/drgrpo_math_reward/mean": 0.611328125, "rewards/drgrpo_math_reward/std": 0.4879252314567566, "rho2": 0.5624997019767761, "step": 173 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.087143547892589e-09, "advantages/std": 0.27888670563697815, "advantages/var": 0.0777777945810465, "completions/clipped_ratio": -1.953125, "epoch": 0.9978494623655914, "grad_norm": 35.72571779968094, "learning_rate": 1.813216858727856e-06, "loss": -1.385, "num_tokens": 65956000.0, "residual_var": 0.048611145466566086, "reward": 0.75390625, "reward_std": 0.16881583631038666, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.3749997615814209, "step": 174 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.6047982372613341e-09, "advantages/std": 0.21762607991695404, "advantages/var": 0.04736111066002047, "completions/clipped_ratio": -2.234375, "epoch": 1.0057347670250896, "grad_norm": 27.335118940668234, "learning_rate": 1.8111221545978911e-06, "loss": -0.924, "num_tokens": 66345060.0, "residual_var": 0.03256078064441681, "reward": 0.806640625, "reward_std": 0.1195635050535202, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "rho2": 0.31249988079071045, "step": 175 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 5.475684124266863e-09, "advantages/std": 0.2763853967189789, "advantages/var": 0.07638888751950734, "completions/clipped_ratio": -1.921875, "epoch": 1.0114695340501791, "grad_norm": 39.91756605658793, "learning_rate": 1.8090169943749474e-06, "loss": -0.4324, "num_tokens": 66763021.0, "residual_var": 0.04058162122964859, "reward": 0.71484375, "reward_std": 0.17473086714744568, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.45193037390708923, "rho2": 0.4687497317790985, "step": 176 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 7.454057297076664e-09, "advantages/std": 0.28111881017684937, "advantages/var": 0.07902778543524747, "completions/clipped_ratio": -1.71875, "epoch": 1.0172043010752687, "grad_norm": 37.08259916273741, "learning_rate": 1.8069014051964305e-06, "loss": -1.0803, "num_tokens": 67188085.0, "residual_var": 0.04445316269993782, "reward": 0.716796875, "reward_std": 0.17930860817432404, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "rho2": 0.4374997019767761, "step": 177 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 8.563594726542867e-09, "advantages/std": 0.29907265305519104, "advantages/var": 0.08944445180547067, "completions/clipped_ratio": -1.3125, "epoch": 1.0229390681003585, "grad_norm": 43.629666398834274, "learning_rate": 1.8047754143341844e-06, "loss": -1.1702, "num_tokens": 67661246.0, "residual_var": 0.04192712530493736, "reward": 0.75, "reward_std": 0.20328712463378906, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "rho2": 0.5312496423721313, "step": 178 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.656612686812885e-10, "advantages/std": 0.25, "advantages/var": 0.0625, "completions/clipped_ratio": -1.40625, "epoch": 1.028673835125448, "grad_norm": 35.58111839260163, "learning_rate": 1.8026390491941412e-06, "loss": -0.0072, "num_tokens": 68138906.0, "residual_var": 0.04101565107703209, "reward": 0.796875, "reward_std": 0.14283469319343567, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "rho2": 0.3437497615814209, "step": 179 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.8515811469894048e-09, "advantages/std": 0.24494898319244385, "advantages/var": 0.06000000436701214, "completions/clipped_ratio": -1.375, "epoch": 1.0344086021505376, "grad_norm": 34.98221513678247, "learning_rate": 1.8004923373159655e-06, "loss": -0.0516, "num_tokens": 68565852.0, "residual_var": 0.039375025779008865, "reward": 0.86328125, "reward_std": 0.13461080193519592, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.3438861668109894, "rho2": 0.3437497913837433, "step": 180 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.24664415419101715, "advantages/var": 0.06083333879660224, "completions/clipped_ratio": -0.8671875, "epoch": 1.0401433691756272, "grad_norm": 32.35431859136387, "learning_rate": 1.7983353063727014e-06, "loss": -1.1143, "num_tokens": 69054532.0, "residual_var": 0.036119814962148666, "reward": 0.6640625, "reward_std": 0.14567705988883972, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4727790653705597, "rho2": 0.4062497615814209, "step": 181 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.6772192227275952e-09, "advantages/std": 0.27763888239860535, "advantages/var": 0.07708334901954661, "completions/clipped_ratio": -1.0078125, "epoch": 1.0458781362007168, "grad_norm": 40.534688745867136, "learning_rate": 1.796167984170415e-06, "loss": -2.5626, "num_tokens": 69538012.0, "residual_var": 0.03854169696569443, "reward": 0.701171875, "reward_std": 0.18121296167373657, "rewards/drgrpo_math_reward/mean": 0.701171875, "rewards/drgrpo_math_reward/std": 0.45819199085235596, "rho2": 0.4999997019767761, "step": 182 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.574480804631703e-09, "advantages/std": 0.27131369709968567, "advantages/var": 0.07361112223389998, "completions/clipped_ratio": -0.7421875, "epoch": 1.0516129032258064, "grad_norm": 39.10469891670816, "learning_rate": 1.7939903986478354e-06, "loss": -0.2293, "num_tokens": 70034563.0, "residual_var": 0.03910592198371887, "reward": 0.73828125, "reward_std": 0.18091422319412231, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "rho2": 0.46874988079071045, "step": 183 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.4546711045741827e-09, "advantages/std": 0.28455621004104614, "advantages/var": 0.08097223667292397, "completions/clipped_ratio": -0.2265625, "epoch": 1.0573476702508962, "grad_norm": 42.62694288360159, "learning_rate": 1.7918025778759956e-06, "loss": -1.2351, "num_tokens": 70560907.0, "residual_var": 0.03795576095581055, "reward": 0.818359375, "reward_std": 0.19277161359786987, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "rho2": 0.5312497019767761, "step": 184 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 7.999105258803513e-10, "advantages/std": 0.2910708487033844, "advantages/var": 0.08472223896490849, "completions/clipped_ratio": -0.484375, "epoch": 1.0630824372759857, "grad_norm": 43.804075553572744, "learning_rate": 1.7896045500578705e-06, "loss": 0.3015, "num_tokens": 71062221.0, "residual_var": 0.04500873014330864, "reward": 0.74609375, "reward_std": 0.18721503019332886, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "rho2": 0.4687495827674866, "step": 185 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 2.9950962291949284e-09, "advantages/std": 0.2720804810523987, "advantages/var": 0.07402778816970468, "completions/clipped_ratio": -0.5703125, "epoch": 1.0688172043010753, "grad_norm": 44.921592995907695, "learning_rate": 1.787396343528012e-06, "loss": -0.4542, "num_tokens": 71570714.0, "residual_var": 0.039327286183834076, "reward": 0.658203125, "reward_std": 0.1791955828666687, "rewards/drgrpo_math_reward/mean": 0.658203125, "rewards/drgrpo_math_reward/std": 0.4747757613658905, "rho2": 0.4687497615814209, "step": 186 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.332551247874325e-09, "advantages/std": 0.29556915163993835, "advantages/var": 0.08736112340115287, "completions/clipped_ratio": -1.046875, "epoch": 1.0745519713261649, "grad_norm": 42.582030842468136, "learning_rate": 1.7851779867521854e-06, "loss": -0.5294, "num_tokens": 72041322.0, "residual_var": 0.035490479320287704, "reward": 0.837890625, "reward_std": 0.2138943374156952, "rewards/drgrpo_math_reward/mean": 0.837890625, "rewards/drgrpo_math_reward/std": 0.3689115643501282, "rho2": 0.5937497615814209, "step": 187 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 9.191477804489327e-10, "advantages/std": 0.253311425447464, "advantages/var": 0.0641666782622261, "completions/clipped_ratio": -1.3828125, "epoch": 1.0802867383512544, "grad_norm": 40.0307497091435, "learning_rate": 1.782949508327002e-06, "loss": -0.6652, "num_tokens": 72483431.0, "residual_var": 0.03408856689929962, "reward": 0.75, "reward_std": 0.16229984164237976, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "rho2": 0.4687497615814209, "step": 188 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.62885944368034e-09, "advantages/std": 0.21441134810447693, "advantages/var": 0.04597222619597918, "completions/clipped_ratio": -1.953125, "epoch": 1.086021505376344, "grad_norm": 35.90228073059747, "learning_rate": 1.7807109369795494e-06, "loss": -0.5281, "num_tokens": 72906198.0, "residual_var": 0.03016928769648075, "reward": 0.802734375, "reward_std": 0.11662977933883667, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "rho2": 0.34374988079071045, "step": 189 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.8091607326709192e-09, "advantages/std": 0.2573907673358917, "advantages/var": 0.06625000710975915, "completions/clipped_ratio": -1.9140625, "epoch": 1.0917562724014336, "grad_norm": 43.20595417724851, "learning_rate": 1.7784623015670235e-06, "loss": 0.1994, "num_tokens": 73334962.0, "residual_var": 0.03312502056360245, "reward": 0.681640625, "reward_std": 0.16922760009765625, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.4999998211860657, "step": 190 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.1750843509500268e-09, "advantages/std": 0.2972092628479004, "advantages/var": 0.08833334592259234, "completions/clipped_ratio": -1.7734375, "epoch": 1.0974910394265234, "grad_norm": 51.40227058498769, "learning_rate": 1.776203631076353e-06, "loss": -1.2671, "num_tokens": 73742491.0, "residual_var": 0.06072920188307762, "reward": 0.7265625, "reward_std": 0.17972251772880554, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "rho2": 0.3124997913837433, "step": 191 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 3.4880131395642857e-09, "advantages/std": 0.23363077640533447, "advantages/var": 0.05458333968375939, "completions/clipped_ratio": -1.5234375, "epoch": 1.103225806451613, "grad_norm": 34.075992485450655, "learning_rate": 1.7739349546238294e-06, "loss": -0.8448, "num_tokens": 74160575.0, "residual_var": 0.03752605989575386, "reward": 0.771484375, "reward_std": 0.12644492089748383, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "rho2": 0.3124998211860657, "step": 192 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 3.58712290480235e-09, "advantages/std": 0.1947220414876938, "advantages/var": 0.03791667344113514, "completions/clipped_ratio": -2.3671875, "epoch": 1.1089605734767025, "grad_norm": 31.359241618796272, "learning_rate": 1.7716563014547295e-06, "loss": -0.1286, "num_tokens": 74526494.0, "residual_var": 0.027252621948719025, "reward": 0.822265625, "reward_std": 0.0961657464504242, "rewards/drgrpo_math_reward/mean": 0.822265625, "rewards/drgrpo_math_reward/std": 0.3826628625392914, "rho2": 0.2812498211860657, "step": 193 }, { "advantages/mean": 1.7462298274040222e-10, "advantages/snr": 7.112507551890247e-10, "advantages/std": 0.2455153465270996, "advantages/var": 0.0602777853803218, "completions/clipped_ratio": -2.0703125, "epoch": 1.114695340501792, "grad_norm": 37.270384833239724, "learning_rate": 1.7693677009429386e-06, "loss": -0.4958, "num_tokens": 74928344.0, "residual_var": 0.03578995540738106, "reward": 0.81640625, "reward_std": 0.14507855474948883, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "rho2": 0.4062497317790985, "step": 194 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.7445268373702257e-09, "advantages/std": 0.26692697405815125, "advantages/var": 0.07125000947984095, "completions/clipped_ratio": -1.890625, "epoch": 1.1204301075268817, "grad_norm": 48.434144125213436, "learning_rate": 1.767069182590573e-06, "loss": -0.2347, "num_tokens": 75302177.0, "residual_var": 0.04675783962011337, "reward": 0.775390625, "reward_std": 0.155318945646286, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "rho2": 0.3437498211860657, "step": 195 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.45223785573061e-09, "advantages/std": 0.26977357268333435, "advantages/var": 0.07277778051833028, "completions/clipped_ratio": -1.8359375, "epoch": 1.1261648745519715, "grad_norm": 44.77097627811664, "learning_rate": 1.7647607760275985e-06, "loss": 0.2234, "num_tokens": 75704140.0, "residual_var": 0.04548614099621773, "reward": 0.80078125, "reward_std": 0.1574428230524063, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "rho2": 0.3749997615814209, "step": 196 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 2.1150645653507737e-09, "advantages/std": 0.22016409039497375, "advantages/var": 0.048472226699446175, "completions/clipped_ratio": -2.265625, "epoch": 1.131899641577061, "grad_norm": 37.148349335924635, "learning_rate": 1.7624425110114479e-06, "loss": -0.8504, "num_tokens": 76083489.0, "residual_var": 0.03332467004656792, "reward": 0.759765625, "reward_std": 0.11529842019081116, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "rho2": 0.31249985098838806, "step": 197 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.551618881733193e-10, "advantages/std": 0.2557668387889862, "advantages/var": 0.06541667582411126, "completions/clipped_ratio": -1.9375, "epoch": 1.1376344086021506, "grad_norm": 38.93865390623534, "learning_rate": 1.7601144174266397e-06, "loss": -1.3159, "num_tokens": 76504184.0, "residual_var": 0.03475262597203255, "reward": 0.732421875, "reward_std": 0.16130761802196503, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.4687497913837433, "step": 198 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 2.0033717328249483e-09, "advantages/std": 0.23243877291679382, "advantages/var": 0.054027783155064846, "completions/clipped_ratio": -1.8359375, "epoch": 1.1433691756272402, "grad_norm": 38.755954870657675, "learning_rate": 1.7577765252843907e-06, "loss": -1.4317, "num_tokens": 76914764.0, "residual_var": 0.03376738354563713, "reward": 0.724609375, "reward_std": 0.1325943022966385, "rewards/drgrpo_math_reward/mean": 0.724609375, "rewards/drgrpo_math_reward/std": 0.44714778661727905, "rho2": 0.3749998211860657, "step": 199 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.2006236860991244e-10, "advantages/std": 0.2771381735801697, "advantages/var": 0.07680556725535226, "completions/clipped_ratio": -1.7890625, "epoch": 1.1491039426523297, "grad_norm": 46.392307153451014, "learning_rate": 1.7554288647222299e-06, "loss": -0.5734, "num_tokens": 77307120.0, "residual_var": 0.04560332000255585, "reward": 0.759765625, "reward_std": 0.17210491001605988, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "rho2": 0.40624988079071045, "step": 200 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.7941776090040613e-09, "advantages/std": 0.25954023003578186, "advantages/var": 0.06736113100702656, "completions/clipped_ratio": -1.953125, "epoch": 1.1548387096774193, "grad_norm": 44.23622242578184, "learning_rate": 1.753071466003611e-06, "loss": -0.5879, "num_tokens": 77733374.0, "residual_var": 0.03789064660668373, "reward": 0.775390625, "reward_std": 0.16054514050483704, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "rho2": 0.4374998211860657, "step": 201 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.6107334991676265e-10, "advantages/std": 0.2524876296520233, "advantages/var": 0.06375000312729728, "completions/clipped_ratio": -1.453125, "epoch": 1.1605734767025089, "grad_norm": 41.047597006381615, "learning_rate": 1.7507043595175194e-06, "loss": -0.9294, "num_tokens": 78191055.0, "residual_var": 0.03785158693790436, "reward": 0.794921875, "reward_std": 0.14958953857421875, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "rho2": 0.4062497317790985, "step": 202 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 8.333706385104416e-10, "advantages/std": 0.2793842554092407, "advantages/var": 0.07805556217057585, "completions/clipped_ratio": -1.71875, "epoch": 1.1663082437275984, "grad_norm": 51.01632724505282, "learning_rate": 1.7483275757780845e-06, "loss": -0.7757, "num_tokens": 78606624.0, "residual_var": 0.039027802646160126, "reward": 0.73828125, "reward_std": 0.19064868986606598, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "rho2": 0.4999998211860657, "step": 203 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.5642443445922294e-09, "advantages/std": 0.22699731588363647, "advantages/var": 0.05152778141837544, "completions/clipped_ratio": -1.0859375, "epoch": 1.1720430107526882, "grad_norm": 41.37695895534963, "learning_rate": 1.7459411454241822e-06, "loss": -0.4536, "num_tokens": 79079917.0, "residual_var": 0.03220488503575325, "reward": 0.744140625, "reward_std": 0.1298380047082901, "rewards/drgrpo_math_reward/mean": 0.744140625, "rewards/drgrpo_math_reward/std": 0.43676990270614624, "rho2": 0.3749997913837433, "step": 204 }, { "advantages/mean": -6.402842700481415e-10, "advantages/snr": 2.088068447813922e-09, "advantages/std": 0.30663949251174927, "advantages/var": 0.09402777836786314, "completions/clipped_ratio": -0.90625, "epoch": 1.1777777777777778, "grad_norm": 50.210420451695626, "learning_rate": 1.7435450992190433e-06, "loss": -0.4702, "num_tokens": 79568895.0, "residual_var": 0.049952298402786255, "reward": 0.763671875, "reward_std": 0.20373284816741943, "rewards/drgrpo_math_reward/mean": 0.763671875, "rewards/drgrpo_math_reward/std": 0.42524150013923645, "rho2": 0.4687497019767761, "step": 205 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.7462297140716862e-09, "advantages/std": 0.20000000298023224, "advantages/var": 0.040000001192092904, "completions/clipped_ratio": -1.1640625, "epoch": 1.1835125448028674, "grad_norm": 33.132490773865776, "learning_rate": 1.7411394680498548e-06, "loss": 0.067, "num_tokens": 80025420.0, "residual_var": 0.02750001661479473, "reward": 0.734375, "reward_std": 0.1039782464504242, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "rho2": 0.3124998211860657, "step": 206 }, { "advantages/mean": -3.3178366720676422e-09, "advantages/snr": 9.70785361800751e-09, "advantages/std": 0.3417682945728302, "advantages/var": 0.11680556717522084, "completions/clipped_ratio": -1.015625, "epoch": 1.189247311827957, "grad_norm": 58.294810893353706, "learning_rate": 1.7387242829273632e-06, "loss": -2.5913, "num_tokens": 80504509.0, "residual_var": 0.0547526590526104, "reward": 0.818359375, "reward_std": 0.24781876802444458, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "rho2": 0.5312496423721313, "step": 207 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 4.136374980352267e-09, "advantages/std": 0.22515428066253662, "advantages/var": 0.05069445010066431, "completions/clipped_ratio": -1.15625, "epoch": 1.1949820788530465, "grad_norm": 39.80291568478663, "learning_rate": 1.7362995749854732e-06, "loss": -0.207, "num_tokens": 80964717.0, "residual_var": 0.03009984642267227, "reward": 0.814453125, "reward_std": 0.1314820945262909, "rewards/drgrpo_math_reward/mean": 0.814453125, "rewards/drgrpo_math_reward/std": 0.38912075757980347, "rho2": 0.4062497913837433, "step": 208 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 3.30021408532719e-09, "advantages/std": 0.24692556262016296, "advantages/var": 0.06097223347528402, "completions/clipped_ratio": -1.4296875, "epoch": 1.2007168458781363, "grad_norm": 42.27140857276864, "learning_rate": 1.7338653754808478e-06, "loss": -0.6305, "num_tokens": 81398183.0, "residual_var": 0.04001304507255554, "reward": 0.708984375, "reward_std": 0.13604727387428284, "rewards/drgrpo_math_reward/mean": 0.708984375, "rewards/drgrpo_math_reward/std": 0.45467492938041687, "rho2": 0.3437498211860657, "step": 209 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 8.962112228520511e-09, "advantages/std": 0.2857738137245178, "advantages/var": 0.08166667261065541, "completions/clipped_ratio": -0.8515625, "epoch": 1.206451612903226, "grad_norm": 50.14623917958379, "learning_rate": 1.7314217157925047e-06, "loss": -0.9199, "num_tokens": 81878064.0, "residual_var": 0.04848962649703026, "reward": 0.7109375, "reward_std": 0.1721491813659668, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45377036929130554, "rho2": 0.40624961256980896, "step": 210 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 4.250886205596455e-09, "advantages/std": 0.273861289024353, "advantages/var": 0.07500000562608022, "completions/clipped_ratio": -1.4609375, "epoch": 1.2121863799283155, "grad_norm": 54.455990219872284, "learning_rate": 1.7289686274214115e-06, "loss": -0.2861, "num_tokens": 82331539.0, "residual_var": 0.03984377905726433, "reward": 0.7734375, "reward_std": 0.17627525329589844, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "rho2": 0.4687497615814209, "step": 211 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 1.0270829332028162e-09, "advantages/std": 0.22669117152690887, "advantages/var": 0.05138888724824242, "completions/clipped_ratio": -1.5703125, "epoch": 1.217921146953405, "grad_norm": 38.00294295737321, "learning_rate": 1.7265061419900803e-06, "loss": -1.2675, "num_tokens": 82748184.0, "residual_var": 0.03693578392267227, "reward": 0.79296875, "reward_std": 0.11337600648403168, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40557438135147095, "rho2": 0.28124988079071045, "step": 212 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 4.311185259890814e-09, "advantages/std": 0.2700308859348297, "advantages/var": 0.07291667935874901, "completions/clipped_ratio": -1.6171875, "epoch": 1.2236559139784946, "grad_norm": 42.160987118844886, "learning_rate": 1.7240342912421596e-06, "loss": -0.632, "num_tokens": 83197832.0, "residual_var": 0.04101564362645149, "reward": 0.759765625, "reward_std": 0.16703376173973083, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "rho2": 0.43749985098838806, "step": 213 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 4.395995206772153e-09, "advantages/std": 0.23833917081356049, "advantages/var": 0.05680556034409556, "completions/clipped_ratio": -1.359375, "epoch": 1.2293906810035842, "grad_norm": 41.38799838886004, "learning_rate": 1.721553107042026e-06, "loss": -2.0991, "num_tokens": 83647441.0, "residual_var": 0.03550349175930023, "reward": 0.759765625, "reward_std": 0.13589581847190857, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "rho2": 0.3749998211860657, "step": 214 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.115670863626888e-09, "advantages/std": 0.27512624859809875, "advantages/var": 0.07569445266766284, "completions/clipped_ratio": -1.5234375, "epoch": 1.2351254480286737, "grad_norm": 43.587455781908766, "learning_rate": 1.719062621374371e-06, "loss": -1.3686, "num_tokens": 84061695.0, "residual_var": 0.04494360089302063, "reward": 0.779296875, "reward_std": 0.1667918860912323, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "rho2": 0.4062498211860657, "step": 215 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.3790728070570056e-09, "advantages/std": 0.24466530978679657, "advantages/var": 0.05986111381306913, "completions/clipped_ratio": -1.6015625, "epoch": 1.2408602150537635, "grad_norm": 40.24291962907663, "learning_rate": 1.716562866343792e-06, "loss": -1.4652, "num_tokens": 84509702.0, "residual_var": 0.03741321340203285, "reward": 0.787109375, "reward_std": 0.14563731849193573, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "rho2": 0.37499985098838806, "step": 216 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.331971692474985e-09, "advantages/std": 0.2622022330760956, "advantages/var": 0.06875001103009115, "completions/clipped_ratio": -1.3828125, "epoch": 1.246594982078853, "grad_norm": 42.90813595243895, "learning_rate": 1.7140538741743761e-06, "loss": -0.7692, "num_tokens": 84937511.0, "residual_var": 0.03867189586162567, "reward": 0.732421875, "reward_std": 0.16433528065681458, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.43749985098838806, "step": 217 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 6.43632097823937e-09, "advantages/std": 0.28939592838287354, "advantages/var": 0.08375000336458527, "completions/clipped_ratio": -1.9453125, "epoch": 1.2523297491039427, "grad_norm": 50.59163625567646, "learning_rate": 1.7115356772092855e-06, "loss": -0.8469, "num_tokens": 85343134.0, "residual_var": 0.05234377458691597, "reward": 0.853515625, "reward_std": 0.17629116773605347, "rewards/drgrpo_math_reward/mean": 0.853515625, "rewards/drgrpo_math_reward/std": 0.35393697023391724, "rho2": 0.37499985098838806, "step": 218 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.5661839790079906e-10, "advantages/std": 0.25495100021362305, "advantages/var": 0.06500001250992682, "completions/clipped_ratio": -1.3203125, "epoch": 1.2580645161290323, "grad_norm": 45.32442588675835, "learning_rate": 1.7090083079103398e-06, "loss": -2.1202, "num_tokens": 85826996.0, "residual_var": 0.028437525033950806, "reward": 0.71484375, "reward_std": 0.1786910742521286, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.45193037390708923, "rho2": 0.5624997615814209, "step": 219 }, { "advantages/mean": -1.979060471057892e-09, "advantages/snr": 6.984918377363664e-09, "advantages/std": 0.28333336114883423, "advantages/var": 0.08027779353989573, "completions/clipped_ratio": -1.484375, "epoch": 1.2637992831541218, "grad_norm": 48.37751369486061, "learning_rate": 1.7064717988576e-06, "loss": -1.355, "num_tokens": 86264417.0, "residual_var": 0.04515627771615982, "reward": 0.8203125, "reward_std": 0.18267682194709778, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "rho2": 0.4374998211860657, "step": 220 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.22669118642807007, "advantages/var": 0.05138889400416602, "completions/clipped_ratio": -2.09375, "epoch": 1.2695340501792114, "grad_norm": 37.181114133695104, "learning_rate": 1.7039261827489448e-06, "loss": -0.5855, "num_tokens": 86622157.0, "residual_var": 0.03532987833023071, "reward": 0.91015625, "reward_std": 0.12424316257238388, "rewards/drgrpo_math_reward/mean": 0.91015625, "rewards/drgrpo_math_reward/std": 0.2862374484539032, "rho2": 0.31249988079071045, "step": 221 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 4.542756666742103e-09, "advantages/std": 0.23063921928405762, "advantages/var": 0.053194449471959615, "completions/clipped_ratio": -1.34375, "epoch": 1.2752688172043012, "grad_norm": 38.55407567573268, "learning_rate": 1.7013714923996524e-06, "loss": -1.0357, "num_tokens": 87077482.0, "residual_var": 0.034908879548311234, "reward": 0.705078125, "reward_std": 0.12494020164012909, "rewards/drgrpo_math_reward/mean": 0.705078125, "rewards/drgrpo_math_reward/std": 0.4564536213874817, "rho2": 0.3437497019767761, "step": 222 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 5.7348462017722265e-09, "advantages/std": 0.22329604625701904, "advantages/var": 0.04986112427401679, "completions/clipped_ratio": -1.65625, "epoch": 1.2810035842293908, "grad_norm": 36.08633289893015, "learning_rate": 1.6988077607419752e-06, "loss": -0.5464, "num_tokens": 87517197.0, "residual_var": 0.03583769127726555, "reward": 0.650390625, "reward_std": 0.11145263910293579, "rewards/drgrpo_math_reward/mean": 0.650390625, "rewards/drgrpo_math_reward/std": 0.47731292247772217, "rho2": 0.28124988079071045, "step": 223 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.724340751630126e-09, "advantages/std": 0.2710576057434082, "advantages/var": 0.07347222563134892, "completions/clipped_ratio": -1.796875, "epoch": 1.2867383512544803, "grad_norm": 42.14315943310403, "learning_rate": 1.6962350208247167e-06, "loss": -0.6939, "num_tokens": 87950690.0, "residual_var": 0.04132814705371857, "reward": 0.798828125, "reward_std": 0.17054690420627594, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "rho2": 0.43749988079071045, "step": 224 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 3.613694989239888e-09, "advantages/std": 0.19329021871089935, "advantages/var": 0.037361108649307306, "completions/clipped_ratio": -2.203125, "epoch": 1.29247311827957, "grad_norm": 33.73663056472902, "learning_rate": 1.6936533058128049e-06, "loss": -0.5481, "num_tokens": 88320414.0, "residual_var": 0.02685331553220749, "reward": 0.791015625, "reward_std": 0.0946824848651886, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "rho2": 0.2812498211860657, "step": 225 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.4318366345279654e-09, "advantages/std": 0.28722813725471497, "advantages/var": 0.08250000283081338, "completions/clipped_ratio": -1.75, "epoch": 1.2982078853046595, "grad_norm": 54.877854160698895, "learning_rate": 1.6910626489868648e-06, "loss": -1.3473, "num_tokens": 88719014.0, "residual_var": 0.041250042617321014, "reward": 0.78515625, "reward_std": 0.19028910994529724, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4111155867576599, "rho2": 0.49999967217445374, "step": 226 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.5200680289906245e-09, "advantages/std": 0.26457515358924866, "advantages/var": 0.07000001189677452, "completions/clipped_ratio": -2.21875, "epoch": 1.303942652329749, "grad_norm": 41.54882720124493, "learning_rate": 1.6884630837427888e-06, "loss": -0.5348, "num_tokens": 89100122.0, "residual_var": 0.04375002160668373, "reward": 0.77734375, "reward_std": 0.15271230041980743, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "rho2": 0.37499985098838806, "step": 227 }, { "advantages/mean": -2.2118911147117615e-09, "advantages/snr": 8.978198768654259e-09, "advantages/std": 0.24636244773864746, "advantages/var": 0.0606944556557778, "completions/clipped_ratio": -2.0234375, "epoch": 1.3096774193548386, "grad_norm": 47.08376018383834, "learning_rate": 1.685854643591308e-06, "loss": -0.0286, "num_tokens": 89488263.0, "residual_var": 0.04741755872964859, "reward": 0.833984375, "reward_std": 0.12311156839132309, "rewards/drgrpo_math_reward/mean": 0.833984375, "rewards/drgrpo_math_reward/std": 0.3724585771560669, "rho2": 0.21874985098838806, "step": 228 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.287020412877018e-09, "advantages/std": 0.28333336114883423, "advantages/var": 0.08027779353989573, "completions/clipped_ratio": -1.84375, "epoch": 1.3154121863799282, "grad_norm": 55.8134965414817, "learning_rate": 1.6832373621575581e-06, "loss": -1.3498, "num_tokens": 89904046.0, "residual_var": 0.05268231779336929, "reward": 0.74609375, "reward_std": 0.1712391972541809, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "rho2": 0.34374988079071045, "step": 229 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 7.833895673000178e-10, "advantages/std": 0.2972092628479004, "advantages/var": 0.08833334592259234, "completions/clipped_ratio": -2.109375, "epoch": 1.321146953405018, "grad_norm": 60.24031563134192, "learning_rate": 1.6806112731806471e-06, "loss": -0.6831, "num_tokens": 90331936.0, "residual_var": 0.03864586353302002, "reward": 0.76171875, "reward_std": 0.21298879384994507, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "rho2": 0.5624997615814209, "step": 230 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.25792980194091797, "advantages/var": 0.06652778272928117, "completions/clipped_ratio": -2.1953125, "epoch": 1.3268817204301075, "grad_norm": 50.07804252954931, "learning_rate": 1.677976410513221e-06, "loss": -0.9577, "num_tokens": 90710538.0, "residual_var": 0.039500895887613297, "reward": 0.759765625, "reward_std": 0.1590118706226349, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "rho2": 0.4062497615814209, "step": 231 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 4.141142942820369e-09, "advantages/std": 0.28111881017684937, "advantages/var": 0.07902778543524747, "completions/clipped_ratio": -2.28125, "epoch": 1.3326164874551971, "grad_norm": 59.0557110463939, "learning_rate": 1.6753328081210244e-06, "loss": 0.1415, "num_tokens": 91109553.0, "residual_var": 0.04445316642522812, "reward": 0.759765625, "reward_std": 0.17318779230117798, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "rho2": 0.43749964237213135, "step": 232 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 5.049851050974445e-09, "advantages/std": 0.2535853981971741, "advantages/var": 0.06430555417881934, "completions/clipped_ratio": -2.0703125, "epoch": 1.3383512544802867, "grad_norm": 43.42172827815474, "learning_rate": 1.672680500082467e-06, "loss": -0.1926, "num_tokens": 91497324.0, "residual_var": 0.04019100219011307, "reward": 0.751953125, "reward_std": 0.14386671781539917, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "rho2": 0.3749997019767761, "step": 233 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 9.302894010289538e-10, "advantages/std": 0.25027763843536377, "advantages/var": 0.06263889630078268, "completions/clipped_ratio": -1.9140625, "epoch": 1.3440860215053765, "grad_norm": 50.31662055819204, "learning_rate": 1.6700195205881811e-06, "loss": -0.2699, "num_tokens": 91905918.0, "residual_var": 0.04110679030418396, "reward": 0.736328125, "reward_std": 0.1395752727985382, "rewards/drgrpo_math_reward/mean": 0.736328125, "rewards/drgrpo_math_reward/std": 0.4410543739795685, "rho2": 0.34374985098838806, "step": 234 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 4.21443003951413e-09, "advantages/std": 0.24860724806785583, "advantages/var": 0.06180556379187241, "completions/clipped_ratio": -1.96875, "epoch": 1.349820788530466, "grad_norm": 46.65644683106695, "learning_rate": 1.667349903940582e-06, "loss": -0.2831, "num_tokens": 92297492.0, "residual_var": 0.04055992141366005, "reward": 0.814453125, "reward_std": 0.1419036090373993, "rewards/drgrpo_math_reward/mean": 0.814453125, "rewards/drgrpo_math_reward/std": 0.38912075757980347, "rho2": 0.3437498211860657, "step": 235 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23540747165679932, "advantages/var": 0.055416677711846773, "completions/clipped_ratio": -1.921875, "epoch": 1.3555555555555556, "grad_norm": 50.35182979138741, "learning_rate": 1.6646716845534256e-06, "loss": -0.5046, "num_tokens": 92687669.0, "residual_var": 0.03983075171709061, "reward": 0.767578125, "reward_std": 0.12451696395874023, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "rho2": 0.28124991059303284, "step": 236 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 4.49007602564036e-09, "advantages/std": 0.2592724859714508, "advantages/var": 0.06722222198181615, "completions/clipped_ratio": -1.828125, "epoch": 1.3612903225806452, "grad_norm": 53.225555046622546, "learning_rate": 1.661984896951365e-06, "loss": -0.2609, "num_tokens": 93104611.0, "residual_var": 0.0441146083176136, "reward": 0.703125, "reward_std": 0.14872382581233978, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "rho2": 0.3437497317790985, "step": 237 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 8.182237872194598e-10, "advantages/std": 0.28455618023872375, "advantages/var": 0.08097221971205304, "completions/clipped_ratio": -1.9921875, "epoch": 1.3670250896057348, "grad_norm": 55.42324462194506, "learning_rate": 1.6592895757695052e-06, "loss": -0.5441, "num_tokens": 93498346.0, "residual_var": 0.04048614203929901, "reward": 0.802734375, "reward_std": 0.1889267861843109, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "rho2": 0.4999998211860657, "step": 238 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.3604989488792995e-09, "advantages/std": 0.2771381735801697, "advantages/var": 0.07680556725535226, "completions/clipped_ratio": -1.7578125, "epoch": 1.3727598566308243, "grad_norm": 61.82505415437269, "learning_rate": 1.6565857557529564e-06, "loss": -0.4733, "num_tokens": 93921581.0, "residual_var": 0.05040368437767029, "reward": 0.650390625, "reward_std": 0.16475613415241241, "rewards/drgrpo_math_reward/mean": 0.650390625, "rewards/drgrpo_math_reward/std": 0.47731292247772217, "rho2": 0.3437497019767761, "step": 239 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 3.1635451249771334e-09, "advantages/std": 0.22079403698444366, "advantages/var": 0.04875000676788788, "completions/clipped_ratio": -2.328125, "epoch": 1.378494623655914, "grad_norm": 48.23121932390077, "learning_rate": 1.653873471756387e-06, "loss": -0.1985, "num_tokens": 94307823.0, "residual_var": 0.035039082169532776, "reward": 0.841796875, "reward_std": 0.11549167335033417, "rewards/drgrpo_math_reward/mean": 0.841796875, "rewards/drgrpo_math_reward/std": 0.36528825759887695, "rho2": 0.28124988079071045, "step": 240 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 6.211669404903374e-09, "advantages/std": 0.20615528523921967, "advantages/var": 0.04250000163206402, "completions/clipped_ratio": -1.96875, "epoch": 1.3842293906810035, "grad_norm": 38.04419984109611, "learning_rate": 1.6511527587435735e-06, "loss": -0.0497, "num_tokens": 94712668.0, "residual_var": 0.0292187687009573, "reward": 0.79296875, "reward_std": 0.10750485956668854, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40557438135147095, "rho2": 0.3124998211860657, "step": 241 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.6566188482458542e-09, "advantages/std": 0.21081852912902832, "advantages/var": 0.04444445222412696, "completions/clipped_ratio": -1.859375, "epoch": 1.3899641577060933, "grad_norm": 38.99219914686392, "learning_rate": 1.6484236517869496e-06, "loss": -0.0112, "num_tokens": 95148331.0, "residual_var": 0.03194446489214897, "reward": 0.78515625, "reward_std": 0.1049162894487381, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4111155867576599, "rho2": 0.2812498211860657, "step": 242 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 8.906842463101969e-10, "advantages/std": 0.26140648126602173, "advantages/var": 0.06833334844788297, "completions/clipped_ratio": -2.140625, "epoch": 1.3956989247311828, "grad_norm": 53.876386285146594, "learning_rate": 1.645686186067155e-06, "loss": -0.4296, "num_tokens": 95556500.0, "residual_var": 0.05125002562999725, "reward": 0.78125, "reward_std": 0.1394425630569458, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "rho2": 0.24999991059303284, "step": 243 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.5142522222473678e-09, "advantages/std": 0.23063921928405762, "advantages/var": 0.053194449471959615, "completions/clipped_ratio": -2.0234375, "epoch": 1.4014336917562724, "grad_norm": 49.81376150044177, "learning_rate": 1.642940396872581e-06, "loss": -0.2353, "num_tokens": 95958675.0, "residual_var": 0.034908875823020935, "reward": 0.818359375, "reward_std": 0.12670421600341797, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "rho2": 0.3437498211860657, "step": 244 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.3135261403064837e-09, "advantages/std": 0.26588428020477295, "advantages/var": 0.07069445046001022, "completions/clipped_ratio": -2.0234375, "epoch": 1.407168458781362, "grad_norm": 54.171402306267936, "learning_rate": 1.640186319598916e-06, "loss": -0.9363, "num_tokens": 96357429.0, "residual_var": 0.04860246926546097, "reward": 0.861328125, "reward_std": 0.14248207211494446, "rewards/drgrpo_math_reward/mean": 0.861328125, "rewards/drgrpo_math_reward/std": 0.34594178199768066, "rho2": 0.3124997019767761, "step": 245 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 2.710091137427512e-09, "advantages/std": 0.30069366097450256, "advantages/var": 0.09041667775024909, "completions/clipped_ratio": -1.859375, "epoch": 1.4129032258064516, "grad_norm": 59.34521279169363, "learning_rate": 1.6374239897486897e-06, "loss": -0.164, "num_tokens": 96774016.0, "residual_var": 0.042382847517728806, "reward": 0.755859375, "reward_std": 0.2116120457649231, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "rho2": 0.5312497615814209, "step": 246 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.0428643138038885e-10, "advantages/std": 0.28795257210731506, "advantages/var": 0.08291668378321848, "completions/clipped_ratio": -2.0, "epoch": 1.4186379928315414, "grad_norm": 59.79410024725512, "learning_rate": 1.6346534429308141e-06, "loss": -0.1107, "num_tokens": 97182071.0, "residual_var": 0.046640679240226746, "reward": 0.814453125, "reward_std": 0.1831747442483902, "rewards/drgrpo_math_reward/mean": 0.814453125, "rewards/drgrpo_math_reward/std": 0.38912075757980347, "rho2": 0.4374995231628418, "step": 247 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.7130912878896828e-09, "advantages/std": 0.2718251347541809, "advantages/var": 0.07388890388412861, "completions/clipped_ratio": -2.0390625, "epoch": 1.424372759856631, "grad_norm": 52.974008357217826, "learning_rate": 1.6318747148601257e-06, "loss": -0.1117, "num_tokens": 97605926.0, "residual_var": 0.04848960414528847, "reward": 0.76171875, "reward_std": 0.15800951421260834, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "rho2": 0.34374988079071045, "step": 248 }, { "advantages/mean": -2.9103830456733704e-10, "advantages/snr": 1.0492077186605972e-09, "advantages/std": 0.27738863229751587, "advantages/var": 0.07694445332788646, "completions/clipped_ratio": -1.9765625, "epoch": 1.4301075268817205, "grad_norm": 59.02314762475839, "learning_rate": 1.6290878413569251e-06, "loss": -0.4523, "num_tokens": 98020959.0, "residual_var": 0.04087677597999573, "reward": 0.80859375, "reward_std": 0.17632853984832764, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.3937928080558777, "rho2": 0.46874967217445374, "step": 249 }, { "advantages/mean": -1.7462298274040222e-09, "advantages/snr": 6.93121755737813e-09, "advantages/std": 0.2519369423389435, "advantages/var": 0.06347222291509613, "completions/clipped_ratio": -1.9296875, "epoch": 1.43584229390681, "grad_norm": 53.41925290909598, "learning_rate": 1.6262928583465141e-06, "loss": -0.2448, "num_tokens": 98448364.0, "residual_var": 0.0357031486928463, "reward": 0.666015625, "reward_std": 0.156736820936203, "rewards/drgrpo_math_reward/mean": 0.666015625, "rewards/drgrpo_math_reward/std": 0.47209542989730835, "rho2": 0.4374998211860657, "step": 250 }, { "advantages/mean": 2.153683453798294e-09, "advantages/snr": 8.402673505084133e-09, "advantages/std": 0.25630930066108704, "advantages/var": 0.06569445760537551, "completions/clipped_ratio": -1.7421875, "epoch": 1.4415770609318996, "grad_norm": 54.70648398178565, "learning_rate": 1.6234898018587336e-06, "loss": -0.4094, "num_tokens": 98846598.0, "residual_var": 0.04311200603842735, "reward": 0.748046875, "reward_std": 0.14105165004730225, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "rho2": 0.3437498211860657, "step": 251 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.769223699122265e-10, "advantages/std": 0.24409699440002441, "advantages/var": 0.05958334267512555, "completions/clipped_ratio": -2.5, "epoch": 1.4473118279569892, "grad_norm": 50.39681812169105, "learning_rate": 1.620678708027499e-06, "loss": -0.2689, "num_tokens": 99227995.0, "residual_var": 0.03723959997296333, "reward": 0.802734375, "reward_std": 0.14150559902191162, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "rho2": 0.37499988079071045, "step": 252 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.802929081517551e-10, "advantages/std": 0.24238400161266327, "advantages/var": 0.05875000423776755, "completions/clipped_ratio": -2.3125, "epoch": 1.4530465949820788, "grad_norm": 56.36012809747618, "learning_rate": 1.6178596130903343e-06, "loss": -0.6744, "num_tokens": 99593541.0, "residual_var": 0.04039065167307854, "reward": 0.857421875, "reward_std": 0.12753017246723175, "rewards/drgrpo_math_reward/mean": 0.857421875, "rewards/drgrpo_math_reward/std": 0.3499840497970581, "rho2": 0.3124997615814209, "step": 253 }, { "advantages/mean": 2.2118911147117615e-09, "advantages/snr": 9.6154022603322e-09, "advantages/std": 0.23003624379634857, "advantages/var": 0.05291667345993312, "completions/clipped_ratio": -2.1875, "epoch": 1.4587813620071683, "grad_norm": 54.42875436519834, "learning_rate": 1.615032553387905e-06, "loss": -0.1637, "num_tokens": 99973384.0, "residual_var": 0.03307294100522995, "reward": 0.716796875, "reward_std": 0.13094235956668854, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "rho2": 0.3749997913837433, "step": 254 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 6.091138397946409e-10, "advantages/std": 0.19112242758274078, "advantages/var": 0.036527782325119995, "completions/clipped_ratio": -2.6953125, "epoch": 1.4645161290322581, "grad_norm": 36.911387570379965, "learning_rate": 1.6121975653635488e-06, "loss": -0.0962, "num_tokens": 100310542.0, "residual_var": 0.02739585191011429, "reward": 0.865234375, "reward_std": 0.08961933106184006, "rewards/drgrpo_math_reward/mean": 0.865234375, "rewards/drgrpo_math_reward/std": 0.3418070077896118, "rho2": 0.24999991059303284, "step": 255 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.23363077640533447, "advantages/var": 0.05458333968375939, "completions/clipped_ratio": -2.0546875, "epoch": 1.4702508960573477, "grad_norm": 53.346044793983936, "learning_rate": 1.6093546855628081e-06, "loss": -0.2482, "num_tokens": 100703243.0, "residual_var": 0.03411460295319557, "reward": 0.763671875, "reward_std": 0.134678453207016, "rewards/drgrpo_math_reward/mean": 0.763671875, "rewards/drgrpo_math_reward/std": 0.42524150013923645, "rho2": 0.37499988079071045, "step": 256 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 4.142052747109938e-09, "advantages/std": 0.22484564781188965, "advantages/var": 0.050555565339948316, "completions/clipped_ratio": -2.5703125, "epoch": 1.4759856630824373, "grad_norm": 50.03640655471084, "learning_rate": 1.6065039506329559e-06, "loss": -0.506, "num_tokens": 101045457.0, "residual_var": 0.033177103847265244, "reward": 0.84375, "reward_std": 0.12422171235084534, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "rho2": 0.3437498211860657, "step": 257 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 3.092355562449119e-09, "advantages/std": 0.263523131608963, "advantages/var": 0.06944444089299484, "completions/clipped_ratio": -2.2421875, "epoch": 1.4817204301075269, "grad_norm": 59.266144823030935, "learning_rate": 1.6036453973225256e-06, "loss": -0.4354, "num_tokens": 101395350.0, "residual_var": 0.04991321638226509, "reward": 0.74609375, "reward_std": 0.14747364819049835, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "rho2": 0.28124985098838806, "step": 258 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.865255422030312e-09, "advantages/std": 0.24094721674919128, "advantages/var": 0.058055561259181765, "completions/clipped_ratio": -2.2734375, "epoch": 1.4874551971326164, "grad_norm": 53.944881105871, "learning_rate": 1.6007790624808365e-06, "loss": -0.5276, "num_tokens": 101739760.0, "residual_var": 0.04354168847203255, "reward": 0.73828125, "reward_std": 0.12557920813560486, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "rho2": 0.24999982118606567, "step": 259 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.0597275960276614e-09, "advantages/std": 0.3295620083808899, "advantages/var": 0.10861111736804574, "completions/clipped_ratio": -2.21875, "epoch": 1.4931899641577062, "grad_norm": 69.66710547527994, "learning_rate": 1.5979049830575188e-06, "loss": -0.3316, "num_tokens": 102120543.0, "residual_var": 0.05430559813976288, "reward": 0.6953125, "reward_std": 0.230571448802948, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4607250988483429, "rho2": 0.4999997615814209, "step": 260 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 4.890421485060267e-09, "advantages/std": 0.23804761469364166, "advantages/var": 0.05666666686133248, "completions/clipped_ratio": -2.671875, "epoch": 1.4989247311827958, "grad_norm": 52.574827453992754, "learning_rate": 1.595023196102037e-06, "loss": -0.1869, "num_tokens": 102419585.0, "residual_var": 0.0389583557844162, "reward": 0.8984375, "reward_std": 0.1256462037563324, "rewards/drgrpo_math_reward/mean": 0.8984375, "rewards/drgrpo_math_reward/std": 0.30236753821372986, "rho2": 0.3124998211860657, "step": 261 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.29976844787597656, "advantages/var": 0.08986112234197208, "completions/clipped_ratio": -2.453125, "epoch": 1.5046594982078854, "grad_norm": 66.80598667486676, "learning_rate": 1.5921337387632133e-06, "loss": -0.0947, "num_tokens": 102779224.0, "residual_var": 0.05335506424307823, "reward": 0.751953125, "reward_std": 0.18921317160129547, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "rho2": 0.40624988079071045, "step": 262 }, { "advantages/mean": 7.566995918750763e-10, "advantages/snr": 2.6234712889574733e-09, "advantages/std": 0.2884344756603241, "advantages/var": 0.0831944467494461, "completions/clipped_ratio": -2.4375, "epoch": 1.510394265232975, "grad_norm": 66.33732617673724, "learning_rate": 1.589236648288747e-06, "loss": -0.2181, "num_tokens": 103124653.0, "residual_var": 0.04679691046476364, "reward": 0.728515625, "reward_std": 0.18394052982330322, "rewards/drgrpo_math_reward/mean": 0.728515625, "rewards/drgrpo_math_reward/std": 0.44516023993492126, "rho2": 0.4374997019767761, "step": 263 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.9634002501456464e-09, "advantages/std": 0.23717083036899567, "advantages/var": 0.05625000277791892, "completions/clipped_ratio": -2.6796875, "epoch": 1.5161290322580645, "grad_norm": 44.34966748652034, "learning_rate": 1.5863319620247364e-06, "loss": -0.0838, "num_tokens": 103419320.0, "residual_var": 0.03339845687150955, "reward": 0.759765625, "reward_std": 0.1408853381872177, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "rho2": 0.4062498211860657, "step": 264 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.4653317355907177e-09, "advantages/std": 0.23833917081356049, "advantages/var": 0.05680556034409556, "completions/clipped_ratio": -2.796875, "epoch": 1.521863799283154, "grad_norm": 51.37731014357014, "learning_rate": 1.5834197174151957e-06, "loss": 0.1908, "num_tokens": 103726676.0, "residual_var": 0.03372832387685776, "reward": 0.681640625, "reward_std": 0.14133787155151367, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.4062497317790985, "step": 265 }, { "advantages/mean": 1.1059455573558807e-09, "advantages/snr": 3.768808942373478e-09, "advantages/std": 0.29344695806503296, "advantages/var": 0.08611111719762121, "completions/clipped_ratio": -2.6328125, "epoch": 1.5275985663082436, "grad_norm": 61.46214496674834, "learning_rate": 1.5804999520015733e-06, "loss": -0.2806, "num_tokens": 104057464.0, "residual_var": 0.03767364099621773, "reward": 0.671875, "reward_std": 0.20619486272335052, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4699897766113281, "rho2": 0.5624997615814209, "step": 266 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 6.9849184427990146e-09, "advantages/std": 0.21666668355464935, "advantages/var": 0.04694445176257056, "completions/clipped_ratio": -2.8671875, "epoch": 1.5333333333333332, "grad_norm": 45.984815592260055, "learning_rate": 1.5775727034222674e-06, "loss": 0.0075, "num_tokens": 104324751.0, "residual_var": 0.03374134376645088, "reward": 0.81640625, "reward_std": 0.10816451907157898, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "rho2": 0.2812497913837433, "step": 267 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 7.41335457342705e-09, "advantages/std": 0.21984843909740448, "advantages/var": 0.04833333617356517, "completions/clipped_ratio": -2.7265625, "epoch": 1.5390681003584228, "grad_norm": 49.360747942014775, "learning_rate": 1.5746380094121409e-06, "loss": -0.3293, "num_tokens": 104619779.0, "residual_var": 0.03776043280959129, "reward": 0.81640625, "reward_std": 0.1043786108493805, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "rho2": 0.21874994039535522, "step": 268 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 3.7551154878888765e-10, "advantages/std": 0.3100179433822632, "advantages/var": 0.09611112521896814, "completions/clipped_ratio": -2.78125, "epoch": 1.5448028673835126, "grad_norm": 69.31257710408642, "learning_rate": 1.5716959078020354e-06, "loss": -0.3013, "num_tokens": 104915231.0, "residual_var": 0.05706600844860077, "reward": 0.83984375, "reward_std": 0.2014968991279602, "rewards/drgrpo_math_reward/mean": 0.83984375, "rewards/drgrpo_math_reward/std": 0.3671095669269562, "rho2": 0.4062497913837433, "step": 269 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.839293472028012e-09, "advantages/std": 0.2886751592159271, "advantages/var": 0.08333334754834087, "completions/clipped_ratio": -2.4296875, "epoch": 1.5505376344086022, "grad_norm": 60.24477511870747, "learning_rate": 1.5687464365182819e-06, "loss": -0.4246, "num_tokens": 105276307.0, "residual_var": 0.03906252235174179, "reward": 0.703125, "reward_std": 0.19893690943717957, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "rho2": 0.5312498807907104, "step": 270 }, { "advantages/mean": 7.566995918750763e-10, "advantages/snr": 2.783773342820735e-09, "advantages/std": 0.2718251347541809, "advantages/var": 0.07388890388412861, "completions/clipped_ratio": -2.8203125, "epoch": 1.5562724014336917, "grad_norm": 60.61853214430673, "learning_rate": 1.5657896335822145e-06, "loss": 0.0073, "num_tokens": 105598018.0, "residual_var": 0.04848961532115936, "reward": 0.8046875, "reward_std": 0.15615186095237732, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "rho2": 0.3437497019767761, "step": 271 }, { "advantages/mean": -1.979060471057892e-09, "advantages/snr": 7.301253888882922e-09, "advantages/std": 0.2710576057434082, "advantages/var": 0.07347222563134892, "completions/clipped_ratio": -2.796875, "epoch": 1.5620071684587815, "grad_norm": 54.163777890936515, "learning_rate": 1.5628255371096781e-06, "loss": 0.0071, "num_tokens": 105927003.0, "residual_var": 0.039032138884067535, "reward": 0.732421875, "reward_std": 0.17495277523994446, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.46874988079071045, "step": 272 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 6.389654501606959e-10, "advantages/std": 0.18219344317913055, "advantages/var": 0.033194450737467074, "completions/clipped_ratio": -2.71875, "epoch": 1.567741935483871, "grad_norm": 40.417224810426184, "learning_rate": 1.5598541853105384e-06, "loss": -0.1454, "num_tokens": 106255478.0, "residual_var": 0.02385851927101612, "reward": 0.794921875, "reward_std": 0.09001073986291885, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "rho2": 0.28124988079071045, "step": 273 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 5.870267279546583e-09, "advantages/std": 0.27763888239860535, "advantages/var": 0.07708334901954661, "completions/clipped_ratio": -2.3984375, "epoch": 1.5734767025089607, "grad_norm": 61.201246116678135, "learning_rate": 1.556875616488188e-06, "loss": -0.3844, "num_tokens": 106652934.0, "residual_var": 0.04817712679505348, "reward": 0.681640625, "reward_std": 0.16910633444786072, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.37499964237213135, "step": 274 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 6.825242037435063e-09, "advantages/std": 0.22173559665679932, "advantages/var": 0.04916667482474679, "completions/clipped_ratio": -2.3671875, "epoch": 1.5792114695340502, "grad_norm": 47.62010697628326, "learning_rate": 1.553889869039054e-06, "loss": -0.4979, "num_tokens": 107029556.0, "residual_var": 0.03533856198191643, "reward": 0.703125, "reward_std": 0.10964228957891464, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "rho2": 0.2812498211860657, "step": 275 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 8.203371509409415e-10, "advantages/std": 0.2838231027126312, "advantages/var": 0.08055555363342481, "completions/clipped_ratio": -2.3671875, "epoch": 1.5849462365591398, "grad_norm": 141.39850194301653, "learning_rate": 1.5508969814521024e-06, "loss": -0.0007, "num_tokens": 107443148.0, "residual_var": 0.04027780890464783, "reward": 0.80859375, "reward_std": 0.19141492247581482, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.3937928080558777, "rho2": 0.4999997317790985, "step": 276 }, { "advantages/mean": -4.0745362639427185e-10, "advantages/snr": 1.5764196348421162e-09, "advantages/std": 0.25846773386001587, "advantages/var": 0.066805569446732, "completions/clipped_ratio": -2.4140625, "epoch": 1.5906810035842294, "grad_norm": 59.26994150576121, "learning_rate": 1.5478969923083417e-06, "loss": -0.4853, "num_tokens": 107815545.0, "residual_var": 0.0438411720097065, "reward": 0.755859375, "reward_std": 0.14424702525138855, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "rho2": 0.3437497615814209, "step": 277 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.4374844083348e-09, "advantages/std": 0.24295634031295776, "advantages/var": 0.059027783298265746, "completions/clipped_ratio": -2.7265625, "epoch": 1.596415770609319, "grad_norm": 51.8961083277923, "learning_rate": 1.5448899402803264e-06, "loss": -0.1511, "num_tokens": 108167396.0, "residual_var": 0.03873700276017189, "reward": 0.810546875, "reward_std": 0.1332515925168991, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "rho2": 0.3437497317790985, "step": 278 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.399507892341932e-09, "advantages/std": 0.2910708487033844, "advantages/var": 0.08472223896490849, "completions/clipped_ratio": -2.6328125, "epoch": 1.6021505376344085, "grad_norm": 68.8103468670342, "learning_rate": 1.5418758641316572e-06, "loss": -0.574, "num_tokens": 108494524.0, "residual_var": 0.047656286507844925, "reward": 0.75390625, "reward_std": 0.18139266967773438, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.4374997615814209, "step": 279 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 1.0134785399957934e-09, "advantages/std": 0.2297341525554657, "advantages/var": 0.05277778085037799, "completions/clipped_ratio": -2.4765625, "epoch": 1.607885304659498, "grad_norm": 55.01746277279755, "learning_rate": 1.5388548027164822e-06, "loss": 0.0295, "num_tokens": 108857565.0, "residual_var": 0.03628474846482277, "reward": 0.8515625, "reward_std": 0.12026290595531464, "rewards/drgrpo_math_reward/mean": 0.8515625, "rewards/drgrpo_math_reward/std": 0.35588082671165466, "rho2": 0.3124997615814209, "step": 280 }, { "advantages/mean": 1.979060471057892e-09, "advantages/snr": 7.570816093636673e-09, "advantages/std": 0.26140648126602173, "advantages/var": 0.06833334844788297, "completions/clipped_ratio": -2.6484375, "epoch": 1.6136200716845877, "grad_norm": 60.064285790552745, "learning_rate": 1.5358267949789964e-06, "loss": -0.1366, "num_tokens": 109201374.0, "residual_var": 0.04911460727453232, "reward": 0.8046875, "reward_std": 0.1443926990032196, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "rho2": 0.28124988079071045, "step": 281 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.6521076184151136e-09, "advantages/std": 0.2818589210510254, "advantages/var": 0.07944445137604816, "completions/clipped_ratio": -2.3828125, "epoch": 1.6193548387096774, "grad_norm": 62.6750125900511, "learning_rate": 1.532791879952939e-06, "loss": -0.4059, "num_tokens": 109600360.0, "residual_var": 0.0397222526371479, "reward": 0.79296875, "reward_std": 0.18726930022239685, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40557438135147095, "rho2": 0.4999997913837433, "step": 282 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.236642930683677e-09, "advantages/std": 0.2573907673358917, "advantages/var": 0.06625000710975915, "completions/clipped_ratio": -2.5546875, "epoch": 1.625089605734767, "grad_norm": 61.91074301552538, "learning_rate": 1.5297500967610891e-06, "loss": -0.2868, "num_tokens": 109958377.0, "residual_var": 0.03933596611022949, "reward": 0.744140625, "reward_std": 0.1531161367893219, "rewards/drgrpo_math_reward/mean": 0.744140625, "rewards/drgrpo_math_reward/std": 0.43676990270614624, "rho2": 0.4062497317790985, "step": 283 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.288456568626398e-09, "advantages/std": 0.2710576057434082, "advantages/var": 0.07347222563134892, "completions/clipped_ratio": -2.5546875, "epoch": 1.6308243727598566, "grad_norm": 66.97840462322483, "learning_rate": 1.5267014846147645e-06, "loss": -0.3889, "num_tokens": 110359173.0, "residual_var": 0.0436241589486599, "reward": 0.759765625, "reward_std": 0.16251389682292938, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "rho2": 0.4062497615814209, "step": 284 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 5.114720757352465e-09, "advantages/std": 0.2276083528995514, "advantages/var": 0.051805562309646724, "completions/clipped_ratio": -2.625, "epoch": 1.6365591397849464, "grad_norm": 54.881793060530256, "learning_rate": 1.5236460828133134e-06, "loss": -0.4058, "num_tokens": 110747161.0, "residual_var": 0.035616349428892136, "reward": 0.732421875, "reward_std": 0.1179073303937912, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.3124997019767761, "step": 285 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.745317741195837e-09, "advantages/std": 0.29439204931259155, "advantages/var": 0.08666667869846734, "completions/clipped_ratio": -2.4765625, "epoch": 1.642293906810036, "grad_norm": 74.00750508725662, "learning_rate": 1.5205839307436086e-06, "loss": -0.2126, "num_tokens": 111161497.0, "residual_var": 0.04333336651325226, "reward": 0.640625, "reward_std": 0.19938677549362183, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.48028653860092163, "rho2": 0.4999997615814209, "step": 286 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 3.4616882124408073e-09, "advantages/std": 0.23540745675563812, "advantages/var": 0.05541667069615763, "completions/clipped_ratio": -2.6796875, "epoch": 1.6480286738351255, "grad_norm": 58.42794149920188, "learning_rate": 1.5175150678795402e-06, "loss": -0.1936, "num_tokens": 111536957.0, "residual_var": 0.03636721521615982, "reward": 0.701171875, "reward_std": 0.12940721213817596, "rewards/drgrpo_math_reward/mean": 0.701171875, "rewards/drgrpo_math_reward/std": 0.45819199085235596, "rho2": 0.3437497615814209, "step": 287 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.68650727814765e-09, "advantages/std": 0.21666666865348816, "advantages/var": 0.04694444530540043, "completions/clipped_ratio": -2.6015625, "epoch": 1.653763440860215, "grad_norm": 52.49225125373135, "learning_rate": 1.5144395337815063e-06, "loss": -0.214, "num_tokens": 111903140.0, "residual_var": 0.030807310715317726, "reward": 0.7734375, "reward_std": 0.11849889904260635, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "rho2": 0.3437498211860657, "step": 288 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 2.146033615322476e-09, "advantages/std": 0.21698693931102753, "advantages/var": 0.04708333183156754, "completions/clipped_ratio": -2.484375, "epoch": 1.6594982078853047, "grad_norm": 57.527852433809215, "learning_rate": 1.5113573680959038e-06, "loss": -0.128, "num_tokens": 112263039.0, "residual_var": 0.030898453667759895, "reward": 0.763671875, "reward_std": 0.1188259869813919, "rewards/drgrpo_math_reward/mean": 0.763671875, "rewards/drgrpo_math_reward/std": 0.42524150013923645, "rho2": 0.34374985098838806, "step": 289 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 4.218060451308702e-09, "advantages/std": 0.22079402208328247, "advantages/var": 0.04875000018771303, "completions/clipped_ratio": -2.3515625, "epoch": 1.6652329749103942, "grad_norm": 53.69687724923325, "learning_rate": 1.5082686105546159e-06, "loss": -0.1113, "num_tokens": 112650474.0, "residual_var": 0.0304687712341547, "reward": 0.615234375, "reward_std": 0.12576617300510406, "rewards/drgrpo_math_reward/mean": 0.615234375, "rewards/drgrpo_math_reward/std": 0.4870156943798065, "rho2": 0.3749998211860657, "step": 290 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 3.2703682052289916e-09, "advantages/std": 0.3203730583190918, "advantages/var": 0.1026388964967282, "completions/clipped_ratio": -2.421875, "epoch": 1.6709677419354838, "grad_norm": 83.51060003066874, "learning_rate": 1.5051733009745012e-06, "loss": -0.8512, "num_tokens": 113048861.0, "residual_var": 0.04169710725545883, "reward": 0.630859375, "reward_std": 0.23122908174991608, "rewards/drgrpo_math_reward/mean": 0.630859375, "rewards/drgrpo_math_reward/std": 0.4830440282821655, "rho2": 0.5937495231628418, "step": 291 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.7604325006065655e-09, "advantages/std": 0.25303712487220764, "advantages/var": 0.0640277865635932, "completions/clipped_ratio": -2.609375, "epoch": 1.6767025089605734, "grad_norm": 63.43475481127461, "learning_rate": 1.5020714792568794e-06, "loss": -0.1444, "num_tokens": 113407595.0, "residual_var": 0.044019125401973724, "reward": 0.806640625, "reward_std": 0.13931599259376526, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "rho2": 0.3124998211860657, "step": 292 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.4076385338187645e-09, "advantages/std": 0.2901149392127991, "advantages/var": 0.0841666779544461, "completions/clipped_ratio": -2.5390625, "epoch": 1.682437275985663, "grad_norm": 70.66773348081196, "learning_rate": 1.4989631853870184e-06, "loss": -0.3354, "num_tokens": 113771007.0, "residual_var": 0.04471356421709061, "reward": 0.73828125, "reward_std": 0.187490314245224, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "rho2": 0.46874985098838806, "step": 293 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.3277032342542236e-10, "advantages/std": 0.26900023221969604, "advantages/var": 0.0723611249342504, "completions/clipped_ratio": -2.5859375, "epoch": 1.6881720430107527, "grad_norm": 70.11754852937156, "learning_rate": 1.4958484594336178e-06, "loss": -0.0684, "num_tokens": 114146859.0, "residual_var": 0.04748699814081192, "reward": 0.748046875, "reward_std": 0.16029149293899536, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "rho2": 0.34374988079071045, "step": 294 }, { "advantages/mean": 5.820766091346741e-11, "advantages/snr": 2.0490770628672686e-10, "advantages/std": 0.28406769037246704, "advantages/var": 0.0806944527135478, "completions/clipped_ratio": -2.5859375, "epoch": 1.6939068100358423, "grad_norm": 62.338709072703566, "learning_rate": 1.4927273415482915e-06, "loss": -0.6503, "num_tokens": 114513123.0, "residual_var": 0.052955761551856995, "reward": 0.791015625, "reward_std": 0.16604849696159363, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "rho2": 0.3437497615814209, "step": 295 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.1140344460155885e-09, "advantages/std": 0.29907265305519104, "advantages/var": 0.08944445180547067, "completions/clipped_ratio": -2.265625, "epoch": 1.6996415770609319, "grad_norm": 97.29624338170805, "learning_rate": 1.4895998719650523e-06, "loss": -0.0915, "num_tokens": 114916761.0, "residual_var": 0.05310767889022827, "reward": 0.6796875, "reward_std": 0.18860673904418945, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4670529365539551, "rho2": 0.4062497317790985, "step": 296 }, { "advantages/mean": 5.820766091346741e-11, "advantages/snr": 2.6325402431319694e-10, "advantages/std": 0.2211083322763443, "advantages/var": 0.04888889460202628, "completions/clipped_ratio": -2.703125, "epoch": 1.7053763440860215, "grad_norm": 118.5984486034822, "learning_rate": 1.4864660909997916e-06, "loss": 0.4068, "num_tokens": 115261910.0, "residual_var": 0.03513890504837036, "reward": 0.734375, "reward_std": 0.11653674393892288, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "rho2": 0.2812499403953552, "step": 297 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 5.060738031753789e-10, "advantages/std": 0.23003624379634857, "advantages/var": 0.05291667345993312, "completions/clipped_ratio": -2.5390625, "epoch": 1.7111111111111112, "grad_norm": 73.29220676498036, "learning_rate": 1.4833260390497604e-06, "loss": -0.2304, "num_tokens": 115655530.0, "residual_var": 0.03803388029336929, "reward": 0.794921875, "reward_std": 0.11543179303407669, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "rho2": 0.2812497913837433, "step": 298 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.694490725312989e-09, "advantages/std": 0.216024711728096, "advantages/var": 0.04666667607720698, "completions/clipped_ratio": -2.46875, "epoch": 1.7168458781362008, "grad_norm": 80.98924172001904, "learning_rate": 1.4801797565930479e-06, "loss": -0.3545, "num_tokens": 116011987.0, "residual_var": 0.03645835444331169, "reward": 0.75390625, "reward_std": 0.10075695812702179, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.2187497913837433, "step": 299 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.641167593971404e-10, "advantages/std": 0.2508319616317749, "advantages/var": 0.0629166729760442, "completions/clipped_ratio": -2.171875, "epoch": 1.7225806451612904, "grad_norm": 102.93259559504042, "learning_rate": 1.4770272841880607e-06, "loss": -0.2538, "num_tokens": 116383201.0, "residual_var": 0.041289087384939194, "reward": 0.701171875, "reward_std": 0.13786455988883972, "rewards/drgrpo_math_reward/mean": 0.701171875, "rewards/drgrpo_math_reward/std": 0.45819199085235596, "rho2": 0.3437497913837433, "step": 300 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 7.883635353995264e-10, "advantages/std": 0.2953341007232666, "advantages/var": 0.08722223105002058, "completions/clipped_ratio": -2.125, "epoch": 1.72831541218638, "grad_norm": 100.06438478697434, "learning_rate": 1.4738686624729987e-06, "loss": -0.5862, "num_tokens": 116785102.0, "residual_var": 0.04361114650964737, "reward": 0.78125, "reward_std": 0.20214760303497314, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "rho2": 0.4999997615814209, "step": 301 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 2.2122846519797083e-09, "advantages/std": 0.21048885583877563, "advantages/var": 0.04430555843231687, "completions/clipped_ratio": -2.4375, "epoch": 1.7340501792114695, "grad_norm": 91.33239515870918, "learning_rate": 1.4707039321653328e-06, "loss": 0.3049, "num_tokens": 117135828.0, "residual_var": 0.035998281091451645, "reward": 0.798828125, "reward_std": 0.09377043694257736, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "rho2": 0.18749988079071045, "step": 302 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.6742054422372178e-09, "advantages/std": 0.278138667345047, "advantages/var": 0.07736111827247871, "completions/clipped_ratio": -1.8125, "epoch": 1.739784946236559, "grad_norm": 96.06617485806908, "learning_rate": 1.467533134061278e-06, "loss": -0.3064, "num_tokens": 117573999.0, "residual_var": 0.045933179557323456, "reward": 0.681640625, "reward_std": 0.1772492527961731, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.40624985098838806, "step": 303 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.4529509225202624e-09, "advantages/std": 0.24037009477615356, "advantages/var": 0.05777778246269705, "completions/clipped_ratio": -2.1015625, "epoch": 1.7455197132616487, "grad_norm": 80.93532291897421, "learning_rate": 1.4643563090352697e-06, "loss": 0.0797, "num_tokens": 117985473.0, "residual_var": 0.03611113131046295, "reward": 0.703125, "reward_std": 0.14108555018901825, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "rho2": 0.3749998211860657, "step": 304 }, { "advantages/mean": 1.4551915228366852e-09, "advantages/snr": 5.801459492464255e-09, "advantages/std": 0.2508319616317749, "advantages/var": 0.0629166729760442, "completions/clipped_ratio": -1.8671875, "epoch": 1.7512544802867382, "grad_norm": 74.36035268198877, "learning_rate": 1.4611734980394356e-06, "loss": -0.1696, "num_tokens": 118405597.0, "residual_var": 0.037356793880462646, "reward": 0.791015625, "reward_std": 0.14885765314102173, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "rho2": 0.4062497615814209, "step": 305 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.8540089667006097e-09, "advantages/std": 0.2855307161808014, "advantages/var": 0.08152778988272136, "completions/clipped_ratio": -2.171875, "epoch": 1.7569892473118278, "grad_norm": 98.8554469876464, "learning_rate": 1.4579847421030676e-06, "loss": -0.4504, "num_tokens": 118808119.0, "residual_var": 0.040763918310403824, "reward": 0.763671875, "reward_std": 0.18841999769210815, "rewards/drgrpo_math_reward/mean": 0.763671875, "rewards/drgrpo_math_reward/std": 0.42524150013923645, "rho2": 0.4999997317790985, "step": 306 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.6451666111689598e-09, "advantages/std": 0.25549519062042236, "advantages/var": 0.06527779243016596, "completions/clipped_ratio": -1.859375, "epoch": 1.7627240143369176, "grad_norm": 77.33602590256984, "learning_rate": 1.4547900823320929e-06, "loss": -0.0431, "num_tokens": 119213144.0, "residual_var": 0.038758713752031326, "reward": 0.7421875, "reward_std": 0.15239764750003815, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "rho2": 0.4062497019767761, "step": 307 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.5448146573517204e-09, "advantages/std": 0.29556915163993835, "advantages/var": 0.08736112340115287, "completions/clipped_ratio": -1.953125, "epoch": 1.7684587813620072, "grad_norm": 89.94725781615777, "learning_rate": 1.451589559908545e-06, "loss": -0.3155, "num_tokens": 119639938.0, "residual_var": 0.03276044875383377, "reward": 0.662109375, "reward_std": 0.2173064947128296, "rewards/drgrpo_math_reward/mean": 0.662109375, "rewards/drgrpo_math_reward/std": 0.4734536409378052, "rho2": 0.6249997019767761, "step": 308 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.4399056415832834e-10, "advantages/std": 0.2622022330760956, "advantages/var": 0.06875001103009115, "completions/clipped_ratio": -2.4375, "epoch": 1.7741935483870968, "grad_norm": 79.6848238618587, "learning_rate": 1.4483832160900325e-06, "loss": -0.2981, "num_tokens": 119992386.0, "residual_var": 0.04726565256714821, "reward": 0.849609375, "reward_std": 0.14520511031150818, "rewards/drgrpo_math_reward/mean": 0.849609375, "rewards/drgrpo_math_reward/std": 0.35780346393585205, "rho2": 0.3124998211860657, "step": 309 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.788654356049171e-09, "advantages/std": 0.26034167408943176, "advantages/var": 0.0677777872676879, "completions/clipped_ratio": -1.8984375, "epoch": 1.7799283154121865, "grad_norm": 86.49981763028393, "learning_rate": 1.4451710922092056e-06, "loss": -0.3402, "num_tokens": 120398226.0, "residual_var": 0.04024307429790497, "reward": 0.671875, "reward_std": 0.15678554773330688, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4699897766113281, "rho2": 0.40624985098838806, "step": 310 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 4.223932764381209e-09, "advantages/std": 0.24804794788360596, "advantages/var": 0.0615277844492681, "completions/clipped_ratio": -2.046875, "epoch": 1.7856630824372761, "grad_norm": 72.49710119605653, "learning_rate": 1.4419532296732268e-06, "loss": -0.2557, "num_tokens": 120821883.0, "residual_var": 0.04230036959052086, "reward": 0.685546875, "reward_std": 0.13695251941680908, "rewards/drgrpo_math_reward/mean": 0.685546875, "rewards/drgrpo_math_reward/std": 0.4647517800331116, "rho2": 0.3124998211860657, "step": 311 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.545821712611014e-09, "advantages/std": 0.27436795830726624, "advantages/var": 0.07527777654569778, "completions/clipped_ratio": -2.3125, "epoch": 1.7913978494623657, "grad_norm": 83.42790282577374, "learning_rate": 1.4387296699632332e-06, "loss": -0.2721, "num_tokens": 121230378.0, "residual_var": 0.039991337805986404, "reward": 0.63671875, "reward_std": 0.18637633323669434, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.4814152419567108, "rho2": 0.46874985098838806, "step": 312 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.036843688300116e-09, "advantages/std": 0.2857738137245178, "advantages/var": 0.08166667261065541, "completions/clipped_ratio": -2.4375, "epoch": 1.7971326164874553, "grad_norm": 84.93496608789505, "learning_rate": 1.4355004546338045e-06, "loss": 0.057, "num_tokens": 121628149.0, "residual_var": 0.04338545352220535, "reward": 0.71875, "reward_std": 0.18806010484695435, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "rho2": 0.4687497019767761, "step": 313 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 3.0892674721251433e-09, "advantages/std": 0.26378655433654785, "advantages/var": 0.06958334624874851, "completions/clipped_ratio": -2.21875, "epoch": 1.8028673835125448, "grad_norm": 76.19106108360742, "learning_rate": 1.4322656253124264e-06, "loss": -0.1837, "num_tokens": 122064312.0, "residual_var": 0.03696617856621742, "reward": 0.693359375, "reward_std": 0.1663198471069336, "rewards/drgrpo_math_reward/mean": 0.693359375, "rewards/drgrpo_math_reward/std": 0.4615498185157776, "rho2": 0.4687497019767761, "step": 314 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.21794496476650238, "advantages/var": 0.047500007667071964, "completions/clipped_ratio": -2.4609375, "epoch": 1.8086021505376344, "grad_norm": 62.5063960959858, "learning_rate": 1.4290252236989535e-06, "loss": -0.1652, "num_tokens": 122450598.0, "residual_var": 0.029687514528632164, "reward": 0.78515625, "reward_std": 0.12615203857421875, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4111155867576599, "rho2": 0.37499988079071045, "step": 315 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 5.3064896326683645e-09, "advantages/std": 0.26325950026512146, "advantages/var": 0.06930556447984149, "completions/clipped_ratio": -2.3984375, "epoch": 1.814336917562724, "grad_norm": 80.32769306380419, "learning_rate": 1.4257792915650725e-06, "loss": -0.241, "num_tokens": 122843729.0, "residual_var": 0.04115019738674164, "reward": 0.708984375, "reward_std": 0.16345852613449097, "rewards/drgrpo_math_reward/mean": 0.708984375, "rewards/drgrpo_math_reward/std": 0.45467492938041687, "rho2": 0.4062498211860657, "step": 316 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.5323635504420908e-09, "advantages/std": 0.2279132455587387, "advantages/var": 0.05194444750111793, "completions/clipped_ratio": -2.65625, "epoch": 1.8200716845878135, "grad_norm": 60.22537796683005, "learning_rate": 1.4225278707537638e-06, "loss": -0.0226, "num_tokens": 123186775.0, "residual_var": 0.0389583557844162, "reward": 0.83984375, "reward_std": 0.11274328827857971, "rewards/drgrpo_math_reward/mean": 0.83984375, "rewards/drgrpo_math_reward/std": 0.3671095669269562, "rho2": 0.24999982118606567, "step": 317 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 3.5280276628678478e-09, "advantages/std": 0.2969755232334137, "advantages/var": 0.08819446139975984, "completions/clipped_ratio": -2.0, "epoch": 1.8258064516129031, "grad_norm": 91.84439531353381, "learning_rate": 1.4192710031787617e-06, "loss": -0.3509, "num_tokens": 123593922.0, "residual_var": 0.044097259640693665, "reward": 0.787109375, "reward_std": 0.20324639976024628, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "rho2": 0.4999997019767761, "step": 318 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.7514940923734022e-09, "advantages/std": 0.25385913252830505, "advantages/var": 0.06444445916802355, "completions/clipped_ratio": -2.1328125, "epoch": 1.8315412186379927, "grad_norm": 79.93651214231198, "learning_rate": 1.4160087308240133e-06, "loss": -0.3137, "num_tokens": 124002376.0, "residual_var": 0.04027780145406723, "reward": 0.78125, "reward_std": 0.14836521446704865, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "rho2": 0.37499988079071045, "step": 319 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.1579157083250831e-09, "advantages/std": 0.30161604285240173, "advantages/var": 0.09097223730594184, "completions/clipped_ratio": -2.5078125, "epoch": 1.8372759856630825, "grad_norm": 93.38978196047681, "learning_rate": 1.4127410957431396e-06, "loss": -0.1751, "num_tokens": 124371189.0, "residual_var": 0.04832902550697327, "reward": 0.775390625, "reward_std": 0.19502206146717072, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "rho2": 0.4687497615814209, "step": 320 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 1.3728266137116581e-08, "advantages/std": 0.22047927975654602, "advantages/var": 0.048611112801965284, "completions/clipped_ratio": -2.4140625, "epoch": 1.843010752688172, "grad_norm": 59.257573050838594, "learning_rate": 1.4094681400588907e-06, "loss": -0.1452, "num_tokens": 124755375.0, "residual_var": 0.03342015668749809, "reward": 0.91796875, "reward_std": 0.11586953699588776, "rewards/drgrpo_math_reward/mean": 0.91796875, "rewards/drgrpo_math_reward/std": 0.2746807038784027, "rho2": 0.31249988079071045, "step": 321 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 3.6570012544888086e-09, "advantages/std": 0.2865019142627716, "advantages/var": 0.08208334687623253, "completions/clipped_ratio": -2.3359375, "epoch": 1.8487455197132616, "grad_norm": 90.04983722883188, "learning_rate": 1.4061899059626043e-06, "loss": -0.9468, "num_tokens": 125149987.0, "residual_var": 0.04873700439929962, "reward": 0.759765625, "reward_std": 0.17744740843772888, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "rho2": 0.4062498211860657, "step": 322 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 5.203240900914091e-09, "advantages/std": 0.26848340034484863, "advantages/var": 0.07208333626073227, "completions/clipped_ratio": -2.1640625, "epoch": 1.8544802867383514, "grad_norm": 78.66846646933797, "learning_rate": 1.4029064357136626e-06, "loss": -0.4385, "num_tokens": 125542578.0, "residual_var": 0.04505210369825363, "reward": 0.755859375, "reward_std": 0.16071709990501404, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "rho2": 0.37499988079071045, "step": 323 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.878478986422071e-10, "advantages/std": 0.23863035440444946, "advantages/var": 0.05694444604319315, "completions/clipped_ratio": -2.375, "epoch": 1.860215053763441, "grad_norm": 79.81009848564727, "learning_rate": 1.3996177716389452e-06, "loss": -0.0232, "num_tokens": 125913825.0, "residual_var": 0.03736981004476547, "reward": 0.82421875, "reward_std": 0.1365666538476944, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "rho2": 0.34374988079071045, "step": 324 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.993012436034352e-09, "advantages/std": 0.29154759645462036, "advantages/var": 0.08500000099846616, "completions/clipped_ratio": -2.3828125, "epoch": 1.8659498207885306, "grad_norm": 90.5475624785007, "learning_rate": 1.3963239561322857e-06, "loss": -0.3855, "num_tokens": 126284947.0, "residual_var": 0.05046878755092621, "reward": 0.80859375, "reward_std": 0.18637537956237793, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.3937928080558777, "rho2": 0.4062497019767761, "step": 325 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.93908306047682e-09, "advantages/std": 0.2592725157737732, "advantages/var": 0.06722223743566147, "completions/clipped_ratio": -2.1640625, "epoch": 1.8716845878136201, "grad_norm": 99.37985089935707, "learning_rate": 1.3930250316539235e-06, "loss": -0.4802, "num_tokens": 126666934.0, "residual_var": 0.03781251609325409, "reward": 0.734375, "reward_std": 0.16070452332496643, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "rho2": 0.43749988079071045, "step": 326 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.8463055313168485e-09, "advantages/std": 0.2522124648094177, "advantages/var": 0.06361112740524177, "completions/clipped_ratio": -2.234375, "epoch": 1.8774193548387097, "grad_norm": 69.2017662972246, "learning_rate": 1.3897210407299583e-06, "loss": -0.2052, "num_tokens": 127046936.0, "residual_var": 0.037769123911857605, "reward": 0.8046875, "reward_std": 0.15200483798980713, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "rho2": 0.4062498211860657, "step": 327 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.666131464156088e-10, "advantages/std": 0.3175426721572876, "advantages/var": 0.10083334864079063, "completions/clipped_ratio": -2.328125, "epoch": 1.8831541218637993, "grad_norm": 104.64543360717904, "learning_rate": 1.386412025951799e-06, "loss": -0.6721, "num_tokens": 127438081.0, "residual_var": 0.028359429910779, "reward": 0.7265625, "reward_std": 0.24851016700267792, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "rho2": 0.7187495231628418, "step": 328 }, { "advantages/mean": -4.0745362639427185e-10, "advantages/snr": 1.5339547094693413e-09, "advantages/std": 0.26562297344207764, "advantages/var": 0.07055556402021068, "completions/clipped_ratio": -2.40625, "epoch": 1.8888888888888888, "grad_norm": 59.60120628258996, "learning_rate": 1.3830980299756188e-06, "loss": -0.7721, "num_tokens": 127794790.0, "residual_var": 0.04409724846482277, "reward": 0.7734375, "reward_std": 0.15928564965724945, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "rho2": 0.3749998211860657, "step": 329 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 2.5399704644459935e-09, "advantages/std": 0.18333333730697632, "advantages/var": 0.033611112568113555, "completions/clipped_ratio": -2.765625, "epoch": 1.8946236559139784, "grad_norm": 51.579691112900406, "learning_rate": 1.379779095521801e-06, "loss": -0.6478, "num_tokens": 128134664.0, "residual_var": 0.026258695870637894, "reward": 0.875, "reward_std": 0.07992979884147644, "rewards/drgrpo_math_reward/mean": 0.875, "rewards/drgrpo_math_reward/std": 0.3310423493385315, "rho2": 0.21874991059303284, "step": 330 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.8394107276834965e-09, "advantages/std": 0.31644731760025024, "advantages/var": 0.10013890481639365, "completions/clipped_ratio": -1.9609375, "epoch": 1.900358422939068, "grad_norm": 69.83896466239963, "learning_rate": 1.3764552653743919e-06, "loss": -0.7294, "num_tokens": 128571780.0, "residual_var": 0.056328170001506805, "reward": 0.630859375, "reward_std": 0.20179690420627594, "rewards/drgrpo_math_reward/mean": 0.630859375, "rewards/drgrpo_math_reward/std": 0.4830440282821655, "rho2": 0.4374997019767761, "step": 331 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.691330240045686e-09, "advantages/std": 0.2064918726682663, "advantages/var": 0.0426388934780475, "completions/clipped_ratio": -2.5703125, "epoch": 1.9060931899641576, "grad_norm": 44.58342133333614, "learning_rate": 1.3731265823805468e-06, "loss": 0.3634, "num_tokens": 128960780.0, "residual_var": 0.03197918459773064, "reward": 0.810546875, "reward_std": 0.09633443504571915, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "rho2": 0.24999986588954926, "step": 332 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 3.028504248828846e-09, "advantages/std": 0.2306392341852188, "advantages/var": 0.053194456345544205, "completions/clipped_ratio": -2.3828125, "epoch": 1.9118279569892473, "grad_norm": 50.36844072847442, "learning_rate": 1.3697930894499784e-06, "loss": -1.3056, "num_tokens": 129322362.0, "residual_var": 0.02825956791639328, "reward": 0.701171875, "reward_std": 0.14634226262569427, "rewards/drgrpo_math_reward/mean": 0.701171875, "rewards/drgrpo_math_reward/std": 0.45819199085235596, "rho2": 0.4687497615814209, "step": 333 }, { "advantages/mean": -1.7462298274040222e-09, "advantages/snr": 5.767606420711305e-09, "advantages/std": 0.3027650713920593, "advantages/var": 0.09166668845503878, "completions/clipped_ratio": -2.421875, "epoch": 1.917562724014337, "grad_norm": 58.491984057355204, "learning_rate": 1.3664548295544046e-06, "loss": -0.3199, "num_tokens": 129703808.0, "residual_var": 0.0544271245598793, "reward": 0.7421875, "reward_std": 0.19048281013965607, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "rho2": 0.40624964237213135, "step": 334 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 9.709708203770096e-10, "advantages/std": 0.23979158699512482, "advantages/var": 0.05750000519364051, "completions/clipped_ratio": -2.671875, "epoch": 1.9232974910394265, "grad_norm": 48.50781213866919, "learning_rate": 1.3631118457269927e-06, "loss": -0.7354, "num_tokens": 130037099.0, "residual_var": 0.03414065018296242, "reward": 0.80078125, "reward_std": 0.14314822852611542, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "rho2": 0.4062497615814209, "step": 335 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 8.378552191343982e-10, "advantages/std": 0.27788886427879333, "advantages/var": 0.07722222089015762, "completions/clipped_ratio": -2.578125, "epoch": 1.9290322580645163, "grad_norm": 59.71557588373033, "learning_rate": 1.359764181061807e-06, "loss": -0.5605, "num_tokens": 130365781.0, "residual_var": 0.050677116960287094, "reward": 0.734375, "reward_std": 0.16705745458602905, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "rho2": 0.3437497615814209, "step": 336 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 8.126669454419574e-10, "advantages/std": 0.2865019142627716, "advantages/var": 0.08208334687623253, "completions/clipped_ratio": -2.578125, "epoch": 1.9347670250896059, "grad_norm": 54.14887252060116, "learning_rate": 1.3564118787132506e-06, "loss": -0.1152, "num_tokens": 130714811.0, "residual_var": 0.043606799095869064, "reward": 0.728515625, "reward_std": 0.1893252432346344, "rewards/drgrpo_math_reward/mean": 0.728515625, "rewards/drgrpo_math_reward/std": 0.44516023993492126, "rho2": 0.4687497615814209, "step": 337 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 1.0730167339738786e-09, "advantages/std": 0.21698695421218872, "advantages/var": 0.047083338298282484, "completions/clipped_ratio": -2.7421875, "epoch": 1.9405017921146954, "grad_norm": 44.655668006849396, "learning_rate": 1.353054981895512e-06, "loss": 0.129, "num_tokens": 131027422.0, "residual_var": 0.030898455530405045, "reward": 0.845703125, "reward_std": 0.11795921623706818, "rewards/drgrpo_math_reward/mean": 0.845703125, "rewards/drgrpo_math_reward/std": 0.36158639192581177, "rho2": 0.34374985098838806, "step": 338 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.4357961868592553e-09, "advantages/std": 0.24324201047420502, "advantages/var": 0.05916667565953326, "completions/clipped_ratio": -2.28125, "epoch": 1.946236559139785, "grad_norm": 47.67221013393229, "learning_rate": 1.349693533882005e-06, "loss": -0.0754, "num_tokens": 131413929.0, "residual_var": 0.042526066303253174, "reward": 0.7421875, "reward_std": 0.13240733742713928, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "rho2": 0.2812498211860657, "step": 339 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 5.028177490921764e-09, "advantages/std": 0.2546784579753876, "advantages/var": 0.06486111695672125, "completions/clipped_ratio": -2.3515625, "epoch": 1.9519713261648746, "grad_norm": 45.496324911021865, "learning_rate": 1.3463275780048135e-06, "loss": -0.6376, "num_tokens": 131781541.0, "residual_var": 0.038511306047439575, "reward": 0.619140625, "reward_std": 0.15289105474948883, "rewards/drgrpo_math_reward/mean": 0.619140625, "rewards/drgrpo_math_reward/std": 0.48607301712036133, "rho2": 0.40624988079071045, "step": 340 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.707745501563748e-09, "advantages/std": 0.2967415750026703, "advantages/var": 0.0880555623350654, "completions/clipped_ratio": -2.3984375, "epoch": 1.9577060931899641, "grad_norm": 56.404438032443515, "learning_rate": 1.3429571576541314e-06, "loss": -0.4719, "num_tokens": 132166460.0, "residual_var": 0.04953128471970558, "reward": 0.734375, "reward_std": 0.1868361383676529, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "rho2": 0.4374997317790985, "step": 341 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.724340751630126e-09, "advantages/std": 0.2710576057434082, "advantages/var": 0.07347222563134892, "completions/clipped_ratio": -2.0546875, "epoch": 1.9634408602150537, "grad_norm": 51.811476714311006, "learning_rate": 1.3395823162777038e-06, "loss": -0.8087, "num_tokens": 132581556.0, "residual_var": 0.039032142609357834, "reward": 0.845703125, "reward_std": 0.17292287945747375, "rewards/drgrpo_math_reward/mean": 0.845703125, "rewards/drgrpo_math_reward/std": 0.36158639192581177, "rho2": 0.4687498211860657, "step": 342 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 3.2778898342612143e-09, "advantages/std": 0.24860726296901703, "advantages/var": 0.061805571200945986, "completions/clipped_ratio": -2.375, "epoch": 1.9691756272401433, "grad_norm": 53.736942555069994, "learning_rate": 1.3362030973802669e-06, "loss": -0.176, "num_tokens": 132964123.0, "residual_var": 0.03862850368022919, "reward": 0.748046875, "reward_std": 0.1409589648246765, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "rho2": 0.3749997019767761, "step": 343 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.73776501244438e-09, "advantages/std": 0.2679656147956848, "advantages/var": 0.07180557071282934, "completions/clipped_ratio": -2.171875, "epoch": 1.9749103942652328, "grad_norm": 55.88199911189342, "learning_rate": 1.3328195445229867e-06, "loss": -0.422, "num_tokens": 133369195.0, "residual_var": 0.04263458028435707, "reward": 0.810546875, "reward_std": 0.16190052032470703, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "rho2": 0.4062497317790985, "step": 344 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 5.44600965853743e-10, "advantages/std": 0.21376259624958038, "advantages/var": 0.04569444755536112, "completions/clipped_ratio": -2.4921875, "epoch": 1.9806451612903224, "grad_norm": 40.34538934990838, "learning_rate": 1.329431701322898e-06, "loss": 0.1427, "num_tokens": 133753509.0, "residual_var": 0.031414955854415894, "reward": 0.822265625, "reward_std": 0.1111922338604927, "rewards/drgrpo_math_reward/mean": 0.822265625, "rewards/drgrpo_math_reward/std": 0.3826628625392914, "rho2": 0.3124997615814209, "step": 345 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.395906268505378e-09, "advantages/std": 0.29130932688713074, "advantages/var": 0.08486112393143319, "completions/clipped_ratio": -2.1796875, "epoch": 1.9863799283154122, "grad_norm": 56.77628087918797, "learning_rate": 1.3260396114523417e-06, "loss": -0.5622, "num_tokens": 134153772.0, "residual_var": 0.05038632079958916, "reward": 0.798828125, "reward_std": 0.1830447018146515, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "rho2": 0.4062497913837433, "step": 346 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.3727916623021474e-09, "advantages/std": 0.2544056475162506, "advantages/var": 0.06472223348816275, "completions/clipped_ratio": -2.2734375, "epoch": 1.9921146953405018, "grad_norm": 50.980595691377765, "learning_rate": 1.322643318638403e-06, "loss": -0.3581, "num_tokens": 134538700.0, "residual_var": 0.044496551156044006, "reward": 0.80859375, "reward_std": 0.144730806350708, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.3937928080558777, "rho2": 0.3124998211860657, "step": 347 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.4483799666947197e-09, "advantages/std": 0.28528738021850586, "advantages/var": 0.08138888931193833, "completions/clipped_ratio": -2.234375, "epoch": 1.9978494623655914, "grad_norm": 52.74809142012798, "learning_rate": 1.3192428666623462e-06, "loss": -0.8855, "num_tokens": 134933547.0, "residual_var": 0.04323789104819298, "reward": 0.73046875, "reward_std": 0.18336734175682068, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44415023922920227, "rho2": 0.4687495827674866, "step": 348 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 8.263392628867272e-09, "advantages/std": 0.2535853981971741, "advantages/var": 0.06430555417881934, "completions/clipped_ratio": -2.0703125, "epoch": 2.0057347670250896, "grad_norm": 51.39385576757077, "learning_rate": 1.3158382993590506e-06, "loss": -0.532, "num_tokens": 135345872.0, "residual_var": 0.034162357449531555, "reward": 0.880859375, "reward_std": 0.16037645936012268, "rewards/drgrpo_math_reward/mean": 0.880859375, "rewards/drgrpo_math_reward/std": 0.32427072525024414, "rho2": 0.46874964237213135, "step": 349 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.373571985033162e-09, "advantages/std": 0.24523232877254486, "advantages/var": 0.060138895075205534, "completions/clipped_ratio": -2.2109375, "epoch": 2.011469534050179, "grad_norm": 47.88417227313692, "learning_rate": 1.3124296606164462e-06, "loss": -0.3239, "num_tokens": 135727330.0, "residual_var": 0.03570748493075371, "reward": 0.802734375, "reward_std": 0.1517515480518341, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "rho2": 0.40624985098838806, "step": 350 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 6.491554851381335e-09, "advantages/std": 0.26900023221969604, "advantages/var": 0.0723611249342504, "completions/clipped_ratio": -2.1171875, "epoch": 2.0172043010752687, "grad_norm": 53.318133564563155, "learning_rate": 1.3090169943749473e-06, "loss": -1.8827, "num_tokens": 136132570.0, "residual_var": 0.03391929715871811, "reward": 0.775390625, "reward_std": 0.18226401507854462, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "rho2": 0.5312497615814209, "step": 351 }, { "advantages/mean": -9.89530235528946e-10, "advantages/snr": 3.465582316707883e-09, "advantages/std": 0.2855307161808014, "advantages/var": 0.08152778988272136, "completions/clipped_ratio": -2.140625, "epoch": 2.0229390681003583, "grad_norm": 57.05664053017318, "learning_rate": 1.3056003446268868e-06, "loss": -0.8735, "num_tokens": 136530569.0, "residual_var": 0.04331166297197342, "reward": 0.814453125, "reward_std": 0.18799343705177307, "rewards/drgrpo_math_reward/mean": 0.814453125, "rewards/drgrpo_math_reward/std": 0.38912075757980347, "rho2": 0.4687497317790985, "step": 352 }, { "advantages/mean": -5.820766091346741e-11, "advantages/snr": 2.1849217451879611e-10, "advantages/std": 0.2664061486721039, "advantages/var": 0.07097223605030312, "completions/clipped_ratio": -2.015625, "epoch": 2.028673835125448, "grad_norm": 46.792399943188734, "learning_rate": 1.302179755415948e-06, "loss": -0.0726, "num_tokens": 136967705.0, "residual_var": 0.04435766488313675, "reward": 0.689453125, "reward_std": 0.1590128242969513, "rewards/drgrpo_math_reward/mean": 0.689453125, "rewards/drgrpo_math_reward/std": 0.46317005157470703, "rho2": 0.3749998211860657, "step": 353 }, { "advantages/mean": 8.731149137020111e-10, "advantages/snr": 3.431979155755368e-09, "advantages/std": 0.2544056475162506, "advantages/var": 0.06472223348816275, "completions/clipped_ratio": -2.453125, "epoch": 2.0344086021505374, "grad_norm": 56.68279278121475, "learning_rate": 1.2987552708365974e-06, "loss": -0.6944, "num_tokens": 137346736.0, "residual_var": 0.04651912301778793, "reward": 0.84375, "reward_std": 0.13534927368164062, "rewards/drgrpo_math_reward/mean": 0.84375, "rewards/drgrpo_math_reward/std": 0.36344730854034424, "rho2": 0.2812498211860657, "step": 354 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 3.646098217565215e-09, "advantages/std": 0.31928738951683044, "advantages/var": 0.10194443710447221, "completions/clipped_ratio": -1.765625, "epoch": 2.0401433691756274, "grad_norm": 61.52909128407179, "learning_rate": 1.2953269350335168e-06, "loss": -0.5336, "num_tokens": 137795882.0, "residual_var": 0.044600728899240494, "reward": 0.6640625, "reward_std": 0.22583432495594025, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4727790653705597, "rho2": 0.5624997615814209, "step": 355 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 5.855272929413575e-09, "advantages/std": 0.25846773386001587, "advantages/var": 0.066805569446732, "completions/clipped_ratio": -1.9609375, "epoch": 2.045878136200717, "grad_norm": 51.95812370526967, "learning_rate": 1.2918947922010336e-06, "loss": -0.4348, "num_tokens": 138224666.0, "residual_var": 0.035490483045578, "reward": 0.724609375, "reward_std": 0.1641634702682495, "rewards/drgrpo_math_reward/mean": 0.724609375, "rewards/drgrpo_math_reward/std": 0.44714778661727905, "rho2": 0.4687497019767761, "step": 356 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 3.942515003780859e-09, "advantages/std": 0.17716911435127258, "advantages/var": 0.0313888950800143, "completions/clipped_ratio": -2.09375, "epoch": 2.0516129032258066, "grad_norm": 34.13547277389914, "learning_rate": 1.28845888658255e-06, "loss": -0.0075, "num_tokens": 138627278.0, "residual_var": 0.025503486394882202, "reward": 0.765625, "reward_std": 0.07355421781539917, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42402184009552, "rho2": 0.18749994039535522, "step": 357 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.619861341571255e-09, "advantages/std": 0.2874698340892792, "advantages/var": 0.0826389055113177, "completions/clipped_ratio": -1.6484375, "epoch": 2.057347670250896, "grad_norm": 56.25172523478906, "learning_rate": 1.285019262469976e-06, "loss": -0.474, "num_tokens": 139075531.0, "residual_var": 0.054231803864240646, "reward": 0.728515625, "reward_std": 0.17930275201797485, "rewards/drgrpo_math_reward/mean": 0.728515625, "rewards/drgrpo_math_reward/std": 0.44516023993492126, "rho2": 0.3437497615814209, "step": 358 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 7.388450007721587e-10, "advantages/std": 0.3151278495788574, "advantages/var": 0.09930556158019499, "completions/clipped_ratio": -1.71875, "epoch": 2.0630824372759857, "grad_norm": 67.36803386402565, "learning_rate": 1.2815759642031551e-06, "loss": -1.4688, "num_tokens": 139522389.0, "residual_var": 0.046549517661333084, "reward": 0.666015625, "reward_std": 0.22692254185676575, "rewards/drgrpo_math_reward/mean": 0.666015625, "rewards/drgrpo_math_reward/std": 0.47209542989730835, "rho2": 0.5312497019767761, "step": 359 }, { "advantages/mean": 7.566995918750763e-10, "advantages/snr": 2.6707040854625774e-09, "advantages/std": 0.28333336114883423, "advantages/var": 0.08027779353989573, "completions/clipped_ratio": -2.1875, "epoch": 2.0688172043010753, "grad_norm": 61.613483006046366, "learning_rate": 1.2781290361692937e-06, "loss": -1.2389, "num_tokens": 139922575.0, "residual_var": 0.055191006511449814, "reward": 0.7421875, "reward_std": 0.15900084376335144, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "rho2": 0.3124997913837433, "step": 360 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.361317093425564e-10, "advantages/std": 0.26692697405815125, "advantages/var": 0.07125000947984095, "completions/clipped_ratio": -1.859375, "epoch": 2.074551971326165, "grad_norm": 51.45280445491275, "learning_rate": 1.2746785228023901e-06, "loss": -1.1403, "num_tokens": 140338449.0, "residual_var": 0.04675783962011337, "reward": 0.677734375, "reward_std": 0.14947649836540222, "rewards/drgrpo_math_reward/mean": 0.677734375, "rewards/drgrpo_math_reward/std": 0.46780112385749817, "rho2": 0.3437498211860657, "step": 361 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.5642443445922294e-09, "advantages/std": 0.22699731588363647, "advantages/var": 0.05152778141837544, "completions/clipped_ratio": -2.3828125, "epoch": 2.0802867383512544, "grad_norm": 46.61190096076828, "learning_rate": 1.27122446858266e-06, "loss": -0.4038, "num_tokens": 140727287.0, "residual_var": 0.03381512314081192, "reward": 0.783203125, "reward_std": 0.12749597430229187, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "rho2": 0.3437497615814209, "step": 362 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 2.4364555491406494e-09, "advantages/std": 0.1911224126815796, "advantages/var": 0.036527776629228015, "completions/clipped_ratio": -2.40625, "epoch": 2.086021505376344, "grad_norm": 38.94695144235124, "learning_rate": 1.2677669180359642e-06, "loss": 0.0083, "num_tokens": 141099499.0, "residual_var": 0.02739585004746914, "reward": 0.732421875, "reward_std": 0.08961933851242065, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.24999991059303284, "step": 363 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.8193347035254061e-09, "advantages/std": 0.3199392557144165, "advantages/var": 0.1023611273470948, "completions/clipped_ratio": -2.0625, "epoch": 2.0917562724014336, "grad_norm": 61.04995302387311, "learning_rate": 1.2643059157332337e-06, "loss": -0.4115, "num_tokens": 141524838.0, "residual_var": 0.041584257036447525, "reward": 0.712890625, "reward_std": 0.23002517223358154, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "rho2": 0.5937495827674866, "step": 364 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 1.0369385950567983e-09, "advantages/std": 0.22453656792640686, "advantages/var": 0.05041667033616992, "completions/clipped_ratio": -2.0859375, "epoch": 2.097491039426523, "grad_norm": 44.792470226379606, "learning_rate": 1.2608415062898969e-06, "loss": -0.8778, "num_tokens": 141926526.0, "residual_var": 0.03781251981854439, "reward": 0.677734375, "reward_std": 0.1121911108493805, "rewards/drgrpo_math_reward/mean": 0.677734375, "rewards/drgrpo_math_reward/std": 0.46780112385749817, "rho2": 0.24999991059303284, "step": 365 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.3698552871990758e-09, "advantages/std": 0.33993464708328247, "advantages/var": 0.1155555642876358, "completions/clipped_ratio": -2.265625, "epoch": 2.1032258064516127, "grad_norm": 59.375675807100464, "learning_rate": 1.2573737343653023e-06, "loss": -0.4409, "num_tokens": 142311990.0, "residual_var": 0.05055560544133186, "reward": 0.765625, "reward_std": 0.246641144156456, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42402184009552, "rho2": 0.5624996423721313, "step": 366 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.0702275268216437e-10, "advantages/std": 0.2860167324542999, "advantages/var": 0.08180557124383459, "completions/clipped_ratio": -2.1796875, "epoch": 2.1089605734767023, "grad_norm": 55.17362784011663, "learning_rate": 1.2539026446621445e-06, "loss": 0.8416, "num_tokens": 142704258.0, "residual_var": 0.03834637999534607, "reward": 0.767578125, "reward_std": 0.19492703676223755, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "rho2": 0.5312497615814209, "step": 367 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 6.419946072108308e-10, "advantages/std": 0.36266759037971497, "advantages/var": 0.13152778111182872, "completions/clipped_ratio": -1.8828125, "epoch": 2.1146953405017923, "grad_norm": 61.628823867377065, "learning_rate": 1.2504282819258865e-06, "loss": -0.2128, "num_tokens": 143120622.0, "residual_var": 0.0657639354467392, "reward": 0.662109375, "reward_std": 0.2727823853492737, "rewards/drgrpo_math_reward/mean": 0.662109375, "rewards/drgrpo_math_reward/std": 0.4734536409378052, "rho2": 0.4999997317790985, "step": 368 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 8.622370519781628e-10, "advantages/std": 0.2700308859348297, "advantages/var": 0.07291667935874901, "completions/clipped_ratio": -2.328125, "epoch": 2.120430107526882, "grad_norm": 44.20871500847943, "learning_rate": 1.2469506909441838e-06, "loss": -1.7361, "num_tokens": 143486154.0, "residual_var": 0.03873700648546219, "reward": 0.806640625, "reward_std": 0.17252814769744873, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "rho2": 0.4687497913837433, "step": 369 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.7415199780104634e-09, "advantages/std": 0.294627845287323, "advantages/var": 0.08680556721865074, "completions/clipped_ratio": -2.203125, "epoch": 2.1261648745519715, "grad_norm": 53.360594204033994, "learning_rate": 1.2434699165463078e-06, "loss": -1.1886, "num_tokens": 143855900.0, "residual_var": 0.04882817715406418, "reward": 0.794921875, "reward_std": 0.1824372261762619, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "rho2": 0.4374995231628418, "step": 370 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.583538511807429e-09, "advantages/std": 0.2793842554092407, "advantages/var": 0.07805556217057585, "completions/clipped_ratio": -1.8125, "epoch": 2.131899641577061, "grad_norm": 47.040685447844496, "learning_rate": 1.2399860036025658e-06, "loss": -1.1814, "num_tokens": 144269620.0, "residual_var": 0.03658856078982353, "reward": 0.65234375, "reward_std": 0.1923348307609558, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.47669193148612976, "rho2": 0.5312498807907104, "step": 371 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 3.5194982067760704e-09, "advantages/std": 0.2315407395362854, "advantages/var": 0.05361111406500996, "completions/clipped_ratio": -2.109375, "epoch": 2.1376344086021506, "grad_norm": 36.82323678769803, "learning_rate": 1.2364989970237248e-06, "loss": 0.1582, "num_tokens": 144640090.0, "residual_var": 0.038533009588718414, "reward": 0.83203125, "reward_std": 0.12105467915534973, "rewards/drgrpo_math_reward/mean": 0.83203125, "rewards/drgrpo_math_reward/std": 0.374204158782959, "rho2": 0.2812498211860657, "step": 372 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.1087755160962727e-10, "advantages/std": 0.28333336114883423, "advantages/var": 0.08027779353989573, "completions/clipped_ratio": -1.6484375, "epoch": 2.14336917562724, "grad_norm": 51.576346545771074, "learning_rate": 1.2330089417604304e-06, "loss": -1.7265, "num_tokens": 145106665.0, "residual_var": 0.042647603899240494, "reward": 0.6953125, "reward_std": 0.1831432282924652, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4607250988483429, "rho2": 0.4687497019767761, "step": 373 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.631286334969895e-10, "advantages/std": 0.32058975100517273, "advantages/var": 0.10277778844955865, "completions/clipped_ratio": -1.609375, "epoch": 2.1491039426523297, "grad_norm": 51.6751164726718, "learning_rate": 1.2295158828026292e-06, "loss": -0.5206, "num_tokens": 145540569.0, "residual_var": 0.05460074543952942, "reward": 0.75390625, "reward_std": 0.2141927033662796, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.4687495827674866, "step": 374 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.4092934548284855e-09, "advantages/std": 0.30731818079948425, "advantages/var": 0.09444446424990449, "completions/clipped_ratio": -1.5625, "epoch": 2.1548387096774193, "grad_norm": 53.36106575644517, "learning_rate": 1.2260198651789884e-06, "loss": -1.1355, "num_tokens": 145994634.0, "residual_var": 0.04427088424563408, "reward": 0.6640625, "reward_std": 0.21038015186786652, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4727790653705597, "rho2": 0.5312495827674866, "step": 375 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 3.707532309219526e-09, "advantages/std": 0.2825970947742462, "advantages/var": 0.0798611179748443, "completions/clipped_ratio": -1.765625, "epoch": 2.160573476702509, "grad_norm": 46.17442377637242, "learning_rate": 1.2225209339563143e-06, "loss": -1.2664, "num_tokens": 146414197.0, "residual_var": 0.03993059694766998, "reward": 0.791015625, "reward_std": 0.18656235933303833, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "rho2": 0.49999967217445374, "step": 376 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.36693747918836e-09, "advantages/std": 0.25549519062042236, "advantages/var": 0.06527779243016596, "completions/clipped_ratio": -1.9453125, "epoch": 2.1663082437275984, "grad_norm": 44.14175733741532, "learning_rate": 1.2190191342389726e-06, "loss": -1.6864, "num_tokens": 146842275.0, "residual_var": 0.03671877831220627, "reward": 0.859375, "reward_std": 0.15728996694087982, "rewards/drgrpo_math_reward/mean": 0.859375, "rewards/drgrpo_math_reward/std": 0.3479743003845215, "rho2": 0.4374997615814209, "step": 377 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.678821455366993e-10, "advantages/std": 0.31644731760025024, "advantages/var": 0.10013890481639365, "completions/clipped_ratio": -2.0234375, "epoch": 2.172043010752688, "grad_norm": 56.99387932933516, "learning_rate": 1.2155145111683066e-06, "loss": -1.1482, "num_tokens": 147263441.0, "residual_var": 0.05006949603557587, "reward": 0.818359375, "reward_std": 0.22208544611930847, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "rho2": 0.49999967217445374, "step": 378 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.403118581123502e-09, "advantages/std": 0.29083216190338135, "advantages/var": 0.08458334639739462, "completions/clipped_ratio": -1.8671875, "epoch": 2.1777777777777776, "grad_norm": 47.46858172543204, "learning_rate": 1.2120071099220547e-06, "loss": -0.1954, "num_tokens": 147686862.0, "residual_var": 0.03964846953749657, "reward": 0.779296875, "reward_std": 0.20330065488815308, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "rho2": 0.5312497615814209, "step": 379 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.5020461951555506e-09, "advantages/std": 0.3100179433822632, "advantages/var": 0.09611112521896814, "completions/clipped_ratio": -2.0234375, "epoch": 2.183512544802867, "grad_norm": 48.280975115460734, "learning_rate": 1.2084969757137685e-06, "loss": -0.2054, "num_tokens": 148097654.0, "residual_var": 0.03604169934988022, "reward": 0.8125, "reward_std": 0.22958697378635406, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39069411158561707, "rho2": 0.6249997615814209, "step": 380 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.1979037308103055e-09, "advantages/std": 0.29154759645462036, "advantages/var": 0.08500000099846616, "completions/clipped_ratio": -1.640625, "epoch": 2.189247311827957, "grad_norm": 50.05914379932426, "learning_rate": 1.2049841537922305e-06, "loss": -0.0656, "num_tokens": 148539904.0, "residual_var": 0.039843782782554626, "reward": 0.78125, "reward_std": 0.1994413435459137, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "rho2": 0.5312497615814209, "step": 381 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.160576516553562e-09, "advantages/std": 0.30092453956604004, "advantages/var": 0.0905555785130332, "completions/clipped_ratio": -1.859375, "epoch": 2.1949820788530467, "grad_norm": 52.83715969138398, "learning_rate": 1.2014686894408693e-06, "loss": -1.167, "num_tokens": 148973807.0, "residual_var": 0.04810766875743866, "reward": 0.765625, "reward_std": 0.20507681369781494, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42402184009552, "rho2": 0.4687498211860657, "step": 382 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.1704078927667033e-09, "advantages/std": 0.37546271085739136, "advantages/var": 0.14097224724438107, "completions/clipped_ratio": -1.28125, "epoch": 2.2007168458781363, "grad_norm": 65.54211634659485, "learning_rate": 1.1979506279771778e-06, "loss": -3.1305, "num_tokens": 149455070.0, "residual_var": 0.02643236517906189, "reward": 0.748046875, "reward_std": 0.3186452388763428, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "rho2": 0.8124995231628418, "step": 383 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.1121629347170463e-09, "advantages/std": 0.31402409076690674, "advantages/var": 0.09861112958198248, "completions/clipped_ratio": -1.484375, "epoch": 2.206451612903226, "grad_norm": 49.87820697763209, "learning_rate": 1.1944300147521275e-06, "loss": -1.0777, "num_tokens": 149903873.0, "residual_var": 0.04930559918284416, "reward": 0.73046875, "reward_std": 0.21982157230377197, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44415023922920227, "rho2": 0.4999997019767761, "step": 384 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.60608192950419e-09, "advantages/std": 0.31269440054893494, "advantages/var": 0.09777778813465776, "completions/clipped_ratio": -1.109375, "epoch": 2.2121863799283155, "grad_norm": 55.323478795332626, "learning_rate": 1.1909068951495848e-06, "loss": -0.8824, "num_tokens": 150400716.0, "residual_var": 0.033611152321100235, "reward": 0.71484375, "reward_std": 0.23718586564064026, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.45193037390708923, "rho2": 0.6562496423721313, "step": 385 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.2914718265653375e-09, "advantages/std": 0.3048223853111267, "advantages/var": 0.092916686586765, "completions/clipped_ratio": -1.7109375, "epoch": 2.217921146953405, "grad_norm": 54.66895989931206, "learning_rate": 1.1873813145857248e-06, "loss": -1.3005, "num_tokens": 150825835.0, "residual_var": 0.03484378010034561, "reward": 0.783203125, "reward_std": 0.2321348488330841, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "rho2": 0.6249997615814209, "step": 386 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 3.780759266793585e-09, "advantages/std": 0.33870670199394226, "advantages/var": 0.11472222997561321, "completions/clipped_ratio": -1.140625, "epoch": 2.2236559139784946, "grad_norm": 59.87211667992196, "learning_rate": 1.1838533185084466e-06, "loss": -0.5116, "num_tokens": 151312395.0, "residual_var": 0.03943582996726036, "reward": 0.6640625, "reward_std": 0.2611224055290222, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4727790653705597, "rho2": 0.6562495231628418, "step": 387 }, { "advantages/mean": -2.2118911147117615e-09, "advantages/snr": 7.555920555072644e-09, "advantages/std": 0.29273614287376404, "advantages/var": 0.08569444934460879, "completions/clipped_ratio": -1.03125, "epoch": 2.229390681003584, "grad_norm": 50.11952665400386, "learning_rate": 1.1803229523967888e-06, "loss": -1.2524, "num_tokens": 151818018.0, "residual_var": 0.040169306099414825, "reward": 0.681640625, "reward_std": 0.20572291314601898, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.5312497615814209, "step": 388 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 6.232909775544687e-09, "advantages/std": 0.29884037375450134, "advantages/var": 0.08930556898573005, "completions/clipped_ratio": -0.6796875, "epoch": 2.2351254480286737, "grad_norm": 51.18416506530321, "learning_rate": 1.1767902617603402e-06, "loss": -0.8007, "num_tokens": 152323194.0, "residual_var": 0.033489614725112915, "reward": 0.759765625, "reward_std": 0.22259947657585144, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "rho2": 0.6249997615814209, "step": 389 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 2.493510754122176e-09, "advantages/std": 0.32681119441986084, "advantages/var": 0.10680555679813608, "completions/clipped_ratio": -0.9375, "epoch": 2.2408602150537633, "grad_norm": 60.95991315065249, "learning_rate": 1.173255292138656e-06, "loss": -1.7583, "num_tokens": 152811209.0, "residual_var": 0.0333767831325531, "reward": 0.814453125, "reward_std": 0.25185802578926086, "rewards/drgrpo_math_reward/mean": 0.814453125, "rewards/drgrpo_math_reward/std": 0.38912075757980347, "rho2": 0.6874996423721313, "step": 390 }, { "advantages/mean": 2.9103830456733704e-10, "advantages/snr": 1.0048422439764403e-09, "advantages/std": 0.2896358072757721, "advantages/var": 0.0838889008562882, "completions/clipped_ratio": -1.046875, "epoch": 2.246594982078853, "grad_norm": 50.15512479974175, "learning_rate": 1.1697180891006689e-06, "loss": -0.2451, "num_tokens": 153285299.0, "residual_var": 0.03932293877005577, "reward": 0.8359375, "reward_std": 0.20252102613449097, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.37069445848464966, "rho2": 0.5312498211860657, "step": 391 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 6.904474948712746e-09, "advantages/std": 0.26977360248565674, "advantages/var": 0.07277779659808914, "completions/clipped_ratio": -0.6328125, "epoch": 2.252329749103943, "grad_norm": 46.9687615051108, "learning_rate": 1.1661786982441026e-06, "loss": -0.3056, "num_tokens": 153794090.0, "residual_var": 0.031840309500694275, "reward": 0.7578125, "reward_std": 0.1872817873954773, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "rho2": 0.5624997019767761, "step": 392 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.4408787244552653e-09, "advantages/std": 0.24238400161266327, "advantages/var": 0.05875000423776755, "completions/clipped_ratio": -0.9765625, "epoch": 2.258064516129032, "grad_norm": 45.05804133440101, "learning_rate": 1.1626371651948836e-06, "loss": -0.2836, "num_tokens": 154276866.0, "residual_var": 0.03671877086162567, "reward": 0.771484375, "reward_std": 0.1401205062866211, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "rho2": 0.3749998211860657, "step": 393 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.6994025893107055e-09, "advantages/std": 0.3146868050098419, "advantages/var": 0.09902778524730227, "completions/clipped_ratio": -1.1328125, "epoch": 2.263799283154122, "grad_norm": 63.70221720808448, "learning_rate": 1.1590935356065535e-06, "loss": -0.7218, "num_tokens": 154777768.0, "residual_var": 0.05879777669906616, "reward": 0.732421875, "reward_std": 0.20518332719802856, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.4062497317790985, "step": 394 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.022700611399606e-10, "advantages/std": 0.28939592838287354, "advantages/var": 0.08375000336458527, "completions/clipped_ratio": -0.9453125, "epoch": 2.2695340501792116, "grad_norm": 56.91418145810234, "learning_rate": 1.1555478551596793e-06, "loss": -1.3171, "num_tokens": 155277574.0, "residual_var": 0.04187503084540367, "reward": 0.748046875, "reward_std": 0.19117532670497894, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "rho2": 0.4999997317790985, "step": 395 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.602196737564972e-10, "advantages/std": 0.3231786787509918, "advantages/var": 0.10444445839923677, "completions/clipped_ratio": -0.5390625, "epoch": 2.275268817204301, "grad_norm": 67.72799201503261, "learning_rate": 1.1520001695612673e-06, "loss": -0.9105, "num_tokens": 155780927.0, "residual_var": 0.042430609464645386, "reward": 0.6875, "reward_std": 0.2350688874721527, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "rho2": 0.5937495827674866, "step": 396 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.2674467861652374, "advantages/var": 0.07152778343011423, "completions/clipped_ratio": -1.171875, "epoch": 2.2810035842293908, "grad_norm": 50.85619981951322, "learning_rate": 1.1484505245441695e-06, "loss": -0.9871, "num_tokens": 156286480.0, "residual_var": 0.03129342943429947, "reward": 0.787109375, "reward_std": 0.1870487928390503, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "rho2": 0.5624997615814209, "step": 397 }, { "advantages/mean": 2.444721758365631e-09, "advantages/snr": 7.69357972595432e-09, "advantages/std": 0.31776127219200134, "advantages/var": 0.10097222610507917, "completions/clipped_ratio": -1.1328125, "epoch": 2.2867383512544803, "grad_norm": 63.92157438698838, "learning_rate": 1.1448989658664984e-06, "loss": -0.8675, "num_tokens": 156785005.0, "residual_var": 0.04102000594139099, "reward": 0.728515625, "reward_std": 0.23034435510635376, "rewards/drgrpo_math_reward/mean": 0.728515625, "rewards/drgrpo_math_reward/std": 0.44516023993492126, "rho2": 0.5937497019767761, "step": 398 }, { "advantages/mean": 1.7462298274040222e-10, "advantages/snr": 8.361858093861743e-10, "advantages/std": 0.2088327407836914, "advantages/var": 0.04361111362322845, "completions/clipped_ratio": -1.40625, "epoch": 2.29247311827957, "grad_norm": 39.12621200875256, "learning_rate": 1.1413455393110348e-06, "loss": 0.1523, "num_tokens": 157254598.0, "residual_var": 0.03134550154209137, "reward": 0.71484375, "reward_std": 0.10807153582572937, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.45193037390708923, "rho2": 0.28124988079071045, "step": 399 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.4654437573246324e-09, "advantages/std": 0.31776127219200134, "advantages/var": 0.10097222610507917, "completions/clipped_ratio": -0.6953125, "epoch": 2.2982078853046595, "grad_norm": 63.796568741362606, "learning_rate": 1.137790290684638e-06, "loss": -0.4449, "num_tokens": 157777800.0, "residual_var": 0.04733077064156532, "reward": 0.697265625, "reward_std": 0.22457481920719147, "rewards/drgrpo_math_reward/mean": 0.697265625, "rewards/drgrpo_math_reward/std": 0.45989060401916504, "rho2": 0.5312497019767761, "step": 400 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 4.4762245286947386e-09, "advantages/std": 0.26007479429244995, "advantages/var": 0.06763889862626016, "completions/clipped_ratio": -1.25, "epoch": 2.303942652329749, "grad_norm": 60.325595355403074, "learning_rate": 1.1342332658176555e-06, "loss": 0.2124, "num_tokens": 158239345.0, "residual_var": 0.04227432608604431, "reward": 0.775390625, "reward_std": 0.15460288524627686, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "rho2": 0.37499985098838806, "step": 401 }, { "advantages/mean": 5.820766091346741e-11, "advantages/snr": 2.0420596942541568e-10, "advantages/std": 0.285043865442276, "advantages/var": 0.08125000522627435, "completions/clipped_ratio": -1.1171875, "epoch": 2.3096774193548386, "grad_norm": 59.20615049322778, "learning_rate": 1.1306745105633319e-06, "loss": -0.6522, "num_tokens": 158735379.0, "residual_var": 0.04062503203749657, "reward": 0.720703125, "reward_std": 0.19433605670928955, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "rho2": 0.4999997615814209, "step": 402 }, { "advantages/mean": 7.566995918750763e-10, "advantages/snr": 3.1787737663062165e-09, "advantages/std": 0.23804762959480286, "advantages/var": 0.05666667395570446, "completions/clipped_ratio": -1.4609375, "epoch": 2.315412186379928, "grad_norm": 56.67990997048625, "learning_rate": 1.1271140707972187e-06, "loss": -0.5351, "num_tokens": 159222068.0, "residual_var": 0.037187520414590836, "reward": 0.73828125, "reward_std": 0.1365133672952652, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "rho2": 0.3437498211860657, "step": 403 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.8442933996670506e-09, "advantages/std": 0.2524876296520233, "advantages/var": 0.06375000312729728, "completions/clipped_ratio": -1.859375, "epoch": 2.3211469534050178, "grad_norm": 56.3291823076648, "learning_rate": 1.1235519924165812e-06, "loss": -0.2051, "num_tokens": 159665355.0, "residual_var": 0.03386720269918442, "reward": 0.802734375, "reward_std": 0.16814923286437988, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "rho2": 0.46874985098838806, "step": 404 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.0117210122455504e-09, "advantages/std": 0.3472111225128174, "advantages/var": 0.12055556359661068, "completions/clipped_ratio": -0.96875, "epoch": 2.3268817204301078, "grad_norm": 81.97868852085662, "learning_rate": 1.119988321339809e-06, "loss": -0.7714, "num_tokens": 160159369.0, "residual_var": 0.04897579923272133, "reward": 0.59765625, "reward_std": 0.25142383575439453, "rewards/drgrpo_math_reward/mean": 0.59765625, "rewards/drgrpo_math_reward/std": 0.4908501207828522, "rho2": 0.5937492251396179, "step": 405 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 8.341130546057245e-10, "advantages/std": 0.2791355848312378, "advantages/var": 0.07791667471907715, "completions/clipped_ratio": -1.9609375, "epoch": 2.332616487455197, "grad_norm": 62.16835135312051, "learning_rate": 1.1164231035058227e-06, "loss": -0.6242, "num_tokens": 160605599.0, "residual_var": 0.04139326512813568, "reward": 0.763671875, "reward_std": 0.1764281690120697, "rewards/drgrpo_math_reward/mean": 0.763671875, "rewards/drgrpo_math_reward/std": 0.42524150013923645, "rho2": 0.4687497019767761, "step": 406 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.0047121915583932e-09, "advantages/std": 0.2903541922569275, "advantages/var": 0.08430555696117281, "completions/clipped_ratio": -1.6171875, "epoch": 2.338351254480287, "grad_norm": 68.72179862281877, "learning_rate": 1.1128563848734815e-06, "loss": -0.8852, "num_tokens": 161048730.0, "residual_var": 0.04478736221790314, "reward": 0.716796875, "reward_std": 0.18810126185417175, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "rho2": 0.4687497615814209, "step": 407 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.3129377823894333e-09, "advantages/std": 0.25166115164756775, "advantages/var": 0.06333333524858009, "completions/clipped_ratio": -2.140625, "epoch": 2.3440860215053765, "grad_norm": 54.118624188908356, "learning_rate": 1.109288211420992e-06, "loss": -0.41, "num_tokens": 161473479.0, "residual_var": 0.03958335891366005, "reward": 0.82421875, "reward_std": 0.1449122428894043, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "rho2": 0.3749997615814209, "step": 408 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.0723934533341967e-09, "advantages/std": 0.2808716595172882, "advantages/var": 0.07888888911999548, "completions/clipped_ratio": -2.015625, "epoch": 2.349820788530466, "grad_norm": 61.08432201024244, "learning_rate": 1.1057186291453136e-06, "loss": -0.6587, "num_tokens": 161896583.0, "residual_var": 0.04684031382203102, "reward": 0.78125, "reward_std": 0.1777448207139969, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "rho2": 0.4062497019767761, "step": 409 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 2.6400511948865318e-09, "advantages/std": 0.17638342082500458, "advantages/var": 0.03111111114193066, "completions/clipped_ratio": -2.34375, "epoch": 2.3555555555555556, "grad_norm": 38.05249828194833, "learning_rate": 1.102147684061568e-06, "loss": -0.44, "num_tokens": 162287754.0, "residual_var": 0.023333348333835602, "reward": 0.90234375, "reward_std": 0.08219823986291885, "rewards/drgrpo_math_reward/mean": 0.90234375, "rewards/drgrpo_math_reward/std": 0.29713961482048035, "rho2": 0.24999991059303284, "step": 410 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 6.05651474289182e-09, "advantages/std": 0.3075440526008606, "advantages/var": 0.09458334429016091, "completions/clipped_ratio": -2.0, "epoch": 2.361290322580645, "grad_norm": 150.31334795311386, "learning_rate": 1.0985754222024436e-06, "loss": -0.5024, "num_tokens": 162714647.0, "residual_var": 0.05615890398621559, "reward": 0.775390625, "reward_std": 0.1935105323791504, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "rho2": 0.40624961256980896, "step": 411 }, { "advantages/mean": 1.7462298274040222e-10, "advantages/snr": 5.941175880761482e-10, "advantages/std": 0.2939198911190033, "advantages/var": 0.08638890239540675, "completions/clipped_ratio": -2.2734375, "epoch": 2.3670250896057348, "grad_norm": 75.21930580895636, "learning_rate": 1.0950018896176042e-06, "loss": -0.1272, "num_tokens": 163146394.0, "residual_var": 0.0539930984377861, "reward": 0.73828125, "reward_std": 0.18256942927837372, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "rho2": 0.37499964237213135, "step": 412 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.6747725376194594e-09, "advantages/std": 0.2611406743526459, "advantages/var": 0.06819445180135464, "completions/clipped_ratio": -1.8515625, "epoch": 2.3727598566308243, "grad_norm": 64.85321341345697, "learning_rate": 1.0914271323730934e-06, "loss": -0.1018, "num_tokens": 163582537.0, "residual_var": 0.03835940361022949, "reward": 0.771484375, "reward_std": 0.16655293107032776, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "rho2": 0.4374997913837433, "step": 413 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.374562480526464e-09, "advantages/std": 0.2941560447216034, "advantages/var": 0.08652777864625794, "completions/clipped_ratio": -2.109375, "epoch": 2.378494623655914, "grad_norm": 74.9359755143715, "learning_rate": 1.0878511965507434e-06, "loss": -0.8417, "num_tokens": 163989420.0, "residual_var": 0.0486719086766243, "reward": 0.771484375, "reward_std": 0.1949605643749237, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "rho2": 0.4374997019767761, "step": 414 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.8413982138462765e-09, "advantages/std": 0.18966345489025116, "advantages/var": 0.03597222612090634, "completions/clipped_ratio": -2.2734375, "epoch": 2.3842293906810035, "grad_norm": 48.86688080555522, "learning_rate": 1.0842741282475768e-06, "loss": 0.0917, "num_tokens": 164393571.0, "residual_var": 0.024730918928980827, "reward": 0.775390625, "reward_std": 0.09738312661647797, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "rho2": 0.3124997317790985, "step": 415 }, { "advantages/mean": -1.7462298274040222e-10, "advantages/snr": 6.60012755435742e-10, "advantages/std": 0.26457515358924866, "advantages/var": 0.07000001189677452, "completions/clipped_ratio": -1.7890625, "epoch": 2.389964157706093, "grad_norm": 65.01775100907336, "learning_rate": 1.0806959735752173e-06, "loss": -0.0412, "num_tokens": 164841200.0, "residual_var": 0.03718752786517143, "reward": 0.70703125, "reward_std": 0.16961485147476196, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.455569326877594, "rho2": 0.4687497615814209, "step": 416 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 8.210452128730375e-10, "advantages/std": 0.2835783362388611, "advantages/var": 0.08041667278400055, "completions/clipped_ratio": -2.1796875, "epoch": 2.3956989247311826, "grad_norm": 78.59303931195502, "learning_rate": 1.0771167786592916e-06, "loss": 0.1923, "num_tokens": 165233699.0, "residual_var": 0.04523440822958946, "reward": 0.779296875, "reward_std": 0.18326979875564575, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "rho2": 0.4374997913837433, "step": 417 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 5.397018992473179e-10, "advantages/std": 0.21570299565792084, "advantages/var": 0.046527782335801016, "completions/clipped_ratio": -2.59375, "epoch": 2.4014336917562726, "grad_norm": 50.96431045489158, "learning_rate": 1.0735365896388359e-06, "loss": -0.2047, "num_tokens": 165609257.0, "residual_var": 0.03198786452412605, "reward": 0.853515625, "reward_std": 0.11355571448802948, "rewards/drgrpo_math_reward/mean": 0.853515625, "rewards/drgrpo_math_reward/std": 0.35393697023391724, "rho2": 0.31249991059303284, "step": 418 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 1.004270855688633e-09, "advantages/std": 0.23184047639369965, "advantages/var": 0.0537500064944576, "completions/clipped_ratio": -2.0703125, "epoch": 2.4071684587813618, "grad_norm": 62.01746119768433, "learning_rate": 1.0699554526657028e-06, "loss": -0.3115, "num_tokens": 166024198.0, "residual_var": 0.036953143775463104, "reward": 0.650390625, "reward_std": 0.12301129102706909, "rewards/drgrpo_math_reward/mean": 0.650390625, "rewards/drgrpo_math_reward/std": 0.47731292247772217, "rho2": 0.31249985098838806, "step": 419 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 5.080780781930131e-10, "advantages/std": 0.22912879288196564, "advantages/var": 0.05250000372754671, "completions/clipped_ratio": -2.25, "epoch": 2.412903225806452, "grad_norm": 85.05747641576848, "learning_rate": 1.0663734139039632e-06, "loss": -0.3774, "num_tokens": 166420163.0, "residual_var": 0.03281252458691597, "reward": 0.703125, "reward_std": 0.13005205988883972, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "rho2": 0.3749997913837433, "step": 420 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 8.970889075123879e-10, "advantages/std": 0.2595402002334595, "advantages/var": 0.06736111553722424, "completions/clipped_ratio": -2.1171875, "epoch": 2.4186379928315414, "grad_norm": 62.437938117336955, "learning_rate": 1.0627905195293135e-06, "loss": 0.2502, "num_tokens": 166840012.0, "residual_var": 0.04841582849621773, "reward": 0.697265625, "reward_std": 0.14230301976203918, "rewards/drgrpo_math_reward/mean": 0.697265625, "rewards/drgrpo_math_reward/std": 0.45989060401916504, "rho2": 0.2812497615814209, "step": 421 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.902452130417222e-10, "advantages/std": 0.23746344447135925, "advantages/var": 0.05638888746020232, "completions/clipped_ratio": -2.3671875, "epoch": 2.424372759856631, "grad_norm": 53.34371736228326, "learning_rate": 1.0592068157284795e-06, "loss": -0.4756, "num_tokens": 167230284.0, "residual_var": 0.0334809273481369, "reward": 0.74609375, "reward_std": 0.1407259702682495, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "rho2": 0.4062497615814209, "step": 422 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 6.214494139362271e-09, "advantages/std": 0.24352732300758362, "advantages/var": 0.059305557051239965, "completions/clipped_ratio": -2.375, "epoch": 2.4301075268817205, "grad_norm": 58.68633314311732, "learning_rate": 1.0556223486986218e-06, "loss": -0.3224, "num_tokens": 167603388.0, "residual_var": 0.044479191303253174, "reward": 0.787109375, "reward_std": 0.12275198847055435, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "rho2": 0.24999983608722687, "step": 423 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 4.3957267766967165e-09, "advantages/std": 0.26483747363090515, "advantages/var": 0.07013888743920038, "completions/clipped_ratio": -2.2421875, "epoch": 2.43584229390681, "grad_norm": 64.7802841244938, "learning_rate": 1.0520371646467393e-06, "loss": -0.2237, "num_tokens": 168024056.0, "residual_var": 0.04164499044418335, "reward": 0.806640625, "reward_std": 0.15910032391548157, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "rho2": 0.4062498211860657, "step": 424 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.956153653838679e-09, "advantages/std": 0.26483750343322754, "advantages/var": 0.07013890322474481, "completions/clipped_ratio": -2.1796875, "epoch": 2.4415770609318996, "grad_norm": 63.77402078972581, "learning_rate": 1.0484513097890737e-06, "loss": -0.6672, "num_tokens": 168471109.0, "residual_var": 0.04164499044418335, "reward": 0.775390625, "reward_std": 0.15998506546020508, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "rho2": 0.4062498211860657, "step": 425 }, { "advantages/mean": -1.7462298274040222e-10, "advantages/snr": 7.162188264488527e-10, "advantages/std": 0.24381232261657715, "advantages/var": 0.0594444486596899, "completions/clipped_ratio": -1.921875, "epoch": 2.447311827956989, "grad_norm": 58.383952615902345, "learning_rate": 1.044864830350515e-06, "loss": -0.5591, "num_tokens": 168875422.0, "residual_var": 0.03715279698371887, "reward": 0.70703125, "reward_std": 0.14613619446754456, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.455569326877594, "rho2": 0.37499988079071045, "step": 426 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.981238703440824e-09, "advantages/std": 0.23392783105373383, "advantages/var": 0.054722230141504236, "completions/clipped_ratio": -2.1015625, "epoch": 2.4530465949820788, "grad_norm": 64.63341645630022, "learning_rate": 1.041277772564003e-06, "loss": -0.0671, "num_tokens": 169268917.0, "residual_var": 0.03420140594244003, "reward": 0.82421875, "reward_std": 0.13528938591480255, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "rho2": 0.37499985098838806, "step": 427 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 8.739686980751845e-10, "advantages/std": 0.2664061486721039, "advantages/var": 0.07097223605030312, "completions/clipped_ratio": -2.2109375, "epoch": 2.4587813620071683, "grad_norm": 65.16919567088678, "learning_rate": 1.0376901826699347e-06, "loss": -0.452, "num_tokens": 169651575.0, "residual_var": 0.04213979095220566, "reward": 0.779296875, "reward_std": 0.15927115082740784, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "rho2": 0.40624967217445374, "step": 428 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.056477028513483e-10, "advantages/std": 0.28698626160621643, "advantages/var": 0.0823611143507117, "completions/clipped_ratio": -2.359375, "epoch": 2.464516129032258, "grad_norm": 68.33334812605142, "learning_rate": 1.0341021069155647e-06, "loss": -0.3977, "num_tokens": 170094319.0, "residual_var": 0.043754372745752335, "reward": 0.744140625, "reward_std": 0.18585413694381714, "rewards/drgrpo_math_reward/mean": 0.744140625, "rewards/drgrpo_math_reward/std": 0.43676990270614624, "rho2": 0.4687497615814209, "step": 429 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.6387916886932293e-09, "advantages/std": 0.21311186254024506, "advantages/var": 0.045416665955372304, "completions/clipped_ratio": -2.3828125, "epoch": 2.4702508960573475, "grad_norm": 50.96320293206426, "learning_rate": 1.0305135915544123e-06, "loss": -0.0329, "num_tokens": 170493318.0, "residual_var": 0.031223976984620094, "reward": 0.748046875, "reward_std": 0.11256247013807297, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "rho2": 0.3124998211860657, "step": 430 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.9720535657030054e-09, "advantages/std": 0.31335994601249695, "advantages/var": 0.098194455764955, "completions/clipped_ratio": -1.8046875, "epoch": 2.4759856630824375, "grad_norm": 78.54579216958615, "learning_rate": 1.026924682845663e-06, "loss": -0.6166, "num_tokens": 170957994.0, "residual_var": 0.04909728467464447, "reward": 0.626953125, "reward_std": 0.20862311124801636, "rewards/drgrpo_math_reward/mean": 0.626953125, "rewards/drgrpo_math_reward/std": 0.48408737778663635, "rho2": 0.4999995529651642, "step": 431 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 2.0681874901761335e-09, "advantages/std": 0.22515428066253662, "advantages/var": 0.05069445010066431, "completions/clipped_ratio": -2.359375, "epoch": 2.481720430107527, "grad_norm": 53.94454187091519, "learning_rate": 1.0233354270535726e-06, "loss": -0.2503, "num_tokens": 171369491.0, "residual_var": 0.03643665835261345, "reward": 0.751953125, "reward_std": 0.11245040595531464, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "rho2": 0.2812497615814209, "step": 432 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.9199740837378565e-09, "advantages/std": 0.23921167850494385, "advantages/var": 0.057222227133152614, "completions/clipped_ratio": -2.1953125, "epoch": 2.4874551971326166, "grad_norm": 56.63491301816543, "learning_rate": 1.0197458704468718e-06, "loss": -0.3353, "num_tokens": 171783573.0, "residual_var": 0.0411284938454628, "reward": 0.640625, "reward_std": 0.12180182337760925, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.48028653860092163, "rho2": 0.28124988079071045, "step": 433 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.4599870418689282e-09, "advantages/std": 0.23921167850494385, "advantages/var": 0.057222227133152614, "completions/clipped_ratio": -2.1640625, "epoch": 2.493189964157706, "grad_norm": 56.71968392033658, "learning_rate": 1.0161560592981686e-06, "loss": 0.1748, "num_tokens": 172204776.0, "residual_var": 0.03576390817761421, "reward": 0.69921875, "reward_std": 0.13956832885742188, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "rho2": 0.37499988079071045, "step": 434 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.5270548234336427e-09, "advantages/std": 0.2303379327058792, "advantages/var": 0.05305556324321814, "completions/clipped_ratio": -1.921875, "epoch": 2.498924731182796, "grad_norm": 53.83539551120417, "learning_rate": 1.0125660398833527e-06, "loss": -0.1056, "num_tokens": 172625112.0, "residual_var": 0.03315974771976471, "reward": 0.69921875, "reward_std": 0.13153532147407532, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "rho2": 0.3749997913837433, "step": 435 }, { "advantages/mean": -1.2223608791828156e-09, "advantages/snr": 4.509597990192393e-09, "advantages/std": 0.2710576057434082, "advantages/var": 0.07347222563134892, "completions/clipped_ratio": -2.2734375, "epoch": 2.5046594982078854, "grad_norm": 64.031599874025, "learning_rate": 1.0089758584809977e-06, "loss": -0.8288, "num_tokens": 173049390.0, "residual_var": 0.041328150779008865, "reward": 0.697265625, "reward_std": 0.16917571425437927, "rewards/drgrpo_math_reward/mean": 0.697265625, "rewards/drgrpo_math_reward/std": 0.45989060401916504, "rho2": 0.4374998211860657, "step": 436 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 8.739686980751845e-10, "advantages/std": 0.2664061486721039, "advantages/var": 0.07097223605030312, "completions/clipped_ratio": -2.1015625, "epoch": 2.510394265232975, "grad_norm": 64.82953085567293, "learning_rate": 1.005385561371767e-06, "loss": -0.4404, "num_tokens": 173464005.0, "residual_var": 0.037704020738601685, "reward": 0.716796875, "reward_std": 0.1750306487083435, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "rho2": 0.4687497615814209, "step": 437 }, { "advantages/mean": -3.026798367500305e-09, "advantages/snr": 1.0168089382092045e-08, "advantages/std": 0.2976762056350708, "advantages/var": 0.08861112340129296, "completions/clipped_ratio": -1.8515625, "epoch": 2.5161290322580645, "grad_norm": 67.03839499847, "learning_rate": 1.0017951948378134e-06, "loss": -0.9453, "num_tokens": 173904182.0, "residual_var": 0.03599829226732254, "reward": 0.73046875, "reward_std": 0.2161572277545929, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44415023922920227, "rho2": 0.5937497615814209, "step": 438 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.5680122946079323e-09, "advantages/std": 0.2969755232334137, "advantages/var": 0.08819446139975984, "completions/clipped_ratio": -2.171875, "epoch": 2.521863799283154, "grad_norm": 72.50799125594739, "learning_rate": 9.982048051621867e-07, "loss": -0.6906, "num_tokens": 174344950.0, "residual_var": 0.046853337436914444, "reward": 0.677734375, "reward_std": 0.19543945789337158, "rewards/drgrpo_math_reward/mean": 0.677734375, "rewards/drgrpo_math_reward/std": 0.46780112385749817, "rho2": 0.46874964237213135, "step": 439 }, { "advantages/mean": -2.2118911147117615e-09, "advantages/snr": 7.746595766758797e-09, "advantages/std": 0.2855307161808014, "advantages/var": 0.08152778988272136, "completions/clipped_ratio": -2.1640625, "epoch": 2.5275985663082436, "grad_norm": 69.72516324773257, "learning_rate": 9.946144386282334e-07, "loss": -0.7951, "num_tokens": 174756443.0, "residual_var": 0.053502634167671204, "reward": 0.798828125, "reward_std": 0.16578121483325958, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "rho2": 0.3437498211860657, "step": 440 }, { "advantages/mean": -2.444721758365631e-09, "advantages/snr": 8.951761400749556e-09, "advantages/std": 0.27309951186180115, "advantages/var": 0.07458334337915407, "completions/clipped_ratio": -1.9609375, "epoch": 2.533333333333333, "grad_norm": 70.10475979826845, "learning_rate": 9.91024141519002e-07, "loss": -0.5444, "num_tokens": 175182330.0, "residual_var": 0.04428388178348541, "reward": 0.740234375, "reward_std": 0.16356495022773743, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "rho2": 0.4062497317790985, "step": 441 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 3.2488173157799826e-09, "advantages/std": 0.2508319616317749, "advantages/var": 0.0629166729760442, "completions/clipped_ratio": -1.9296875, "epoch": 2.539068100358423, "grad_norm": 57.6120829633273, "learning_rate": 9.874339601166472e-07, "loss": -1.1459, "num_tokens": 175643293.0, "residual_var": 0.03735679015517235, "reward": 0.791015625, "reward_std": 0.15084770321846008, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "rho2": 0.40624985098838806, "step": 442 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.137508448374966e-10, "advantages/std": 0.2813657522201538, "advantages/var": 0.07916668652241299, "completions/clipped_ratio": -1.9921875, "epoch": 2.5448028673835124, "grad_norm": 73.7557478840485, "learning_rate": 9.838439407018315e-07, "loss": -1.6443, "num_tokens": 176065220.0, "residual_var": 0.047005247324705124, "reward": 0.68359375, "reward_std": 0.17027214169502258, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.46552830934524536, "rho2": 0.4062497019767761, "step": 443 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 5.280102003572122e-09, "advantages/std": 0.22047929465770721, "advantages/var": 0.04861111937276008, "completions/clipped_ratio": -2.140625, "epoch": 2.5505376344086024, "grad_norm": 54.110897844044956, "learning_rate": 9.80254129553128e-07, "loss": 0.039, "num_tokens": 176461000.0, "residual_var": 0.03342015668749809, "reward": 0.67578125, "reward_std": 0.11470641195774078, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.4685399830341339, "rho2": 0.31249988079071045, "step": 444 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.204813847269187e-09, "advantages/std": 0.28987544775009155, "advantages/var": 0.08402777520831606, "completions/clipped_ratio": -1.71875, "epoch": 2.5562724014336915, "grad_norm": 74.73692586393744, "learning_rate": 9.766645729464275e-07, "loss": 0.1518, "num_tokens": 176921349.0, "residual_var": 0.04201391711831093, "reward": 0.650390625, "reward_std": 0.19145467877388, "rewards/drgrpo_math_reward/mean": 0.650390625, "rewards/drgrpo_math_reward/std": 0.47731292247772217, "rho2": 0.4999997913837433, "step": 445 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.5182214095261368e-09, "advantages/std": 0.23003624379634857, "advantages/var": 0.05291667345993312, "completions/clipped_ratio": -2.390625, "epoch": 2.5620071684587815, "grad_norm": 59.9755775608547, "learning_rate": 9.730753171543374e-07, "loss": -0.159, "num_tokens": 177347932.0, "residual_var": 0.038033876568078995, "reward": 0.814453125, "reward_std": 0.12459483742713928, "rewards/drgrpo_math_reward/mean": 0.814453125, "rewards/drgrpo_math_reward/std": 0.38912075757980347, "rho2": 0.2812498211860657, "step": 446 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.7816318963127715e-09, "advantages/std": 0.25110867619514465, "advantages/var": 0.06305556726047801, "completions/clipped_ratio": -2.3203125, "epoch": 2.567741935483871, "grad_norm": 61.58079123725055, "learning_rate": 9.694864084455876e-07, "loss": -1.1138, "num_tokens": 177721387.0, "residual_var": 0.043350715190172195, "reward": 0.7890625, "reward_std": 0.13947676122188568, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4083731174468994, "rho2": 0.31249988079071045, "step": 447 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 5.077610072704531e-09, "advantages/std": 0.27512624859809875, "advantages/var": 0.07569445266766284, "completions/clipped_ratio": -1.9921875, "epoch": 2.5734767025089607, "grad_norm": 75.11846854361673, "learning_rate": 9.658978930844352e-07, "loss": -0.4102, "num_tokens": 178146547.0, "residual_var": 0.04494359344244003, "reward": 0.720703125, "reward_std": 0.17045296728610992, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "rho2": 0.40624985098838806, "step": 448 }, { "advantages/mean": 2.153683453798294e-09, "advantages/snr": 7.820810913194381e-09, "advantages/std": 0.2753785252571106, "advantages/var": 0.0758333321727811, "completions/clipped_ratio": -2.40625, "epoch": 2.5792114695340502, "grad_norm": 72.25720058721578, "learning_rate": 9.623098173300653e-07, "loss": -0.5389, "num_tokens": 178553127.0, "residual_var": 0.049765653908252716, "reward": 0.703125, "reward_std": 0.15868166089057922, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "rho2": 0.3437497913837433, "step": 449 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 2.172906997144492e-09, "advantages/std": 0.16072751581668854, "advantages/var": 0.025833334340603864, "completions/clipped_ratio": -2.5, "epoch": 2.58494623655914, "grad_norm": 33.02613204598104, "learning_rate": 9.58722227435997e-07, "loss": -0.1657, "num_tokens": 178910232.0, "residual_var": 0.020989596843719482, "reward": 0.8203125, "reward_std": 0.06464291363954544, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "rho2": 0.18749986588954926, "step": 450 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.0260402643293343e-10, "advantages/std": 0.28915587067604065, "advantages/var": 0.08361111754641914, "completions/clipped_ratio": -2.296875, "epoch": 2.5906810035842294, "grad_norm": 72.5434422347627, "learning_rate": 9.551351696494853e-07, "loss": -0.6403, "num_tokens": 179295692.0, "residual_var": 0.054869815707206726, "reward": 0.73828125, "reward_std": 0.17874369025230408, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "rho2": 0.34374988079071045, "step": 451 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 5.193753065894795e-09, "advantages/std": 0.3138028383255005, "advantages/var": 0.0984722213411402, "completions/clipped_ratio": -2.0078125, "epoch": 2.596415770609319, "grad_norm": 77.04176272646433, "learning_rate": 9.515486902109261e-07, "loss": -1.1445, "num_tokens": 179699328.0, "residual_var": 0.04923616722226143, "reward": 0.697265625, "reward_std": 0.21540358662605286, "rewards/drgrpo_math_reward/mean": 0.697265625, "rewards/drgrpo_math_reward/std": 0.45989060401916504, "rho2": 0.4999995827674866, "step": 452 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.6829815340737563e-09, "advantages/std": 0.26034167408943176, "advantages/var": 0.0677777872676879, "completions/clipped_ratio": -2.3125, "epoch": 2.6021505376344085, "grad_norm": 67.48426153884103, "learning_rate": 9.479628353532608e-07, "loss": -0.1501, "num_tokens": 180093179.0, "residual_var": 0.04447919875383377, "reward": 0.69921875, "reward_std": 0.14408627152442932, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "rho2": 0.3437497019767761, "step": 453 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.5995053982454237e-09, "advantages/std": 0.258736252784729, "advantages/var": 0.06694444850508319, "completions/clipped_ratio": -2.3203125, "epoch": 2.607885304659498, "grad_norm": 65.78416831284387, "learning_rate": 9.443776513013783e-07, "loss": -0.4929, "num_tokens": 180488239.0, "residual_var": 0.039748284965753555, "reward": 0.75390625, "reward_std": 0.15575343370437622, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.40624988079071045, "step": 454 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.937267816735752e-09, "advantages/std": 0.30046263337135315, "advantages/var": 0.09027779405244818, "completions/clipped_ratio": -2.09375, "epoch": 2.6136200716845877, "grad_norm": 72.88669586938197, "learning_rate": 9.407931842715202e-07, "loss": -0.0035, "num_tokens": 180925787.0, "residual_var": 0.045138921588659286, "reward": 0.74609375, "reward_std": 0.20015192031860352, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "rho2": 0.4999997615814209, "step": 455 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.7066873778236524e-09, "advantages/std": 0.2728450894355774, "advantages/var": 0.07444444282910823, "completions/clipped_ratio": -1.9765625, "epoch": 2.6193548387096772, "grad_norm": 69.25331379052419, "learning_rate": 9.372094804706866e-07, "loss": -1.2174, "num_tokens": 181365120.0, "residual_var": 0.04885420203208923, "reward": 0.78125, "reward_std": 0.1556052267551422, "rewards/drgrpo_math_reward/mean": 0.78125, "rewards/drgrpo_math_reward/std": 0.41380295157432556, "rho2": 0.3437497019767761, "step": 456 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.6024501501900382e-09, "advantages/std": 0.29059329628944397, "advantages/var": 0.08444446384836457, "completions/clipped_ratio": -1.5546875, "epoch": 2.6250896057347672, "grad_norm": 77.58166588538124, "learning_rate": 9.336265860960369e-07, "loss": -1.6388, "num_tokens": 181842000.0, "residual_var": 0.044861141592264175, "reward": 0.7421875, "reward_std": 0.1930510401725769, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "rho2": 0.4687497615814209, "step": 457 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.2725904583930969, "advantages/var": 0.0743055580069587, "completions/clipped_ratio": -1.609375, "epoch": 2.6308243727598564, "grad_norm": 67.64046611971021, "learning_rate": 9.300445473342972e-07, "loss": -0.8913, "num_tokens": 182308863.0, "residual_var": 0.030186664313077927, "reward": 0.712890625, "reward_std": 0.1950400471687317, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "rho2": 0.5937496423721313, "step": 458 }, { "advantages/mean": -2.444721758365631e-09, "advantages/snr": 7.656805542199846e-09, "advantages/std": 0.31928741931915283, "advantages/var": 0.10194445613548453, "completions/clipped_ratio": -1.3671875, "epoch": 2.6365591397849464, "grad_norm": 78.19841224403235, "learning_rate": 9.264634103611637e-07, "loss": -1.9761, "num_tokens": 182771419.0, "residual_var": 0.04778648912906647, "reward": 0.75390625, "reward_std": 0.22828257083892822, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.5312498211860657, "step": 459 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.984611592205463e-09, "advantages/std": 0.35059472918510437, "advantages/var": 0.12291666413237667, "completions/clipped_ratio": -1.421875, "epoch": 2.642293906810036, "grad_norm": 87.48350103945812, "learning_rate": 9.228832213407084e-07, "loss": -0.9918, "num_tokens": 183254781.0, "residual_var": 0.04993493854999542, "reward": 0.658203125, "reward_std": 0.2730935513973236, "rewards/drgrpo_math_reward/mean": 0.658203125, "rewards/drgrpo_math_reward/std": 0.4747757613658905, "rho2": 0.5937497615814209, "step": 460 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.2274649009740366e-10, "advantages/std": 0.275378555059433, "advantages/var": 0.07583334858662116, "completions/clipped_ratio": -1.3828125, "epoch": 2.6480286738351255, "grad_norm": 64.0019516060153, "learning_rate": 9.193040264247828e-07, "loss": -0.6081, "num_tokens": 183735335.0, "residual_var": 0.03791669011116028, "reward": 0.67578125, "reward_std": 0.18806010484695435, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.4685399830341339, "rho2": 0.4999998211860657, "step": 461 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.2417595067861428e-09, "advantages/std": 0.3115819990634918, "advantages/var": 0.09708334214040182, "completions/clipped_ratio": -1.671875, "epoch": 2.653763440860215, "grad_norm": 74.2774157312089, "learning_rate": 9.157258717524234e-07, "loss": -0.5196, "num_tokens": 184198643.0, "residual_var": 0.039440155029296875, "reward": 0.787109375, "reward_std": 0.2234230935573578, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "rho2": 0.5937495827674866, "step": 462 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 4.34125870353226e-09, "advantages/std": 0.3486083745956421, "advantages/var": 0.12152779883821552, "completions/clipped_ratio": -1.5546875, "epoch": 2.6594982078853047, "grad_norm": 80.6253274161068, "learning_rate": 9.121488034492568e-07, "loss": -1.8405, "num_tokens": 184664268.0, "residual_var": 0.05316847190260887, "reward": 0.751953125, "reward_std": 0.2536480128765106, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "rho2": 0.5624995231628418, "step": 463 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 4.9699852876689385e-09, "advantages/std": 0.2576604187488556, "advantages/var": 0.06638889138983561, "completions/clipped_ratio": -1.8515625, "epoch": 2.6652329749103942, "grad_norm": 63.19212422109767, "learning_rate": 9.085728676269066e-07, "loss": -0.4268, "num_tokens": 185101414.0, "residual_var": 0.035269126296043396, "reward": 0.828125, "reward_std": 0.16466236114501953, "rewards/drgrpo_math_reward/mean": 0.828125, "rewards/drgrpo_math_reward/std": 0.3776407241821289, "rho2": 0.4687497615814209, "step": 464 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 7.320198485139096e-09, "advantages/std": 0.28625941276550293, "advantages/var": 0.08194445139685058, "completions/clipped_ratio": -1.578125, "epoch": 2.670967741935484, "grad_norm": 65.10103158278667, "learning_rate": 9.049981103823959e-07, "loss": -0.9931, "num_tokens": 185546122.0, "residual_var": 0.03585072234272957, "reward": 0.671875, "reward_std": 0.19949322938919067, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4699897766113281, "rho2": 0.5624997615814209, "step": 465 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.2078120792988004e-09, "advantages/std": 0.28915587067604065, "advantages/var": 0.08361111754641914, "completions/clipped_ratio": -1.2265625, "epoch": 2.6767025089605734, "grad_norm": 86.52446006130594, "learning_rate": 9.014245777975564e-07, "loss": -1.3851, "num_tokens": 186019320.0, "residual_var": 0.04441843926906586, "reward": 0.671875, "reward_std": 0.19064772129058838, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4699897766113281, "rho2": 0.4687497019767761, "step": 466 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 6.281949785823441e-09, "advantages/std": 0.29650747776031494, "advantages/var": 0.08791668436778366, "completions/clipped_ratio": -1.3671875, "epoch": 2.682437275985663, "grad_norm": 69.17954258866713, "learning_rate": 8.978523159384322e-07, "loss": -0.5942, "num_tokens": 186470417.0, "residual_var": 0.043958354741334915, "reward": 0.650390625, "reward_std": 0.1992083340883255, "rewards/drgrpo_math_reward/mean": 0.650390625, "rewards/drgrpo_math_reward/std": 0.47731292247772217, "rho2": 0.49999985098838806, "step": 467 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 9.333989992413056e-10, "advantages/std": 0.24944384396076202, "advantages/var": 0.06222223128992099, "completions/clipped_ratio": -1.7265625, "epoch": 2.688172043010753, "grad_norm": 55.31791151236499, "learning_rate": 8.942813708546866e-07, "loss": -0.5399, "num_tokens": 186899052.0, "residual_var": 0.03888891637325287, "reward": 0.765625, "reward_std": 0.14408722519874573, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42402184009552, "rho2": 0.3749998211860657, "step": 468 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 6.362880467429595e-09, "advantages/std": 0.29273614287376404, "advantages/var": 0.08569444934460879, "completions/clipped_ratio": -2.265625, "epoch": 2.693906810035842, "grad_norm": 62.79701082519547, "learning_rate": 8.907117885790083e-07, "loss": -0.1179, "num_tokens": 187288519.0, "residual_var": 0.05088111758232117, "reward": 0.787109375, "reward_std": 0.18091976642608643, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "rho2": 0.40624964237213135, "step": 469 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 6.653689342975278e-09, "advantages/std": 0.24494898319244385, "advantages/var": 0.06000000436701214, "completions/clipped_ratio": -2.1640625, "epoch": 2.699641577060932, "grad_norm": 56.48204743838804, "learning_rate": 8.871436151265182e-07, "loss": -0.8356, "num_tokens": 187717581.0, "residual_var": 0.0337500236928463, "reward": 0.77734375, "reward_std": 0.15162497758865356, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "rho2": 0.4374997615814209, "step": 470 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.0206664023149988e-09, "advantages/std": 0.3421744406223297, "advantages/var": 0.11708334781520424, "completions/clipped_ratio": -1.7265625, "epoch": 2.7053763440860212, "grad_norm": 88.61645506282255, "learning_rate": 8.835768964941772e-07, "loss": -1.3452, "num_tokens": 188146276.0, "residual_var": 0.04390630125999451, "reward": 0.720703125, "reward_std": 0.2601764500141144, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "rho2": 0.6249996423721313, "step": 471 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.5303190561455407e-09, "advantages/std": 0.2282177358865738, "advantages/var": 0.05208333497319395, "completions/clipped_ratio": -2.1875, "epoch": 2.7111111111111112, "grad_norm": 56.52678918892596, "learning_rate": 8.800116786601908e-07, "loss": -0.2648, "num_tokens": 188554427.0, "residual_var": 0.03580730780959129, "reward": 0.720703125, "reward_std": 0.12965798377990723, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "rho2": 0.31249988079071045, "step": 472 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.2848184659172622e-09, "advantages/std": 0.2718251347541809, "advantages/var": 0.07388890388412861, "completions/clipped_ratio": -1.984375, "epoch": 2.716845878136201, "grad_norm": 75.9117013158768, "learning_rate": 8.764480075834186e-07, "loss": -0.3814, "num_tokens": 188968272.0, "residual_var": 0.03694448247551918, "reward": 0.63671875, "reward_std": 0.18350042402744293, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.4814152419567108, "rho2": 0.49999967217445374, "step": 473 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.9124207806914912e-09, "advantages/std": 0.3043663799762726, "advantages/var": 0.09263889325986074, "completions/clipped_ratio": -1.90625, "epoch": 2.7225806451612904, "grad_norm": 71.7509423710507, "learning_rate": 8.728859292027814e-07, "loss": -0.8233, "num_tokens": 189387301.0, "residual_var": 0.04342452064156532, "reward": 0.783203125, "reward_std": 0.20802319049835205, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "rho2": 0.5312496423721313, "step": 474 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.2835783362388611, "advantages/var": 0.08041667278400055, "completions/clipped_ratio": -1.09375, "epoch": 2.72831541218638, "grad_norm": 69.52883901811138, "learning_rate": 8.693254894366682e-07, "loss": -0.8353, "num_tokens": 189868994.0, "residual_var": 0.05026044696569443, "reward": 0.638671875, "reward_std": 0.17083090543746948, "rewards/drgrpo_math_reward/mean": 0.638671875, "rewards/drgrpo_math_reward/std": 0.48085519671440125, "rho2": 0.3749997317790985, "step": 475 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.4184220323620412e-09, "advantages/std": 0.3282952904701233, "advantages/var": 0.10777779774486262, "completions/clipped_ratio": -1.234375, "epoch": 2.7340501792114695, "grad_norm": 80.32586328278164, "learning_rate": 8.657667341823448e-07, "loss": -0.4026, "num_tokens": 190349924.0, "residual_var": 0.03031253255903721, "reward": 0.71875, "reward_std": 0.26384237408638, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "rho2": 0.7187497615814209, "step": 476 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 4.658751773432817e-09, "advantages/std": 0.2748737335205078, "advantages/var": 0.07555556937950314, "completions/clipped_ratio": -1.078125, "epoch": 2.739784946236559, "grad_norm": 65.83262457928704, "learning_rate": 8.62209709315362e-07, "loss": -1.1978, "num_tokens": 190834338.0, "residual_var": 0.044861145317554474, "reward": 0.77734375, "reward_std": 0.1788024753332138, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "rho2": 0.4062497019767761, "step": 477 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.3798421122351524e-09, "advantages/std": 0.3374743163585663, "advantages/var": 0.11388891420168168, "completions/clipped_ratio": -0.6171875, "epoch": 2.7455197132616487, "grad_norm": 75.95196195484908, "learning_rate": 8.58654460688965e-07, "loss": -0.4815, "num_tokens": 191349946.0, "residual_var": 0.032031282782554626, "reward": 0.7734375, "reward_std": 0.2670985758304596, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "rho2": 0.7187497615814209, "step": 478 }, { "advantages/mean": -2.9103830456733704e-10, "advantages/snr": 9.966259420485386e-10, "advantages/std": 0.29202359914779663, "advantages/var": 0.08527778245923301, "completions/clipped_ratio": 0.1328125, "epoch": 2.7512544802867382, "grad_norm": 65.60735432218996, "learning_rate": 8.551010341335015e-07, "loss": 0.0763, "num_tokens": 191896919.0, "residual_var": 0.026649337261915207, "reward": 0.6640625, "reward_std": 0.22573786973953247, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4727790653705597, "rho2": 0.6874997019767761, "step": 479 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 8.045401222799212e-10, "advantages/std": 0.28939592838287354, "advantages/var": 0.08375000336458527, "completions/clipped_ratio": -0.25, "epoch": 2.756989247311828, "grad_norm": 71.41121800874254, "learning_rate": 8.515494754558308e-07, "loss": -2.6011, "num_tokens": 192430432.0, "residual_var": 0.034023467451334, "reward": 0.705078125, "reward_std": 0.20894183218479156, "rewards/drgrpo_math_reward/mean": 0.705078125, "rewards/drgrpo_math_reward/std": 0.4564536213874817, "rho2": 0.5937497615814209, "step": 480 }, { "advantages/mean": -1.979060471057892e-09, "advantages/snr": 6.024413450935841e-09, "advantages/std": 0.328506737947464, "advantages/var": 0.10791667687688378, "completions/clipped_ratio": 0.1796875, "epoch": 2.762724014336918, "grad_norm": 75.8395080430503, "learning_rate": 8.479998304387328e-07, "loss": -1.0252, "num_tokens": 192975583.0, "residual_var": 0.03709638863801956, "reward": 0.732421875, "reward_std": 0.26108264923095703, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.6562497615814209, "step": 481 }, { "advantages/mean": -2.2118911147117615e-09, "advantages/snr": 6.72883931476161e-09, "advantages/std": 0.32871806621551514, "advantages/var": 0.1080555670564678, "completions/clipped_ratio": 0.5625, "epoch": 2.768458781362007, "grad_norm": 79.00027185064818, "learning_rate": 8.444521448403206e-07, "loss": -0.637, "num_tokens": 193542397.0, "residual_var": 0.04052088037133217, "reward": 0.703125, "reward_std": 0.2449316829442978, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "rho2": 0.6249996423721313, "step": 482 }, { "advantages/mean": -1.3387762010097504e-09, "advantages/snr": 3.98164033938817e-09, "advantages/std": 0.3362373411655426, "advantages/var": 0.11305554959407349, "completions/clipped_ratio": 0.2421875, "epoch": 2.774193548387097, "grad_norm": 79.8645402407394, "learning_rate": 8.409064643934467e-07, "loss": -1.7737, "num_tokens": 194096722.0, "residual_var": 0.03886289894580841, "reward": 0.65625, "reward_std": 0.25814229249954224, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.4754233956336975, "rho2": 0.6562496423721313, "step": 483 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 2.2020945426790343e-09, "advantages/std": 0.3700600564479828, "advantages/var": 0.1369444453782842, "completions/clipped_ratio": 0.1328125, "epoch": 2.7799283154121865, "grad_norm": 87.95322278942685, "learning_rate": 8.373628348051163e-07, "loss": -1.4067, "num_tokens": 194636642.0, "residual_var": 0.021397622302174568, "reward": 0.734375, "reward_std": 0.32368212938308716, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "rho2": 0.8437496423721313, "step": 484 }, { "advantages/mean": 1.979060471057892e-09, "advantages/snr": 5.871494281123812e-09, "advantages/std": 0.3370624780654907, "advantages/var": 0.11361111411964941, "completions/clipped_ratio": 0.21875, "epoch": 2.785663082437276, "grad_norm": 81.11521486306856, "learning_rate": 8.338213017558972e-07, "loss": -1.952, "num_tokens": 195185151.0, "residual_var": 0.028402816504240036, "reward": 0.66015625, "reward_std": 0.27670520544052124, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.4741191864013672, "rho2": 0.7499997019767761, "step": 485 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.7892411830433553e-09, "advantages/std": 0.32532036304473877, "advantages/var": 0.10583333861156063, "completions/clipped_ratio": 0.1640625, "epoch": 2.7913978494623657, "grad_norm": 87.40345207912942, "learning_rate": 8.302819108993311e-07, "loss": -1.9064, "num_tokens": 195734857.0, "residual_var": 0.03968752175569534, "reward": 0.76171875, "reward_std": 0.25426650047302246, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "rho2": 0.6249998807907104, "step": 486 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.6094035106628276e-10, "advantages/std": 0.32253339886665344, "advantages/var": 0.10402779338447576, "completions/clipped_ratio": 0.1796875, "epoch": 2.7971326164874553, "grad_norm": 82.64764092064844, "learning_rate": 8.267447078613441e-07, "loss": -1.6324, "num_tokens": 196289151.0, "residual_var": 0.03575959429144859, "reward": 0.708984375, "reward_std": 0.24465788900852203, "rewards/drgrpo_math_reward/mean": 0.708984375, "rewards/drgrpo_math_reward/std": 0.45467492938041687, "rho2": 0.6562496423721313, "step": 487 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.59139962171801e-09, "advantages/std": 0.31446605920791626, "advantages/var": 0.0988889023937567, "completions/clipped_ratio": 0.15625, "epoch": 2.802867383512545, "grad_norm": 75.44431020880204, "learning_rate": 8.232097382396597e-07, "loss": -0.72, "num_tokens": 196840545.0, "residual_var": 0.03708336874842644, "reward": 0.6953125, "reward_std": 0.23320671916007996, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4607250988483429, "rho2": 0.6249997615814209, "step": 488 }, { "advantages/mean": 2.9103830456733704e-09, "advantages/snr": 7.4324472585792306e-09, "advantages/std": 0.3915780186653137, "advantages/var": 0.15333334470185278, "completions/clipped_ratio": -0.0078125, "epoch": 2.8086021505376344, "grad_norm": 92.41137718476863, "learning_rate": 8.196770476032114e-07, "loss": -1.4189, "num_tokens": 197383638.0, "residual_var": 0.028750097379088402, "reward": 0.65625, "reward_std": 0.33114296197891235, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.4754233956336975, "rho2": 0.8124994039535522, "step": 489 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 6.350250223051015e-09, "advantages/std": 0.3299831748008728, "advantages/var": 0.10888889565166338, "completions/clipped_ratio": -0.046875, "epoch": 2.814336917562724, "grad_norm": 91.37670531227323, "learning_rate": 8.161466814915533e-07, "loss": -2.3927, "num_tokens": 197920276.0, "residual_var": 0.04423614963889122, "reward": 0.7109375, "reward_std": 0.24433428049087524, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45377036929130554, "rho2": 0.5937497615814209, "step": 490 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.6713881198566725e-09, "advantages/std": 0.3050501048564911, "advantages/var": 0.09305556647295621, "completions/clipped_ratio": -0.078125, "epoch": 2.8200716845878135, "grad_norm": 85.55731596710582, "learning_rate": 8.126186854142751e-07, "loss": -1.5654, "num_tokens": 198471977.0, "residual_var": 0.029079891741275787, "reward": 0.65625, "reward_std": 0.23887482285499573, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.4754233956336975, "rho2": 0.6874997615814209, "step": 491 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.7845957574467256e-09, "advantages/std": 0.39140063524246216, "advantages/var": 0.1531944572682029, "completions/clipped_ratio": -0.375, "epoch": 2.825806451612903, "grad_norm": 103.42018509100016, "learning_rate": 8.090931048504151e-07, "loss": -1.0911, "num_tokens": 198985021.0, "residual_var": 0.033511366695165634, "reward": 0.560546875, "reward_std": 0.32877489924430847, "rewards/drgrpo_math_reward/mean": 0.560546875, "rewards/drgrpo_math_reward/std": 0.49680593609809875, "rho2": 0.7812495231628418, "step": 492 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 1.983746920090409e-09, "advantages/std": 0.41079193353652954, "advantages/var": 0.1687500126586805, "completions/clipped_ratio": -0.515625, "epoch": 2.8315412186379927, "grad_norm": 109.82653952621, "learning_rate": 8.055699852478724e-07, "loss": -0.3232, "num_tokens": 199515710.0, "residual_var": 0.010546967387199402, "reward": 0.591796875, "reward_std": 0.37540900707244873, "rewards/drgrpo_math_reward/mean": 0.591796875, "rewards/drgrpo_math_reward/std": 0.49198177456855774, "rho2": 0.937499463558197, "step": 493 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.6518765187303217e-09, "advantages/std": 0.352372944355011, "advantages/var": 0.12416669191341967, "completions/clipped_ratio": -1.2734375, "epoch": 2.8372759856630827, "grad_norm": 127.73396612935338, "learning_rate": 8.020493720228223e-07, "loss": 0.2227, "num_tokens": 199978028.0, "residual_var": 0.031041719019412994, "reward": 0.56640625, "reward_std": 0.28817349672317505, "rewards/drgrpo_math_reward/mean": 0.56640625, "rewards/drgrpo_math_reward/std": 0.4960552453994751, "rho2": 0.7499996423721313, "step": 494 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 8.689760048362368e-10, "advantages/std": 0.4019051790237427, "advantages/var": 0.16152777292610665, "completions/clipped_ratio": -0.921875, "epoch": 2.843010752688172, "grad_norm": 114.45597498594871, "learning_rate": 7.985313105591307e-07, "loss": 0.5696, "num_tokens": 200475377.0, "residual_var": 0.040382031351327896, "reward": 0.560546875, "reward_std": 0.3444315195083618, "rewards/drgrpo_math_reward/mean": 0.560546875, "rewards/drgrpo_math_reward/std": 0.49680593609809875, "rho2": 0.7499995231628418, "step": 495 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.1487987912891781e-09, "advantages/std": 0.40534624457359314, "advantages/var": 0.16430557798991519, "completions/clipped_ratio": -0.84375, "epoch": 2.848745519713262, "grad_norm": 106.09939868532756, "learning_rate": 7.950158462077697e-07, "loss": -0.6535, "num_tokens": 200965726.0, "residual_var": 0.04107644036412239, "reward": 0.580078125, "reward_std": 0.3481616973876953, "rewards/drgrpo_math_reward/mean": 0.580078125, "rewards/drgrpo_math_reward/std": 0.4940285086631775, "rho2": 0.7499997615814209, "step": 496 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 2.8177367746180163e-09, "advantages/std": 0.4131518602371216, "advantages/var": 0.17069445961739405, "completions/clipped_ratio": -0.484375, "epoch": 2.8544802867383514, "grad_norm": 113.03951316453029, "learning_rate": 7.915030242862316e-07, "loss": -0.9323, "num_tokens": 201466992.0, "residual_var": 0.026671061292290688, "reward": 0.658203125, "reward_std": 0.3711739182472229, "rewards/drgrpo_math_reward/mean": 0.658203125, "rewards/drgrpo_math_reward/std": 0.4747757613658905, "rho2": 0.8437497019767761, "step": 497 }, { "advantages/mean": 1.979060471057892e-09, "advantages/snr": 5.070163490419205e-09, "advantages/std": 0.3903346359729767, "advantages/var": 0.15236112804015622, "completions/clipped_ratio": -0.5859375, "epoch": 2.860215053763441, "grad_norm": 115.38926652228395, "learning_rate": 7.879928900779455e-07, "loss": -1.0482, "num_tokens": 201960221.0, "residual_var": 0.028567789122462273, "reward": 0.693359375, "reward_std": 0.3388535678386688, "rewards/drgrpo_math_reward/mean": 0.693359375, "rewards/drgrpo_math_reward/std": 0.4615498185157776, "rho2": 0.8124995231628418, "step": 498 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 6.527676749411163e-10, "advantages/std": 0.3566822409629822, "advantages/var": 0.12722222101837488, "completions/clipped_ratio": -0.125, "epoch": 2.8659498207885306, "grad_norm": 97.34853577794802, "learning_rate": 7.844854888316932e-07, "loss": -0.1379, "num_tokens": 202481969.0, "residual_var": 0.02782989852130413, "reward": 0.671875, "reward_std": 0.30239546298980713, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4699897766113281, "rho2": 0.7812497615814209, "step": 499 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.881792769754559e-09, "advantages/std": 0.3711842894554138, "advantages/var": 0.13777777673852043, "completions/clipped_ratio": 0.1484375, "epoch": 2.87168458781362, "grad_norm": 104.64464650038471, "learning_rate": 7.809808657610273e-07, "loss": 0.2991, "num_tokens": 203018482.0, "residual_var": 0.03444450721144676, "reward": 0.75, "reward_std": 0.3053758442401886, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "rho2": 0.7499995827674866, "step": 500 }, { "advantages/mean": 1.7462298274040222e-10, "advantages/snr": 4.764906347496868e-10, "advantages/std": 0.36647725105285645, "advantages/var": 0.13430557553925837, "completions/clipped_ratio": -0.1015625, "epoch": 2.8774193548387097, "grad_norm": 98.23630148229847, "learning_rate": 7.774790660436857e-07, "loss": -0.5791, "num_tokens": 203548053.0, "residual_var": 0.037773486226797104, "reward": 0.681640625, "reward_std": 0.30663371086120605, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.7187497019767761, "step": 501 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 2.326987379755552e-09, "advantages/std": 0.3501983880996704, "advantages/var": 0.12263891102760738, "completions/clipped_ratio": 0.171875, "epoch": 2.8831541218637993, "grad_norm": 101.18016910610736, "learning_rate": 7.739801348210115e-07, "loss": 0.2262, "num_tokens": 204082235.0, "residual_var": 0.04598964378237724, "reward": 0.658203125, "reward_std": 0.26820850372314453, "rewards/drgrpo_math_reward/mean": 0.658203125, "rewards/drgrpo_math_reward/std": 0.4747757613658905, "rho2": 0.6249995827674866, "step": 502 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 6.222660449822228e-09, "advantages/std": 0.37416574358940125, "advantages/var": 0.14000000367580956, "completions/clipped_ratio": 0.09375, "epoch": 2.888888888888889, "grad_norm": 101.36409205183902, "learning_rate": 7.704841171973706e-07, "loss": -0.3326, "num_tokens": 204620815.0, "residual_var": 0.021875057369470596, "reward": 0.71875, "reward_std": 0.32317209243774414, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "rho2": 0.8437496423721313, "step": 503 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 6.096936965451374e-09, "advantages/std": 0.38188132643699646, "advantages/var": 0.14583334748127985, "completions/clipped_ratio": 0.265625, "epoch": 2.8946236559139784, "grad_norm": 98.05084212723511, "learning_rate": 7.669910582395698e-07, "loss": -0.587, "num_tokens": 205154871.0, "residual_var": 0.022786526009440422, "reward": 0.68359375, "reward_std": 0.3349953591823578, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.46552830934524536, "rho2": 0.8437495827674866, "step": 504 }, { "advantages/mean": -2.444721758365631e-09, "advantages/snr": 6.612993466356039e-09, "advantages/std": 0.3696845769882202, "advantages/var": 0.13666668646295932, "completions/clipped_ratio": 0.4296875, "epoch": 2.900358422939068, "grad_norm": 108.31440254864808, "learning_rate": 7.635010029762755e-07, "loss": 0.443, "num_tokens": 205696526.0, "residual_var": 0.034166738390922546, "reward": 0.70703125, "reward_std": 0.30672910809516907, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.455569326877594, "rho2": 0.7499995231628418, "step": 505 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 6.644774372406567e-10, "advantages/std": 0.35039660334587097, "advantages/var": 0.12277777963632364, "completions/clipped_ratio": 0.46875, "epoch": 2.9060931899641576, "grad_norm": 87.22588660957456, "learning_rate": 7.60013996397434e-07, "loss": 0.0374, "num_tokens": 206239642.0, "residual_var": 0.038368094712495804, "reward": 0.72265625, "reward_std": 0.2803157567977905, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "rho2": 0.6874997615814209, "step": 506 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.7569382033352228e-09, "advantages/std": 0.39756202697753906, "advantages/var": 0.1580555652944895, "completions/clipped_ratio": 0.46875, "epoch": 2.9118279569892476, "grad_norm": 118.88425222687864, "learning_rate": 7.565300834536923e-07, "loss": -0.8488, "num_tokens": 206786814.0, "residual_var": 0.034574758261442184, "reward": 0.58984375, "reward_std": 0.34076446294784546, "rewards/drgrpo_math_reward/mean": 0.58984375, "rewards/drgrpo_math_reward/std": 0.49234291911125183, "rho2": 0.7812494039535522, "step": 507 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.676135456649597e-09, "advantages/std": 0.34801024198532104, "advantages/var": 0.12111112852668171, "completions/clipped_ratio": 0.4609375, "epoch": 2.9175627240143367, "grad_norm": 110.83367003809674, "learning_rate": 7.530493090558162e-07, "loss": -0.4949, "num_tokens": 207321837.0, "residual_var": 0.02649311162531376, "reward": 0.7265625, "reward_std": 0.28756949305534363, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "rho2": 0.7812495827674866, "step": 508 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 9.437547280011995e-10, "advantages/std": 0.3700600862503052, "advantages/var": 0.1369444674355833, "completions/clipped_ratio": 0.34375, "epoch": 2.9232974910394267, "grad_norm": 115.80478082914378, "learning_rate": 7.495717180741139e-07, "loss": -0.7876, "num_tokens": 207851017.0, "residual_var": 0.025677137076854706, "reward": 0.6640625, "reward_std": 0.3152969479560852, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4727790653705597, "rho2": 0.8124996423721313, "step": 509 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 3.5388420148065656e-09, "advantages/std": 0.39475733041763306, "advantages/var": 0.15583334991845632, "completions/clipped_ratio": 0.40625, "epoch": 2.9290322580645163, "grad_norm": 112.36438167382424, "learning_rate": 7.460973553378556e-07, "loss": -0.6669, "num_tokens": 208387562.0, "residual_var": 0.03895839676260948, "reward": 0.58203125, "reward_std": 0.3463805913925171, "rewards/drgrpo_math_reward/mean": 0.58203125, "rewards/drgrpo_math_reward/std": 0.4937073290348053, "rho2": 0.7499996423721313, "step": 510 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.4699392943200092e-09, "advantages/std": 0.3959868252277374, "advantages/var": 0.15680556575394267, "completions/clipped_ratio": 0.3515625, "epoch": 2.934767025089606, "grad_norm": 138.6978798045077, "learning_rate": 7.426262656346978e-07, "loss": -0.0088, "num_tokens": 208909138.0, "residual_var": 0.019600754603743553, "reward": 0.693359375, "reward_std": 0.35419565439224243, "rewards/drgrpo_math_reward/mean": 0.693359375, "rewards/drgrpo_math_reward/std": 0.4615498185157776, "rho2": 0.8749996423721313, "step": 511 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 4.080805592785745e-09, "advantages/std": 0.3138028681278229, "advantages/var": 0.0984722400452478, "completions/clipped_ratio": 0.2734375, "epoch": 2.9405017921146954, "grad_norm": 104.92688347616733, "learning_rate": 7.391584937101033e-07, "loss": -0.2291, "num_tokens": 209439857.0, "residual_var": 0.030772609636187553, "reward": 0.638671875, "reward_std": 0.24266879260540009, "rewards/drgrpo_math_reward/mean": 0.638671875, "rewards/drgrpo_math_reward/std": 0.48085519671440125, "rho2": 0.6874996423721313, "step": 512 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.3520956224820837e-09, "advantages/std": 0.34439966082572937, "advantages/var": 0.11861112637687743, "completions/clipped_ratio": 0.0703125, "epoch": 2.946236559139785, "grad_norm": 139.77549613048777, "learning_rate": 7.356940842667663e-07, "loss": -0.0347, "num_tokens": 209943397.0, "residual_var": 0.022239631041884422, "reward": 0.78515625, "reward_std": 0.289497435092926, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4111155867576599, "rho2": 0.8124996423721313, "step": 513 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.155464536996448e-09, "advantages/std": 0.36893242597579956, "advantages/var": 0.13611113493638882, "completions/clipped_ratio": 0.5546875, "epoch": 2.9519713261648746, "grad_norm": 115.10399372315393, "learning_rate": 7.322330819640359e-07, "loss": -1.4132, "num_tokens": 210499143.0, "residual_var": 0.02552090398967266, "reward": 0.671875, "reward_std": 0.3119371235370636, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4699897766113281, "rho2": 0.8124995231628418, "step": 514 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.501656791989435e-09, "advantages/std": 0.325747013092041, "advantages/var": 0.10611111653838634, "completions/clipped_ratio": 0.765625, "epoch": 2.957706093189964, "grad_norm": 87.82918530524225, "learning_rate": 7.287755314173401e-07, "loss": -0.9137, "num_tokens": 211063365.0, "residual_var": 0.026527808979153633, "reward": 0.6484375, "reward_std": 0.2649092674255371, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4779251217842102, "rho2": 0.7499997615814209, "step": 515 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.289069762750083e-10, "advantages/std": 0.3539460301399231, "advantages/var": 0.12527779225181135, "completions/clipped_ratio": 0.796875, "epoch": 2.9634408602150537, "grad_norm": 91.80247505294213, "learning_rate": 7.2532147719761e-07, "loss": 0.1367, "num_tokens": 211627553.0, "residual_var": 0.03131949529051781, "reward": 0.7421875, "reward_std": 0.2917748689651489, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "rho2": 0.7499996423721313, "step": 516 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 6.7134756192623e-10, "advantages/std": 0.34681087732315063, "advantages/var": 0.12027778462965344, "completions/clipped_ratio": 0.796875, "epoch": 2.9691756272401433, "grad_norm": 84.72651859193958, "learning_rate": 7.21870963830706e-07, "loss": 0.1882, "num_tokens": 212193721.0, "residual_var": 0.03758684918284416, "reward": 0.73046875, "reward_std": 0.27680712938308716, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44415023922920227, "rho2": 0.6874997019767761, "step": 517 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 3.420529257775194e-09, "advantages/std": 0.34034299850463867, "advantages/var": 0.11583335663112848, "completions/clipped_ratio": 0.890625, "epoch": 2.974910394265233, "grad_norm": 87.67644578648706, "learning_rate": 7.18424035796845e-07, "loss": -0.3737, "num_tokens": 212756260.0, "residual_var": 0.025338582694530487, "reward": 0.58984375, "reward_std": 0.28218716382980347, "rewards/drgrpo_math_reward/mean": 0.58984375, "rewards/drgrpo_math_reward/std": 0.49234291911125183, "rho2": 0.7812497019767761, "step": 518 }, { "advantages/mean": -1.3387762010097504e-09, "advantages/snr": 4.455715978492229e-09, "advantages/std": 0.30046263337135315, "advantages/var": 0.09027779405244818, "completions/clipped_ratio": 0.7890625, "epoch": 2.9806451612903224, "grad_norm": 66.6679531444923, "learning_rate": 7.149807375300238e-07, "loss": -1.1645, "num_tokens": 213316509.0, "residual_var": 0.031033026054501534, "reward": 0.6015625, "reward_std": 0.23131582140922546, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4900552034378052, "rho2": 0.6562496423721313, "step": 519 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 7.119688562806996e-10, "advantages/std": 0.32702362537384033, "advantages/var": 0.10694445155264987, "completions/clipped_ratio": 0.8828125, "epoch": 2.9863799283154124, "grad_norm": 82.88980629889518, "learning_rate": 7.115411134174499e-07, "loss": -0.6315, "num_tokens": 213888110.0, "residual_var": 0.053472261875867844, "reward": 0.69140625, "reward_std": 0.23198208212852478, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.4623647928237915, "rho2": 0.4999997317790985, "step": 520 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.4881330316366535e-09, "advantages/std": 0.31291642785072327, "advantages/var": 0.0979166908188569, "completions/clipped_ratio": 0.890625, "epoch": 2.9921146953405016, "grad_norm": 76.80580360387677, "learning_rate": 7.081052077989667e-07, "loss": -0.3089, "num_tokens": 214454406.0, "residual_var": 0.03977866843342781, "reward": 0.763671875, "reward_std": 0.23324155807495117, "rewards/drgrpo_math_reward/mean": 0.763671875, "rewards/drgrpo_math_reward/std": 0.42524150013923645, "rho2": 0.5937498807907104, "step": 521 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 5.31901321665873e-09, "advantages/std": 0.306412935256958, "advantages/var": 0.09388888689278474, "completions/clipped_ratio": 0.8828125, "epoch": 2.9978494623655916, "grad_norm": 67.00783346029033, "learning_rate": 7.046730649664831e-07, "loss": 0.1262, "num_tokens": 215022560.0, "residual_var": 0.032274335622787476, "reward": 0.7890625, "reward_std": 0.23044493794441223, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4083731174468994, "rho2": 0.6562497615814209, "step": 522 }, { "advantages/mean": 2.2118911147117615e-09, "advantages/snr": 6.681768669157786e-09, "advantages/std": 0.33103376626968384, "advantages/var": 0.10958335441069167, "completions/clipped_ratio": 0.796875, "epoch": 3.0057347670250896, "grad_norm": 76.5167605647889, "learning_rate": 7.012447291634027e-07, "loss": 0.2918, "num_tokens": 215584651.0, "residual_var": 0.027395877987146378, "reward": 0.759765625, "reward_std": 0.27261391282081604, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "rho2": 0.7499996423721313, "step": 523 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 7.257670446544843e-09, "advantages/std": 0.3208062946796417, "advantages/var": 0.10291667870608112, "completions/clipped_ratio": 0.8828125, "epoch": 3.011469534050179, "grad_norm": 66.33730201897887, "learning_rate": 6.97820244584052e-07, "loss": 0.1563, "num_tokens": 216162860.0, "residual_var": 0.03859379515051842, "reward": 0.638671875, "reward_std": 0.24094659090042114, "rewards/drgrpo_math_reward/mean": 0.638671875, "rewards/drgrpo_math_reward/std": 0.48085519671440125, "rho2": 0.6249996423721313, "step": 524 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.201195446288939e-09, "advantages/std": 0.3173238933086395, "advantages/var": 0.10069445326455284, "completions/clipped_ratio": 0.828125, "epoch": 3.0172043010752687, "grad_norm": 66.71005256046635, "learning_rate": 6.943996553731131e-07, "loss": -0.57, "num_tokens": 216733327.0, "residual_var": 0.044053856283426285, "reward": 0.740234375, "reward_std": 0.23091748356819153, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "rho2": 0.5624997615814209, "step": 525 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 1.1044125654972361e-09, "advantages/std": 0.21081852912902832, "advantages/var": 0.04444445222412696, "completions/clipped_ratio": 0.90625, "epoch": 3.0229390681003583, "grad_norm": 39.929476680293284, "learning_rate": 6.909830056250526e-07, "loss": -0.6999, "num_tokens": 217294176.0, "residual_var": 0.027777792885899544, "reward": 0.73046875, "reward_std": 0.12136822193861008, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44415023922920227, "rho2": 0.37499988079071045, "step": 526 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.1998657888205269e-09, "advantages/std": 0.2910708487033844, "advantages/var": 0.08472223896490849, "completions/clipped_ratio": 0.890625, "epoch": 3.028673835125448, "grad_norm": 64.33140236019935, "learning_rate": 6.875703393835541e-07, "loss": -0.5199, "num_tokens": 217856113.0, "residual_var": 0.03441842645406723, "reward": 0.80078125, "reward_std": 0.2092304825782776, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "rho2": 0.5937498211860657, "step": 527 }, { "advantages/mean": -1.979060471057892e-09, "advantages/snr": 5.889521206028938e-09, "advantages/std": 0.3360307812690735, "advantages/var": 0.11291668596030391, "completions/clipped_ratio": 0.9375, "epoch": 3.0344086021505374, "grad_norm": 69.14578337134546, "learning_rate": 6.841617006409493e-07, "loss": -0.8677, "num_tokens": 218430042.0, "residual_var": 0.038815151900053024, "reward": 0.759765625, "reward_std": 0.26725661754608154, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "rho2": 0.6562496423721313, "step": 528 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.760593162885191e-09, "advantages/std": 0.330613911151886, "advantages/var": 0.10930555824714716, "completions/clipped_ratio": 0.9296875, "epoch": 3.0401433691756274, "grad_norm": 68.4487729955992, "learning_rate": 6.807571333376538e-07, "loss": 0.2521, "num_tokens": 218994418.0, "residual_var": 0.027326446026563644, "reward": 0.802734375, "reward_std": 0.2673768997192383, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "rho2": 0.7499995231628418, "step": 529 }, { "advantages/mean": 7.566995918750763e-10, "advantages/snr": 2.6169259020963316e-09, "advantages/std": 0.28915590047836304, "advantages/var": 0.08361113478145299, "completions/clipped_ratio": 0.96875, "epoch": 3.045878136200717, "grad_norm": 60.909533061827425, "learning_rate": 6.77356681361597e-07, "loss": -0.4377, "num_tokens": 219575568.0, "residual_var": 0.028741346672177315, "reward": 0.79296875, "reward_std": 0.2187976837158203, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40557438135147095, "rho2": 0.6562497615814209, "step": 530 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.2101194310892205e-09, "advantages/std": 0.3263859450817108, "advantages/var": 0.10652778514688155, "completions/clipped_ratio": 0.953125, "epoch": 3.0516129032258066, "grad_norm": 66.12071396305181, "learning_rate": 6.739603885476582e-07, "loss": -0.4071, "num_tokens": 220163577.0, "residual_var": 0.04660593345761299, "reward": 0.791015625, "reward_std": 0.24707302451133728, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "rho2": 0.5624998211860657, "step": 531 }, { "advantages/mean": 1.6880221664905548e-09, "advantages/snr": 5.394482753456179e-09, "advantages/std": 0.3129163980484009, "advantages/var": 0.09791667216758526, "completions/clipped_ratio": 0.953125, "epoch": 3.057347670250896, "grad_norm": 92.17600135015113, "learning_rate": 6.70568298677102e-07, "loss": -1.2565, "num_tokens": 220740598.0, "residual_var": 0.052018266171216965, "reward": 0.732421875, "reward_std": 0.2201756238937378, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.4687497913837433, "step": 532 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.7065797887348874e-09, "advantages/std": 0.2720804810523987, "advantages/var": 0.07402778816970468, "completions/clipped_ratio": 0.9453125, "epoch": 3.0630824372759857, "grad_norm": 53.70345852378651, "learning_rate": 6.671804554770134e-07, "loss": -0.5636, "num_tokens": 221307532.0, "residual_var": 0.03701391443610191, "reward": 0.740234375, "reward_std": 0.18161220848560333, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "rho2": 0.4999998211860657, "step": 533 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 6.7134756192623e-10, "advantages/std": 0.34681087732315063, "advantages/var": 0.12027778462965344, "completions/clipped_ratio": 0.984375, "epoch": 3.0688172043010753, "grad_norm": 81.11847969462569, "learning_rate": 6.637969026197332e-07, "loss": -1.2032, "num_tokens": 221904911.0, "residual_var": 0.037586864084005356, "reward": 0.65234375, "reward_std": 0.2721153199672699, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.47669193148612976, "rho2": 0.6874995827674866, "step": 534 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.348416367695141e-09, "advantages/std": 0.25900453329086304, "advantages/var": 0.06708334826521778, "completions/clipped_ratio": 0.9765625, "epoch": 3.074551971326165, "grad_norm": 52.388397760118735, "learning_rate": 6.604176837222959e-07, "loss": -0.362, "num_tokens": 222477144.0, "residual_var": 0.03563803806900978, "reward": 0.650390625, "reward_std": 0.16647270321846008, "rewards/drgrpo_math_reward/mean": 0.650390625, "rewards/drgrpo_math_reward/std": 0.47731292247772217, "rho2": 0.46874985098838806, "step": 535 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.8065967801091088e-09, "advantages/std": 0.2903542220592499, "advantages/var": 0.08430557426763219, "completions/clipped_ratio": 1.0, "epoch": 3.0802867383512544, "grad_norm": 59.4322504067654, "learning_rate": 6.570428423458686e-07, "loss": -0.2252, "num_tokens": 223052584.0, "residual_var": 0.026345523074269295, "reward": 0.826171875, "reward_std": 0.22595645487308502, "rewards/drgrpo_math_reward/mean": 0.826171875, "rewards/drgrpo_math_reward/std": 0.3793322443962097, "rho2": 0.6874996423721313, "step": 536 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.016454939781137e-09, "advantages/std": 0.34359216690063477, "advantages/var": 0.11805557715547366, "completions/clipped_ratio": 0.9609375, "epoch": 3.086021505376344, "grad_norm": 105.355564999717, "learning_rate": 6.536724219951865e-07, "loss": -0.0533, "num_tokens": 223631713.0, "residual_var": 0.029513930901885033, "reward": 0.76953125, "reward_std": 0.2803475856781006, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "rho2": 0.7499997019767761, "step": 537 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.95540180424847e-09, "advantages/std": 0.2976762056350708, "advantages/var": 0.08861112340129296, "completions/clipped_ratio": 0.9609375, "epoch": 3.0917562724014336, "grad_norm": 63.7668954335473, "learning_rate": 6.50306466117995e-07, "loss": -0.3598, "num_tokens": 224197907.0, "residual_var": 0.03322920948266983, "reward": 0.640625, "reward_std": 0.22452709078788757, "rewards/drgrpo_math_reward/mean": 0.640625, "rewards/drgrpo_math_reward/std": 0.48028653860092163, "rho2": 0.6249996423721313, "step": 538 }, { "advantages/mean": 7.566995918750763e-10, "advantages/snr": 2.2260148416984982e-09, "advantages/std": 0.33993464708328247, "advantages/var": 0.1155555642876358, "completions/clipped_ratio": 0.9375, "epoch": 3.097491039426523, "grad_norm": 75.44863181517792, "learning_rate": 6.46945018104488e-07, "loss": -0.816, "num_tokens": 224766524.0, "residual_var": 0.03611117601394653, "reward": 0.6640625, "reward_std": 0.2692118287086487, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4727790653705597, "rho2": 0.6874995231628418, "step": 539 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 3.3341418776813738e-09, "advantages/std": 0.3142451345920563, "advantages/var": 0.09875000461477956, "completions/clipped_ratio": 0.9453125, "epoch": 3.1032258064516127, "grad_norm": 65.46877167347884, "learning_rate": 6.435881212867493e-07, "loss": -0.4085, "num_tokens": 225333435.0, "residual_var": 0.02777346782386303, "reward": 0.681640625, "reward_std": 0.24899107217788696, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.7187497615814209, "step": 540 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 3.5974160300133395e-10, "advantages/std": 0.3236081600189209, "advantages/var": 0.10472224123083151, "completions/clipped_ratio": 0.9375, "epoch": 3.1089605734767023, "grad_norm": 71.2369836654289, "learning_rate": 6.402358189381933e-07, "loss": -0.9778, "num_tokens": 225910871.0, "residual_var": 0.022908028215169907, "reward": 0.69921875, "reward_std": 0.26633232831954956, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "rho2": 0.7812496423721313, "step": 541 }, { "advantages/mean": 2.444721758365631e-09, "advantages/snr": 7.427597419993766e-09, "advantages/std": 0.329140305519104, "advantages/var": 0.10833334071720913, "completions/clipped_ratio": 0.9140625, "epoch": 3.1146953405017923, "grad_norm": 64.52640325874074, "learning_rate": 6.368881542730071e-07, "loss": 0.3869, "num_tokens": 226474223.0, "residual_var": 0.016927137970924377, "reward": 0.81640625, "reward_std": 0.27987319231033325, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "rho2": 0.8437495231628418, "step": 542 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.3123037475965224e-09, "advantages/std": 0.30207619071006775, "advantages/var": 0.09125002499390522, "completions/clipped_ratio": 0.921875, "epoch": 3.120430107526882, "grad_norm": 63.09697028711273, "learning_rate": 6.335451704455957e-07, "loss": -0.6503, "num_tokens": 227058228.0, "residual_var": 0.03707033768296242, "reward": 0.771484375, "reward_std": 0.22305315732955933, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "rho2": 0.5937498211860657, "step": 543 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 2.4280853110120208e-09, "advantages/std": 0.33561721444129944, "advantages/var": 0.11263891462933717, "completions/clipped_ratio": 0.90625, "epoch": 3.1261648745519715, "grad_norm": 72.56831491642677, "learning_rate": 6.302069105500216e-07, "loss": -1.0331, "num_tokens": 227642241.0, "residual_var": 0.042239636182785034, "reward": 0.603515625, "reward_std": 0.2527496814727783, "rewards/drgrpo_math_reward/mean": 0.603515625, "rewards/drgrpo_math_reward/std": 0.4896455705165863, "rho2": 0.6249996423721313, "step": 544 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 8.21391135731705e-09, "advantages/std": 0.31180480122566223, "advantages/var": 0.09722223406737474, "completions/clipped_ratio": 0.84375, "epoch": 3.131899641577061, "grad_norm": 60.73808876518364, "learning_rate": 6.268734176194534e-07, "loss": 0.104, "num_tokens": 228205910.0, "residual_var": 0.039496567100286484, "reward": 0.71484375, "reward_std": 0.2298709899187088, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.45193037390708923, "rho2": 0.5937497019767761, "step": 545 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.548342311987969e-09, "advantages/std": 0.2624669373035431, "advantages/var": 0.06888889317750202, "completions/clipped_ratio": 0.828125, "epoch": 3.1376344086021506, "grad_norm": 52.052894306750275, "learning_rate": 6.23544734625608e-07, "loss": -0.1298, "num_tokens": 228776371.0, "residual_var": 0.03229169547557831, "reward": 0.6328125, "reward_std": 0.1780368983745575, "rewards/drgrpo_math_reward/mean": 0.6328125, "rewards/drgrpo_math_reward/std": 0.48250964283943176, "rho2": 0.5312497019767761, "step": 546 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.6239606302534103e-09, "advantages/std": 0.35843023657798767, "advantages/var": 0.1284722344933522, "completions/clipped_ratio": 0.859375, "epoch": 3.14336917562724, "grad_norm": 93.55094222216873, "learning_rate": 6.202209044781989e-07, "loss": -1.2787, "num_tokens": 229355041.0, "residual_var": 0.04014763981103897, "reward": 0.740234375, "reward_std": 0.2906298041343689, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "rho2": 0.6874995231628418, "step": 547 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.2479212605068205e-09, "advantages/std": 0.35843023657798767, "advantages/var": 0.1284722344933522, "completions/clipped_ratio": 0.8671875, "epoch": 3.1491039426523297, "grad_norm": 67.30131454655962, "learning_rate": 6.169019700243815e-07, "loss": -0.4315, "num_tokens": 229933265.0, "residual_var": 0.028103336691856384, "reward": 0.576171875, "reward_std": 0.3001125156879425, "rewards/drgrpo_math_reward/mean": 0.576171875, "rewards/drgrpo_math_reward/std": 0.4946470856666565, "rho2": 0.7812497615814209, "step": 548 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.257120817899704, "advantages/var": 0.06611111499741273, "completions/clipped_ratio": 0.8359375, "epoch": 3.1548387096774193, "grad_norm": 45.552675700560236, "learning_rate": 6.13587974048201e-07, "loss": 0.4829, "num_tokens": 230493637.0, "residual_var": 0.03305557370185852, "reward": 0.6875, "reward_std": 0.1721501350402832, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "rho2": 0.49999988079071045, "step": 549 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.5087414333222824e-09, "advantages/std": 0.29860788583755493, "advantages/var": 0.08916666948437424, "completions/clipped_ratio": 0.8125, "epoch": 3.160573476702509, "grad_norm": 57.74097245034948, "learning_rate": 6.10278959270042e-07, "loss": 0.5873, "num_tokens": 231066665.0, "residual_var": 0.030651075765490532, "reward": 0.75, "reward_std": 0.2308090329170227, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "rho2": 0.6562497019767761, "step": 550 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 6.012797320256524e-09, "advantages/std": 0.2710576057434082, "advantages/var": 0.07347222563134892, "completions/clipped_ratio": 0.8515625, "epoch": 3.1663082437275984, "grad_norm": 52.93361547201495, "learning_rate": 6.069749683460764e-07, "loss": 0.1296, "num_tokens": 231630639.0, "residual_var": 0.034440141171216965, "reward": 0.658203125, "reward_std": 0.18419982492923737, "rewards/drgrpo_math_reward/mean": 0.658203125, "rewards/drgrpo_math_reward/std": 0.4747757613658905, "rho2": 0.5312496423721313, "step": 551 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.631286334969895e-10, "advantages/std": 0.32058975100517273, "advantages/var": 0.10277778844955865, "completions/clipped_ratio": 0.8515625, "epoch": 3.172043010752688, "grad_norm": 66.00623732784459, "learning_rate": 6.036760438677144e-07, "loss": -0.6361, "num_tokens": 232189895.0, "residual_var": 0.04175351932644844, "reward": 0.69921875, "reward_std": 0.23112353682518005, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "rho2": 0.5937496423721313, "step": 552 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.159882380313828e-09, "advantages/std": 0.32339349389076233, "advantages/var": 0.10458335189087453, "completions/clipped_ratio": 0.8046875, "epoch": 3.1777777777777776, "grad_norm": 68.87045174759595, "learning_rate": 6.003822283610546e-07, "loss": -0.6627, "num_tokens": 232752793.0, "residual_var": 0.032682325690984726, "reward": 0.751953125, "reward_std": 0.2568758726119995, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "rho2": 0.6874997615814209, "step": 553 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 5.029716445139067e-09, "advantages/std": 0.3240370452404022, "advantages/var": 0.10500000668813048, "completions/clipped_ratio": 0.765625, "epoch": 3.183512544802867, "grad_norm": 67.47355210698046, "learning_rate": 5.970935642863374e-07, "loss": -0.3713, "num_tokens": 233319379.0, "residual_var": 0.03281253203749657, "reward": 0.75, "reward_std": 0.25021636486053467, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "rho2": 0.6874997615814209, "step": 554 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.272862523630875e-09, "advantages/std": 0.30731815099716187, "advantages/var": 0.09444444593231438, "completions/clipped_ratio": 0.7265625, "epoch": 3.189247311827957, "grad_norm": 62.08844906144912, "learning_rate": 5.938100940373956e-07, "loss": -0.7908, "num_tokens": 233875825.0, "residual_var": 0.03836808726191521, "reward": 0.6015625, "reward_std": 0.22093512117862701, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4900552034378052, "rho2": 0.5937497615814209, "step": 555 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.866550295864263e-09, "advantages/std": 0.32489314675331116, "advantages/var": 0.10555555680726858, "completions/clipped_ratio": 0.7265625, "epoch": 3.1949820788530467, "grad_norm": 66.40240529397907, "learning_rate": 5.905318599411097e-07, "loss": -0.5874, "num_tokens": 234451682.0, "residual_var": 0.03628476709127426, "reward": 0.6796875, "reward_std": 0.24658125638961792, "rewards/drgrpo_math_reward/mean": 0.6796875, "rewards/drgrpo_math_reward/std": 0.4670529365539551, "rho2": 0.6562496423721313, "step": 556 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 4.1855136055930444e-09, "advantages/std": 0.278138667345047, "advantages/var": 0.07736111827247871, "completions/clipped_ratio": 0.640625, "epoch": 3.2007168458781363, "grad_norm": 58.116481451477924, "learning_rate": 5.872589042568604e-07, "loss": -0.2548, "num_tokens": 235021444.0, "residual_var": 0.04351566359400749, "reward": 0.798828125, "reward_std": 0.17788508534431458, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "rho2": 0.4374997019767761, "step": 557 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.8968435030880188e-09, "advantages/std": 0.3068659007549286, "advantages/var": 0.09416668104613368, "completions/clipped_ratio": 0.6484375, "epoch": 3.206451612903226, "grad_norm": 60.080546717745165, "learning_rate": 5.839912691759866e-07, "loss": -0.5501, "num_tokens": 235587272.0, "residual_var": 0.029427126049995422, "reward": 0.6953125, "reward_std": 0.23629873991012573, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4607250988483429, "rho2": 0.6874996423721313, "step": 558 }, { "advantages/mean": -1.8044374883174896e-09, "advantages/snr": 5.413312141223831e-09, "advantages/std": 0.3333333432674408, "advantages/var": 0.11111111773384952, "completions/clipped_ratio": 0.7265625, "epoch": 3.2121863799283155, "grad_norm": 67.59247069889048, "learning_rate": 5.807289968212383e-07, "loss": -0.7583, "num_tokens": 236138442.0, "residual_var": 0.034722257405519485, "reward": 0.8046875, "reward_std": 0.26010650396347046, "rewards/drgrpo_math_reward/mean": 0.8046875, "rewards/drgrpo_math_reward/std": 0.3968288004398346, "rho2": 0.6874997615814209, "step": 559 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.6288352232724215e-09, "advantages/std": 0.3208062946796417, "advantages/var": 0.10291667870608112, "completions/clipped_ratio": 0.6171875, "epoch": 3.217921146953405, "grad_norm": 63.91375944352659, "learning_rate": 5.774721292462356e-07, "loss": -0.2583, "num_tokens": 236695784.0, "residual_var": 0.041809938848018646, "reward": 0.751953125, "reward_std": 0.2420426607131958, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "rho2": 0.5937497019767761, "step": 560 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.5357306709709106e-09, "advantages/std": 0.3079953193664551, "advantages/var": 0.09486111675164466, "completions/clipped_ratio": 0.6640625, "epoch": 3.2236559139784946, "grad_norm": 54.35901737936862, "learning_rate": 5.742207084349273e-07, "loss": -0.3006, "num_tokens": 237263678.0, "residual_var": 0.02964414469897747, "reward": 0.619140625, "reward_std": 0.2363450825214386, "rewards/drgrpo_math_reward/mean": 0.619140625, "rewards/drgrpo_math_reward/std": 0.48607301712036133, "rho2": 0.6874995827674866, "step": 561 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 3.86266854005349e-09, "advantages/std": 0.3013857305049896, "advantages/var": 0.09083335855202623, "completions/clipped_ratio": 0.6640625, "epoch": 3.229390681003584, "grad_norm": 61.51244607719289, "learning_rate": 5.709747763010466e-07, "loss": -0.729, "num_tokens": 237817667.0, "residual_var": 0.03122398629784584, "reward": 0.82421875, "reward_std": 0.2275945544242859, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "rho2": 0.6562497615814209, "step": 562 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 5.833594469573092e-09, "advantages/std": 0.2793842554092407, "advantages/var": 0.07805556217057585, "completions/clipped_ratio": 0.5546875, "epoch": 3.2351254480286737, "grad_norm": 49.22919680208576, "learning_rate": 5.677343746875738e-07, "loss": -0.3458, "num_tokens": 238378345.0, "residual_var": 0.026831623166799545, "reward": 0.8125, "reward_std": 0.2114243358373642, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39069411158561707, "rho2": 0.6562497615814209, "step": 563 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 3.0739664186037866e-09, "advantages/std": 0.26509958505630493, "advantages/var": 0.07027778999702505, "completions/clipped_ratio": 0.5546875, "epoch": 3.2408602150537633, "grad_norm": 47.78384040225743, "learning_rate": 5.644995453661954e-07, "loss": 0.482, "num_tokens": 238927492.0, "residual_var": 0.03294273465871811, "reward": 0.8203125, "reward_std": 0.18355467915534973, "rewards/drgrpo_math_reward/mean": 0.8203125, "rewards/drgrpo_math_reward/std": 0.38430243730545044, "rho2": 0.5312497615814209, "step": 564 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 5.98463072644124e-09, "advantages/std": 0.29178571701049805, "advantages/var": 0.08513890465133045, "completions/clipped_ratio": 0.5234375, "epoch": 3.246594982078853, "grad_norm": 55.529719060107574, "learning_rate": 5.612703300367668e-07, "loss": -0.843, "num_tokens": 239474339.0, "residual_var": 0.03458770364522934, "reward": 0.779296875, "reward_std": 0.2117578685283661, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "rho2": 0.5937497615814209, "step": 565 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 3.6021970697460285e-09, "advantages/std": 0.32317864894866943, "advantages/var": 0.10444443913628731, "completions/clipped_ratio": 0.5390625, "epoch": 3.252329749103943, "grad_norm": 64.10099460450083, "learning_rate": 5.580467703267735e-07, "loss": -0.9295, "num_tokens": 240037812.0, "residual_var": 0.03263893723487854, "reward": 0.71875, "reward_std": 0.25451892614364624, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "rho2": 0.6874996423721313, "step": 566 }, { "advantages/mean": -1.7462298274040222e-09, "advantages/snr": 5.616464410718476e-09, "advantages/std": 0.3109126389026642, "advantages/var": 0.09666666902941845, "completions/clipped_ratio": 0.640625, "epoch": 3.258064516129032, "grad_norm": 63.32814268848118, "learning_rate": 5.548289077907943e-07, "loss": -0.043, "num_tokens": 240598492.0, "residual_var": 0.0453125536441803, "reward": 0.67578125, "reward_std": 0.21411389112472534, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.4685399830341339, "rho2": 0.5312495827674866, "step": 567 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.9576041342107736e-10, "advantages/std": 0.2941560447216034, "advantages/var": 0.08652777864625794, "completions/clipped_ratio": 0.53125, "epoch": 3.263799283154122, "grad_norm": 59.958218161921515, "learning_rate": 5.51616783909968e-07, "loss": -0.5034, "num_tokens": 241162808.0, "residual_var": 0.04055992513895035, "reward": 0.732421875, "reward_std": 0.20549547672271729, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.5312497615814209, "step": 568 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 4.71430583211036e-09, "advantages/std": 0.3210226893424988, "advantages/var": 0.10305556707269048, "completions/clipped_ratio": 0.4296875, "epoch": 3.2695340501792116, "grad_norm": 61.17632917605665, "learning_rate": 5.484104400914552e-07, "loss": -1.4676, "num_tokens": 241725822.0, "residual_var": 0.03542537987232208, "reward": 0.77734375, "reward_std": 0.24415996670722961, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "rho2": 0.6562497615814209, "step": 569 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.1951480612480313e-09, "advantages/std": 0.3181980848312378, "advantages/var": 0.1012500211902676, "completions/clipped_ratio": 0.4140625, "epoch": 3.275268817204301, "grad_norm": 66.52067371957013, "learning_rate": 5.452099176679071e-07, "loss": -1.1239, "num_tokens": 242276929.0, "residual_var": 0.03796880692243576, "reward": 0.822265625, "reward_std": 0.2386215329170227, "rewards/drgrpo_math_reward/mean": 0.822265625, "rewards/drgrpo_math_reward/std": 0.3826628625392914, "rho2": 0.6249995231628418, "step": 570 }, { "advantages/mean": -1.979060471057892e-09, "advantages/snr": 6.267048533453025e-09, "advantages/std": 0.31578826904296875, "advantages/var": 0.09972223086515442, "completions/clipped_ratio": 0.546875, "epoch": 3.2810035842293908, "grad_norm": 57.78562066149918, "learning_rate": 5.420152578969325e-07, "loss": -0.46, "num_tokens": 242843533.0, "residual_var": 0.034279558807611465, "reward": 0.72265625, "reward_std": 0.2382698804140091, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "rho2": 0.6562496423721313, "step": 571 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.661092162665558e-10, "advantages/std": 0.31797975301742554, "advantages/var": 0.10111112332902294, "completions/clipped_ratio": 0.4296875, "epoch": 3.2867383512544803, "grad_norm": 57.280851847533576, "learning_rate": 5.388265019605641e-07, "loss": -1.936, "num_tokens": 243406815.0, "residual_var": 0.04107644408941269, "reward": 0.7109375, "reward_std": 0.22716139256954193, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45377036929130554, "rho2": 0.5937495231628418, "step": 572 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 3.479435839467984e-09, "advantages/std": 0.33458101749420166, "advantages/var": 0.11194445726745528, "completions/clipped_ratio": 0.4453125, "epoch": 3.29247311827957, "grad_norm": 64.7647311057191, "learning_rate": 5.356436909647302e-07, "loss": -0.1065, "num_tokens": 243950027.0, "residual_var": 0.041979216039180756, "reward": 0.70703125, "reward_std": 0.2525302767753601, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.455569326877594, "rho2": 0.6249996423721313, "step": 573 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.583538511807429e-09, "advantages/std": 0.2793842554092407, "advantages/var": 0.07805556217057585, "completions/clipped_ratio": 0.421875, "epoch": 3.2982078853046595, "grad_norm": 59.03415855454835, "learning_rate": 5.324668659387221e-07, "loss": -0.6116, "num_tokens": 244495925.0, "residual_var": 0.051223982125520706, "reward": 0.66796875, "reward_std": 0.17451104521751404, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.47140273451805115, "rho2": 0.34374985098838806, "step": 574 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.5392206649390232e-09, "advantages/std": 0.32892924547195435, "advantages/var": 0.1081944485267492, "completions/clipped_ratio": 0.3984375, "epoch": 3.303942652329749, "grad_norm": 66.25230902018909, "learning_rate": 5.292960678346674e-07, "loss": -0.913, "num_tokens": 245048784.0, "residual_var": 0.04733510687947273, "reward": 0.662109375, "reward_std": 0.25109297037124634, "rewards/drgrpo_math_reward/mean": 0.662109375, "rewards/drgrpo_math_reward/std": 0.4734536409378052, "rho2": 0.5624997615814209, "step": 575 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 5.737415776028352e-09, "advantages/std": 0.28406769037246704, "advantages/var": 0.0806944527135478, "completions/clipped_ratio": 0.421875, "epoch": 3.3096774193548386, "grad_norm": 50.07373499626851, "learning_rate": 5.261313375270013e-07, "loss": -0.2678, "num_tokens": 245595092.0, "residual_var": 0.03782555088400841, "reward": 0.775390625, "reward_std": 0.19359758496284485, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "rho2": 0.5312497615814209, "step": 576 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.225235816552494e-09, "advantages/std": 0.285043865442276, "advantages/var": 0.08125000522627435, "completions/clipped_ratio": 0.328125, "epoch": 3.315412186379928, "grad_norm": 52.41372394865049, "learning_rate": 5.229727158119396e-07, "loss": -0.8822, "num_tokens": 246136184.0, "residual_var": 0.030468784272670746, "reward": 0.732421875, "reward_std": 0.20867595076560974, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.6249997019767761, "step": 577 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.8773034508360926e-09, "advantages/std": 0.24804794788360596, "advantages/var": 0.0615277844492681, "completions/clipped_ratio": 0.390625, "epoch": 3.3211469534050178, "grad_norm": 46.607086484899845, "learning_rate": 5.198202434069519e-07, "loss": -0.2009, "num_tokens": 246681970.0, "residual_var": 0.03460940346121788, "reward": 0.740234375, "reward_std": 0.15300408005714417, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "rho2": 0.4374997913837433, "step": 578 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.607110376452312e-09, "advantages/std": 0.3032234311103821, "advantages/var": 0.09194444917435263, "completions/clipped_ratio": 0.5, "epoch": 3.3268817204301078, "grad_norm": 55.77613788381252, "learning_rate": 5.166739609502396e-07, "loss": -0.5418, "num_tokens": 247246433.0, "residual_var": 0.04022574797272682, "reward": 0.66015625, "reward_std": 0.21202974021434784, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.4741191864013672, "rho2": 0.5624995231628418, "step": 579 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 6.574495902897344e-10, "advantages/std": 0.3541421890258789, "advantages/var": 0.12541669004804135, "completions/clipped_ratio": 0.453125, "epoch": 3.332616487455197, "grad_norm": 61.42006939079426, "learning_rate": 5.135339090002084e-07, "loss": -1.1403, "num_tokens": 247794465.0, "residual_var": 0.04311203584074974, "reward": 0.736328125, "reward_std": 0.28023284673690796, "rewards/drgrpo_math_reward/mean": 0.736328125, "rewards/drgrpo_math_reward/std": 0.4410543739795685, "rho2": 0.6562496423721313, "step": 580 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 6.8135790857532976e-09, "advantages/std": 0.3075440526008606, "advantages/var": 0.09458334429016091, "completions/clipped_ratio": 0.3671875, "epoch": 3.338351254480287, "grad_norm": 53.94493195814452, "learning_rate": 5.104001280349479e-07, "loss": -0.8328, "num_tokens": 248341161.0, "residual_var": 0.03842451050877571, "reward": 0.740234375, "reward_std": 0.22291725873947144, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "rho2": 0.5937497615814209, "step": 581 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.1215178633995786e-09, "advantages/std": 0.2743679881095886, "advantages/var": 0.07527779289930336, "completions/clipped_ratio": 0.2421875, "epoch": 3.3440860215053765, "grad_norm": 45.53107665061829, "learning_rate": 5.072726584517085e-07, "loss": -0.6864, "num_tokens": 248880407.0, "residual_var": 0.0399913527071476, "reward": 0.7421875, "reward_std": 0.17775756120681763, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "rho2": 0.4687497615814209, "step": 582 }, { "advantages/mean": 2.6775524020195007e-09, "advantages/snr": 7.895670705089187e-09, "advantages/std": 0.33911651372909546, "advantages/var": 0.11500000988377579, "completions/clipped_ratio": 0.2734375, "epoch": 3.349820788530466, "grad_norm": 59.009677842623546, "learning_rate": 5.041515405663821e-07, "loss": -2.3923, "num_tokens": 249423355.0, "residual_var": 0.039531297981739044, "reward": 0.66015625, "reward_std": 0.26331719756126404, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.4741191864013672, "rho2": 0.6562496423721313, "step": 583 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.2000425181191404e-09, "advantages/std": 0.26457515358924866, "advantages/var": 0.07000001189677452, "completions/clipped_ratio": 0.0859375, "epoch": 3.3555555555555556, "grad_norm": 43.06354585197565, "learning_rate": 5.010368146129814e-07, "loss": -1.1462, "num_tokens": 249940418.0, "residual_var": 0.0350000225007534, "reward": 0.77734375, "reward_std": 0.1743982434272766, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "rho2": 0.4999997615814209, "step": 584 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 4.615823783050576e-09, "advantages/std": 0.3278719484806061, "advantages/var": 0.10750001460046921, "completions/clipped_ratio": 0.2265625, "epoch": 3.361290322580645, "grad_norm": 59.955386117300385, "learning_rate": 4.979285207431203e-07, "loss": -2.1362, "num_tokens": 250494883.0, "residual_var": 0.030234409496188164, "reward": 0.8125, "reward_std": 0.2608744502067566, "rewards/drgrpo_math_reward/mean": 0.8125, "rewards/drgrpo_math_reward/std": 0.39069411158561707, "rho2": 0.7187497615814209, "step": 585 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 8.025461779395881e-10, "advantages/std": 0.2901149392127991, "advantages/var": 0.0841666779544461, "completions/clipped_ratio": -0.0703125, "epoch": 3.3670250896057348, "grad_norm": 47.214142791824344, "learning_rate": 4.948266990254988e-07, "loss": -1.9951, "num_tokens": 251019608.0, "residual_var": 0.042083363980054855, "reward": 0.78515625, "reward_std": 0.20320621132850647, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4111155867576599, "rho2": 0.4999997317790985, "step": 586 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.4211718062729363e-09, "advantages/std": 0.32766008377075195, "advantages/var": 0.10736113049665619, "completions/clipped_ratio": 0.1015625, "epoch": 3.3727598566308243, "grad_norm": 53.987913509426456, "learning_rate": 4.917313894453841e-07, "loss": -2.7089, "num_tokens": 251553967.0, "residual_var": 0.030195359140634537, "reward": 0.771484375, "reward_std": 0.26409682631492615, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "rho2": 0.7187496423721313, "step": 587 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.8242919797438261e-09, "advantages/std": 0.31906986236572266, "advantages/var": 0.1018055770700812, "completions/clipped_ratio": 0.09375, "epoch": 3.378494623655914, "grad_norm": 52.4338554685557, "learning_rate": 4.886426319040964e-07, "loss": -1.6207, "num_tokens": 252080633.0, "residual_var": 0.03499569371342659, "reward": 0.814453125, "reward_std": 0.24676111340522766, "rewards/drgrpo_math_reward/mean": 0.814453125, "rewards/drgrpo_math_reward/std": 0.38912075757980347, "rho2": 0.6562497615814209, "step": 588 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.104703777922391e-09, "advantages/std": 0.3318718373775482, "advantages/var": 0.11013891644434981, "completions/clipped_ratio": -0.0078125, "epoch": 3.3842293906810035, "grad_norm": 58.88763512441123, "learning_rate": 4.855604662184934e-07, "loss": -2.6303, "num_tokens": 252608584.0, "residual_var": 0.0413021445274353, "reward": 0.712890625, "reward_std": 0.24505165219306946, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "rho2": 0.6249995231628418, "step": 589 }, { "advantages/mean": -1.57160684466362e-09, "advantages/snr": 5.0841214251339094e-09, "advantages/std": 0.30912062525749207, "advantages/var": 0.09555556095958284, "completions/clipped_ratio": -0.015625, "epoch": 3.389964157706093, "grad_norm": 53.03640022462273, "learning_rate": 4.8248493212046e-07, "loss": -3.9159, "num_tokens": 253137082.0, "residual_var": 0.04180559143424034, "reward": 0.76171875, "reward_std": 0.22178566455841064, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "rho2": 0.5624997019767761, "step": 590 }, { "advantages/mean": -2.9103830456733704e-10, "advantages/snr": 9.184324890274977e-10, "advantages/std": 0.31688588857650757, "advantages/var": 0.10041666637892277, "completions/clipped_ratio": -0.125, "epoch": 3.3956989247311826, "grad_norm": 50.60238196148447, "learning_rate": 4.794160692563917e-07, "loss": -1.9858, "num_tokens": 253656300.0, "residual_var": 0.03765628859400749, "reward": 0.783203125, "reward_std": 0.2338705062866211, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "rho2": 0.6249997019767761, "step": 591 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.1005977231444695e-09, "advantages/std": 0.3173238933086395, "advantages/var": 0.10069445326455284, "completions/clipped_ratio": 0.21875, "epoch": 3.4014336917562726, "grad_norm": 54.01015997421203, "learning_rate": 4.7635391718668693e-07, "loss": -2.5793, "num_tokens": 254210325.0, "residual_var": 0.044053856283426285, "reward": 0.669921875, "reward_std": 0.22614558041095734, "rewards/drgrpo_math_reward/mean": 0.669921875, "rewards/drgrpo_math_reward/std": 0.47070086002349854, "rho2": 0.5624997615814209, "step": 592 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.5336963499894834e-09, "advantages/std": 0.2297341525554657, "advantages/var": 0.05277778085037799, "completions/clipped_ratio": 0.140625, "epoch": 3.4071684587813618, "grad_norm": 42.40034108279046, "learning_rate": 4.7329851538523545e-07, "loss": -2.4422, "num_tokens": 254755406.0, "residual_var": 0.02803821489214897, "reward": 0.76953125, "reward_std": 0.14711953699588776, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "rho2": 0.4687497913837433, "step": 593 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.440066161713032e-09, "advantages/std": 0.28625941276550293, "advantages/var": 0.08194445139685058, "completions/clipped_ratio": 0.09375, "epoch": 3.412903225806452, "grad_norm": 46.82756751123162, "learning_rate": 4.7024990323891103e-07, "loss": -1.5409, "num_tokens": 255299516.0, "residual_var": 0.03585072234272957, "reward": 0.77734375, "reward_std": 0.20129705965518951, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "rho2": 0.5624997615814209, "step": 594 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.7354195649464033e-09, "advantages/std": 0.2979094088077545, "advantages/var": 0.0887500158561858, "completions/clipped_ratio": 0.3125, "epoch": 3.4186379928315414, "grad_norm": 49.46541007948821, "learning_rate": 4.672081200470611e-07, "loss": -2.4231, "num_tokens": 255843505.0, "residual_var": 0.03605472669005394, "reward": 0.724609375, "reward_std": 0.21411941945552826, "rewards/drgrpo_math_reward/mean": 0.724609375, "rewards/drgrpo_math_reward/std": 0.44714778661727905, "rho2": 0.5937496423721313, "step": 595 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.4380008363258722e-09, "advantages/std": 0.2865019142627716, "advantages/var": 0.08208334687623253, "completions/clipped_ratio": 0.46875, "epoch": 3.424372759856631, "grad_norm": 48.88408422621982, "learning_rate": 4.641732050210031e-07, "loss": -0.8136, "num_tokens": 256397114.0, "residual_var": 0.03591148182749748, "reward": 0.744140625, "reward_std": 0.20725850760936737, "rewards/drgrpo_math_reward/mean": 0.744140625, "rewards/drgrpo_math_reward/std": 0.43676990270614624, "rho2": 0.5624998211860657, "step": 596 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.2469554913566726e-10, "advantages/std": 0.2741147577762604, "advantages/var": 0.0751389004307379, "completions/clipped_ratio": 0.234375, "epoch": 3.4301075268817205, "grad_norm": 45.58256832806369, "learning_rate": 4.611451972835175e-07, "loss": -1.9567, "num_tokens": 256926796.0, "residual_var": 0.03287329152226448, "reward": 0.771484375, "reward_std": 0.19365635514259338, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "rho2": 0.5624997615814209, "step": 597 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 5.85132065517757e-10, "advantages/std": 0.19895562529563904, "advantages/var": 0.039583340836778724, "completions/clipped_ratio": 0.34375, "epoch": 3.43584229390681, "grad_norm": 35.91342765843857, "learning_rate": 4.5812413586834275e-07, "loss": -3.1903, "num_tokens": 257470972.0, "residual_var": 0.025976581498980522, "reward": 0.720703125, "reward_std": 0.10849019140005112, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "rho2": 0.3437497913837433, "step": 598 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.065134523018892e-09, "advantages/std": 0.2818589210510254, "advantages/var": 0.07944445137604816, "completions/clipped_ratio": 0.3984375, "epoch": 3.4415770609318996, "grad_norm": 48.08708841706313, "learning_rate": 4.5511005971967366e-07, "loss": -2.3604, "num_tokens": 258015200.0, "residual_var": 0.01986113376915455, "reward": 0.83984375, "reward_std": 0.2304096221923828, "rewards/drgrpo_math_reward/mean": 0.83984375, "rewards/drgrpo_math_reward/std": 0.3671095669269562, "rho2": 0.7499997615814209, "step": 599 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.169992346986152e-09, "advantages/std": 0.32188680768013, "advantages/var": 0.103611116958505, "completions/clipped_ratio": 0.453125, "epoch": 3.447311827956989, "grad_norm": 57.805101907795475, "learning_rate": 4.5210300769165797e-07, "loss": -3.7452, "num_tokens": 258566155.0, "residual_var": 0.032378509640693665, "reward": 0.7265625, "reward_std": 0.24977071583271027, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "rho2": 0.6874997019767761, "step": 600 }, { "advantages/mean": -2.2118911147117615e-09, "advantages/snr": 6.414946875054919e-09, "advantages/std": 0.34480270743370056, "advantages/var": 0.1188889070536101, "completions/clipped_ratio": 0.609375, "epoch": 3.4530465949820788, "grad_norm": 57.260146979135236, "learning_rate": 4.4910301854789755e-07, "loss": -2.5664, "num_tokens": 259122492.0, "residual_var": 0.026006992906332016, "reward": 0.76171875, "reward_std": 0.28907182812690735, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "rho2": 0.7812496423721313, "step": 601 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.418480567097896e-10, "advantages/std": 0.34054696559906006, "advantages/var": 0.1159722357787274, "completions/clipped_ratio": 0.40625, "epoch": 3.4587813620071683, "grad_norm": 62.80592521929181, "learning_rate": 4.461101309609461e-07, "loss": -3.3671, "num_tokens": 259681979.0, "residual_var": 0.04348963871598244, "reward": 0.736328125, "reward_std": 0.25533509254455566, "rewards/drgrpo_math_reward/mean": 0.736328125, "rewards/drgrpo_math_reward/std": 0.4410543739795685, "rho2": 0.6249996423721313, "step": 602 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.119901018045968e-09, "advantages/std": 0.33582407236099243, "advantages/var": 0.11277780757712108, "completions/clipped_ratio": 0.71875, "epoch": 3.464516129032258, "grad_norm": 62.42241115394677, "learning_rate": 4.431243835118124e-07, "loss": -5.2576, "num_tokens": 260250248.0, "residual_var": 0.052864622324705124, "reward": 0.53125, "reward_std": 0.2481713891029358, "rewards/drgrpo_math_reward/mean": 0.53125, "rewards/drgrpo_math_reward/std": 0.4995105266571045, "rho2": 0.5312497615814209, "step": 603 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.4738402710196424e-09, "advantages/std": 0.2823512554168701, "advantages/var": 0.07972223143548263, "completions/clipped_ratio": 0.6640625, "epoch": 3.4702508960573475, "grad_norm": 51.68612580370042, "learning_rate": 4.401458146894618e-07, "loss": -3.9494, "num_tokens": 260826875.0, "residual_var": 0.03986114636063576, "reward": 0.6484375, "reward_std": 0.18799927830696106, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4779251217842102, "rho2": 0.4999997019767761, "step": 604 }, { "advantages/mean": 2.561137080192566e-09, "advantages/snr": 8.087800390791659e-09, "advantages/std": 0.31666669249534607, "advantages/var": 0.10027779413594207, "completions/clipped_ratio": 0.4921875, "epoch": 3.4759856630824375, "grad_norm": 61.081567513478966, "learning_rate": 4.37174462890322e-07, "loss": -5.4503, "num_tokens": 261383813.0, "residual_var": 0.037604209035634995, "reward": 0.6953125, "reward_std": 0.23305147886276245, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4607250988483429, "rho2": 0.6249996423721313, "step": 605 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 8.800170072476561e-10, "advantages/std": 0.26457515358924866, "advantages/var": 0.07000001189677452, "completions/clipped_ratio": 0.6875, "epoch": 3.481720430107527, "grad_norm": 45.53873884322227, "learning_rate": 4.3421036641778553e-07, "loss": -2.1612, "num_tokens": 261940177.0, "residual_var": 0.03281252831220627, "reward": 0.65625, "reward_std": 0.18145284056663513, "rewards/drgrpo_math_reward/mean": 0.65625, "rewards/drgrpo_math_reward/std": 0.4754233956336975, "rho2": 0.5312497615814209, "step": 606 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 4.3744803213578465e-09, "advantages/std": 0.29273614287376404, "advantages/var": 0.08569444934460879, "completions/clipped_ratio": 0.6796875, "epoch": 3.4874551971326166, "grad_norm": 49.100984824718296, "learning_rate": 4.3125356348171813e-07, "loss": -4.9726, "num_tokens": 262493066.0, "residual_var": 0.034813400357961655, "reward": 0.720703125, "reward_std": 0.21164387464523315, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "rho2": 0.5937497019767761, "step": 607 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 7.332262928312176e-10, "advantages/std": 0.3175426721572876, "advantages/var": 0.10083334864079063, "completions/clipped_ratio": 0.703125, "epoch": 3.493189964157706, "grad_norm": 56.33711335953415, "learning_rate": 4.283040921979646e-07, "loss": -4.2296, "num_tokens": 263043616.0, "residual_var": 0.040963590145111084, "reward": 0.76953125, "reward_std": 0.23531542718410492, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "rho2": 0.5937496423721313, "step": 608 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.630482176824459e-09, "advantages/std": 0.3097938597202301, "advantages/var": 0.09597223552035761, "completions/clipped_ratio": 0.765625, "epoch": 3.498924731182796, "grad_norm": 53.90828369842717, "learning_rate": 4.253619905878588e-07, "loss": -3.4256, "num_tokens": 263621847.0, "residual_var": 0.023993084207177162, "reward": 0.701171875, "reward_std": 0.25351446866989136, "rewards/drgrpo_math_reward/mean": 0.701171875, "rewards/drgrpo_math_reward/std": 0.45819199085235596, "rho2": 0.7499997615814209, "step": 609 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.223146700014547e-09, "advantages/std": 0.2855307161808014, "advantages/var": 0.08152778988272136, "completions/clipped_ratio": 0.625, "epoch": 3.5046594982078854, "grad_norm": 50.62322092397635, "learning_rate": 4.224272965777326e-07, "loss": -4.0381, "num_tokens": 264181806.0, "residual_var": 0.04076393321156502, "reward": 0.681640625, "reward_std": 0.19444715976715088, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.49999967217445374, "step": 610 }, { "advantages/mean": -4.0745362639427185e-10, "advantages/snr": 1.3519341227038006e-09, "advantages/std": 0.30138570070266724, "advantages/var": 0.09083334058803771, "completions/clipped_ratio": 0.5859375, "epoch": 3.510394265232975, "grad_norm": 55.664478203309805, "learning_rate": 4.195000479984264e-07, "loss": -5.4936, "num_tokens": 264735253.0, "residual_var": 0.03973960876464844, "reward": 0.734375, "reward_std": 0.2176605463027954, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "rho2": 0.5624998211860657, "step": 611 }, { "advantages/mean": 3.14321368932724e-09, "advantages/snr": 9.240971143175652e-09, "advantages/std": 0.34013888239860535, "advantages/var": 0.11569445931937228, "completions/clipped_ratio": 0.7734375, "epoch": 3.5161290322580645, "grad_norm": 68.03781613038481, "learning_rate": 4.1658028258480426e-07, "loss": -5.8869, "num_tokens": 265299592.0, "residual_var": 0.032539110630750656, "reward": 0.751953125, "reward_std": 0.2744312286376953, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "rho2": 0.7187496423721313, "step": 612 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 5.975803216420473e-10, "advantages/std": 0.3896223306655884, "advantages/var": 0.1518055605532851, "completions/clipped_ratio": 0.7578125, "epoch": 3.521863799283154, "grad_norm": 64.38201117366431, "learning_rate": 4.1366803797526373e-07, "loss": -1.8941, "num_tokens": 265859986.0, "residual_var": 0.014231827110052109, "reward": 0.642578125, "reward_std": 0.3519251346588135, "rewards/drgrpo_math_reward/mean": 0.642578125, "rewards/drgrpo_math_reward/std": 0.4797092080116272, "rho2": 0.9062496423721313, "step": 613 }, { "advantages/mean": 2.0954757928848267e-09, "advantages/snr": 6.1385794168054335e-09, "advantages/std": 0.3413616716861725, "advantages/var": 0.11652779089637821, "completions/clipped_ratio": 0.796875, "epoch": 3.5275985663082436, "grad_norm": 62.65678532340262, "learning_rate": 4.1076335171125286e-07, "loss": -6.5157, "num_tokens": 266430553.0, "residual_var": 0.04005648195743561, "reward": 0.705078125, "reward_std": 0.25851887464523315, "rewards/drgrpo_math_reward/mean": 0.705078125, "rewards/drgrpo_math_reward/std": 0.4564536213874817, "rho2": 0.6562495827674866, "step": 614 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.1863293152027555e-09, "advantages/std": 0.29439207911491394, "advantages/var": 0.08666669624560175, "completions/clipped_ratio": 0.71875, "epoch": 3.533333333333333, "grad_norm": 52.07873704593748, "learning_rate": 4.078662612367868e-07, "loss": -2.6079, "num_tokens": 267012673.0, "residual_var": 0.0297916941344738, "reward": 0.76953125, "reward_std": 0.2250671237707138, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "rho2": 0.6562497615814209, "step": 615 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.5086006648268286e-09, "advantages/std": 0.3086709976196289, "advantages/var": 0.09527778477149695, "completions/clipped_ratio": 0.65625, "epoch": 3.539068100358423, "grad_norm": 53.72043078784469, "learning_rate": 4.049768038979631e-07, "loss": -4.2782, "num_tokens": 267588110.0, "residual_var": 0.041684072464704514, "reward": 0.76953125, "reward_std": 0.2212802618741989, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "rho2": 0.5624996423721313, "step": 616 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.651916560108933e-09, "advantages/std": 0.3511884808540344, "advantages/var": 0.1233333490845645, "completions/clipped_ratio": 0.75, "epoch": 3.5448028673835124, "grad_norm": 67.04114711633068, "learning_rate": 4.020950169424815e-07, "loss": -0.704, "num_tokens": 268166282.0, "residual_var": 0.034687552601099014, "reward": 0.70703125, "reward_std": 0.28241097927093506, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.455569326877594, "rho2": 0.7187496423721313, "step": 617 }, { "advantages/mean": 1.57160684466362e-09, "advantages/snr": 4.8089135972356244e-09, "advantages/std": 0.32681119441986084, "advantages/var": 0.10680555679813608, "completions/clipped_ratio": 0.75, "epoch": 3.5505376344086024, "grad_norm": 65.94498822850325, "learning_rate": 3.992209375191634e-07, "loss": -5.0446, "num_tokens": 268730354.0, "residual_var": 0.046727459877729416, "reward": 0.705078125, "reward_std": 0.24286863207817078, "rewards/drgrpo_math_reward/mean": 0.705078125, "rewards/drgrpo_math_reward/std": 0.4564536213874817, "rho2": 0.5624998211860657, "step": 618 }, { "advantages/mean": -1.1059455573558807e-09, "advantages/snr": 4.235056517897478e-09, "advantages/std": 0.2611406743526459, "advantages/var": 0.06819445180135464, "completions/clipped_ratio": 0.7109375, "epoch": 3.5562724014336915, "grad_norm": 49.75890019731969, "learning_rate": 3.963546026774741e-07, "loss": -3.4091, "num_tokens": 269297807.0, "residual_var": 0.031966179609298706, "reward": 0.681640625, "reward_std": 0.17737171053886414, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.5312496423721313, "step": 619 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.022700611399606e-10, "advantages/std": 0.28939592838287354, "advantages/var": 0.08375000336458527, "completions/clipped_ratio": 0.71875, "epoch": 3.5620071684587815, "grad_norm": 54.89323331192145, "learning_rate": 3.934960493670441e-07, "loss": -5.8738, "num_tokens": 269852751.0, "residual_var": 0.036640655249357224, "reward": 0.587890625, "reward_std": 0.20185019075870514, "rewards/drgrpo_math_reward/mean": 0.587890625, "rewards/drgrpo_math_reward/std": 0.49269601702690125, "rho2": 0.5624997615814209, "step": 620 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.2813657224178314, "advantages/var": 0.07916666975170816, "completions/clipped_ratio": 0.7578125, "epoch": 3.567741935483871, "grad_norm": 50.42961379445049, "learning_rate": 3.9064531443719194e-07, "loss": -1.4525, "num_tokens": 270415397.0, "residual_var": 0.044531289488077164, "reward": 0.71484375, "reward_std": 0.17914927005767822, "rewards/drgrpo_math_reward/mean": 0.71484375, "rewards/drgrpo_math_reward/std": 0.45193037390708923, "rho2": 0.43749964237213135, "step": 621 }, { "advantages/mean": -1.979060471057892e-09, "advantages/snr": 7.3855013028886146e-09, "advantages/std": 0.2679656147956848, "advantages/var": 0.07180557071282934, "completions/clipped_ratio": 0.71875, "epoch": 3.5734767025089607, "grad_norm": 54.58831441383573, "learning_rate": 3.8780243463645093e-07, "loss": -2.9022, "num_tokens": 270971761.0, "residual_var": 0.03365888074040413, "reward": 0.685546875, "reward_std": 0.1830367147922516, "rewards/drgrpo_math_reward/mean": 0.685546875, "rewards/drgrpo_math_reward/std": 0.4647517800331116, "rho2": 0.5312498211860657, "step": 622 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 7.68432293038127e-10, "advantages/std": 0.30299434065818787, "advantages/var": 0.09180557047089, "completions/clipped_ratio": 0.7578125, "epoch": 3.5792114695340502, "grad_norm": 55.80729983707128, "learning_rate": 3.849674466120951e-07, "loss": -5.273, "num_tokens": 271543340.0, "residual_var": 0.04303388670086861, "reward": 0.697265625, "reward_std": 0.2128678560256958, "rewards/drgrpo_math_reward/mean": 0.697265625, "rewards/drgrpo_math_reward/std": 0.45989060401916504, "rho2": 0.5312497615814209, "step": 623 }, { "advantages/mean": -8.731149137020111e-10, "advantages/snr": 2.548644428413931e-09, "advantages/std": 0.342580109834671, "advantages/var": 0.11736113165433526, "completions/clipped_ratio": 0.609375, "epoch": 3.58494623655914, "grad_norm": 63.98928568837254, "learning_rate": 3.8214038690966577e-07, "loss": -6.5847, "num_tokens": 272109549.0, "residual_var": 0.036675386130809784, "reward": 0.650390625, "reward_std": 0.26814505457878113, "rewards/drgrpo_math_reward/mean": 0.650390625, "rewards/drgrpo_math_reward/std": 0.47731292247772217, "rho2": 0.6874997615814209, "step": 624 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.355446283709112e-09, "advantages/std": 0.31224992871284485, "advantages/var": 0.09750001798117669, "completions/clipped_ratio": 0.4609375, "epoch": 3.5906810035842294, "grad_norm": 57.44252378704577, "learning_rate": 3.79321291972501e-07, "loss": -6.3942, "num_tokens": 272657803.0, "residual_var": 0.051796913146972656, "reward": 0.67578125, "reward_std": 0.21366894245147705, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.4685399830341339, "rho2": 0.4687497615814209, "step": 625 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.761368783015557e-09, "advantages/std": 0.3372684419155121, "advantages/var": 0.11375000191211715, "completions/clipped_ratio": 0.5, "epoch": 3.596415770609319, "grad_norm": 62.81440510982691, "learning_rate": 3.765101981412665e-07, "loss": -3.786, "num_tokens": 273205958.0, "residual_var": 0.03554692491889, "reward": 0.779296875, "reward_std": 0.262253999710083, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "rho2": 0.6874996423721313, "step": 626 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 9.686339483468415e-10, "advantages/std": 0.24037009477615356, "advantages/var": 0.05777778246269705, "completions/clipped_ratio": 0.515625, "epoch": 3.6021505376344085, "grad_norm": 47.85429432872602, "learning_rate": 3.7370714165348616e-07, "loss": -0.9323, "num_tokens": 273747439.0, "residual_var": 0.03430557623505592, "reward": 0.734375, "reward_std": 0.1432015299797058, "rewards/drgrpo_math_reward/mean": 0.734375, "rewards/drgrpo_math_reward/std": 0.44209739565849304, "rho2": 0.4062498211860657, "step": 627 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.6559074817016132e-09, "advantages/std": 0.26299557089805603, "advantages/var": 0.06916667031199442, "completions/clipped_ratio": 0.6171875, "epoch": 3.607885304659498, "grad_norm": 50.85223192967286, "learning_rate": 3.709121586430752e-07, "loss": -6.0863, "num_tokens": 274292737.0, "residual_var": 0.041067738085985184, "reward": 0.7578125, "reward_std": 0.1568962037563324, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "rho2": 0.4062497615814209, "step": 628 }, { "advantages/mean": 1.979060471057892e-09, "advantages/snr": 6.9252685402203945e-09, "advantages/std": 0.2857738137245178, "advantages/var": 0.08166667261065541, "completions/clipped_ratio": 0.4765625, "epoch": 3.6136200716845877, "grad_norm": 50.77110834514133, "learning_rate": 3.681252851398743e-07, "loss": -5.1995, "num_tokens": 274839673.0, "residual_var": 0.035729195922613144, "reward": 0.6015625, "reward_std": 0.1992138773202896, "rewards/drgrpo_math_reward/mean": 0.6015625, "rewards/drgrpo_math_reward/std": 0.4900552034378052, "rho2": 0.5624997615814209, "step": 629 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.103646436610098e-09, "advantages/std": 0.31644731760025024, "advantages/var": 0.10013890481639365, "completions/clipped_ratio": 0.3125, "epoch": 3.6193548387096772, "grad_norm": 61.956972894167954, "learning_rate": 3.6534655706918605e-07, "loss": -7.6569, "num_tokens": 275374481.0, "residual_var": 0.04381079971790314, "reward": 0.708984375, "reward_std": 0.22866389155387878, "rewards/drgrpo_math_reward/mean": 0.708984375, "rewards/drgrpo_math_reward/std": 0.45467492938041687, "rho2": 0.5624997615814209, "step": 630 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.318751946000626e-09, "advantages/std": 0.2806243300437927, "advantages/var": 0.07875001461252751, "completions/clipped_ratio": 0.3671875, "epoch": 3.6250896057347672, "grad_norm": 60.80157962760746, "learning_rate": 3.625760102513102e-07, "loss": -4.8638, "num_tokens": 275917943.0, "residual_var": 0.0418359711766243, "reward": 0.728515625, "reward_std": 0.18503239750862122, "rewards/drgrpo_math_reward/mean": 0.728515625, "rewards/drgrpo_math_reward/std": 0.44516023993492126, "rho2": 0.4687497019767761, "step": 631 }, { "advantages/mean": -5.238689482212067e-10, "advantages/snr": 1.6801182321784876e-09, "advantages/std": 0.31180480122566223, "advantages/var": 0.09722223406737474, "completions/clipped_ratio": 0.1875, "epoch": 3.6308243727598564, "grad_norm": 53.82829356509245, "learning_rate": 3.598136804010836e-07, "loss": -2.6743, "num_tokens": 276447397.0, "residual_var": 0.036458373069763184, "reward": 0.77734375, "reward_std": 0.23653282225131989, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "rho2": 0.6249997019767761, "step": 632 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.3292891054001202e-09, "advantages/std": 0.2627313733100891, "advantages/var": 0.0690277745214054, "completions/clipped_ratio": 0.2265625, "epoch": 3.6365591397849464, "grad_norm": 51.2587689760945, "learning_rate": 3.570596031274189e-07, "loss": -7.2652, "num_tokens": 276984013.0, "residual_var": 0.03667103499174118, "reward": 0.724609375, "reward_std": 0.16789719462394714, "rewards/drgrpo_math_reward/mean": 0.724609375, "rewards/drgrpo_math_reward/std": 0.44714778661727905, "rho2": 0.4687497913837433, "step": 633 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.8363620957032045e-09, "advantages/std": 0.30345237255096436, "advantages/var": 0.09208334240680927, "completions/clipped_ratio": 0.015625, "epoch": 3.642293906810036, "grad_norm": 54.951289795891334, "learning_rate": 3.5431381393284497e-07, "loss": -4.8569, "num_tokens": 277494385.0, "residual_var": 0.04028650000691414, "reward": 0.845703125, "reward_std": 0.21489115059375763, "rewards/drgrpo_math_reward/mean": 0.845703125, "rewards/drgrpo_math_reward/std": 0.36158639192581177, "rho2": 0.5624996423721313, "step": 634 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 2.097114638065524e-09, "advantages/std": 0.2220485508441925, "advantages/var": 0.049305558932005944, "completions/clipped_ratio": -0.078125, "epoch": 3.6480286738351255, "grad_norm": 41.43726900858742, "learning_rate": 3.515763482130505e-07, "loss": -3.1155, "num_tokens": 278003598.0, "residual_var": 0.029275190085172653, "reward": 0.755859375, "reward_std": 0.13418962061405182, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "rho2": 0.40624988079071045, "step": 635 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 5.238271132666591e-09, "advantages/std": 0.3111359477043152, "advantages/var": 0.09680557795386235, "completions/clipped_ratio": -0.1171875, "epoch": 3.653763440860215, "grad_norm": 60.53863810232421, "learning_rate": 3.488472412564264e-07, "loss": -7.1862, "num_tokens": 278519175.0, "residual_var": 0.04537764564156532, "reward": 0.775390625, "reward_std": 0.21887236833572388, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "rho2": 0.5312497019767761, "step": 636 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.5733910334337096e-09, "advantages/std": 0.31666669249534607, "advantages/var": 0.10027779413594207, "completions/clipped_ratio": 0.0234375, "epoch": 3.6594982078853047, "grad_norm": 57.39803090473713, "learning_rate": 3.4612652824361297e-07, "loss": -6.5146, "num_tokens": 279034301.0, "residual_var": 0.050138939172029495, "reward": 0.7578125, "reward_std": 0.21617728471755981, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "rho2": 0.49999967217445374, "step": 637 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 8.835300607796899e-10, "advantages/std": 0.2635231614112854, "advantages/var": 0.06944445660019838, "completions/clipped_ratio": -0.1484375, "epoch": 3.6652329749103942, "grad_norm": 46.01844535784208, "learning_rate": 3.434142442470437e-07, "loss": -2.1311, "num_tokens": 279562216.0, "residual_var": 0.03689238056540489, "reward": 0.63671875, "reward_std": 0.17081737518310547, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.4814152419567108, "rho2": 0.46874985098838806, "step": 638 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.2312505263685483e-09, "advantages/std": 0.26087459921836853, "advantages/var": 0.0680555565173444, "completions/clipped_ratio": -0.296875, "epoch": 3.670967741935484, "grad_norm": 55.50079917902629, "learning_rate": 3.407104242304951e-07, "loss": -2.0701, "num_tokens": 280062687.0, "residual_var": 0.03615453839302063, "reward": 0.65234375, "reward_std": 0.16745707392692566, "rewards/drgrpo_math_reward/mean": 0.65234375, "rewards/drgrpo_math_reward/std": 0.47669193148612976, "rho2": 0.4687498211860657, "step": 639 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.2467327840668737e-09, "advantages/std": 0.28012895584106445, "advantages/var": 0.07847223190060504, "completions/clipped_ratio": -0.3203125, "epoch": 3.6767025089605734, "grad_norm": 54.157781671491286, "learning_rate": 3.38015103048635e-07, "loss": -3.1612, "num_tokens": 280560743.0, "residual_var": 0.03433162346482277, "reward": 0.755859375, "reward_std": 0.19844113290309906, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "rho2": 0.5624997615814209, "step": 640 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.736423645836594e-09, "advantages/std": 0.3403429687023163, "advantages/var": 0.11583333634510584, "completions/clipped_ratio": -0.109375, "epoch": 3.682437275985663, "grad_norm": 61.65974524931173, "learning_rate": 3.3532831544657456e-07, "loss": -9.335, "num_tokens": 281073547.0, "residual_var": 0.057916704565286636, "reward": 0.75390625, "reward_std": 0.24700871109962463, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.4999997317790985, "step": 641 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.294178839304053e-09, "advantages/std": 0.32532036304473877, "advantages/var": 0.10583333861156063, "completions/clipped_ratio": -0.4609375, "epoch": 3.688172043010753, "grad_norm": 62.862769229564954, "learning_rate": 3.3265009605941797e-07, "loss": -5.0745, "num_tokens": 281577873.0, "residual_var": 0.04299485310912132, "reward": 0.70703125, "reward_std": 0.23834869265556335, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.455569326877594, "rho2": 0.5937495231628418, "step": 642 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 3.98386122255242e-09, "advantages/std": 0.26299557089805603, "advantages/var": 0.06916667031199442, "completions/clipped_ratio": -0.140625, "epoch": 3.693906810035842, "grad_norm": 49.420269719308024, "learning_rate": 3.2998047941181893e-07, "loss": -1.8332, "num_tokens": 282098897.0, "residual_var": 0.04755210131406784, "reward": 0.7265625, "reward_std": 0.1508270502090454, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "rho2": 0.31249985098838806, "step": 643 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.22360680997371674, "advantages/var": 0.050000005466621866, "completions/clipped_ratio": -0.2109375, "epoch": 3.699641577060932, "grad_norm": 44.444833051572594, "learning_rate": 3.273194999175328e-07, "loss": -1.7158, "num_tokens": 282601696.0, "residual_var": 0.03125001862645149, "reward": 0.76953125, "reward_std": 0.12714432179927826, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "rho2": 0.3749998211860657, "step": 644 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.1742870957851774e-10, "advantages/std": 0.27888670563697815, "advantages/var": 0.0777777945810465, "completions/clipped_ratio": -0.140625, "epoch": 3.7053763440860212, "grad_norm": 49.13612272692265, "learning_rate": 3.246671918789755e-07, "loss": -1.8402, "num_tokens": 283123248.0, "residual_var": 0.04131946712732315, "reward": 0.7109375, "reward_std": 0.1792363077402115, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45377036929130554, "rho2": 0.4687498211860657, "step": 645 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 6.184600677549105e-09, "advantages/std": 0.2823512554168701, "advantages/var": 0.07972223143548263, "completions/clipped_ratio": -0.0546875, "epoch": 3.7111111111111112, "grad_norm": 51.1640141189998, "learning_rate": 3.220235894867793e-07, "loss": -3.2455, "num_tokens": 283640558.0, "residual_var": 0.044843778014183044, "reward": 0.7265625, "reward_std": 0.1805747002363205, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "rho2": 0.4374997615814209, "step": 646 }, { "advantages/mean": 3.725290298461914e-09, "advantages/snr": 1.1246380237818753e-08, "advantages/std": 0.33124348521232605, "advantages/var": 0.10972224649560847, "completions/clipped_ratio": 0.078125, "epoch": 3.716845878136201, "grad_norm": 62.65544120122423, "learning_rate": 3.193887268193525e-07, "loss": -5.2428, "num_tokens": 284154654.0, "residual_var": 0.037717048078775406, "reward": 0.69140625, "reward_std": 0.2510807514190674, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.4623647928237915, "rho2": 0.6562497615814209, "step": 647 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.2008505221245916e-09, "advantages/std": 0.29083216190338135, "advantages/var": 0.08458334639739462, "completions/clipped_ratio": 0.15625, "epoch": 3.7225806451612904, "grad_norm": 53.0270647964177, "learning_rate": 3.1676263784244173e-07, "loss": -2.2122, "num_tokens": 284674566.0, "residual_var": 0.03964846953749657, "reward": 0.724609375, "reward_std": 0.19706541299819946, "rewards/drgrpo_math_reward/mean": 0.724609375, "rewards/drgrpo_math_reward/std": 0.44714778661727905, "rho2": 0.5312497615814209, "step": 648 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.7082816213376141e-09, "advantages/std": 0.2725904583930969, "advantages/var": 0.0743055580069587, "completions/clipped_ratio": 0.0390625, "epoch": 3.72831541218638, "grad_norm": 51.26141892067536, "learning_rate": 3.141453564086921e-07, "loss": -7.2701, "num_tokens": 285174327.0, "residual_var": 0.03483075648546219, "reward": 0.771484375, "reward_std": 0.19014981389045715, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "rho2": 0.5312497615814209, "step": 649 }, { "advantages/mean": -1.7462298274040222e-10, "advantages/snr": 5.75888797056539e-10, "advantages/std": 0.3032234311103821, "advantages/var": 0.09194444917435263, "completions/clipped_ratio": 0.3515625, "epoch": 3.7340501792114695, "grad_norm": 55.00037980960771, "learning_rate": 3.1153691625721133e-07, "loss": -5.7861, "num_tokens": 285715310.0, "residual_var": 0.037352465093135834, "reward": 0.7109375, "reward_std": 0.21891212463378906, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45377036929130554, "rho2": 0.5937497615814209, "step": 650 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.880132323426064e-09, "advantages/std": 0.28625941276550293, "advantages/var": 0.08194445139685058, "completions/clipped_ratio": 0.375, "epoch": 3.739784946236559, "grad_norm": 54.81513372143973, "learning_rate": 3.0893735101313535e-07, "loss": -7.1491, "num_tokens": 286245556.0, "residual_var": 0.04097224399447441, "reward": 0.80078125, "reward_std": 0.1913968026638031, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "rho2": 0.49999988079071045, "step": 651 }, { "advantages/mean": 3.3178366720676422e-09, "advantages/snr": 1.099181245850205e-08, "advantages/std": 0.3018461763858795, "advantages/var": 0.09111111419877549, "completions/clipped_ratio": 0.5, "epoch": 3.7455197132616487, "grad_norm": 59.24962647527454, "learning_rate": 3.0634669418719514e-07, "loss": -4.962, "num_tokens": 286797755.0, "residual_var": 0.0427083745598793, "reward": 0.74609375, "reward_std": 0.2096863090991974, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "rho2": 0.5312496423721313, "step": 652 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 5.461942658079832e-09, "advantages/std": 0.2557668387889862, "advantages/var": 0.06541667582411126, "completions/clipped_ratio": 0.359375, "epoch": 3.7512544802867382, "grad_norm": 48.0678178044986, "learning_rate": 3.0376497917528343e-07, "loss": -5.1512, "num_tokens": 287339990.0, "residual_var": 0.03884117677807808, "reward": 0.712890625, "reward_std": 0.15123911201953888, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "rho2": 0.4062497019767761, "step": 653 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.926271277238225e-09, "advantages/std": 0.2835783362388611, "advantages/var": 0.08041667278400055, "completions/clipped_ratio": 0.3359375, "epoch": 3.756989247311828, "grad_norm": 51.63962065590274, "learning_rate": 3.0119223925802485e-07, "loss": -4.9344, "num_tokens": 287864136.0, "residual_var": 0.04272138327360153, "reward": 0.712890625, "reward_std": 0.18232516944408417, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "rho2": 0.4687497317790985, "step": 654 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 5.520865001213131e-09, "advantages/std": 0.25303712487220764, "advantages/var": 0.0640277865635932, "completions/clipped_ratio": 0.4453125, "epoch": 3.762724014336918, "grad_norm": 47.06364705089816, "learning_rate": 2.986285076003474e-07, "loss": -3.6252, "num_tokens": 288411241.0, "residual_var": 0.0320139154791832, "reward": 0.693359375, "reward_std": 0.16487500071525574, "rewards/drgrpo_math_reward/mean": 0.693359375, "rewards/drgrpo_math_reward/std": 0.4615498185157776, "rho2": 0.4999997019767761, "step": 655 }, { "advantages/mean": -2.153683453798294e-09, "advantages/snr": 7.0079927811951985e-09, "advantages/std": 0.30731815099716187, "advantages/var": 0.09444444593231438, "completions/clipped_ratio": 0.453125, "epoch": 3.768458781362007, "grad_norm": 55.91449008805667, "learning_rate": 2.9607381725105507e-07, "loss": -4.7045, "num_tokens": 288953960.0, "residual_var": 0.041319482028484344, "reward": 0.7265625, "reward_std": 0.21936340630054474, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "rho2": 0.5624997019767761, "step": 656 }, { "advantages/mean": -4.0745362639427185e-10, "advantages/snr": 1.3807359249336013e-09, "advantages/std": 0.2950988709926605, "advantages/var": 0.0870833436611429, "completions/clipped_ratio": 0.3125, "epoch": 3.774193548387097, "grad_norm": 53.78612265404732, "learning_rate": 2.9352820114240005e-07, "loss": -5.2286, "num_tokens": 289483261.0, "residual_var": 0.04082034155726433, "reward": 0.732421875, "reward_std": 0.20207764208316803, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.5312497615814209, "step": 657 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.1778655848418952e-09, "advantages/std": 0.29650747776031494, "advantages/var": 0.08791668436778366, "completions/clipped_ratio": 0.53125, "epoch": 3.7799283154121865, "grad_norm": 53.61814647116161, "learning_rate": 2.909916920896599e-07, "loss": -5.2824, "num_tokens": 290036091.0, "residual_var": 0.03296878561377525, "reward": 0.775390625, "reward_std": 0.21747520565986633, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "rho2": 0.6249997019767761, "step": 658 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.6857347172168432e-09, "advantages/std": 0.26007479429244995, "advantages/var": 0.06763889862626016, "completions/clipped_ratio": 0.4609375, "epoch": 3.785663082437276, "grad_norm": 49.22189227608608, "learning_rate": 2.8846432279071466e-07, "loss": -5.4418, "num_tokens": 290578282.0, "residual_var": 0.038046903908252716, "reward": 0.794921875, "reward_std": 0.16026344895362854, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "rho2": 0.4374997019767761, "step": 659 }, { "advantages/mean": 5.238689482212067e-10, "advantages/snr": 1.8602479193853518e-09, "advantages/std": 0.28161242604255676, "advantages/var": 0.0793055585015745, "completions/clipped_ratio": 0.4453125, "epoch": 3.7913978494623657, "grad_norm": 48.061588118469686, "learning_rate": 2.8594612582562394e-07, "loss": -4.8876, "num_tokens": 291117640.0, "residual_var": 0.032217904925346375, "reward": 0.853515625, "reward_std": 0.2032204270362854, "rewards/drgrpo_math_reward/mean": 0.853515625, "rewards/drgrpo_math_reward/std": 0.35393697023391724, "rho2": 0.5937498211860657, "step": 660 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.4082913604614455e-10, "advantages/std": 0.3415650427341461, "advantages/var": 0.11666667841797906, "completions/clipped_ratio": 0.40625, "epoch": 3.7971326164874553, "grad_norm": 70.20116024821192, "learning_rate": 2.834371336562077e-07, "loss": -7.3726, "num_tokens": 291665201.0, "residual_var": 0.0583333857357502, "reward": 0.69921875, "reward_std": 0.24407950043678284, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "rho2": 0.4999997019767761, "step": 661 }, { "advantages/mean": 1.57160684466362e-09, "advantages/snr": 5.78712509942956e-09, "advantages/std": 0.27156952023506165, "advantages/var": 0.07375000432070156, "completions/clipped_ratio": 0.390625, "epoch": 3.802867383512545, "grad_norm": 51.70919172312582, "learning_rate": 2.8093737862562885e-07, "loss": -3.9095, "num_tokens": 292230154.0, "residual_var": 0.029960962012410164, "reward": 0.716796875, "reward_std": 0.19404324889183044, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "rho2": 0.5937497615814209, "step": 662 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.0760514669870705e-09, "advantages/std": 0.28037676215171814, "advantages/var": 0.07861112875468113, "completions/clipped_ratio": 0.5078125, "epoch": 3.8086021505376344, "grad_norm": 51.399927237270084, "learning_rate": 2.784468929579741e-07, "loss": -7.8767, "num_tokens": 292794165.0, "residual_var": 0.039305586367845535, "reward": 0.69140625, "reward_std": 0.19224637746810913, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.4623647928237915, "rho2": 0.4999997913837433, "step": 663 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.2022284563901246e-09, "advantages/std": 0.2643125355243683, "advantages/var": 0.06986111643532045, "completions/clipped_ratio": 0.2578125, "epoch": 3.814336917562724, "grad_norm": 46.526796838475974, "learning_rate": 2.759657087578403e-07, "loss": -4.6761, "num_tokens": 293320557.0, "residual_var": 0.03929690644145012, "reward": 0.775390625, "reward_std": 0.16208168864250183, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "rho2": 0.4374997019767761, "step": 664 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 2.683430518256026e-09, "advantages/std": 0.3036811351776123, "advantages/var": 0.09222223186276324, "completions/clipped_ratio": 0.6015625, "epoch": 3.8200716845878135, "grad_norm": 56.56293701733064, "learning_rate": 2.734938580099196e-07, "loss": -8.6208, "num_tokens": 293874446.0, "residual_var": 0.04034726694226265, "reward": 0.6875, "reward_std": 0.21379470825195312, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "rho2": 0.5624996423721313, "step": 665 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 8.065489120046687e-10, "advantages/std": 0.2886751592159271, "advantages/var": 0.08333334754834087, "completions/clipped_ratio": 0.4921875, "epoch": 3.825806451612903, "grad_norm": 56.61070428716292, "learning_rate": 2.7103137257858863e-07, "loss": -6.7568, "num_tokens": 294428893.0, "residual_var": 0.03906252235174179, "reward": 0.6484375, "reward_std": 0.19893690943717957, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4779251217842102, "rho2": 0.5312498807907104, "step": 666 }, { "advantages/mean": 2.9103830456733704e-10, "advantages/snr": 1.151429752438309e-09, "advantages/std": 0.2527625262737274, "advantages/var": 0.06388889468827674, "completions/clipped_ratio": 0.453125, "epoch": 3.8315412186379927, "grad_norm": 44.53550215655523, "learning_rate": 2.685782842074953e-07, "loss": -1.6803, "num_tokens": 294971980.0, "residual_var": 0.03793405368924141, "reward": 0.72265625, "reward_std": 0.15080136060714722, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "rho2": 0.4062497615814209, "step": 667 }, { "advantages/mean": -1.2223608791828156e-09, "advantages/snr": 3.752485187984152e-09, "advantages/std": 0.325747013092041, "advantages/var": 0.10611111653838634, "completions/clipped_ratio": 0.3671875, "epoch": 3.8372759856630827, "grad_norm": 58.73885236985631, "learning_rate": 2.6613462451915227e-07, "loss": -5.8301, "num_tokens": 295510447.0, "residual_var": 0.05637158080935478, "reward": 0.74609375, "reward_std": 0.2199711799621582, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "rho2": 0.46874967217445374, "step": 668 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 4.311185259890814e-10, "advantages/std": 0.2700308859348297, "advantages/var": 0.07291667935874901, "completions/clipped_ratio": 0.453125, "epoch": 3.843010752688172, "grad_norm": 48.00303202034087, "learning_rate": 2.6370042501452674e-07, "loss": -2.5025, "num_tokens": 296038907.0, "residual_var": 0.03417971357703209, "reward": 0.705078125, "reward_std": 0.18883737921714783, "rewards/drgrpo_math_reward/mean": 0.705078125, "rewards/drgrpo_math_reward/std": 0.4564536213874817, "rho2": 0.5312497615814209, "step": 669 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.3428673586084216e-09, "advantages/std": 0.26007479429244995, "advantages/var": 0.06763889862626016, "completions/clipped_ratio": 0.1796875, "epoch": 3.848745519713262, "grad_norm": 49.547554181393295, "learning_rate": 2.6127571707263694e-07, "loss": -4.7947, "num_tokens": 296563160.0, "residual_var": 0.03804689273238182, "reward": 0.833984375, "reward_std": 0.16698512434959412, "rewards/drgrpo_math_reward/mean": 0.833984375, "rewards/drgrpo_math_reward/std": 0.3724585771560669, "rho2": 0.43749985098838806, "step": 670 }, { "advantages/mean": 1.3969838619232178e-09, "advantages/snr": 4.042112356989384e-09, "advantages/std": 0.34560737013816833, "advantages/var": 0.11944445429382089, "completions/clipped_ratio": 0.390625, "epoch": 3.8544802867383514, "grad_norm": 65.04937238806251, "learning_rate": 2.5886053195014534e-07, "loss": -8.6761, "num_tokens": 297108366.0, "residual_var": 0.04479171708226204, "reward": 0.72265625, "reward_std": 0.2622472047805786, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "rho2": 0.6249996423721313, "step": 671 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 3.880066733901732e-09, "advantages/std": 0.2700308859348297, "advantages/var": 0.07291667935874901, "completions/clipped_ratio": 0.1640625, "epoch": 3.860215053763441, "grad_norm": 47.83684474549185, "learning_rate": 2.564549007809568e-07, "loss": -4.4864, "num_tokens": 297624485.0, "residual_var": 0.03645835816860199, "reward": 0.716796875, "reward_std": 0.18393957614898682, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "rho2": 0.4999998211860657, "step": 672 }, { "advantages/mean": -1.979060471057892e-09, "advantages/snr": 6.943002960464133e-09, "advantages/std": 0.285043865442276, "advantages/var": 0.08125000522627435, "completions/clipped_ratio": 0.25, "epoch": 3.8659498207885306, "grad_norm": 51.14855973879682, "learning_rate": 2.540588545758179e-07, "loss": -6.8316, "num_tokens": 298155064.0, "residual_var": 0.038085971027612686, "reward": 0.697265625, "reward_std": 0.19866734743118286, "rewards/drgrpo_math_reward/mean": 0.697265625, "rewards/drgrpo_math_reward/std": 0.45989060401916504, "rho2": 0.5312497019767761, "step": 673 }, { "advantages/mean": 4.0745362639427185e-10, "advantages/snr": 1.3709317427750313e-09, "advantages/std": 0.2972092628479004, "advantages/var": 0.08833334592259234, "completions/clipped_ratio": 0.2265625, "epoch": 3.87168458781362, "grad_norm": 54.059486168672144, "learning_rate": 2.516724242219157e-07, "loss": -4.6012, "num_tokens": 298693370.0, "residual_var": 0.04968753829598427, "reward": 0.66015625, "reward_std": 0.19230203330516815, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.4741191864013672, "rho2": 0.4374997615814209, "step": 674 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.2165350023164762e-09, "advantages/std": 0.3151278495788574, "advantages/var": 0.09930556158019499, "completions/clipped_ratio": 0.2578125, "epoch": 3.8774193548387097, "grad_norm": 56.91574160774777, "learning_rate": 2.4929564048248066e-07, "loss": -4.5738, "num_tokens": 299236792.0, "residual_var": 0.040342915803194046, "reward": 0.681640625, "reward_std": 0.2289208173751831, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.5937497615814209, "step": 675 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.4923728996574992e-09, "advantages/std": 0.3120274245738983, "advantages/var": 0.0973611136862198, "completions/clipped_ratio": 0.3203125, "epoch": 3.8831541218637993, "grad_norm": 58.86567789279796, "learning_rate": 2.4692853399638913e-07, "loss": -4.0313, "num_tokens": 299762412.0, "residual_var": 0.048680584877729416, "reward": 0.744140625, "reward_std": 0.21900448203086853, "rewards/drgrpo_math_reward/mean": 0.744140625, "rewards/drgrpo_math_reward/std": 0.43676990270614624, "rho2": 0.4999998211860657, "step": 676 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.6117553506085985e-09, "advantages/std": 0.28891560435295105, "advantages/var": 0.08347222643863095, "completions/clipped_ratio": 0.203125, "epoch": 3.888888888888889, "grad_norm": 56.24898123002913, "learning_rate": 2.4457113527777007e-07, "loss": -8.1867, "num_tokens": 300288855.0, "residual_var": 0.0391276478767395, "reward": 0.783203125, "reward_std": 0.19629821181297302, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "rho2": 0.5312496423721313, "step": 677 }, { "advantages/mean": -9.89530235528946e-10, "advantages/snr": 4.141659315741221e-09, "advantages/std": 0.23892119526863098, "advantages/var": 0.057083337548591295, "completions/clipped_ratio": 0.2421875, "epoch": 3.8946236559139784, "grad_norm": 47.41454064155878, "learning_rate": 2.4222347471560934e-07, "loss": -4.9151, "num_tokens": 300812526.0, "residual_var": 0.03567710891366005, "reward": 0.783203125, "reward_std": 0.13644105195999146, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "rho2": 0.3749997913837433, "step": 678 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 6.6871205228317625e-09, "advantages/std": 0.31335994601249695, "advantages/var": 0.098194455764955, "completions/clipped_ratio": 0.2734375, "epoch": 3.900358422939068, "grad_norm": 53.637941439126564, "learning_rate": 2.3988558257336044e-07, "loss": -6.2634, "num_tokens": 301338321.0, "residual_var": 0.042960114777088165, "reward": 0.724609375, "reward_std": 0.23022502660751343, "rewards/drgrpo_math_reward/mean": 0.724609375, "rewards/drgrpo_math_reward/std": 0.44714778661727905, "rho2": 0.5624996423721313, "step": 679 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.45223785573061e-09, "advantages/std": 0.26977357268333435, "advantages/var": 0.07277778051833028, "completions/clipped_ratio": 0.3828125, "epoch": 3.9060931899641576, "grad_norm": 47.5809827533166, "learning_rate": 2.37557488988552e-07, "loss": -2.6853, "num_tokens": 301880955.0, "residual_var": 0.03184029832482338, "reward": 0.76953125, "reward_std": 0.19068288803100586, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "rho2": 0.5624998211860657, "step": 680 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.302859504876618e-09, "advantages/std": 0.2527625262737274, "advantages/var": 0.06388889468827674, "completions/clipped_ratio": 0.109375, "epoch": 3.9118279569892476, "grad_norm": 47.2870133725649, "learning_rate": 2.352392239724016e-07, "loss": -2.3593, "num_tokens": 302407565.0, "residual_var": 0.02994794026017189, "reward": 0.69140625, "reward_std": 0.17208899557590485, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.4623647928237915, "rho2": 0.5312497615814209, "step": 681 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 2.655581162228986e-09, "advantages/std": 0.3068658709526062, "advantages/var": 0.09416666275550156, "completions/clipped_ratio": 0.1953125, "epoch": 3.9175627240143367, "grad_norm": 56.98157623702133, "learning_rate": 2.3293081740942688e-07, "loss": -7.9046, "num_tokens": 302938002.0, "residual_var": 0.026484414935112, "reward": 0.79296875, "reward_std": 0.2426200956106186, "rewards/drgrpo_math_reward/mean": 0.79296875, "rewards/drgrpo_math_reward/std": 0.40557438135147095, "rho2": 0.7187496423721313, "step": 682 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 5.161481724361506e-09, "advantages/std": 0.2932102084159851, "advantages/var": 0.08597222631934542, "completions/clipped_ratio": 0.2109375, "epoch": 3.9232974910394267, "grad_norm": 57.44436911845629, "learning_rate": 2.3063229905706106e-07, "loss": -5.8495, "num_tokens": 303475212.0, "residual_var": 0.045672766864299774, "reward": 0.853515625, "reward_std": 0.20062555372714996, "rewards/drgrpo_math_reward/mean": 0.853515625, "rewards/drgrpo_math_reward/std": 0.35393697023391724, "rho2": 0.46874988079071045, "step": 683 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 3.1269432306354464e-09, "advantages/std": 0.2606082558631897, "advantages/var": 0.06791666302405375, "completions/clipped_ratio": 0.2109375, "epoch": 3.9290322580645163, "grad_norm": 50.81845291934262, "learning_rate": 2.2834369854527046e-07, "loss": -7.0504, "num_tokens": 304007332.0, "residual_var": 0.03395835682749748, "reward": 0.794921875, "reward_std": 0.1723628044128418, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "rho2": 0.4999998211860657, "step": 684 }, { "advantages/mean": -1.9208528101444244e-09, "advantages/snr": 6.727306850080009e-09, "advantages/std": 0.2855307161808014, "advantages/var": 0.08152778988272136, "completions/clipped_ratio": 0.0390625, "epoch": 3.934767025089606, "grad_norm": 52.49581361461307, "learning_rate": 2.2606504537617065e-07, "loss": -4.027, "num_tokens": 304531936.0, "residual_var": 0.045859407633543015, "reward": 0.736328125, "reward_std": 0.1834360957145691, "rewards/drgrpo_math_reward/mean": 0.736328125, "rewards/drgrpo_math_reward/std": 0.4410543739795685, "rho2": 0.4374997615814209, "step": 685 }, { "advantages/mean": -8.731149137020111e-10, "advantages/snr": 2.8729416350136135e-09, "advantages/std": 0.3039097189903259, "advantages/var": 0.09236111729677887, "completions/clipped_ratio": 0.0078125, "epoch": 3.9405017921146954, "grad_norm": 53.43312394702333, "learning_rate": 2.2379636892364717e-07, "loss": -6.3065, "num_tokens": 305063124.0, "residual_var": 0.037521734833717346, "reward": 0.712890625, "reward_std": 0.2254187911748886, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "rho2": 0.5937497615814209, "step": 686 }, { "advantages/mean": 1.7462298274040222e-10, "advantages/snr": 5.06147657656769e-10, "advantages/std": 0.34500402212142944, "advantages/var": 0.11902777527996378, "completions/clipped_ratio": 0.03125, "epoch": 3.946236559139785, "grad_norm": 67.94750865921226, "learning_rate": 2.2153769843297664e-07, "loss": -11.2329, "num_tokens": 305567696.0, "residual_var": 0.04463544860482216, "reward": 0.826171875, "reward_std": 0.2641918659210205, "rewards/drgrpo_math_reward/mean": 0.826171875, "rewards/drgrpo_math_reward/std": 0.3793322443962097, "rho2": 0.6249998211860657, "step": 687 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.1313717620832277e-09, "advantages/std": 0.27309951186180115, "advantages/var": 0.07458334337915407, "completions/clipped_ratio": -0.0078125, "epoch": 3.9519713261648746, "grad_norm": 45.92676485583065, "learning_rate": 2.1928906302045046e-07, "loss": -3.0583, "num_tokens": 306083650.0, "residual_var": 0.03496096655726433, "reward": 0.767578125, "reward_std": 0.18688249588012695, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "rho2": 0.5312497615814209, "step": 688 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 4.998592723313688e-09, "advantages/std": 0.30276504158973694, "advantages/var": 0.09166667040883514, "completions/clipped_ratio": 0.0703125, "epoch": 3.957706093189964, "grad_norm": 56.59327260805284, "learning_rate": 2.1705049167299815e-07, "loss": -7.1729, "num_tokens": 306613635.0, "residual_var": 0.05729169398546219, "reward": 0.6640625, "reward_std": 0.1961938440799713, "rewards/drgrpo_math_reward/mean": 0.6640625, "rewards/drgrpo_math_reward/std": 0.4727790653705597, "rho2": 0.37499985098838806, "step": 689 }, { "advantages/mean": -2.7939677238464355e-09, "advantages/snr": 8.571501190719287e-09, "advantages/std": 0.3259601294994354, "advantages/var": 0.10625000602328871, "completions/clipped_ratio": -0.0546875, "epoch": 3.9634408602150537, "grad_norm": 60.21409152188406, "learning_rate": 2.1482201324781456e-07, "loss": -7.1748, "num_tokens": 307126751.0, "residual_var": 0.043164122849702835, "reward": 0.755859375, "reward_std": 0.23513007164001465, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "rho2": 0.5937495231628418, "step": 690 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 3.5332551852438584e-09, "advantages/std": 0.23063921928405762, "advantages/var": 0.053194449471959615, "completions/clipped_ratio": -0.0390625, "epoch": 3.9691756272401433, "grad_norm": 41.27101886566809, "learning_rate": 2.1260365647198797e-07, "loss": -2.7969, "num_tokens": 307635253.0, "residual_var": 0.03158421814441681, "reward": 0.802734375, "reward_std": 0.13968734443187714, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "rho2": 0.40624988079071045, "step": 691 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.31972211599349976, "advantages/var": 0.10222223145536091, "completions/clipped_ratio": 0.0078125, "epoch": 3.974910394265233, "grad_norm": 63.015447581923745, "learning_rate": 2.1039544994212967e-07, "loss": -7.8752, "num_tokens": 308172091.0, "residual_var": 0.04152781888842583, "reward": 0.80078125, "reward_std": 0.23779098689556122, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "rho2": 0.5937497019767761, "step": 692 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 5.186679962915741e-09, "advantages/std": 0.29178571701049805, "advantages/var": 0.08513890465133045, "completions/clipped_ratio": -0.0546875, "epoch": 3.9806451612903224, "grad_norm": 49.10797773449693, "learning_rate": 2.0819742212400437e-07, "loss": -4.7297, "num_tokens": 308674595.0, "residual_var": 0.045230064541101456, "reward": 0.845703125, "reward_std": 0.19290609657764435, "rewards/drgrpo_math_reward/mean": 0.845703125, "rewards/drgrpo_math_reward/std": 0.36158639192581177, "rho2": 0.4687498211860657, "step": 693 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.035113975465271e-09, "advantages/std": 0.28601670265197754, "advantages/var": 0.08180555419590974, "completions/clipped_ratio": 0.1328125, "epoch": 3.9863799283154124, "grad_norm": 54.363542909737454, "learning_rate": 2.060096013521646e-07, "loss": -3.479, "num_tokens": 309203437.0, "residual_var": 0.03578996658325195, "reward": 0.779296875, "reward_std": 0.1990545094013214, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "rho2": 0.5624996423721313, "step": 694 }, { "advantages/mean": -1.5133991837501526e-09, "advantages/snr": 5.116222781268769e-09, "advantages/std": 0.2958039939403534, "advantages/var": 0.08750000283106463, "completions/clipped_ratio": 0.046875, "epoch": 3.9921146953405016, "grad_norm": 52.83226725600399, "learning_rate": 2.038320158295851e-07, "loss": -5.693, "num_tokens": 309715184.0, "residual_var": 0.03007815219461918, "reward": 0.7890625, "reward_std": 0.2263432741165161, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4083731174468994, "rho2": 0.6562497615814209, "step": 695 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.3301048279422264e-09, "advantages/std": 0.29976844787597656, "advantages/var": 0.08986112234197208, "completions/clipped_ratio": 0.0859375, "epoch": 3.9978494623655916, "grad_norm": 63.83063870404085, "learning_rate": 2.0166469362729865e-07, "loss": -10.5821, "num_tokens": 310231525.0, "residual_var": 0.04212243854999542, "reward": 0.783203125, "reward_std": 0.20924952626228333, "rewards/drgrpo_math_reward/mean": 0.783203125, "rewards/drgrpo_math_reward/std": 0.4124660789966583, "rho2": 0.5312496423721313, "step": 696 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.2788230572499366e-09, "advantages/std": 0.27309951186180115, "advantages/var": 0.07458334337915407, "completions/clipped_ratio": 0.21875, "epoch": 4.00573476702509, "grad_norm": 49.07517146498918, "learning_rate": 1.9950766268403462e-07, "loss": -7.6414, "num_tokens": 310769385.0, "residual_var": 0.041953153908252716, "reward": 0.705078125, "reward_std": 0.18032687902450562, "rewards/drgrpo_math_reward/mean": 0.705078125, "rewards/drgrpo_math_reward/std": 0.4564536213874817, "rho2": 0.4374997913837433, "step": 697 }, { "advantages/mean": -1.57160684466362e-09, "advantages/snr": 5.946016832253336e-09, "advantages/std": 0.2643125355243683, "advantages/var": 0.06986111643532045, "completions/clipped_ratio": 0.1171875, "epoch": 4.011469534050179, "grad_norm": 48.56526514704082, "learning_rate": 1.973609508058588e-07, "loss": -6.6366, "num_tokens": 311295707.0, "residual_var": 0.041480064392089844, "reward": 0.728515625, "reward_std": 0.15856057405471802, "rewards/drgrpo_math_reward/mean": 0.728515625, "rewards/drgrpo_math_reward/std": 0.44516023993492126, "rho2": 0.4062497019767761, "step": 698 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.6362035827153185e-09, "advantages/std": 0.3201562166213989, "advantages/var": 0.10250000304132811, "completions/clipped_ratio": 0.09375, "epoch": 4.017204301075269, "grad_norm": 58.57463151122907, "learning_rate": 1.9522458566581557e-07, "loss": -5.8353, "num_tokens": 311827364.0, "residual_var": 0.04164065793156624, "reward": 0.75390625, "reward_std": 0.23857755959033966, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.5937497615814209, "step": 699 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.8292902223707093e-09, "advantages/std": 0.3181980550289154, "advantages/var": 0.10125000222418468, "completions/clipped_ratio": 0.078125, "epoch": 4.022939068100358, "grad_norm": 64.65447603709022, "learning_rate": 1.9309859480356982e-07, "loss": -7.8579, "num_tokens": 312343638.0, "residual_var": 0.03796879202127457, "reward": 0.794921875, "reward_std": 0.23693251609802246, "rewards/drgrpo_math_reward/mean": 0.794921875, "rewards/drgrpo_math_reward/std": 0.4041535556316376, "rho2": 0.6249997019767761, "step": 700 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 8.533435957028186e-10, "advantages/std": 0.2728451192378998, "advantages/var": 0.07444445909194375, "completions/clipped_ratio": -0.03125, "epoch": 4.028673835125448, "grad_norm": 52.87295915407075, "learning_rate": 1.9098300562505264e-07, "loss": -5.3526, "num_tokens": 312858393.0, "residual_var": 0.03722225874662399, "reward": 0.71875, "reward_std": 0.18023407459259033, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "rho2": 0.4999997019767761, "step": 701 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.484977021747515e-09, "advantages/std": 0.3135814666748047, "advantages/var": 0.09833333624192164, "completions/clipped_ratio": 0.0703125, "epoch": 4.034408602150537, "grad_norm": 57.57931144092849, "learning_rate": 1.8887784540210893e-07, "loss": -9.4108, "num_tokens": 313375483.0, "residual_var": 0.030729210004210472, "reward": 0.7109375, "reward_std": 0.24057221412658691, "rewards/drgrpo_math_reward/mean": 0.7109375, "rewards/drgrpo_math_reward/std": 0.45377036929130554, "rho2": 0.6874996423721313, "step": 702 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 3.444940560504379e-09, "advantages/std": 0.3041381537914276, "advantages/var": 0.09250001659165807, "completions/clipped_ratio": 0.140625, "epoch": 4.040143369175627, "grad_norm": 54.52132427426332, "learning_rate": 1.86783141272144e-07, "loss": -5.3943, "num_tokens": 313890906.0, "residual_var": 0.037578169256448746, "reward": 0.8359375, "reward_std": 0.2199089229106903, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.37069445848464966, "rho2": 0.5937496423721313, "step": 703 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.796488086934403e-09, "advantages/std": 0.30663949251174927, "advantages/var": 0.09402777836786314, "completions/clipped_ratio": 0.2265625, "epoch": 4.045878136200717, "grad_norm": 59.28858418754244, "learning_rate": 1.8469892023777568e-07, "loss": -5.9876, "num_tokens": 314432493.0, "residual_var": 0.041137177497148514, "reward": 0.623046875, "reward_std": 0.22271910309791565, "rewards/drgrpo_math_reward/mean": 0.623046875, "rewards/drgrpo_math_reward/std": 0.4850969910621643, "rho2": 0.5624998807907104, "step": 704 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 8.393661749284777e-10, "advantages/std": 0.27738863229751587, "advantages/var": 0.07694445332788646, "completions/clipped_ratio": 0.09375, "epoch": 4.051612903225807, "grad_norm": 52.86948652034123, "learning_rate": 1.8262520916648427e-07, "loss": -6.4687, "num_tokens": 314943593.0, "residual_var": 0.04087676480412483, "reward": 0.80078125, "reward_std": 0.17692944407463074, "rewards/drgrpo_math_reward/mean": 0.80078125, "rewards/drgrpo_math_reward/std": 0.39980348944664, "rho2": 0.4687497615814209, "step": 705 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.916947836500089e-10, "advantages/std": 0.2972092628479004, "advantages/var": 0.08833334592259234, "completions/clipped_ratio": 0.0703125, "epoch": 4.057347670250896, "grad_norm": 53.79778490569021, "learning_rate": 1.805620347902681e-07, "loss": -7.3392, "num_tokens": 315461249.0, "residual_var": 0.03312503546476364, "reward": 0.76171875, "reward_std": 0.21977651119232178, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "rho2": 0.6249997019767761, "step": 706 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 4.3463933131762586e-09, "advantages/std": 0.294627845287323, "advantages/var": 0.08680556721865074, "completions/clipped_ratio": 0.1953125, "epoch": 4.063082437275986, "grad_norm": 60.01049963539943, "learning_rate": 1.7850942370529755e-07, "loss": -9.6753, "num_tokens": 316000451.0, "residual_var": 0.046115487813949585, "reward": 0.767578125, "reward_std": 0.1948215365409851, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "rho2": 0.4687497615814209, "step": 707 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 5.049795095405228e-09, "advantages/std": 0.3227486312389374, "advantages/var": 0.10416667896660758, "completions/clipped_ratio": 0.171875, "epoch": 4.068817204301075, "grad_norm": 66.11394900526818, "learning_rate": 1.7646740237157254e-07, "loss": -8.1883, "num_tokens": 316526189.0, "residual_var": 0.032552119344472885, "reward": 0.68359375, "reward_std": 0.25036922097206116, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.46552830934524536, "rho2": 0.6874997615814209, "step": 708 }, { "advantages/mean": 2.9103830456733704e-10, "advantages/snr": 1.0236580017018337e-09, "advantages/std": 0.28431203961372375, "advantages/var": 0.08083333586931563, "completions/clipped_ratio": -0.1015625, "epoch": 4.074551971326165, "grad_norm": 58.70759015275292, "learning_rate": 1.7443599711258217e-07, "loss": -3.7693, "num_tokens": 317043140.0, "residual_var": 0.03789065405726433, "reward": 0.68359375, "reward_std": 0.19415073096752167, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.46552830934524536, "rho2": 0.5312497615814209, "step": 709 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.4374844083348e-09, "advantages/std": 0.24295634031295776, "advantages/var": 0.059027783298265746, "completions/clipped_ratio": -0.0390625, "epoch": 4.080286738351255, "grad_norm": 42.84595766592432, "learning_rate": 1.724152341149645e-07, "loss": -3.5315, "num_tokens": 317556178.0, "residual_var": 0.03135853260755539, "reward": 0.798828125, "reward_std": 0.15388333797454834, "rewards/drgrpo_math_reward/mean": 0.798828125, "rewards/drgrpo_math_reward/std": 0.4012683033943176, "rho2": 0.4687497615814209, "step": 710 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.8187178125915768e-09, "advantages/std": 0.25603821873664856, "advantages/var": 0.0655555694538359, "completions/clipped_ratio": 0.0, "epoch": 4.086021505376344, "grad_norm": 43.131000011443504, "learning_rate": 1.7040513942816904e-07, "loss": -2.6295, "num_tokens": 318057566.0, "residual_var": 0.03687502443790436, "reward": 0.74609375, "reward_std": 0.15877225995063782, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "rho2": 0.4374997615814209, "step": 711 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.9788020671053867e-09, "advantages/std": 0.2941560447216034, "advantages/var": 0.08652777864625794, "completions/clipped_ratio": 0.171875, "epoch": 4.091756272401434, "grad_norm": 54.04716232701328, "learning_rate": 1.6840573896412126e-07, "loss": -4.5174, "num_tokens": 318593346.0, "residual_var": 0.04055993631482124, "reward": 0.744140625, "reward_std": 0.20080366730690002, "rewards/drgrpo_math_reward/mean": 0.744140625, "rewards/drgrpo_math_reward/std": 0.43676990270614624, "rho2": 0.5312496423721313, "step": 712 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.0230965694675723e-09, "advantages/std": 0.3413616716861725, "advantages/var": 0.11652779089637821, "completions/clipped_ratio": 0.09375, "epoch": 4.097491039426523, "grad_norm": 60.156999763374095, "learning_rate": 1.6641705849688914e-07, "loss": -7.9025, "num_tokens": 319102802.0, "residual_var": 0.05098094418644905, "reward": 0.669921875, "reward_std": 0.26154181361198425, "rewards/drgrpo_math_reward/mean": 0.669921875, "rewards/drgrpo_math_reward/std": 0.47070086002349854, "rho2": 0.5624997615814209, "step": 713 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 3.7124425543687873e-10, "advantages/std": 0.3135814666748047, "advantages/var": 0.09833333624192164, "completions/clipped_ratio": 0.015625, "epoch": 4.103225806451613, "grad_norm": 64.58977231717428, "learning_rate": 1.6443912366234925e-07, "loss": -11.7292, "num_tokens": 319616904.0, "residual_var": 0.03380212187767029, "reward": 0.75390625, "reward_std": 0.23894751071929932, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.6562497019767761, "step": 714 }, { "advantages/mean": -3.259629011154175e-09, "advantages/snr": 1.0032926035524922e-08, "advantages/std": 0.32489314675331116, "advantages/var": 0.10555555680726858, "completions/clipped_ratio": 0.0, "epoch": 4.108960573476702, "grad_norm": 63.96595784778398, "learning_rate": 1.6247195995785835e-07, "loss": -5.1052, "num_tokens": 320140880.0, "residual_var": 0.042881984263658524, "reward": 0.57421875, "reward_std": 0.23557019233703613, "rewards/drgrpo_math_reward/mean": 0.57421875, "rewards/drgrpo_math_reward/std": 0.4949444830417633, "rho2": 0.5937497019767761, "step": 715 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.34197142720222473, "advantages/var": 0.11694445702272649, "completions/clipped_ratio": -0.046875, "epoch": 4.114695340501792, "grad_norm": 67.20452616571875, "learning_rate": 1.6051559274192273e-07, "loss": -12.4832, "num_tokens": 320653925.0, "residual_var": 0.047508735209703445, "reward": 0.77734375, "reward_std": 0.2593649923801422, "rewards/drgrpo_math_reward/mean": 0.77734375, "rewards/drgrpo_math_reward/std": 0.41643625497817993, "rho2": 0.5937496423721313, "step": 716 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.5618437207896963e-09, "advantages/std": 0.2941560447216034, "advantages/var": 0.08652777864625794, "completions/clipped_ratio": -0.1171875, "epoch": 4.120430107526881, "grad_norm": 55.87140335200285, "learning_rate": 1.58570047233873e-07, "loss": -9.5962, "num_tokens": 321165010.0, "residual_var": 0.035151947289705276, "reward": 0.822265625, "reward_std": 0.21573621034622192, "rewards/drgrpo_math_reward/mean": 0.822265625, "rewards/drgrpo_math_reward/std": 0.3826628625392914, "rho2": 0.5937496423721313, "step": 717 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.193644123751125e-09, "advantages/std": 0.3184162378311157, "advantages/var": 0.10138890051452165, "completions/clipped_ratio": -0.0390625, "epoch": 4.1261648745519715, "grad_norm": 59.689685667885605, "learning_rate": 1.5663534851353778e-07, "loss": -5.1423, "num_tokens": 321680633.0, "residual_var": 0.031684063374996185, "reward": 0.72265625, "reward_std": 0.2468416690826416, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "rho2": 0.6874997019767761, "step": 718 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 7.317160889482837e-09, "advantages/std": 0.3181980550289154, "advantages/var": 0.10125000222418468, "completions/clipped_ratio": -0.0703125, "epoch": 4.131899641577061, "grad_norm": 62.8459327676018, "learning_rate": 1.547115215209207e-07, "loss": -4.8473, "num_tokens": 322216626.0, "residual_var": 0.041132852435112, "reward": 0.759765625, "reward_std": 0.23509493470191956, "rewards/drgrpo_math_reward/mean": 0.759765625, "rewards/drgrpo_math_reward/std": 0.4276435375213623, "rho2": 0.5937497019767761, "step": 719 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 5.190256323127661e-09, "advantages/std": 0.33644384145736694, "advantages/var": 0.11319445845458986, "completions/clipped_ratio": -0.109375, "epoch": 4.137634408602151, "grad_norm": 62.137148018220714, "learning_rate": 1.527985910558799e-07, "loss": -5.644, "num_tokens": 322732199.0, "residual_var": 0.038910649716854095, "reward": 0.763671875, "reward_std": 0.25687479972839355, "rewards/drgrpo_math_reward/mean": 0.763671875, "rewards/drgrpo_math_reward/std": 0.42524150013923645, "rho2": 0.6562495231628418, "step": 720 }, { "advantages/mean": 6.402842700481415e-10, "advantages/snr": 2.09271023051e-09, "advantages/std": 0.3059593439102173, "advantages/var": 0.09361112012597061, "completions/clipped_ratio": 0.03125, "epoch": 4.14336917562724, "grad_norm": 59.23636055820033, "learning_rate": 1.508965817778065e-07, "loss": -7.4555, "num_tokens": 323249753.0, "residual_var": 0.03802954778075218, "reward": 0.71875, "reward_std": 0.2211625725030899, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "rho2": 0.5937497615814209, "step": 721 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.6618202762837968e-09, "advantages/std": 0.21015867590904236, "advantages/var": 0.0441666690598419, "completions/clipped_ratio": 0.0234375, "epoch": 4.14910394265233, "grad_norm": 39.71902244619188, "learning_rate": 1.4900551820530827e-07, "loss": -1.0968, "num_tokens": 323753537.0, "residual_var": 0.028984392061829567, "reward": 0.78515625, "reward_std": 0.11641712486743927, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4111155867576599, "rho2": 0.34374991059303284, "step": 722 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.7468270516295885e-09, "advantages/std": 0.26977357268333435, "advantages/var": 0.07277778051833028, "completions/clipped_ratio": -0.0078125, "epoch": 4.15483870967742, "grad_norm": 50.41837287191254, "learning_rate": 1.4712542471589273e-07, "loss": -8.7934, "num_tokens": 324260646.0, "residual_var": 0.038663219660520554, "reward": 0.765625, "reward_std": 0.1778920441865921, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42402184009552, "rho2": 0.4687497615814209, "step": 723 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.7816318963127715e-09, "advantages/std": 0.25110867619514465, "advantages/var": 0.06305556726047801, "completions/clipped_ratio": -0.0859375, "epoch": 4.160573476702509, "grad_norm": 46.909535421145435, "learning_rate": 1.452563255456536e-07, "loss": -3.8354, "num_tokens": 324779815.0, "residual_var": 0.033498287200927734, "reward": 0.75, "reward_std": 0.16545413434505463, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "rho2": 0.4687498211860657, "step": 724 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.576223268034043e-09, "advantages/std": 0.29297327995300293, "advantages/var": 0.08583334276642063, "completions/clipped_ratio": 0.0546875, "epoch": 4.166308243727599, "grad_norm": 79.72853916046961, "learning_rate": 1.4339824478895757e-07, "loss": -6.1756, "num_tokens": 325301362.0, "residual_var": 0.04023441672325134, "reward": 0.76171875, "reward_std": 0.20368511974811554, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "rho2": 0.5312496423721313, "step": 725 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 2.96995404349503e-09, "advantages/std": 0.3135814666748047, "advantages/var": 0.09833333624192164, "completions/clipped_ratio": 0.171875, "epoch": 4.172043010752688, "grad_norm": 60.7545333800132, "learning_rate": 1.415512063981339e-07, "loss": -9.8749, "num_tokens": 325836006.0, "residual_var": 0.04302087426185608, "reward": 0.64453125, "reward_std": 0.23109637200832367, "rewards/drgrpo_math_reward/mean": 0.64453125, "rewards/drgrpo_math_reward/std": 0.47912323474884033, "rho2": 0.5624997019767761, "step": 726 }, { "advantages/mean": 1.979060471057892e-09, "advantages/snr": 5.585230216070688e-09, "advantages/std": 0.3543381989002228, "advantages/var": 0.12555555919985384, "completions/clipped_ratio": 0.078125, "epoch": 4.177777777777778, "grad_norm": 73.17663337313147, "learning_rate": 1.3971523418316643e-07, "loss": -8.7419, "num_tokens": 326360703.0, "residual_var": 0.05885420739650726, "reward": 0.72265625, "reward_std": 0.2689397931098938, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "rho2": 0.5312497615814209, "step": 727 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 3.1430528566670674e-09, "advantages/std": 0.2592725157737732, "advantages/var": 0.06722223743566147, "completions/clipped_ratio": -0.109375, "epoch": 4.183512544802867, "grad_norm": 51.84657502273497, "learning_rate": 1.3789035181138596e-07, "loss": -6.7464, "num_tokens": 326860224.0, "residual_var": 0.03781251981854439, "reward": 0.78515625, "reward_std": 0.16273440420627594, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4111155867576599, "rho2": 0.43749988079071045, "step": 728 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.1543962476787851e-09, "advantages/std": 0.3025355935096741, "advantages/var": 0.09152778534025074, "completions/clipped_ratio": -0.0234375, "epoch": 4.189247311827957, "grad_norm": 58.96490897946397, "learning_rate": 1.3607658280716472e-07, "loss": -4.2655, "num_tokens": 327379661.0, "residual_var": 0.04862417280673981, "reward": 0.728515625, "reward_std": 0.20082512497901917, "rewards/drgrpo_math_reward/mean": 0.728515625, "rewards/drgrpo_math_reward/std": 0.44516023993492126, "rho2": 0.4687497019767761, "step": 729 }, { "advantages/mean": -1.3387762010097504e-09, "advantages/snr": 4.026406969674498e-09, "advantages/std": 0.3324989676475525, "advantages/var": 0.11055556348668816, "completions/clipped_ratio": 0.03125, "epoch": 4.194982078853046, "grad_norm": 61.512681445532735, "learning_rate": 1.3427395055161393e-07, "loss": -5.7477, "num_tokens": 327895668.0, "residual_var": 0.0414583757519722, "reward": 0.5625, "reward_std": 0.25225284695625305, "rewards/drgrpo_math_reward/mean": 0.5625, "rewards/drgrpo_math_reward/std": 0.49656352400779724, "rho2": 0.6249997019767761, "step": 730 }, { "advantages/mean": -1.862645149230957e-09, "advantages/snr": 5.902510174222863e-09, "advantages/std": 0.31556829810142517, "advantages/var": 0.09958335076662994, "completions/clipped_ratio": 0.1171875, "epoch": 4.200716845878136, "grad_norm": 57.40714924395863, "learning_rate": 1.3248247828228243e-07, "loss": -6.256, "num_tokens": 328451505.0, "residual_var": 0.04356774315237999, "reward": 0.744140625, "reward_std": 0.22826915979385376, "rewards/drgrpo_math_reward/mean": 0.744140625, "rewards/drgrpo_math_reward/std": 0.43676990270614624, "rho2": 0.5624997615814209, "step": 731 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.823242771086913e-09, "advantages/std": 0.2896358072757721, "advantages/var": 0.0838889008562882, "completions/clipped_ratio": 0.1015625, "epoch": 4.2064516129032254, "grad_norm": 53.25896374151797, "learning_rate": 1.3070218909285657e-07, "loss": -3.6461, "num_tokens": 328969049.0, "residual_var": 0.039322953671216965, "reward": 0.70703125, "reward_std": 0.19728496670722961, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.455569326877594, "rho2": 0.5312497019767761, "step": 732 }, { "advantages/mean": -5.238689482212067e-10, "advantages/snr": 1.526479383017565e-09, "advantages/std": 0.34318768978118896, "advantages/var": 0.11777779041734959, "completions/clipped_ratio": 0.0625, "epoch": 4.2121863799283155, "grad_norm": 64.20031795283984, "learning_rate": 1.2893310593286244e-07, "loss": -7.6396, "num_tokens": 329498501.0, "residual_var": 0.03680562227964401, "reward": 0.671875, "reward_std": 0.26710307598114014, "rewards/drgrpo_math_reward/mean": 0.671875, "rewards/drgrpo_math_reward/std": 0.4699897766113281, "rho2": 0.6874995231628418, "step": 733 }, { "advantages/mean": -7.566995918750763e-10, "advantages/snr": 2.5621814900484605e-09, "advantages/std": 0.2953341007232666, "advantages/var": 0.08722223105002058, "completions/clipped_ratio": 0.109375, "epoch": 4.217921146953405, "grad_norm": 53.81681875911166, "learning_rate": 1.2717525160737065e-07, "loss": -7.425, "num_tokens": 330021974.0, "residual_var": 0.027256984263658524, "reward": 0.75390625, "reward_std": 0.22843992710113525, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.6874996423721313, "step": 734 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.4588924754407226e-09, "advantages/std": 0.28406769037246704, "advantages/var": 0.0806944527135478, "completions/clipped_ratio": -0.0078125, "epoch": 4.223655913978495, "grad_norm": 54.0522720635494, "learning_rate": 1.2542864877670245e-07, "loss": -6.9373, "num_tokens": 330546395.0, "residual_var": 0.03782555088400841, "reward": 0.767578125, "reward_std": 0.1997183859348297, "rewards/drgrpo_math_reward/mean": 0.767578125, "rewards/drgrpo_math_reward/std": 0.42278963327407837, "rho2": 0.5312497615814209, "step": 735 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.1758334609740122e-09, "advantages/std": 0.3210226893424988, "advantages/var": 0.10305556707269048, "completions/clipped_ratio": -0.1171875, "epoch": 4.229390681003585, "grad_norm": 62.415529680769176, "learning_rate": 1.2369331995613663e-07, "loss": -8.1566, "num_tokens": 331043149.0, "residual_var": 0.03542538732290268, "reward": 0.765625, "reward_std": 0.24388158321380615, "rewards/drgrpo_math_reward/mean": 0.765625, "rewards/drgrpo_math_reward/std": 0.42402184009552, "rho2": 0.6562497019767761, "step": 736 }, { "advantages/mean": 1.7462298274040222e-10, "advantages/snr": 5.469146815857034e-10, "advantages/std": 0.31928741931915283, "advantages/var": 0.10194445613548453, "completions/clipped_ratio": 0.109375, "epoch": 4.235125448028674, "grad_norm": 62.11160711014657, "learning_rate": 1.2196928751561964e-07, "loss": -6.0681, "num_tokens": 331557494.0, "residual_var": 0.044600728899240494, "reward": 0.6953125, "reward_std": 0.23206958174705505, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4607250988483429, "rho2": 0.5624997615814209, "step": 737 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.1780195890261013e-10, "advantages/std": 0.278637558221817, "advantages/var": 0.07763888885181647, "completions/clipped_ratio": 0.21875, "epoch": 4.240860215053764, "grad_norm": 54.89373551856166, "learning_rate": 1.202565736794785e-07, "loss": -2.3072, "num_tokens": 332091882.0, "residual_var": 0.03881947696208954, "reward": 0.634765625, "reward_std": 0.18358096480369568, "rewards/drgrpo_math_reward/mean": 0.634765625, "rewards/drgrpo_math_reward/std": 0.4819667339324951, "rho2": 0.4999997019767761, "step": 738 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 7.367869460847659e-10, "advantages/std": 0.3160080909729004, "advantages/var": 0.09986111356033689, "completions/clipped_ratio": 0.09375, "epoch": 4.246594982078853, "grad_norm": 64.85755386189227, "learning_rate": 1.1855520052613211e-07, "loss": -7.8109, "num_tokens": 332621400.0, "residual_var": 0.046809930354356766, "reward": 0.712890625, "reward_std": 0.22757768630981445, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "rho2": 0.5312497615814209, "step": 739 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.188528925189731e-09, "advantages/std": 0.30573228001594543, "advantages/var": 0.09347222704374847, "completions/clipped_ratio": 0.2421875, "epoch": 4.252329749103943, "grad_norm": 57.64146986880267, "learning_rate": 1.1686518998780881e-07, "loss": -8.7478, "num_tokens": 333167354.0, "residual_var": 0.0350521095097065, "reward": 0.755859375, "reward_std": 0.2273828685283661, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "rho2": 0.6249998211860657, "step": 740 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 3.029392263977957e-09, "advantages/std": 0.26900023221969604, "advantages/var": 0.0723611249342504, "completions/clipped_ratio": 0.0703125, "epoch": 4.258064516129032, "grad_norm": 49.27508175315792, "learning_rate": 1.1518656385026148e-07, "loss": -3.8049, "num_tokens": 333702017.0, "residual_var": 0.03391929715871811, "reward": 0.775390625, "reward_std": 0.18260450661182404, "rewards/drgrpo_math_reward/mean": 0.775390625, "rewards/drgrpo_math_reward/std": 0.41773295402526855, "rho2": 0.5312497615814209, "step": 741 }, { "advantages/mean": 2.444721758365631e-09, "advantages/snr": 7.677754433128938e-09, "advantages/std": 0.3184162378311157, "advantages/var": 0.10138890051452165, "completions/clipped_ratio": -0.0625, "epoch": 4.263799283154122, "grad_norm": 59.16065120911612, "learning_rate": 1.1351934375248906e-07, "loss": -6.7867, "num_tokens": 334214561.0, "residual_var": 0.04752609133720398, "reward": 0.70703125, "reward_std": 0.22000300884246826, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.455569326877594, "rho2": 0.5312496423721313, "step": 742 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 8.454930716966898e-10, "advantages/std": 0.2753785252571106, "advantages/var": 0.0758333321727811, "completions/clipped_ratio": 0.0546875, "epoch": 4.269534050179211, "grad_norm": 49.386417300559174, "learning_rate": 1.1186355118645552e-07, "loss": -1.5774, "num_tokens": 334737153.0, "residual_var": 0.040286485105752945, "reward": 0.74609375, "reward_std": 0.1820569932460785, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "rho2": 0.4687497615814209, "step": 743 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.989492837340825e-09, "advantages/std": 0.2725904583930969, "advantages/var": 0.0743055580069587, "completions/clipped_ratio": -0.09375, "epoch": 4.275268817204301, "grad_norm": 54.62862160827747, "learning_rate": 1.1021920749681402e-07, "loss": -6.4413, "num_tokens": 335238728.0, "residual_var": 0.032508715987205505, "reward": 0.806640625, "reward_std": 0.1878339648246765, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "rho2": 0.5624996423721313, "step": 744 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.7019309787096571e-09, "advantages/std": 0.27360761165618896, "advantages/var": 0.07486112515620391, "completions/clipped_ratio": 0.171875, "epoch": 4.28100358422939, "grad_norm": 51.46953294542059, "learning_rate": 1.085863338806312e-07, "loss": -1.9553, "num_tokens": 335759930.0, "residual_var": 0.03743058070540428, "reward": 0.751953125, "reward_std": 0.1859126091003418, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "rho2": 0.4999998211860657, "step": 745 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 1.9789343716512578e-09, "advantages/std": 0.3529636561870575, "advantages/var": 0.12458334258893533, "completions/clipped_ratio": 0.2734375, "epoch": 4.28673835125448, "grad_norm": 71.92020668531775, "learning_rate": 1.069649513871147e-07, "loss": -6.0299, "num_tokens": 336309195.0, "residual_var": 0.038932330906391144, "reward": 0.630859375, "reward_std": 0.2772601544857025, "rewards/drgrpo_math_reward/mean": 0.630859375, "rewards/drgrpo_math_reward/std": 0.4830440282821655, "rho2": 0.6874997615814209, "step": 746 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.2694025181761328e-09, "advantages/std": 0.27512624859809875, "advantages/var": 0.07569445266766284, "completions/clipped_ratio": 0.1640625, "epoch": 4.29247311827957, "grad_norm": 54.22483127863891, "learning_rate": 1.0535508091734068e-07, "loss": -4.1918, "num_tokens": 336828515.0, "residual_var": 0.03548179194331169, "reward": 0.732421875, "reward_std": 0.19289124011993408, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.5312498211860657, "step": 747 }, { "advantages/mean": -1.3387762010097504e-09, "advantages/snr": 4.337226911377132e-09, "advantages/std": 0.3086709976196289, "advantages/var": 0.09527778477149695, "completions/clipped_ratio": 0.03125, "epoch": 4.2982078853046595, "grad_norm": 55.9095327613768, "learning_rate": 1.0375674322398497e-07, "loss": -3.6905, "num_tokens": 337347606.0, "residual_var": 0.03572919964790344, "reward": 0.7421875, "reward_std": 0.23340092599391937, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "rho2": 0.6249997615814209, "step": 748 }, { "advantages/mean": 1.7462298274040222e-10, "advantages/snr": 5.694732130401605e-10, "advantages/std": 0.30663949251174927, "advantages/var": 0.09402777836786314, "completions/clipped_ratio": -0.109375, "epoch": 4.3039426523297495, "grad_norm": 59.257388264697404, "learning_rate": 1.0216995891105629e-07, "loss": -11.5485, "num_tokens": 337842696.0, "residual_var": 0.04407556727528572, "reward": 0.802734375, "reward_std": 0.2212052196264267, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "rho2": 0.5312496423721313, "step": 749 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.3096722531355764e-09, "advantages/std": 0.2666666805744171, "advantages/var": 0.07111111852857821, "completions/clipped_ratio": 0.3203125, "epoch": 4.309677419354839, "grad_norm": 55.377882033856835, "learning_rate": 1.0059474843362892e-07, "loss": -7.6076, "num_tokens": 338371098.0, "residual_var": 0.04000002518296242, "reward": 0.703125, "reward_std": 0.1689617931842804, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "rho2": 0.4374997615814209, "step": 750 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 4.447150745184277e-09, "advantages/std": 0.28795257210731506, "advantages/var": 0.08291668378321848, "completions/clipped_ratio": 0.1640625, "epoch": 4.315412186379929, "grad_norm": 53.17785625641289, "learning_rate": 9.903113209758096e-08, "loss": -5.3924, "num_tokens": 338887051.0, "residual_var": 0.03627608343958855, "reward": 0.751953125, "reward_std": 0.2016282081604004, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "rho2": 0.5624996423721313, "step": 751 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.1959511304582463e-09, "advantages/std": 0.29202359914779663, "advantages/var": 0.08527778245923301, "completions/clipped_ratio": 0.078125, "epoch": 4.321146953405018, "grad_norm": 53.93612724726197, "learning_rate": 9.747913005933061e-08, "loss": -5.2116, "num_tokens": 339400388.0, "residual_var": 0.04263891279697418, "reward": 0.7734375, "reward_std": 0.2049104869365692, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "rho2": 0.4999998211860657, "step": 752 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.8091607326709192e-09, "advantages/std": 0.2573907673358917, "advantages/var": 0.06625000710975915, "completions/clipped_ratio": 0.171875, "epoch": 4.326881720430108, "grad_norm": 44.13696941893862, "learning_rate": 9.59387623255784e-08, "loss": -2.2483, "num_tokens": 339915125.0, "residual_var": 0.03726564720273018, "reward": 0.673828125, "reward_std": 0.16385088860988617, "rewards/drgrpo_math_reward/mean": 0.673828125, "rewards/drgrpo_math_reward/std": 0.4692695140838623, "rho2": 0.4374998211860657, "step": 753 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 3.216773695147253e-10, "advantages/std": 0.36190086603164673, "advantages/var": 0.1309722368344559, "completions/clipped_ratio": 0.140625, "epoch": 4.332616487455197, "grad_norm": 71.06586574609156, "learning_rate": 9.441004875304736e-08, "loss": -8.584, "num_tokens": 340442441.0, "residual_var": 0.04092887416481972, "reward": 0.658203125, "reward_std": 0.2938159108161926, "rewards/drgrpo_math_reward/mean": 0.658203125, "rewards/drgrpo_math_reward/std": 0.4747757613658905, "rho2": 0.6874996423721313, "step": 754 }, { "advantages/mean": 1.1641532182693481e-10, "advantages/snr": 3.8479874922626174e-10, "advantages/std": 0.3025355935096741, "advantages/var": 0.09152778534025074, "completions/clipped_ratio": 0.109375, "epoch": 4.338351254480287, "grad_norm": 60.43213939232953, "learning_rate": 9.289300904822827e-08, "loss": -6.5135, "num_tokens": 340948058.0, "residual_var": 0.051484398543834686, "reward": 0.697265625, "reward_std": 0.19526860117912292, "rewards/drgrpo_math_reward/mean": 0.697265625, "rewards/drgrpo_math_reward/std": 0.45989060401916504, "rho2": 0.43749985098838806, "step": 755 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.6720527389305906e-09, "advantages/std": 0.26140648126602173, "advantages/var": 0.06833334844788297, "completions/clipped_ratio": 0.0625, "epoch": 4.344086021505376, "grad_norm": 49.2290636536843, "learning_rate": 9.13876627671255e-08, "loss": -1.5702, "num_tokens": 341470380.0, "residual_var": 0.03416668623685837, "reward": 0.81640625, "reward_std": 0.17411985993385315, "rewards/drgrpo_math_reward/mean": 0.81640625, "rewards/drgrpo_math_reward/std": 0.3875311613082886, "rho2": 0.49999985098838806, "step": 756 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 5.238689168926288e-09, "advantages/std": 0.3333333432674408, "advantages/var": 0.11111111773384952, "completions/clipped_ratio": 0.15625, "epoch": 4.349820788530466, "grad_norm": 61.78977715633126, "learning_rate": 8.989402931500434e-08, "loss": -8.7922, "num_tokens": 341988672.0, "residual_var": 0.04861116036772728, "reward": 0.76171875, "reward_std": 0.23564383387565613, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "rho2": 0.5624996423721313, "step": 757 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 2.621016473766188e-09, "advantages/std": 0.3109126687049866, "advantages/var": 0.09666668756125674, "completions/clipped_ratio": 0.140625, "epoch": 4.355555555555555, "grad_norm": 60.00140745535176, "learning_rate": 8.841212794614128e-08, "loss": -4.3498, "num_tokens": 342508389.0, "residual_var": 0.04229170083999634, "reward": 0.72265625, "reward_std": 0.22398187220096588, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "rho2": 0.5624997615814209, "step": 758 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 4.923408936308493e-09, "advantages/std": 0.33103373646736145, "advantages/var": 0.10958333467954251, "completions/clipped_ratio": 0.1796875, "epoch": 4.361290322580645, "grad_norm": 60.52958498880444, "learning_rate": 8.694197776357559e-08, "loss": -3.8942, "num_tokens": 343039676.0, "residual_var": 0.05136724188923836, "reward": 0.771484375, "reward_std": 0.2351570427417755, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "rho2": 0.5312496423721313, "step": 759 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 4.4848712333866726e-09, "advantages/std": 0.2855307161808014, "advantages/var": 0.08152778988272136, "completions/clipped_ratio": 0.0703125, "epoch": 4.367025089605734, "grad_norm": 53.95232911088751, "learning_rate": 8.54835977188636e-08, "loss": -3.4007, "num_tokens": 343556835.0, "residual_var": 0.04331165924668312, "reward": 0.689453125, "reward_std": 0.18577814102172852, "rewards/drgrpo_math_reward/mean": 0.689453125, "rewards/drgrpo_math_reward/std": 0.46317005157470703, "rho2": 0.4687498211860657, "step": 760 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 1.9477843048577147e-09, "advantages/std": 0.29884037375450134, "advantages/var": 0.08930556898573005, "completions/clipped_ratio": 0.0625, "epoch": 4.372759856630824, "grad_norm": 59.958126720457166, "learning_rate": 8.403700661183355e-08, "loss": -6.8018, "num_tokens": 344065352.0, "residual_var": 0.04186201095581055, "reward": 0.779296875, "reward_std": 0.20973405241966248, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "rho2": 0.5312497615814209, "step": 761 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.2800153935542278e-09, "advantages/std": 0.2728451192378998, "advantages/var": 0.07444445909194375, "completions/clipped_ratio": 0.1953125, "epoch": 4.378494623655914, "grad_norm": 56.290858508950166, "learning_rate": 8.260222309034393e-08, "loss": -8.6172, "num_tokens": 344584929.0, "residual_var": 0.03954864293336868, "reward": 0.6875, "reward_std": 0.17545069754123688, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "rho2": 0.4687497019767761, "step": 762 }, { "advantages/mean": 4.0745362639427185e-10, "advantages/snr": 1.2975235469774688e-09, "advantages/std": 0.31402406096458435, "advantages/var": 0.09861111086468899, "completions/clipped_ratio": 0.0, "epoch": 4.3842293906810035, "grad_norm": 63.27165814868649, "learning_rate": 8.117926565004285e-08, "loss": -6.1288, "num_tokens": 345099551.0, "residual_var": 0.04006079584360123, "reward": 0.73046875, "reward_std": 0.23268702626228333, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44415023922920227, "rho2": 0.5937497615814209, "step": 763 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 6.034050917099409e-09, "advantages/std": 0.28939592838287354, "advantages/var": 0.08375000336458527, "completions/clipped_ratio": 0.046875, "epoch": 4.3899641577060935, "grad_norm": 57.02762330064207, "learning_rate": 7.976815263412961e-08, "loss": -6.2342, "num_tokens": 345610589.0, "residual_var": 0.03664065897464752, "reward": 0.779296875, "reward_std": 0.206825852394104, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "rho2": 0.5624997019767761, "step": 764 }, { "advantages/mean": 2.9103830456733704e-10, "advantages/snr": 9.216247843313271e-10, "advantages/std": 0.31578826904296875, "advantages/var": 0.09972223086515442, "completions/clipped_ratio": 0.296875, "epoch": 4.395698924731183, "grad_norm": 57.08167973251072, "learning_rate": 7.83689022331182e-08, "loss": -5.7578, "num_tokens": 346153581.0, "residual_var": 0.0436285175383091, "reward": 0.75390625, "reward_std": 0.22711610794067383, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.5624996423721313, "step": 765 }, { "advantages/mean": -2.9103830456733704e-10, "advantages/snr": 1.0192889166787892e-09, "advantages/std": 0.2855307161808014, "advantages/var": 0.08152778988272136, "completions/clipped_ratio": 0.0078125, "epoch": 4.401433691756273, "grad_norm": 54.54845194651555, "learning_rate": 7.698153248460271e-08, "loss": -4.3905, "num_tokens": 346662986.0, "residual_var": 0.04331168159842491, "reward": 0.818359375, "reward_std": 0.18078528344631195, "rewards/drgrpo_math_reward/mean": 0.818359375, "rewards/drgrpo_math_reward/std": 0.38592514395713806, "rho2": 0.4687495231628418, "step": 766 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 6.878272809258787e-10, "advantages/std": 0.33850160241127014, "advantages/var": 0.11458333483499761, "completions/clipped_ratio": 0.0, "epoch": 4.407168458781362, "grad_norm": 63.23061342088889, "learning_rate": 7.560606127302527e-08, "loss": -9.6182, "num_tokens": 347173651.0, "residual_var": 0.04296880215406418, "reward": 0.716796875, "reward_std": 0.2532894015312195, "rewards/drgrpo_math_reward/mean": 0.716796875, "rewards/drgrpo_math_reward/std": 0.4509948492050171, "rho2": 0.6249996423721313, "step": 767 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.5483419090845536e-09, "advantages/std": 0.2624669671058655, "advantages/var": 0.06888890882175147, "completions/clipped_ratio": 0.234375, "epoch": 4.412903225806452, "grad_norm": 50.54844497343367, "learning_rate": 7.424250632944484e-08, "loss": -3.984, "num_tokens": 347703509.0, "residual_var": 0.03444446250796318, "reward": 0.63671875, "reward_std": 0.17396602034568787, "rewards/drgrpo_math_reward/mean": 0.63671875, "rewards/drgrpo_math_reward/std": 0.4814152419567108, "rho2": 0.49999985098838806, "step": 768 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.8736582450556313e-09, "advantages/std": 0.2835783362388611, "advantages/var": 0.08041667278400055, "completions/clipped_ratio": 0.1953125, "epoch": 4.418637992831541, "grad_norm": 47.526873793464574, "learning_rate": 7.289088523130926e-08, "loss": -3.3005, "num_tokens": 348220808.0, "residual_var": 0.0402083620429039, "reward": 0.837890625, "reward_std": 0.19273741543293, "rewards/drgrpo_math_reward/mean": 0.837890625, "rewards/drgrpo_math_reward/std": 0.3689115643501282, "rho2": 0.4999998211860657, "step": 769 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.729435889515722e-09, "advantages/std": 0.24972209334373474, "advantages/var": 0.06236112390397697, "completions/clipped_ratio": 0.0078125, "epoch": 4.424372759856631, "grad_norm": 46.407364690965615, "learning_rate": 7.155121540222808e-08, "loss": -3.8766, "num_tokens": 348729755.0, "residual_var": 0.03118058294057846, "reward": 0.849609375, "reward_std": 0.16575975716114044, "rewards/drgrpo_math_reward/mean": 0.849609375, "rewards/drgrpo_math_reward/std": 0.35780346393585205, "rho2": 0.4999997913837433, "step": 770 }, { "advantages/mean": -8.731149137020111e-10, "advantages/snr": 3.3000637771787104e-09, "advantages/std": 0.26457515358924866, "advantages/var": 0.07000001189677452, "completions/clipped_ratio": 0.109375, "epoch": 4.43010752688172, "grad_norm": 49.40897073077737, "learning_rate": 7.022351411174865e-08, "loss": -1.039, "num_tokens": 349244289.0, "residual_var": 0.0350000262260437, "reward": 0.796875, "reward_std": 0.17362791299819946, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "rho2": 0.4999997615814209, "step": 771 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 7.056056033830583e-09, "advantages/std": 0.2969754934310913, "advantages/var": 0.08819444369864016, "completions/clipped_ratio": 0.15625, "epoch": 4.43584229390681, "grad_norm": 56.78592860405079, "learning_rate": 6.890779847513295e-08, "loss": -9.2045, "num_tokens": 349780866.0, "residual_var": 0.03307294473052025, "reward": 0.771484375, "reward_std": 0.22487452626228333, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "rho2": 0.6249997615814209, "step": 772 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.7741710486421495e-09, "advantages/std": 0.32808369398117065, "advantages/var": 0.10763891025633043, "completions/clipped_ratio": 0.1484375, "epoch": 4.4415770609319, "grad_norm": 66.66319243814104, "learning_rate": 6.760408545313678e-08, "loss": -7.6347, "num_tokens": 350312305.0, "residual_var": 0.037000901997089386, "reward": 0.712890625, "reward_std": 0.25556445121765137, "rewards/drgrpo_math_reward/mean": 0.712890625, "rewards/drgrpo_math_reward/std": 0.45285552740097046, "rho2": 0.6562497615814209, "step": 773 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.2953341007232666, "advantages/var": 0.08722223105002058, "completions/clipped_ratio": 0.15625, "epoch": 4.447311827956989, "grad_norm": 55.888770867203334, "learning_rate": 6.631239185179205e-08, "loss": -4.6712, "num_tokens": 350825118.0, "residual_var": 0.0354340635240078, "reward": 0.7265625, "reward_std": 0.21302297711372375, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "rho2": 0.5937497019767761, "step": 774 }, { "advantages/mean": -1.6298145055770874e-09, "advantages/snr": 5.179163625034987e-09, "advantages/std": 0.3146868050098419, "advantages/var": 0.09902778524730227, "completions/clipped_ratio": 0.0625, "epoch": 4.453046594982079, "grad_norm": 63.14450091613359, "learning_rate": 6.503273432218914e-08, "loss": -11.1461, "num_tokens": 351355814.0, "residual_var": 0.05570317059755325, "reward": 0.658203125, "reward_std": 0.2042386829853058, "rewards/drgrpo_math_reward/mean": 0.658203125, "rewards/drgrpo_math_reward/std": 0.4747757613658905, "rho2": 0.43749967217445374, "step": 775 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 2.953315407010518e-09, "advantages/std": 0.31534814834594727, "advantages/var": 0.09944445466521756, "completions/clipped_ratio": 0.1171875, "epoch": 4.458781362007168, "grad_norm": 60.83802362419972, "learning_rate": 6.376512936026279e-08, "loss": -4.8408, "num_tokens": 351874969.0, "residual_var": 0.049722254276275635, "reward": 0.7265625, "reward_std": 0.2160642445087433, "rewards/drgrpo_math_reward/mean": 0.7265625, "rewards/drgrpo_math_reward/std": 0.4461594223976135, "rho2": 0.4999997317790985, "step": 776 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.552204177606606e-09, "advantages/std": 0.30000001192092896, "advantages/var": 0.09000000715255752, "completions/clipped_ratio": 0.0390625, "epoch": 4.464516129032258, "grad_norm": 61.10952172241609, "learning_rate": 6.250959330657924e-08, "loss": -8.1654, "num_tokens": 352387098.0, "residual_var": 0.04500003531575203, "reward": 0.82421875, "reward_std": 0.20517073571681976, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "rho2": 0.4999997019767761, "step": 777 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 1.041250189422832e-09, "advantages/std": 0.22360680997371674, "advantages/var": 0.050000005466621866, "completions/clipped_ratio": 0.1171875, "epoch": 4.4702508960573475, "grad_norm": 41.561201953402325, "learning_rate": 6.126614234612593e-08, "loss": -3.8906, "num_tokens": 352908838.0, "residual_var": 0.031250014901161194, "reward": 0.76953125, "reward_std": 0.13326513767242432, "rewards/drgrpo_math_reward/mean": 0.76953125, "rewards/drgrpo_math_reward/std": 0.42154473066329956, "rho2": 0.37499988079071045, "step": 778 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.6378528027229338e-09, "advantages/std": 0.28431203961372375, "advantages/var": 0.08083333586931563, "completions/clipped_ratio": 0.2109375, "epoch": 4.4759856630824375, "grad_norm": 50.58246193023474, "learning_rate": 6.003479250810217e-08, "loss": -4.6667, "num_tokens": 353437637.0, "residual_var": 0.03536462038755417, "reward": 0.69921875, "reward_std": 0.2008703351020813, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "rho2": 0.5624997019767761, "step": 779 }, { "advantages/mean": -1.7462298274040222e-09, "advantages/snr": 5.829768905973321e-09, "advantages/std": 0.29953670501708984, "advantages/var": 0.0897222376524951, "completions/clipped_ratio": 0.265625, "epoch": 4.481720430107527, "grad_norm": 56.101505276272796, "learning_rate": 5.881555966571328e-08, "loss": -5.2574, "num_tokens": 353977036.0, "residual_var": 0.03644968941807747, "reward": 0.6484375, "reward_std": 0.21775254607200623, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4779251217842102, "rho2": 0.5937497019767761, "step": 780 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.246167126089907e-09, "advantages/std": 0.3586239218711853, "advantages/var": 0.12861111733827002, "completions/clipped_ratio": 0.28125, "epoch": 4.487455197132617, "grad_norm": 77.85480428897213, "learning_rate": 5.760845953596527e-08, "loss": -6.8554, "num_tokens": 354506380.0, "residual_var": 0.04421013966202736, "reward": 0.69140625, "reward_std": 0.27413833141326904, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.4623647928237915, "rho2": 0.6562495231628418, "step": 781 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 6.884795088094659e-09, "advantages/std": 0.27054473757743835, "advantages/var": 0.07319445503084498, "completions/clipped_ratio": 0.078125, "epoch": 4.493189964157706, "grad_norm": 52.62834254653075, "learning_rate": 5.6413507679463066e-08, "loss": -5.6182, "num_tokens": 355027997.0, "residual_var": 0.04117190092802048, "reward": 0.755859375, "reward_std": 0.17719264328479767, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "rho2": 0.4374997615814209, "step": 782 }, { "advantages/mean": 2.3283064365386963e-09, "advantages/snr": 7.175823393469098e-09, "advantages/std": 0.32446539402008057, "advantages/var": 0.10527779191660613, "completions/clipped_ratio": 0.40625, "epoch": 4.498924731182796, "grad_norm": 63.18795843182846, "learning_rate": 5.523071950020908e-08, "loss": -5.1869, "num_tokens": 355568936.0, "residual_var": 0.04605906456708908, "reward": 0.73828125, "reward_std": 0.23482444882392883, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "rho2": 0.5624997615814209, "step": 783 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.5108045973188897e-09, "advantages/std": 0.3082207143306732, "advantages/var": 0.09500000874251047, "completions/clipped_ratio": 0.25, "epoch": 4.504659498207886, "grad_norm": 56.483891585763374, "learning_rate": 5.4060110245405136e-08, "loss": -6.0161, "num_tokens": 356096241.0, "residual_var": 0.03562503680586815, "reward": 0.76171875, "reward_std": 0.23394422233104706, "rewards/drgrpo_math_reward/mean": 0.76171875, "rewards/drgrpo_math_reward/std": 0.42644867300987244, "rho2": 0.6249997019767761, "step": 784 }, { "advantages/mean": 6.402842700481415e-10, "advantages/snr": 2.159420238876808e-09, "advantages/std": 0.29650747776031494, "advantages/var": 0.08791668436778366, "completions/clipped_ratio": -0.0546875, "epoch": 4.510394265232975, "grad_norm": 59.40030916367403, "learning_rate": 5.2901695005255765e-08, "loss": -2.617, "num_tokens": 356603036.0, "residual_var": 0.049453169107437134, "reward": 0.771484375, "reward_std": 0.19158118963241577, "rewards/drgrpo_math_reward/mean": 0.771484375, "rewards/drgrpo_math_reward/std": 0.4202871024608612, "rho2": 0.43749967217445374, "step": 785 }, { "advantages/mean": -1.7462298274040222e-09, "advantages/snr": 8.746346434406952e-09, "advantages/std": 0.19965247809886932, "advantages/var": 0.039861112011019495, "completions/clipped_ratio": 0.078125, "epoch": 4.516129032258064, "grad_norm": 36.49826611194047, "learning_rate": 5.175548871277358e-08, "loss": -2.5398, "num_tokens": 357119582.0, "residual_var": 0.02865019254386425, "reward": 0.849609375, "reward_std": 0.09820909798145294, "rewards/drgrpo_math_reward/mean": 0.849609375, "rewards/drgrpo_math_reward/std": 0.35780346393585205, "rho2": 0.2812498211860657, "step": 786 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.057582047799814e-09, "advantages/std": 0.3045944571495056, "advantages/var": 0.09277778332620201, "completions/clipped_ratio": 0.3203125, "epoch": 4.521863799283154, "grad_norm": 56.25523204437013, "learning_rate": 5.06215061435874e-08, "loss": -5.5809, "num_tokens": 357664392.0, "residual_var": 0.052187543362379074, "reward": 0.578125, "reward_std": 0.19631177186965942, "rewards/drgrpo_math_reward/mean": 0.578125, "rewards/drgrpo_math_reward/std": 0.49434176087379456, "rho2": 0.4374997019767761, "step": 787 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 4.18542046102e-09, "advantages/std": 0.3059593439102173, "advantages/var": 0.09361112012597061, "completions/clipped_ratio": 0.1796875, "epoch": 4.527598566308244, "grad_norm": 61.863643936229394, "learning_rate": 4.9499761915750335e-08, "loss": -7.9171, "num_tokens": 358202873.0, "residual_var": 0.04680559039115906, "reward": 0.66015625, "reward_std": 0.2049366980791092, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.4741191864013672, "rho2": 0.4999997615814209, "step": 788 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 8.638841149180537e-10, "advantages/std": 0.2695160508155823, "advantages/var": 0.07263890164722753, "completions/clipped_ratio": 0.1171875, "epoch": 4.533333333333333, "grad_norm": 48.613961063692116, "learning_rate": 4.8390270489553245e-08, "loss": -5.9743, "num_tokens": 358725128.0, "residual_var": 0.03631947562098503, "reward": 0.841796875, "reward_std": 0.17913666367530823, "rewards/drgrpo_math_reward/mean": 0.841796875, "rewards/drgrpo_math_reward/std": 0.36528825759887695, "rho2": 0.4999997615814209, "step": 789 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.8089116227885588e-09, "advantages/std": 0.2901149392127991, "advantages/var": 0.0841666779544461, "completions/clipped_ratio": 0.1015625, "epoch": 4.539068100358423, "grad_norm": 60.26928098921365, "learning_rate": 4.729304616733687e-08, "loss": -8.396, "num_tokens": 359254743.0, "residual_var": 0.03682294860482216, "reward": 0.6875, "reward_std": 0.20442989468574524, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "rho2": 0.5624997615814209, "step": 790 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 6.003848609776889e-09, "advantages/std": 0.31024184823036194, "advantages/var": 0.09625000439339093, "completions/clipped_ratio": 0.125, "epoch": 4.544802867383512, "grad_norm": 61.017490873564505, "learning_rate": 4.620810309330803e-08, "loss": -7.7131, "num_tokens": 359778875.0, "residual_var": 0.054140664637088776, "reward": 0.802734375, "reward_std": 0.202193945646286, "rewards/drgrpo_math_reward/mean": 0.802734375, "rewards/drgrpo_math_reward/std": 0.3983237147331238, "rho2": 0.4374997615814209, "step": 791 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 9.354894458348092e-10, "advantages/std": 0.24888643622398376, "advantages/var": 0.06194445813627514, "completions/clipped_ratio": 0.2421875, "epoch": 4.550537634408602, "grad_norm": 47.4949500366903, "learning_rate": 4.513545525335705e-08, "loss": -1.2865, "num_tokens": 360306207.0, "residual_var": 0.034843768924474716, "reward": 0.82421875, "reward_std": 0.1586260050535202, "rewards/drgrpo_math_reward/mean": 0.82421875, "rewards/drgrpo_math_reward/std": 0.3810062110424042, "rho2": 0.4374998211860657, "step": 792 }, { "advantages/mean": 1.280568540096283e-09, "advantages/snr": 3.628046348027306e-09, "advantages/std": 0.3529636561870575, "advantages/var": 0.12458334258893533, "completions/clipped_ratio": 0.109375, "epoch": 4.5562724014336915, "grad_norm": 64.58587538777252, "learning_rate": 4.407511647487816e-08, "loss": -5.6907, "num_tokens": 360838161.0, "residual_var": 0.03893233463168144, "reward": 0.806640625, "reward_std": 0.2852780222892761, "rewards/drgrpo_math_reward/mean": 0.806640625, "rewards/drgrpo_math_reward/std": 0.39531853795051575, "rho2": 0.6874997019767761, "step": 793 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 3.608481574425997e-09, "advantages/std": 0.2903542220592499, "advantages/var": 0.08430557426763219, "completions/clipped_ratio": 0.1875, "epoch": 4.5620071684587815, "grad_norm": 56.427951400461616, "learning_rate": 4.30271004265903e-08, "loss": -5.8187, "num_tokens": 361362161.0, "residual_var": 0.03161461278796196, "reward": 0.810546875, "reward_std": 0.2142867147922516, "rewards/drgrpo_math_reward/mean": 0.810546875, "rewards/drgrpo_math_reward/std": 0.3922513723373413, "rho2": 0.6249997615814209, "step": 794 }, { "advantages/mean": 2.3283064365386963e-10, "advantages/snr": 7.042372651540764e-10, "advantages/std": 0.330613911151886, "advantages/var": 0.10930555824714716, "completions/clipped_ratio": 0.1953125, "epoch": 4.567741935483871, "grad_norm": 66.2271356973523, "learning_rate": 4.199142061836136e-08, "loss": -8.4452, "num_tokens": 361873477.0, "residual_var": 0.034158021211624146, "reward": 0.751953125, "reward_std": 0.2565297782421112, "rewards/drgrpo_math_reward/mean": 0.751953125, "rewards/drgrpo_math_reward/std": 0.4323015511035919, "rho2": 0.6874997615814209, "step": 795 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.4101948193331644e-09, "advantages/std": 0.27309951186180115, "advantages/var": 0.07458334337915407, "completions/clipped_ratio": 0.1328125, "epoch": 4.573476702508961, "grad_norm": 51.19551680364319, "learning_rate": 4.096809040103444e-08, "loss": -4.6844, "num_tokens": 362405515.0, "residual_var": 0.041953153908252716, "reward": 0.587890625, "reward_std": 0.17369882762432098, "rewards/drgrpo_math_reward/mean": 0.587890625, "rewards/drgrpo_math_reward/std": 0.49269601702690125, "rho2": 0.4374997615814209, "step": 796 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.5504638069721177e-09, "advantages/std": 0.2950988709926605, "advantages/var": 0.0870833436611429, "completions/clipped_ratio": 0.203125, "epoch": 4.57921146953405, "grad_norm": 53.011989561894296, "learning_rate": 3.995712296625475e-08, "loss": -5.2986, "num_tokens": 362920507.0, "residual_var": 0.04626305401325226, "reward": 0.720703125, "reward_std": 0.20145057141780853, "rewards/drgrpo_math_reward/mean": 0.720703125, "rewards/drgrpo_math_reward/std": 0.44909247756004333, "rho2": 0.4687497913837433, "step": 797 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 3.963971636371369e-09, "advantages/std": 0.2936835289001465, "advantages/var": 0.08625001514724318, "completions/clipped_ratio": 0.28125, "epoch": 4.58494623655914, "grad_norm": 58.154867660316036, "learning_rate": 3.895853134630034e-08, "loss": -6.1182, "num_tokens": 363455899.0, "residual_var": 0.04582035914063454, "reward": 0.681640625, "reward_std": 0.19007550179958344, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.4687495827674866, "step": 798 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.01062562920167e-09, "advantages/std": 0.30934518575668335, "advantages/var": 0.09569444395083693, "completions/clipped_ratio": 0.171875, "epoch": 4.59068100358423, "grad_norm": 56.97498107169034, "learning_rate": 3.797232841391407e-08, "loss": -7.6105, "num_tokens": 363989348.0, "residual_var": 0.03887590765953064, "reward": 0.763671875, "reward_std": 0.23215298354625702, "rewards/drgrpo_math_reward/mean": 0.763671875, "rewards/drgrpo_math_reward/std": 0.42524150013923645, "rho2": 0.5937497019767761, "step": 799 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 7.947162817853429e-10, "advantages/std": 0.29297327995300293, "advantages/var": 0.08583334276642063, "completions/clipped_ratio": 0.203125, "epoch": 4.596415770609319, "grad_norm": 56.62899267901839, "learning_rate": 3.699852688213745e-08, "loss": -5.6404, "num_tokens": 364515532.0, "residual_var": 0.04559898003935814, "reward": 0.72265625, "reward_std": 0.1952083706855774, "rewards/drgrpo_math_reward/mean": 0.72265625, "rewards/drgrpo_math_reward/std": 0.4481254518032074, "rho2": 0.46874985098838806, "step": 800 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 4.9910362411566714e-09, "advantages/std": 0.3032234311103821, "advantages/var": 0.09194444917435263, "completions/clipped_ratio": 0.015625, "epoch": 4.602150537634409, "grad_norm": 61.8067213448404, "learning_rate": 3.6037139304146756e-08, "loss": -5.3575, "num_tokens": 365036396.0, "residual_var": 0.04597226157784462, "reward": 0.7734375, "reward_std": 0.207758367061615, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "rho2": 0.4999997019767761, "step": 801 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.084996173493076e-09, "advantages/std": 0.32188680768013, "advantages/var": 0.103611116958505, "completions/clipped_ratio": 0.328125, "epoch": 4.607885304659498, "grad_norm": 62.88498362164929, "learning_rate": 3.508817807309094e-08, "loss": -6.3987, "num_tokens": 365572088.0, "residual_var": 0.04532988741993904, "reward": 0.703125, "reward_std": 0.23509034514427185, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "rho2": 0.5624998807907104, "step": 802 }, { "advantages/mean": 8.149072527885437e-10, "advantages/snr": 2.5282707863187617e-09, "advantages/std": 0.32231801748275757, "advantages/var": 0.10388890439401521, "completions/clipped_ratio": -0.0078125, "epoch": 4.613620071684588, "grad_norm": 56.645852976263214, "learning_rate": 3.4151655421932654e-08, "loss": -6.7071, "num_tokens": 366086304.0, "residual_var": 0.04545143246650696, "reward": 0.66015625, "reward_std": 0.22693631052970886, "rewards/drgrpo_math_reward/mean": 0.66015625, "rewards/drgrpo_math_reward/std": 0.4741191864013672, "rho2": 0.5624997019767761, "step": 803 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.6378526310389964e-09, "advantages/std": 0.28431206941604614, "advantages/var": 0.08083335281563464, "completions/clipped_ratio": 0.21875, "epoch": 4.619354838709677, "grad_norm": 56.79874969898444, "learning_rate": 3.322758342329002e-08, "loss": -7.9438, "num_tokens": 366601519.0, "residual_var": 0.04799482226371765, "reward": 0.73046875, "reward_std": 0.17706844210624695, "rewards/drgrpo_math_reward/mean": 0.73046875, "rewards/drgrpo_math_reward/std": 0.44415023922920227, "rho2": 0.4062497615814209, "step": 804 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.287020412877018e-09, "advantages/std": 0.28333336114883423, "advantages/var": 0.08027779353989573, "completions/clipped_ratio": 0.1171875, "epoch": 4.625089605734767, "grad_norm": 54.32310284410401, "learning_rate": 3.2315973989280654e-08, "loss": -7.4377, "num_tokens": 367111095.0, "residual_var": 0.03763023391366005, "reward": 0.6875, "reward_std": 0.19321173429489136, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "rho2": 0.5312498211860657, "step": 805 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.710901066001239e-09, "advantages/std": 0.2576604187488556, "advantages/var": 0.06638889138983561, "completions/clipped_ratio": 0.1875, "epoch": 4.630824372759856, "grad_norm": 49.1654623653037, "learning_rate": 3.141683887136892e-08, "loss": -5.5717, "num_tokens": 367639538.0, "residual_var": 0.03734377399086952, "reward": 0.69140625, "reward_std": 0.16003043949604034, "rewards/drgrpo_math_reward/mean": 0.69140625, "rewards/drgrpo_math_reward/std": 0.4623647928237915, "rho2": 0.4374997615814209, "step": 806 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 2.0163722800116715e-09, "advantages/std": 0.2886751592159271, "advantages/var": 0.08333334754834087, "completions/clipped_ratio": 0.203125, "epoch": 4.636559139784946, "grad_norm": 53.29764112678087, "learning_rate": 3.053018966021392e-08, "loss": -3.2225, "num_tokens": 368165471.0, "residual_var": 0.04166669398546219, "reward": 0.67578125, "reward_std": 0.19304540753364563, "rewards/drgrpo_math_reward/mean": 0.67578125, "rewards/drgrpo_math_reward/std": 0.4685399830341339, "rho2": 0.4999998211860657, "step": 807 }, { "advantages/mean": 1.979060471057892e-09, "advantages/snr": 6.844268449359868e-09, "advantages/std": 0.28915587067604065, "advantages/var": 0.08361111754641914, "completions/clipped_ratio": 0.203125, "epoch": 4.6422939068100355, "grad_norm": 55.49211416381325, "learning_rate": 2.9656037785520395e-08, "loss": -2.2882, "num_tokens": 368695721.0, "residual_var": 0.04180559143424034, "reward": 0.60546875, "reward_std": 0.19310419261455536, "rewards/drgrpo_math_reward/mean": 0.60546875, "rewards/drgrpo_math_reward/std": 0.4892277717590332, "rho2": 0.4999997019767761, "step": 808 }, { "advantages/mean": -8.731149137020111e-10, "advantages/snr": 2.8143040358329167e-09, "advantages/std": 0.31024184823036194, "advantages/var": 0.09625000439339093, "completions/clipped_ratio": 0.15625, "epoch": 4.6480286738351255, "grad_norm": 61.19256989332019, "learning_rate": 2.8794394515890607e-08, "loss": -9.1453, "num_tokens": 369224590.0, "residual_var": 0.03910159319639206, "reward": 0.779296875, "reward_std": 0.22468125820159912, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "rho2": 0.5937497615814209, "step": 809 }, { "advantages/mean": -2.3283064365386963e-09, "advantages/snr": 8.408853958835503e-09, "advantages/std": 0.2768874764442444, "advantages/var": 0.07666667461166199, "completions/clipped_ratio": 0.1875, "epoch": 4.6537634408602155, "grad_norm": 51.37487962191091, "learning_rate": 2.79452709586806e-08, "loss": -5.4433, "num_tokens": 369751011.0, "residual_var": 0.033541690558195114, "reward": 0.73828125, "reward_std": 0.19585810601711273, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "rho2": 0.5624998211860657, "step": 810 }, { "advantages/mean": 1.5133991837501526e-09, "advantages/snr": 5.7201105471097645e-09, "advantages/std": 0.26457515358924866, "advantages/var": 0.07000001189677452, "completions/clipped_ratio": -0.078125, "epoch": 4.659498207885305, "grad_norm": 51.620401680676586, "learning_rate": 2.7108678059855062e-08, "loss": -3.7468, "num_tokens": 370269434.0, "residual_var": 0.037187520414590836, "reward": 0.75390625, "reward_std": 0.17044177651405334, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.46874985098838806, "step": 811 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 4.0369000838057454e-09, "advantages/std": 0.2595402002334595, "advantages/var": 0.06736111553722424, "completions/clipped_ratio": 0.171875, "epoch": 4.665232974910394, "grad_norm": 49.42306590182485, "learning_rate": 2.6284626603848114e-08, "loss": -3.4845, "num_tokens": 370797084.0, "residual_var": 0.03789064660668373, "reward": 0.732421875, "reward_std": 0.1702524572610855, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.4374998211860657, "step": 812 }, { "advantages/mean": -1.979060471057892e-09, "advantages/snr": 7.089960964148658e-09, "advantages/std": 0.2791355848312378, "advantages/var": 0.07791667471907715, "completions/clipped_ratio": 0.109375, "epoch": 4.670967741935484, "grad_norm": 53.48156843667528, "learning_rate": 2.5473127213422762e-08, "loss": -4.6014, "num_tokens": 371317369.0, "residual_var": 0.03408857807517052, "reward": 0.837890625, "reward_std": 0.1955866813659668, "rewards/drgrpo_math_reward/mean": 0.837890625, "rewards/drgrpo_math_reward/std": 0.3689115643501282, "rho2": 0.5624996423721313, "step": 813 }, { "advantages/mean": -2.0954757928848267e-09, "advantages/snr": 6.952803372096282e-09, "advantages/std": 0.3013857305049896, "advantages/var": 0.09083335855202623, "completions/clipped_ratio": 0.015625, "epoch": 4.676702508960574, "grad_norm": 58.107979913416386, "learning_rate": 2.4674190349535217e-08, "loss": -6.9159, "num_tokens": 371822866.0, "residual_var": 0.0369010753929615, "reward": 0.7734375, "reward_std": 0.22272510826587677, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "rho2": 0.5937497615814209, "step": 814 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 3.782545504887009e-09, "advantages/std": 0.307769775390625, "advantages/var": 0.09472223464399576, "completions/clipped_ratio": 0.1171875, "epoch": 4.682437275985663, "grad_norm": 58.409129599531276, "learning_rate": 2.3887826311198898e-08, "loss": -9.1883, "num_tokens": 372336300.0, "residual_var": 0.04440108314156532, "reward": 0.7734375, "reward_std": 0.21561717987060547, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "rho2": 0.5312496423721313, "step": 815 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.5476359092978686e-09, "advantages/std": 0.2953341007232666, "advantages/var": 0.08722223105002058, "completions/clipped_ratio": 0.0, "epoch": 4.688172043010753, "grad_norm": 56.08984764162968, "learning_rate": 2.311404523535243e-08, "loss": -2.2204, "num_tokens": 372847234.0, "residual_var": 0.032708365470170975, "reward": 0.7734375, "reward_std": 0.22354966402053833, "rewards/drgrpo_math_reward/mean": 0.7734375, "rewards/drgrpo_math_reward/std": 0.4190165400505066, "rho2": 0.6249997615814209, "step": 816 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.8982440434672016e-09, "advantages/std": 0.30663949251174927, "advantages/var": 0.09402777836786314, "completions/clipped_ratio": 0.09375, "epoch": 4.693906810035842, "grad_norm": 57.11403696825086, "learning_rate": 2.2352857096728627e-08, "loss": -7.2887, "num_tokens": 373371828.0, "residual_var": 0.03526047244668007, "reward": 0.748046875, "reward_std": 0.2241336703300476, "rewards/drgrpo_math_reward/mean": 0.748046875, "rewards/drgrpo_math_reward/std": 0.43455907702445984, "rho2": 0.6249995231628418, "step": 817 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 5.839947803689126e-09, "advantages/std": 0.23921169340610504, "advantages/var": 0.0572222342622164, "completions/clipped_ratio": -0.03125, "epoch": 4.699641577060932, "grad_norm": 44.461440144410076, "learning_rate": 2.1604271707726497e-08, "loss": -4.5627, "num_tokens": 373876718.0, "residual_var": 0.02682293765246868, "reward": 0.85546875, "reward_std": 0.16477538645267487, "rewards/drgrpo_math_reward/mean": 0.85546875, "rewards/drgrpo_math_reward/std": 0.35197147727012634, "rho2": 0.5312497615814209, "step": 818 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.4193368389137195e-09, "advantages/std": 0.32808369398117065, "advantages/var": 0.10763891025633043, "completions/clipped_ratio": 0.1171875, "epoch": 4.705376344086021, "grad_norm": 64.62213331530302, "learning_rate": 2.086829871828377e-08, "loss": -4.3331, "num_tokens": 374396558.0, "residual_var": 0.05381947383284569, "reward": 0.732421875, "reward_std": 0.24131838977336884, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.4999998211860657, "step": 819 }, { "advantages/mean": 2.2118911147117615e-09, "advantages/snr": 7.766467464262707e-09, "advantages/std": 0.28480014204978943, "advantages/var": 0.08111112091158024, "completions/clipped_ratio": 0.0390625, "epoch": 4.711111111111111, "grad_norm": 51.591501179833735, "learning_rate": 2.0144947615753138e-08, "loss": -3.6626, "num_tokens": 374910022.0, "residual_var": 0.04309030994772911, "reward": 0.7421875, "reward_std": 0.18631097674369812, "rewards/drgrpo_math_reward/mean": 0.7421875, "rewards/drgrpo_math_reward/std": 0.43785804510116577, "rho2": 0.4687497615814209, "step": 820 }, { "advantages/mean": 7.566995918750763e-10, "advantages/snr": 2.871472697533992e-09, "advantages/std": 0.2635231614112854, "advantages/var": 0.06944445660019838, "completions/clipped_ratio": 0.15625, "epoch": 4.7168458781362, "grad_norm": 49.828706481832796, "learning_rate": 1.9434227724779984e-08, "loss": -7.7346, "num_tokens": 375440602.0, "residual_var": 0.036892395466566086, "reward": 0.6484375, "reward_std": 0.17134325206279755, "rewards/drgrpo_math_reward/mean": 0.6484375, "rewards/drgrpo_math_reward/std": 0.4779251217842102, "rho2": 0.46874967217445374, "step": 821 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 7.317160889482837e-10, "advantages/std": 0.3181980550289154, "advantages/var": 0.10125000222418468, "completions/clipped_ratio": 0.328125, "epoch": 4.72258064516129, "grad_norm": 60.406859221036704, "learning_rate": 1.8736148207181947e-08, "loss": -7.8622, "num_tokens": 375981059.0, "residual_var": 0.047460973262786865, "reward": 0.634765625, "reward_std": 0.22573891282081604, "rewards/drgrpo_math_reward/mean": 0.634765625, "rewards/drgrpo_math_reward/std": 0.4819667339324951, "rho2": 0.5312497615814209, "step": 822 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.0474136295424727e-09, "advantages/std": 0.3411581814289093, "advantages/var": 0.1163889047558806, "completions/clipped_ratio": 0.328125, "epoch": 4.7283154121863795, "grad_norm": 67.83444774328031, "learning_rate": 1.8050718061830894e-08, "loss": -13.7777, "num_tokens": 376525584.0, "residual_var": 0.032734423875808716, "reward": 0.6953125, "reward_std": 0.2759811580181122, "rewards/drgrpo_math_reward/mean": 0.6953125, "rewards/drgrpo_math_reward/std": 0.4607250988483429, "rho2": 0.7187496423721313, "step": 823 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.0598729494960375e-09, "advantages/std": 0.30436640977859497, "advantages/var": 0.09263891140151159, "completions/clipped_ratio": 0.234375, "epoch": 4.7340501792114695, "grad_norm": 58.73428826799727, "learning_rate": 1.7377946124536804e-08, "loss": -9.764, "num_tokens": 377051455.0, "residual_var": 0.03763459622859955, "reward": 0.669921875, "reward_std": 0.21901626884937286, "rewards/drgrpo_math_reward/mean": 0.669921875, "rewards/drgrpo_math_reward/std": 0.47070086002349854, "rho2": 0.5937495827674866, "step": 824 }, { "advantages/mean": -9.313225746154785e-10, "advantages/snr": 3.1434589139575635e-09, "advantages/std": 0.2962731719017029, "advantages/var": 0.08777779238869599, "completions/clipped_ratio": 0.203125, "epoch": 4.7397849462365595, "grad_norm": 52.722343544998346, "learning_rate": 1.6717841067934392e-08, "loss": -4.9873, "num_tokens": 377570781.0, "residual_var": 0.04114586487412453, "reward": 0.8359375, "reward_std": 0.20403945446014404, "rewards/drgrpo_math_reward/mean": 0.8359375, "rewards/drgrpo_math_reward/std": 0.37069445848464966, "rho2": 0.5312498211860657, "step": 825 }, { "advantages/mean": -5.820766091346741e-10, "advantages/snr": 1.8588485979747471e-09, "advantages/std": 0.3131382465362549, "advantages/var": 0.09805556144380034, "completions/clipped_ratio": 0.0859375, "epoch": 4.745519713261649, "grad_norm": 61.883696852531806, "learning_rate": 1.607041140137033e-08, "loss": -5.0523, "num_tokens": 378108895.0, "residual_var": 0.03677086532115936, "reward": 0.703125, "reward_std": 0.23194855451583862, "rewards/drgrpo_math_reward/mean": 0.703125, "rewards/drgrpo_math_reward/std": 0.45732781291007996, "rho2": 0.6249997615814209, "step": 826 }, { "advantages/mean": 0.0, "advantages/snr": 0.0, "advantages/std": 0.27738863229751587, "advantages/var": 0.07694445332788646, "completions/clipped_ratio": 0.0078125, "epoch": 4.751254480286739, "grad_norm": 56.79356343398058, "learning_rate": 1.543566547079467e-08, "loss": -3.9235, "num_tokens": 378622683.0, "residual_var": 0.028854191303253174, "reward": 0.68359375, "reward_std": 0.20521506667137146, "rewards/drgrpo_math_reward/mean": 0.68359375, "rewards/drgrpo_math_reward/std": 0.46552830934524536, "rho2": 0.6249997615814209, "step": 827 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.6041301117560946e-09, "advantages/std": 0.26822465658187866, "advantages/var": 0.07194446639846674, "completions/clipped_ratio": 0.1875, "epoch": 4.756989247311828, "grad_norm": 51.07633044157558, "learning_rate": 1.481361145865223e-08, "loss": -4.5254, "num_tokens": 379160463.0, "residual_var": 0.04046877101063728, "reward": 0.7578125, "reward_std": 0.17200195789337158, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "rho2": 0.4374998211860657, "step": 828 }, { "advantages/mean": -1.6880221664905548e-09, "advantages/snr": 6.317755383319019e-09, "advantages/std": 0.2671869993209839, "advantages/var": 0.07138889260615144, "completions/clipped_ratio": 0.09375, "epoch": 4.762724014336918, "grad_norm": 51.93637961333321, "learning_rate": 1.4204257383778062e-08, "loss": -5.7788, "num_tokens": 379686483.0, "residual_var": 0.03569447621703148, "reward": 0.69921875, "reward_std": 0.18119356036186218, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "rho2": 0.4999997615814209, "step": 829 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.5009621524442222e-09, "advantages/std": 0.31024184823036194, "advantages/var": 0.09625000439339093, "completions/clipped_ratio": 0.296875, "epoch": 4.768458781362007, "grad_norm": 59.396927371770495, "learning_rate": 1.3607611101293382e-08, "loss": -6.1665, "num_tokens": 380231694.0, "residual_var": 0.04511723294854164, "reward": 0.755859375, "reward_std": 0.21372802555561066, "rewards/drgrpo_math_reward/mean": 0.755859375, "rewards/drgrpo_math_reward/std": 0.42999663949012756, "rho2": 0.5312496423721313, "step": 830 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.1685393229733465e-09, "advantages/std": 0.32210248708724976, "advantages/var": 0.1037500121877919, "completions/clipped_ratio": -0.0390625, "epoch": 4.774193548387097, "grad_norm": 61.73459272613786, "learning_rate": 1.3023680302504336e-08, "loss": -5.8618, "num_tokens": 380749977.0, "residual_var": 0.051875039935112, "reward": 0.779296875, "reward_std": 0.2161424160003662, "rewards/drgrpo_math_reward/mean": 0.779296875, "rewards/drgrpo_math_reward/std": 0.4151262938976288, "rho2": 0.4999997019767761, "step": 831 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 8.647113004019937e-10, "advantages/std": 0.2692582309246063, "advantages/var": 0.07249999492064862, "completions/clipped_ratio": 0.2578125, "epoch": 4.779928315412186, "grad_norm": 49.4770426754396, "learning_rate": 1.2452472514803636e-08, "loss": -5.7302, "num_tokens": 381287590.0, "residual_var": 0.029453150928020477, "reward": 0.71875, "reward_std": 0.19492103159427643, "rewards/drgrpo_math_reward/mean": 0.71875, "rewards/drgrpo_math_reward/std": 0.45004892349243164, "rho2": 0.5937497615814209, "step": 832 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 3.886512603982214e-09, "advantages/std": 0.29953670501708984, "advantages/var": 0.0897222376524951, "completions/clipped_ratio": 0.125, "epoch": 4.785663082437276, "grad_norm": 50.54478114450151, "learning_rate": 1.1893995101572406e-08, "loss": -2.2417, "num_tokens": 381801749.0, "residual_var": 0.03084205463528633, "reward": 0.78515625, "reward_std": 0.2311471402645111, "rewards/drgrpo_math_reward/mean": 0.78515625, "rewards/drgrpo_math_reward/std": 0.4111155867576599, "rho2": 0.6562496423721313, "step": 833 }, { "advantages/mean": -2.5029294192790985e-09, "advantages/snr": 7.65366520501752e-09, "advantages/std": 0.32702362537384033, "advantages/var": 0.10694445155264987, "completions/clipped_ratio": 0.3203125, "epoch": 4.791397849462365, "grad_norm": 63.714215960119446, "learning_rate": 1.1348255262086048e-08, "loss": -8.3573, "num_tokens": 382342034.0, "residual_var": 0.030078157782554626, "reward": 0.7578125, "reward_std": 0.2650821805000305, "rewards/drgrpo_math_reward/mean": 0.7578125, "rewards/drgrpo_math_reward/std": 0.42882615327835083, "rho2": 0.7187497615814209, "step": 834 }, { "advantages/mean": -2.2118911147117615e-09, "advantages/snr": 7.8959742990902e-09, "advantages/std": 0.28012895584106445, "advantages/var": 0.07847223190060504, "completions/clipped_ratio": 0.2109375, "epoch": 4.797132616487455, "grad_norm": 53.44110491271108, "learning_rate": 1.0815260031421191e-08, "loss": -5.2229, "num_tokens": 382861646.0, "residual_var": 0.04904516041278839, "reward": 0.669921875, "reward_std": 0.1777593195438385, "rewards/drgrpo_math_reward/mean": 0.669921875, "rewards/drgrpo_math_reward/std": 0.47070086002349854, "rho2": 0.37499985098838806, "step": 835 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 8.509654893548286e-10, "advantages/std": 0.27360761165618896, "advantages/var": 0.07486112515620391, "completions/clipped_ratio": 0.0859375, "epoch": 4.802867383512545, "grad_norm": 52.342468327536196, "learning_rate": 1.029501628036511e-08, "loss": -7.466, "num_tokens": 383381337.0, "residual_var": 0.03976999968290329, "reward": 0.740234375, "reward_std": 0.1748373657464981, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "rho2": 0.4687497615814209, "step": 836 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.044464751239986e-09, "advantages/std": 0.34540635347366333, "advantages/var": 0.11930554901997326, "completions/clipped_ratio": 0.09375, "epoch": 4.808602150537634, "grad_norm": 61.8147405668089, "learning_rate": 9.787530715326786e-09, "loss": -7.6039, "num_tokens": 383904046.0, "residual_var": 0.04473963379859924, "reward": 0.705078125, "reward_std": 0.2671964168548584, "rewards/drgrpo_math_reward/mean": 0.705078125, "rewards/drgrpo_math_reward/std": 0.4564536213874817, "rho2": 0.6249996423721313, "step": 837 }, { "advantages/mean": -1.7462298274040222e-09, "advantages/snr": 5.285138213587997e-09, "advantages/std": 0.33040380477905273, "advantages/var": 0.10916667421247439, "completions/clipped_ratio": 0.1484375, "epoch": 4.8143369175627235, "grad_norm": 67.71882875493003, "learning_rate": 9.292809878251096e-09, "loss": -6.6656, "num_tokens": 384438594.0, "residual_var": 0.05117190629243851, "reward": 0.70703125, "reward_std": 0.24398653209209442, "rewards/drgrpo_math_reward/mean": 0.70703125, "rewards/drgrpo_math_reward/std": 0.455569326877594, "rho2": 0.5312498211860657, "step": 838 }, { "advantages/mean": -2.561137080192566e-09, "advantages/snr": 7.756471945995178e-09, "advantages/std": 0.33019354939460754, "advantages/var": 0.10902778006180913, "completions/clipped_ratio": 0.0234375, "epoch": 4.8200716845878135, "grad_norm": 60.68996055558624, "learning_rate": 8.81086014653365e-09, "loss": -7.9096, "num_tokens": 384949021.0, "residual_var": 0.03407122567296028, "reward": 0.787109375, "reward_std": 0.25422054529190063, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "rho2": 0.6874996423721313, "step": 839 }, { "advantages/mean": -8.149072527885437e-10, "advantages/snr": 2.3402771619782346e-09, "advantages/std": 0.3482097089290619, "advantages/var": 0.121250001392462, "completions/clipped_ratio": 0.2734375, "epoch": 4.825806451612904, "grad_norm": 61.32921441995904, "learning_rate": 8.341687732939418e-09, "loss": -10.0389, "num_tokens": 385481263.0, "residual_var": 0.04546881839632988, "reward": 0.673828125, "reward_std": 0.2648661732673645, "rewards/drgrpo_math_reward/mean": 0.673828125, "rewards/drgrpo_math_reward/std": 0.4692695140838623, "rho2": 0.6249995231628418, "step": 840 }, { "advantages/mean": -2.3283064365386963e-10, "advantages/snr": 7.060337317482382e-10, "advantages/std": 0.3297726809978485, "advantages/var": 0.10875002113250876, "completions/clipped_ratio": 0.1171875, "epoch": 4.831541218637993, "grad_norm": 64.75640246575324, "learning_rate": 7.885298685522235e-09, "loss": -11.4112, "num_tokens": 385999027.0, "residual_var": 0.05097661167383194, "reward": 0.791015625, "reward_std": 0.23940080404281616, "rewards/drgrpo_math_reward/mean": 0.791015625, "rewards/drgrpo_math_reward/std": 0.40698084235191345, "rho2": 0.5312496423721313, "step": 841 }, { "advantages/mean": 1.0477378964424133e-09, "advantages/snr": 3.442357405244684e-09, "advantages/std": 0.3043663799762726, "advantages/var": 0.09263889325986074, "completions/clipped_ratio": 0.375, "epoch": 4.837275985663083, "grad_norm": 60.55335625525376, "learning_rate": 7.4416988875465325e-09, "loss": -4.4783, "num_tokens": 386549149.0, "residual_var": 0.043424516916275024, "reward": 0.666015625, "reward_std": 0.20800518989562988, "rewards/drgrpo_math_reward/mean": 0.666015625, "rewards/drgrpo_math_reward/std": 0.47209542989730835, "rho2": 0.5312497019767761, "step": 842 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 5.81808632564541e-09, "advantages/std": 0.28012895584106445, "advantages/var": 0.07847223190060504, "completions/clipped_ratio": 0.0859375, "epoch": 4.843010752688172, "grad_norm": 53.96834697165782, "learning_rate": 7.010894057412287e-09, "loss": -2.5612, "num_tokens": 387074176.0, "residual_var": 0.044140659272670746, "reward": 0.701171875, "reward_std": 0.17844422161579132, "rewards/drgrpo_math_reward/mean": 0.701171875, "rewards/drgrpo_math_reward/std": 0.45819199085235596, "rho2": 0.4374997019767761, "step": 843 }, { "advantages/mean": -4.656612873077393e-10, "advantages/snr": 1.6336477554033255e-09, "advantages/std": 0.285043865442276, "advantages/var": 0.08125000522627435, "completions/clipped_ratio": -0.0078125, "epoch": 4.848745519713262, "grad_norm": 59.44206636627981, "learning_rate": 6.592889748580521e-09, "loss": -5.3375, "num_tokens": 387591504.0, "residual_var": 0.030468782410025597, "reward": 0.830078125, "reward_std": 0.20779120922088623, "rewards/drgrpo_math_reward/mean": 0.830078125, "rewards/drgrpo_math_reward/std": 0.3759314715862274, "rho2": 0.6249997019767761, "step": 844 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.266717978514093e-10, "advantages/std": 0.2728451192378998, "advantages/var": 0.07444445909194375, "completions/clipped_ratio": 0.1015625, "epoch": 4.854480286738351, "grad_norm": 51.853120163190646, "learning_rate": 6.1876913495021446e-09, "loss": -6.5493, "num_tokens": 388111251.0, "residual_var": 0.04420141130685806, "reward": 0.73828125, "reward_std": 0.17110469937324524, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "rho2": 0.40624988079071045, "step": 845 }, { "advantages/mean": 5.820766091346741e-10, "advantages/snr": 2.3154780128487777e-09, "advantages/std": 0.25138506293296814, "advantages/var": 0.06319444986581235, "completions/clipped_ratio": 0.0859375, "epoch": 4.860215053763441, "grad_norm": 54.217196555926876, "learning_rate": 5.795304083548558e-09, "loss": -5.968, "num_tokens": 388616019.0, "residual_var": 0.03752173110842705, "reward": 0.689453125, "reward_std": 0.15422707796096802, "rewards/drgrpo_math_reward/mean": 0.689453125, "rewards/drgrpo_math_reward/std": 0.46317005157470703, "rho2": 0.4062497615814209, "step": 846 }, { "advantages/mean": -2.735760062932968e-09, "advantages/snr": 8.968230669068548e-09, "advantages/std": 0.3050501346588135, "advantages/var": 0.09305558465536024, "completions/clipped_ratio": 0.21875, "epoch": 4.86594982078853, "grad_norm": 61.0950760916386, "learning_rate": 5.415733008943713e-09, "loss": -6.3448, "num_tokens": 389157399.0, "residual_var": 0.03198787569999695, "reward": 0.6875, "reward_std": 0.2312389314174652, "rewards/drgrpo_math_reward/mean": 0.6875, "rewards/drgrpo_math_reward/std": 0.4639657139778137, "rho2": 0.6562497615814209, "step": 847 }, { "advantages/mean": -2.6775524020195007e-09, "advantages/snr": 8.437915877440932e-09, "advantages/std": 0.3173238933086395, "advantages/var": 0.10069445326455284, "completions/clipped_ratio": 0.140625, "epoch": 4.87168458781362, "grad_norm": 57.28398782619433, "learning_rate": 5.048983018699826e-09, "loss": -4.8473, "num_tokens": 389672913.0, "residual_var": 0.037760451436042786, "reward": 0.732421875, "reward_std": 0.23777979612350464, "rewards/drgrpo_math_reward/mean": 0.732421875, "rewards/drgrpo_math_reward/std": 0.4431293308734894, "rho2": 0.6249997615814209, "step": 848 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.603634514843846e-09, "advantages/std": 0.30345237255096436, "advantages/var": 0.09208334240680927, "completions/clipped_ratio": 0.328125, "epoch": 4.877419354838709, "grad_norm": 62.37322114897474, "learning_rate": 4.695058840553545e-09, "loss": -8.5867, "num_tokens": 390214715.0, "residual_var": 0.048919305205345154, "reward": 0.708984375, "reward_std": 0.20087936520576477, "rewards/drgrpo_math_reward/mean": 0.708984375, "rewards/drgrpo_math_reward/std": 0.45467492938041687, "rho2": 0.4687497615814209, "step": 849 }, { "advantages/mean": -1.979060471057892e-09, "advantages/snr": 7.465347016609195e-09, "advantages/std": 0.26509958505630493, "advantages/var": 0.07027778999702505, "completions/clipped_ratio": 0.203125, "epoch": 4.883154121863799, "grad_norm": 51.77382901909082, "learning_rate": 4.353965036905549e-09, "loss": -6.2416, "num_tokens": 390743552.0, "residual_var": 0.0417274534702301, "reward": 0.75, "reward_std": 0.16031214594841003, "rewards/drgrpo_math_reward/mean": 0.75, "rewards/drgrpo_math_reward/std": 0.43343618512153625, "rho2": 0.40624988079071045, "step": 850 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.736892640576727e-09, "advantages/std": 0.28037676215171814, "advantages/var": 0.07861112875468113, "completions/clipped_ratio": -0.0859375, "epoch": 4.888888888888889, "grad_norm": 54.45715459077204, "learning_rate": 4.025706004760931e-09, "loss": -5.284, "num_tokens": 391250255.0, "residual_var": 0.04667538031935692, "reward": 0.86328125, "reward_std": 0.17525018751621246, "rewards/drgrpo_math_reward/mean": 0.86328125, "rewards/drgrpo_math_reward/std": 0.3438861668109894, "rho2": 0.4062497615814209, "step": 851 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.5200680289906245e-09, "advantages/std": 0.26457515358924866, "advantages/var": 0.07000001189677452, "completions/clipped_ratio": 0.125, "epoch": 4.894623655913978, "grad_norm": 46.398448250196935, "learning_rate": 3.710285975673688e-09, "loss": -1.8984, "num_tokens": 391775564.0, "residual_var": 0.039375025779008865, "reward": 0.80859375, "reward_std": 0.1672700196504593, "rewards/drgrpo_math_reward/mean": 0.80859375, "rewards/drgrpo_math_reward/std": 0.3937928080558777, "rho2": 0.4374998211860657, "step": 852 }, { "advantages/mean": 6.984919309616089e-10, "advantages/snr": 2.8515811469894048e-09, "advantages/std": 0.24494898319244385, "advantages/var": 0.06000000436701214, "completions/clipped_ratio": 0.1796875, "epoch": 4.900358422939068, "grad_norm": 46.012492537839115, "learning_rate": 3.407709015691096e-09, "loss": -3.8225, "num_tokens": 392301044.0, "residual_var": 0.0337500236928463, "reward": 0.74609375, "reward_std": 0.15583032369613647, "rewards/drgrpo_math_reward/mean": 0.74609375, "rewards/drgrpo_math_reward/std": 0.43567025661468506, "rho2": 0.4374997615814209, "step": 853 }, { "advantages/mean": -1.3969838619232178e-09, "advantages/snr": 4.867772434216179e-09, "advantages/std": 0.28698626160621643, "advantages/var": 0.0823611143507117, "completions/clipped_ratio": 0.234375, "epoch": 4.9060931899641576, "grad_norm": 53.02585237002029, "learning_rate": 3.1179790253019756e-09, "loss": -5.6879, "num_tokens": 392822873.0, "residual_var": 0.02573787420988083, "reward": 0.681640625, "reward_std": 0.22182540595531464, "rewards/drgrpo_math_reward/mean": 0.681640625, "rewards/drgrpo_math_reward/std": 0.46629536151885986, "rho2": 0.6874997615814209, "step": 854 }, { "advantages/mean": 1.7462298274040222e-09, "advantages/snr": 5.794002810080234e-09, "advantages/std": 0.3013857305049896, "advantages/var": 0.09083335855202623, "completions/clipped_ratio": 0.1953125, "epoch": 4.911827956989248, "grad_norm": 59.24860046722861, "learning_rate": 2.841099739386066e-09, "loss": -3.1254, "num_tokens": 393346568.0, "residual_var": 0.03973962739109993, "reward": 0.75390625, "reward_std": 0.2160109579563141, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.5624996423721313, "step": 855 }, { "advantages/mean": 9.313225746154785e-10, "advantages/snr": 3.080728290726034e-09, "advantages/std": 0.3023059666156769, "advantages/var": 0.09138889745143874, "completions/clipped_ratio": 0.3125, "epoch": 4.917562724014337, "grad_norm": 58.39371049228968, "learning_rate": 2.577074727165951e-09, "loss": -4.9656, "num_tokens": 393886541.0, "residual_var": 0.04283857345581055, "reward": 0.69921875, "reward_std": 0.20636117458343506, "rewards/drgrpo_math_reward/mean": 0.69921875, "rewards/drgrpo_math_reward/std": 0.45904624462127686, "rho2": 0.5312497615814209, "step": 856 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.633271182071961e-09, "advantages/std": 0.2763853967189789, "advantages/var": 0.07638888751950734, "completions/clipped_ratio": 0.0625, "epoch": 4.923297491039427, "grad_norm": 54.07081938335018, "learning_rate": 2.3259073921612083e-09, "loss": -5.7226, "num_tokens": 394392887.0, "residual_var": 0.03580731526017189, "reward": 0.796875, "reward_std": 0.1889735758304596, "rewards/drgrpo_math_reward/mean": 0.796875, "rewards/drgrpo_math_reward/std": 0.4027182459831238, "rho2": 0.5312498211860657, "step": 857 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.180666795677408e-09, "advantages/std": 0.2958039939403534, "advantages/var": 0.08750000283106463, "completions/clipped_ratio": 0.171875, "epoch": 4.929032258064516, "grad_norm": 53.978649504548926, "learning_rate": 2.0876009721443322e-09, "loss": -6.4403, "num_tokens": 394926262.0, "residual_var": 0.04921878129243851, "reward": 0.75390625, "reward_std": 0.19567903876304626, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.4374997913837433, "step": 858 }, { "advantages/mean": -1.1641532182693481e-09, "advantages/snr": 3.398192866840047e-09, "advantages/std": 0.34258008003234863, "advantages/var": 0.1173611112349704, "completions/clipped_ratio": 0.1328125, "epoch": 4.934767025089606, "grad_norm": 67.43233538577508, "learning_rate": 1.8621585390989902e-09, "loss": -9.1274, "num_tokens": 395442867.0, "residual_var": 0.04767800495028496, "reward": 0.654296875, "reward_std": 0.2648409605026245, "rewards/drgrpo_math_reward/mean": 0.654296875, "rewards/drgrpo_math_reward/std": 0.4760620892047882, "rho2": 0.5937496423721313, "step": 859 }, { "advantages/mean": -1.1641532182693481e-10, "advantages/snr": 4.091118936097299e-10, "advantages/std": 0.28455618023872375, "advantages/var": 0.08097221971205304, "completions/clipped_ratio": 0.28125, "epoch": 4.940501792114695, "grad_norm": 55.02555512261928, "learning_rate": 1.649582999180721e-09, "loss": -4.3966, "num_tokens": 395981069.0, "residual_var": 0.043016523122787476, "reward": 0.701171875, "reward_std": 0.1826976239681244, "rewards/drgrpo_math_reward/mean": 0.701171875, "rewards/drgrpo_math_reward/std": 0.45819199085235596, "rho2": 0.4687497615814209, "step": 860 }, { "advantages/mean": 1.1641532182693481e-09, "advantages/snr": 4.039482917686628e-09, "advantages/std": 0.28819361329078674, "advantages/var": 0.08305555874159953, "completions/clipped_ratio": 0.1875, "epoch": 4.946236559139785, "grad_norm": 49.00997309560758, "learning_rate": 1.4498770926790749e-09, "loss": -7.1103, "num_tokens": 396490102.0, "residual_var": 0.04152781888842583, "reward": 0.75390625, "reward_std": 0.18848571181297302, "rewards/drgrpo_math_reward/mean": 0.75390625, "rewards/drgrpo_math_reward/std": 0.4311550557613373, "rho2": 0.4999995827674866, "step": 861 }, { "advantages/mean": 1.862645149230957e-09, "advantages/snr": 7.44231520823163e-09, "advantages/std": 0.25027763843536377, "advantages/var": 0.06263889630078268, "completions/clipped_ratio": -0.015625, "epoch": 4.951971326164875, "grad_norm": 45.128223807103865, "learning_rate": 1.2630433939825324e-09, "loss": -5.4277, "num_tokens": 396985464.0, "residual_var": 0.04110679402947426, "reward": 0.865234375, "reward_std": 0.1433878242969513, "rewards/drgrpo_math_reward/mean": 0.865234375, "rewards/drgrpo_math_reward/std": 0.3418070077896118, "rho2": 0.3437498211860657, "step": 862 }, { "advantages/mean": 1.6298145055770874e-09, "advantages/snr": 5.81808632564541e-09, "advantages/std": 0.28012895584106445, "advantages/var": 0.07847223190060504, "completions/clipped_ratio": 0.2890625, "epoch": 4.957706093189964, "grad_norm": 53.99281089551898, "learning_rate": 1.0890843115451964e-09, "loss": -2.6726, "num_tokens": 397512934.0, "residual_var": 0.03678388148546219, "reward": 0.701171875, "reward_std": 0.19177931547164917, "rewards/drgrpo_math_reward/mean": 0.701171875, "rewards/drgrpo_math_reward/std": 0.45819199085235596, "rho2": 0.5312497615814209, "step": 863 }, { "advantages/mean": -1.0477378964424133e-09, "advantages/snr": 3.2164158754149877e-09, "advantages/std": 0.325747013092041, "advantages/var": 0.10611111653838634, "completions/clipped_ratio": 0.2734375, "epoch": 4.963440860215054, "grad_norm": 66.01425066128654, "learning_rate": 9.28002087855928e-10, "loss": -3.8377, "num_tokens": 398057063.0, "residual_var": 0.036475736647844315, "reward": 0.66796875, "reward_std": 0.25084084272384644, "rewards/drgrpo_math_reward/mean": 0.66796875, "rewards/drgrpo_math_reward/std": 0.47140273451805115, "rho2": 0.6562497019767761, "step": 864 }, { "advantages/mean": 3.4924596548080444e-10, "advantages/snr": 1.1067206576667867e-09, "advantages/std": 0.31556829810142517, "advantages/var": 0.09958335076662994, "completions/clipped_ratio": -0.0078125, "epoch": 4.969175627240143, "grad_norm": 61.03115032721547, "learning_rate": 7.797987994092592e-10, "loss": -7.9463, "num_tokens": 398567038.0, "residual_var": 0.03423181548714638, "reward": 0.740234375, "reward_std": 0.2372840940952301, "rewards/drgrpo_math_reward/mean": 0.740234375, "rewards/drgrpo_math_reward/std": 0.4389347732067108, "rho2": 0.6562496423721313, "step": 865 }, { "advantages/mean": -3.4924596548080444e-10, "advantages/snr": 1.079941190156914e-09, "advantages/std": 0.32339349389076233, "advantages/var": 0.10458335189087453, "completions/clipped_ratio": 0.203125, "epoch": 4.974910394265233, "grad_norm": 63.584764838775996, "learning_rate": 6.44476356678636e-10, "loss": -3.8833, "num_tokens": 399091416.0, "residual_var": 0.04575524106621742, "reward": 0.787109375, "reward_std": 0.23581431806087494, "rewards/drgrpo_math_reward/mean": 0.787109375, "rewards/drgrpo_math_reward/std": 0.409751296043396, "rho2": 0.5624997615814209, "step": 866 }, { "advantages/mean": -6.984919309616089e-10, "advantages/snr": 2.699628737729565e-09, "advantages/std": 0.2587362825870514, "advantages/var": 0.06694446392696651, "completions/clipped_ratio": 0.28125, "epoch": 4.980645161290322, "grad_norm": 44.923947691221066, "learning_rate": 5.220365040918828e-10, "loss": -1.0632, "num_tokens": 399625311.0, "residual_var": 0.025104191154241562, "reward": 0.73828125, "reward_std": 0.19272387027740479, "rewards/drgrpo_math_reward/mean": 0.73828125, "rewards/drgrpo_math_reward/std": 0.44000017642974854, "rho2": 0.6249997615814209, "step": 867 }, { "advantages/mean": -1.280568540096283e-09, "advantages/snr": 4.954461709503794e-09, "advantages/std": 0.25846773386001587, "advantages/var": 0.066805569446732, "completions/clipped_ratio": 0.21875, "epoch": 4.986379928315412, "grad_norm": 47.93847064478741, "learning_rate": 4.124808200086649e-10, "loss": -6.1608, "num_tokens": 400143605.0, "residual_var": 0.03757815808057785, "reward": 0.701171875, "reward_std": 0.15784214437007904, "rewards/drgrpo_math_reward/mean": 0.701171875, "rewards/drgrpo_math_reward/std": 0.45819199085235596, "rho2": 0.4374997019767761, "step": 868 }, { "advantages/mean": -6.402842700481415e-10, "advantages/snr": 2.4010655290752638e-09, "advantages/std": 0.2666667103767395, "advantages/var": 0.07111113442315187, "completions/clipped_ratio": 0.125, "epoch": 4.992114695340502, "grad_norm": 49.30429179364696, "learning_rate": 3.158107167000601e-10, "loss": -2.6353, "num_tokens": 400665825.0, "residual_var": 0.03777780011296272, "reward": 0.7890625, "reward_std": 0.17815792560577393, "rewards/drgrpo_math_reward/mean": 0.7890625, "rewards/drgrpo_math_reward/std": 0.4083731174468994, "rho2": 0.46874988079071045, "step": 869 }, { "advantages/mean": 4.656612873077393e-10, "advantages/snr": 1.828428178517005e-09, "advantages/std": 0.2546784579753876, "advantages/var": 0.06486111695672125, "completions/clipped_ratio": 0.2578125, "epoch": 4.997849462365592, "grad_norm": 47.969831523711086, "learning_rate": 2.3202744033057332e-10, "loss": -2.9819, "num_tokens": 401198324.0, "residual_var": 0.03648440167307854, "reward": 0.669921875, "reward_std": 0.15756280720233917, "rewards/drgrpo_math_reward/mean": 0.669921875, "rewards/drgrpo_math_reward/std": 0.47070086002349854, "rho2": 0.4374997615814209, "step": 870 }, { "epoch": 4.997849462365592, "step": 870, "total_flos": 0.0, "train_loss": -2.5739649318329905, "train_runtime": 79546.8581, "train_samples_per_second": 0.351, "train_steps_per_second": 0.011 } ], "logging_steps": 1, "max_steps": 875, "num_input_tokens_seen": 401198324, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }