{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2816, "eval_steps": 10, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "accuracy_reward": 0.6653646230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.22294485569000244, "action_level_variance/metric": 6196.60693359375, "action_level_variance_full_gradient/metric": 19524.71484375, "adam_stats/lr_effective_max": 4.743407316709636e-06, "adam_stats/lr_effective_mean": -4.354325233713041e-11, "adam_stats/lr_effective_min": -4.743390036310302e-06, "adam_stats/m_t_max": 0.0036865242291241884, "adam_stats/m_t_mean": -2.0897693298049802e-11, "adam_stats/m_t_min": -0.002563477260991931, "adam_stats/v_t_max": 1.3590280332209659e-06, "adam_stats/v_t_mean": 6.798479558174989e-14, "adam_stats/v_t_min": 0.0, "advantages": -0.0008502702112309635, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 5.749965667724609, "all_logprobs": -0.1603996902704239, "all_logprobs/max": 0.0, "all_logprobs/median": -2.5987625122070312e-05, "all_logprobs/min": -14.5625, "all_logprobs/p1": -2.703125, "all_logprobs/p10": -0.4296875, "all_logprobs/p25": -0.01611328125, "all_logprobs/p5": -1.0234375, "all_logprobs/p75": -4.76837158203125e-07, "all_logprobs/var": 0.28558453917503357, "clip_ratio": 0.0, "completion_length": 589.1328125, "completion_length/correct": 562.884521484375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 500.0, "completion_length/correct/min": 11.0, "completion_length/correct/p25": 369.5, "completion_length/correct/p75": 728.5, "completion_length/correct/var": 65035.29296875, "completion_length/incorrect": 641.3229370117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 678.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 312.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 142496.234375, "completion_length/max": 1024.0, "completion_length/median": 526.0, "completion_length/min": 2.0, "completion_length/p25": 358.5, "completion_length/p75": 906.25, "completion_length/var": 92176.1640625, "epoch": 0.0128, "feature_vector_variance/max_squared_error": 104059.1484375, "feature_vector_variance/metric": 25271.5390625, "generated_tokens/total": 452454.0, "grad_norm": 0.4092431366443634, "grouped_std_rewards": 0.3503825068473816, "learning_rate": 1.5e-06, "loss": 0.0009, "mean_logprobs": -0.197265625, "mean_logprobs/var": 0.07080078125, "num_completions/total": 768, "per_sentence_gradient_norm": 15.214804649353027, "per_sentence_gradient_norm/max": 1426.8887939453125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 92.5633773803711, "per_sentence_gradient_norm/p99": 287.51751708984375, "per_sentence_gradient_norm/var": 5972.89453125, "per_token_feature_norm": 163.50311279296875, "per_token_feature_norm/max": 336.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 59.5, "per_token_feature_norm/p25": 124.5, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 2426.85400390625, "per_token_full_gradient_variance/max_squared_error": 603.6284790039062, "per_token_full_gradient_variance/variance": 0.1514168083667755, "per_token_gradient_norm": 13.61304759979248, "per_token_gradient_norm/max": 6187.60009765625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 20058.421875, "per_token_policy_error_norm": 0.08151844143867493, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06838791817426682, "policy_entropy": 0.17772728204727173, "policy_entropy/max": 3.765625, "policy_entropy/median": 0.0003185272216796875, "policy_entropy/min": 1.7780915628762273e-17, "policy_entropy/p25": 8.225440979003906e-06, "policy_entropy/p75": 0.08935546875, "policy_entropy/var": 0.1572306752204895, "policy_error_vector_variance/max_squared_error": 2.02032470703125, "policy_error_vector_variance/metric": 0.08098457753658295, "policy_loss": 0.0008502751588821411, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.79339599609375, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 5.749965667724609, "policy_sharpness": 7.136724472045898, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.986328125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.343321800231934, "reward": 0.6653646230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.22294485569000244, "rewards/accuracy_reward": 0.6653646230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.22294485569000244, "sentence_full_gradient_variance/max_squared_error": 5348322.5, "sentence_full_gradient_variance/metric": 22074.630859375, "sentence_full_gradient_variance/p75": 413.1585388183594, "sentence_full_gradient_variance/p90": 550.2266845703125, "sentence_full_gradient_variance/p95": 52317.12109375, "sentence_full_gradient_variance/p99": 428061.21875, "state_level_variance/metric": 548.80224609375, "state_level_variance_full_gradient/metric": 2549.9169921875, "step": 1 }, { "accuracy_reward": 0.6419271230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23015639185905457, "action_level_variance/metric": 7497.59375, "action_level_variance_full_gradient/metric": 24835.443359375, "adam_stats/lr_effective_max": 1.2765900464728475e-05, "adam_stats/lr_effective_mean": -2.457679337819485e-10, "adam_stats/lr_effective_min": -1.2765957762894686e-05, "adam_stats/m_t_max": 0.008452415466308594, "adam_stats/m_t_mean": -8.006255706148602e-14, "adam_stats/m_t_min": -0.008832968771457672, "adam_stats/v_t_max": 4.9150762606586795e-06, "adam_stats/v_t_mean": 5.726400256755859e-13, "adam_stats/v_t_min": 0.0, "advantages": -0.12729516625404358, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 6.530160903930664, "all_logprobs": -0.14862093329429626, "all_logprobs/max": 0.0, "all_logprobs/median": -2.2530555725097656e-05, "all_logprobs/min": -12.5, "all_logprobs/p1": -2.578125, "all_logprobs/p10": -0.38671875, "all_logprobs/p25": -0.01226806640625, "all_logprobs/p5": -0.9609375, "all_logprobs/p75": -4.76837158203125e-07, "all_logprobs/var": 0.25659748911857605, "clip_ratio": 0.0, "completion_length": 631.92578125, "completion_length/correct": 558.2129516601562, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 496.0, "completion_length/correct/min": 13.0, "completion_length/correct/p25": 338.0, "completion_length/correct/p75": 758.0, "completion_length/correct/var": 73397.6796875, "completion_length/incorrect": 764.07275390625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 494.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 120071.984375, "completion_length/max": 1024.0, "completion_length/median": 588.0, "completion_length/min": 2.0, "completion_length/p25": 351.5, "completion_length/p75": 1024.0, "completion_length/var": 99729.34375, "epoch": 0.0256, "feature_vector_variance/max_squared_error": 98630.421875, "feature_vector_variance/metric": 24641.796875, "generated_tokens/total": 937773.0, "grad_norm": 1.4740076065063477, "grouped_std_rewards": 0.35126861929893494, "learning_rate": 3e-06, "loss": 0.1273, "mean_logprobs": -0.173828125, "mean_logprobs/var": 0.052001953125, "num_completions/total": 1536, "per_sentence_gradient_norm": 15.62912368774414, "per_sentence_gradient_norm/max": 1639.5552978515625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 83.30549621582031, "per_sentence_gradient_norm/p99": 317.38726806640625, "per_sentence_gradient_norm/var": 7262.7822265625, "per_token_feature_norm": 161.0347900390625, "per_token_feature_norm/max": 336.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 60.25, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 2323.55029296875, "per_token_full_gradient_variance/max_squared_error": 9170681.0, "per_token_full_gradient_variance/variance": 19.063501358032227, "per_token_gradient_norm": 12.972761154174805, "per_token_gradient_norm/max": 6875.111328125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 22768.701171875, "per_token_policy_error_norm": 0.07640184462070465, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06426867097616196, "policy_entropy": 0.1655479520559311, "policy_entropy/max": 3.765625, "policy_entropy/median": 0.0002803802490234375, "policy_entropy/min": 2.2724877535296173e-16, "policy_entropy/p25": 8.940696716308594e-06, "policy_entropy/p75": 0.0712890625, "policy_entropy/var": 0.14113962650299072, "policy_error_vector_variance/max_squared_error": 2.0196685791015625, "policy_error_vector_variance/metric": 0.07598566263914108, "policy_loss": 0.1272951364517212, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659052848815918, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 6.530160903930664, "policy_sharpness": 7.2322916984558105, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.255737066268921, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.847545623779297, "reward": 0.6419271230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23015639185905457, "rewards/accuracy_reward": 0.6419271230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23015639185905457, "sentence_full_gradient_variance/max_squared_error": 4556851.0, "sentence_full_gradient_variance/metric": 28154.9453125, "sentence_full_gradient_variance/p75": 564.3089599609375, "sentence_full_gradient_variance/p90": 566.53564453125, "sentence_full_gradient_variance/p95": 46456.5859375, "sentence_full_gradient_variance/p99": 522213.71875, "state_level_variance/metric": 700.2237548828125, "state_level_variance_full_gradient/metric": 3319.50341796875, "step": 2 }, { "accuracy_reward": 0.5677083730697632, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2457355558872223, "action_level_variance/metric": 12992.404296875, "action_level_variance_full_gradient/metric": 19049.337890625, "adam_stats/lr_effective_max": 2.2351310690282844e-05, "adam_stats/lr_effective_mean": -3.572668272333601e-10, "adam_stats/lr_effective_min": -2.2355003238772042e-05, "adam_stats/m_t_max": 0.012392330914735794, "adam_stats/m_t_mean": 6.157730575440112e-12, "adam_stats/m_t_min": -0.01385787595063448, "adam_stats/v_t_max": 8.400802471442148e-06, "adam_stats/v_t_mean": 9.466941119917749e-13, "adam_stats/v_t_min": 0.0, "advantages": 0.0017977431416511536, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 6.191328048706055, "all_logprobs": -0.16575887799263, "all_logprobs/max": 0.0, "all_logprobs/median": -3.457069396972656e-05, "all_logprobs/min": -13.5625, "all_logprobs/p1": -2.75, "all_logprobs/p10": -0.4609375, "all_logprobs/p25": -0.0206298828125, "all_logprobs/p5": -1.0546875, "all_logprobs/p75": -5.960464477539062e-07, "all_logprobs/var": 0.2917652428150177, "clip_ratio": 0.0, "completion_length": 595.6588745117188, "completion_length/correct": 530.8485717773438, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 475.0, "completion_length/correct/min": 54.0, "completion_length/correct/p25": 308.5, "completion_length/correct/p75": 698.0, "completion_length/correct/var": 70159.7734375, "completion_length/incorrect": 680.7710571289062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 809.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 330.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 132190.828125, "completion_length/max": 1024.0, "completion_length/median": 564.0, "completion_length/min": 2.0, "completion_length/p25": 315.0, "completion_length/p75": 969.75, "completion_length/var": 102361.21875, "epoch": 0.0384, "feature_vector_variance/max_squared_error": 108285.1640625, "feature_vector_variance/metric": 25229.453125, "generated_tokens/total": 1395239.0, "grad_norm": 0.8769099116325378, "grouped_std_rewards": 0.3746882379055023, "learning_rate": 4.5e-06, "loss": -0.0018, "mean_logprobs": -0.2080078125, "mean_logprobs/var": 0.060546875, "num_completions/total": 2304, "per_sentence_gradient_norm": 20.555194854736328, "per_sentence_gradient_norm/max": 1812.75390625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 24.876224517822266, "per_sentence_gradient_norm/p95": 90.32449340820312, "per_sentence_gradient_norm/p99": 508.539794921875, "per_sentence_gradient_norm/var": 12586.27734375, "per_token_feature_norm": 164.00477600097656, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 61.75, "per_token_feature_norm/p25": 124.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 2465.822265625, "per_token_full_gradient_variance/max_squared_error": 791.2626342773438, "per_token_full_gradient_variance/variance": 0.22652985155582428, "per_token_gradient_norm": 17.0058536529541, "per_token_gradient_norm/max": 7766.43212890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 32373.51171875, "per_token_policy_error_norm": 0.08448130637407303, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.07061532884836197, "policy_entropy": 0.18417267501354218, "policy_entropy/max": 3.78125, "policy_entropy/median": 0.000408172607421875, "policy_entropy/min": 8.109832250191573e-17, "policy_entropy/p25": 9.5367431640625e-06, "policy_entropy/p75": 0.1103515625, "policy_entropy/var": 0.1607799082994461, "policy_error_vector_variance/max_squared_error": 2.0216410160064697, "policy_error_vector_variance/metric": 0.08396802097558975, "policy_loss": -0.0017977531533688307, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 6.191328525543213, "policy_sharpness": 7.0374650955200195, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.8369140625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.556217193603516, "reward": 0.5677083730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2457355558872223, "rewards/accuracy_reward": 0.5677083730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2457355558872223, "sentence_full_gradient_variance/max_squared_error": 3825870.5, "sentence_full_gradient_variance/metric": 21542.107421875, "sentence_full_gradient_variance/p75": 695.1344604492188, "sentence_full_gradient_variance/p90": 788.0768432617188, "sentence_full_gradient_variance/p95": 59467.6640625, "sentence_full_gradient_variance/p99": 449702.375, "state_level_variance/metric": 1214.182373046875, "state_level_variance_full_gradient/metric": 2492.771240234375, "step": 3 }, { "accuracy_reward": 0.6236979365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23500484228134155, "action_level_variance/metric": 8270.7958984375, "action_level_variance_full_gradient/metric": 21296.2734375, "adam_stats/lr_effective_max": 3.281698809587397e-05, "adam_stats/lr_effective_mean": -5.810988912280379e-10, "adam_stats/lr_effective_min": -3.2824831578182057e-05, "adam_stats/m_t_max": 0.014595480635762215, "adam_stats/m_t_mean": -2.0217267096556135e-11, "adam_stats/m_t_min": -0.01231339666992426, "adam_stats/v_t_max": 8.39491985971108e-06, "adam_stats/v_t_mean": 1.159802831031398e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.013158611953258514, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 7.65601110458374, "all_logprobs": -0.15565545856952667, "all_logprobs/max": 0.0, "all_logprobs/median": -2.0503997802734375e-05, "all_logprobs/min": -13.25, "all_logprobs/p1": -2.703125, "all_logprobs/p10": -0.408203125, "all_logprobs/p25": -0.01312255859375, "all_logprobs/p5": -0.984375, "all_logprobs/p75": -4.76837158203125e-07, "all_logprobs/var": 0.276152104139328, "clip_ratio": 0.0, "completion_length": 635.2044677734375, "completion_length/correct": 558.9457397460938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 494.0, "completion_length/correct/min": 97.0, "completion_length/correct/p25": 351.0, "completion_length/correct/p75": 737.0, "completion_length/correct/var": 67233.421875, "completion_length/incorrect": 761.5986328125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 492.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 122931.234375, "completion_length/max": 1024.0, "completion_length/median": 600.0, "completion_length/min": 2.0, "completion_length/p25": 369.0, "completion_length/p75": 1024.0, "completion_length/var": 97710.90625, "epoch": 0.0512, "feature_vector_variance/max_squared_error": 94651.03125, "feature_vector_variance/metric": 24887.92578125, "generated_tokens/total": 1883076.0, "grad_norm": 0.7486812472343445, "grouped_std_rewards": 0.3407518267631531, "learning_rate": 6e-06, "loss": -0.0132, "mean_logprobs": -0.189453125, "mean_logprobs/var": 0.07470703125, "num_completions/total": 3072, "per_sentence_gradient_norm": 17.53493881225586, "per_sentence_gradient_norm/max": 1362.913330078125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 92.78929138183594, "per_sentence_gradient_norm/p99": 389.5683898925781, "per_sentence_gradient_norm/var": 7973.70458984375, "per_token_feature_norm": 161.4739990234375, "per_token_feature_norm/max": 338.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 59.5, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 2352.6279296875, "per_token_full_gradient_variance/max_squared_error": 1194.3201904296875, "per_token_full_gradient_variance/variance": 0.22050072252750397, "per_token_gradient_norm": 15.548802375793457, "per_token_gradient_norm/max": 7735.5048828125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 29809.916015625, "per_token_policy_error_norm": 0.07906274497509003, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0662817731499672, "policy_entropy": 0.17295314371585846, "policy_entropy/max": 3.765625, "policy_entropy/median": 0.000255584716796875, "policy_entropy/min": 2.1649348980190553e-15, "policy_entropy/p25": 8.046627044677734e-06, "policy_entropy/p75": 0.076171875, "policy_entropy/var": 0.1536097526550293, "policy_error_vector_variance/max_squared_error": 2.019747257232666, "policy_error_vector_variance/metric": 0.07858069986104965, "policy_loss": -0.013158610090613365, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 7.65601110458374, "policy_sharpness": 7.213962078094482, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.165942430496216, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.09175968170166, "reward": 0.6236979365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23500484228134155, "rewards/accuracy_reward": 0.6236979365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23500484228134155, "sentence_full_gradient_variance/max_squared_error": 4327947.0, "sentence_full_gradient_variance/metric": 24176.80859375, "sentence_full_gradient_variance/p75": 308.04498291015625, "sentence_full_gradient_variance/p90": 779.8616943359375, "sentence_full_gradient_variance/p95": 24482.453125, "sentence_full_gradient_variance/p99": 617786.75, "state_level_variance/metric": 734.021484375, "state_level_variance_full_gradient/metric": 2880.53759765625, "step": 4 }, { "accuracy_reward": 0.6705729365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2211928516626358, "action_level_variance/metric": 4664.10888671875, "action_level_variance_full_gradient/metric": 14477.701171875, "adam_stats/lr_effective_max": 4.382766564958729e-05, "adam_stats/lr_effective_mean": -7.422260028810967e-10, "adam_stats/lr_effective_min": -4.381819599075243e-05, "adam_stats/m_t_max": 0.022803902626037598, "adam_stats/m_t_mean": -1.2387153802695394e-10, "adam_stats/m_t_min": -0.01902216300368309, "adam_stats/v_t_max": 1.6797825082903728e-05, "adam_stats/v_t_mean": 1.6277185762442192e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.06040404736995697, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 6.315463542938232, "all_logprobs": -0.15320497751235962, "all_logprobs/max": 0.0, "all_logprobs/median": -1.9550323486328125e-05, "all_logprobs/min": -11.4375, "all_logprobs/p1": -2.59375, "all_logprobs/p10": -0.39453125, "all_logprobs/p25": -0.01251220703125, "all_logprobs/p5": -0.984375, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.2661299407482147, "clip_ratio": 0.0, "completion_length": 593.9544677734375, "completion_length/correct": 549.4310913085938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 482.0, "completion_length/correct/min": 21.0, "completion_length/correct/p25": 347.5, "completion_length/correct/p75": 747.0, "completion_length/correct/var": 70803.671875, "completion_length/incorrect": 684.5850219726562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 825.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 353.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 138939.828125, "completion_length/max": 1024.0, "completion_length/median": 529.0, "completion_length/min": 2.0, "completion_length/p25": 347.75, "completion_length/p75": 940.5, "completion_length/var": 97138.125, "epoch": 0.064, "feature_vector_variance/max_squared_error": 100448.875, "feature_vector_variance/metric": 24864.0234375, "generated_tokens/total": 2339233.0, "grad_norm": 0.9894917011260986, "grouped_std_rewards": 0.3248804807662964, "learning_rate": 7.5e-06, "loss": -0.0604, "mean_logprobs": -0.1865234375, "mean_logprobs/var": 0.05859375, "num_completions/total": 3840, "per_sentence_gradient_norm": 14.62324333190918, "per_sentence_gradient_norm/max": 802.4064331054688, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 84.46424865722656, "per_sentence_gradient_norm/p99": 329.5955810546875, "per_sentence_gradient_norm/var": 4456.0712890625, "per_token_feature_norm": 161.1188507080078, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 46.75, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 2294.62353515625, "per_token_full_gradient_variance/max_squared_error": 983.540771484375, "per_token_full_gradient_variance/variance": 0.17852063477039337, "per_token_gradient_norm": 13.79763412475586, "per_token_gradient_norm/max": 7309.63818359375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 24727.513671875, "per_token_policy_error_norm": 0.07832171767950058, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06577557325363159, "policy_entropy": 0.1702422946691513, "policy_entropy/max": 3.828125, "policy_entropy/median": 0.000244140625, "policy_entropy/min": 6.772360450213455e-15, "policy_entropy/p25": 7.3015689849853516e-06, "policy_entropy/p75": 0.07275390625, "policy_entropy/var": 0.15160319209098816, "policy_error_vector_variance/max_squared_error": 2.0251073837280273, "policy_error_vector_variance/metric": 0.07785650342702866, "policy_loss": -0.06040404364466667, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 6.315463542938232, "policy_sharpness": 7.244889259338379, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.241455078125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.95591926574707, "reward": 0.6705729365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2211928516626358, "rewards/accuracy_reward": 0.6705729365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2211928516626358, "sentence_full_gradient_variance/max_squared_error": 2835807.0, "sentence_full_gradient_variance/metric": 16343.9296875, "sentence_full_gradient_variance/p75": 342.371826171875, "sentence_full_gradient_variance/p90": 831.1527099609375, "sentence_full_gradient_variance/p95": 39521.15625, "sentence_full_gradient_variance/p99": 313519.375, "state_level_variance/metric": 373.06036376953125, "state_level_variance_full_gradient/metric": 1866.2279052734375, "step": 5 }, { "accuracy_reward": 0.6549479365348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.22628577053546906, "action_level_variance/metric": 14343.6904296875, "action_level_variance_full_gradient/metric": 31259.74609375, "adam_stats/lr_effective_max": 5.523108848137781e-05, "adam_stats/lr_effective_mean": -7.163381554597947e-10, "adam_stats/lr_effective_min": -5.5146796512417495e-05, "adam_stats/m_t_max": 0.028921952471137047, "adam_stats/m_t_mean": -1.3860311509628076e-10, "adam_stats/m_t_min": -0.02117268182337284, "adam_stats/v_t_max": 2.3834310923120938e-05, "adam_stats/v_t_mean": 2.015036791938596e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.06477002799510956, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 8.012545585632324, "all_logprobs": -0.14822980761528015, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1086463928222656e-05, "all_logprobs/min": -13.625, "all_logprobs/p1": -2.578125, "all_logprobs/p10": -0.388671875, "all_logprobs/p25": -0.01165771484375, "all_logprobs/p5": -0.94921875, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.2528113126754761, "clip_ratio": 0.0, "completion_length": 642.7044677734375, "completion_length/correct": 562.0397338867188, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 513.0, "completion_length/correct/min": 48.0, "completion_length/correct/p25": 364.0, "completion_length/correct/p75": 729.5, "completion_length/correct/var": 57716.125, "completion_length/incorrect": 795.8151245117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 4.0, "completion_length/incorrect/p25": 608.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 104024.2109375, "completion_length/max": 1024.0, "completion_length/median": 616.0, "completion_length/min": 4.0, "completion_length/p25": 380.0, "completion_length/p75": 1012.0, "completion_length/var": 85946.7578125, "epoch": 0.0768, "feature_vector_variance/max_squared_error": 101432.4921875, "feature_vector_variance/metric": 24827.298828125, "generated_tokens/total": 2832830.0, "grad_norm": 0.9244729280471802, "grouped_std_rewards": 0.3382497727870941, "learning_rate": 9e-06, "loss": 0.0648, "mean_logprobs": -0.1689453125, "mean_logprobs/var": 0.0230712890625, "num_completions/total": 4608, "per_sentence_gradient_norm": 19.328536987304688, "per_sentence_gradient_norm/max": 2215.25927734375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 93.5407485961914, "per_sentence_gradient_norm/p99": 389.1475524902344, "per_sentence_gradient_norm/var": 13988.3115234375, "per_token_feature_norm": 160.2632293701172, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 61.0, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 2237.521484375, "per_token_full_gradient_variance/max_squared_error": 1216644.875, "per_token_full_gradient_variance/variance": 2.6382675170898438, "per_token_gradient_norm": 13.2578763961792, "per_token_gradient_norm/max": 7695.609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 23347.09375, "per_token_policy_error_norm": 0.07676897943019867, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0645255371928215, "policy_entropy": 0.16423478722572327, "policy_entropy/max": 3.765625, "policy_entropy/median": 0.00014495849609375, "policy_entropy/min": 1.1709383462843448e-16, "policy_entropy/p25": 5.155801773071289e-06, "policy_entropy/p75": 0.06884765625, "policy_entropy/var": 0.13878443837165833, "policy_error_vector_variance/max_squared_error": 2.01908016204834, "policy_error_vector_variance/metric": 0.07641255110502243, "policy_loss": 0.06477002054452896, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 8.012545585632324, "policy_sharpness": 7.337482929229736, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.4990234375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.672513008117676, "reward": 0.6549479365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.22628577053546906, "rewards/accuracy_reward": 0.6549479365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.22628577053546906, "sentence_full_gradient_variance/max_squared_error": 5480490.0, "sentence_full_gradient_variance/metric": 35564.1875, "sentence_full_gradient_variance/p75": 301.4278259277344, "sentence_full_gradient_variance/p90": 498.97607421875, "sentence_full_gradient_variance/p95": 41043.1640625, "sentence_full_gradient_variance/p99": 449544.5625, "state_level_variance/metric": 1434.309814453125, "state_level_variance_full_gradient/metric": 4304.4365234375, "step": 6 }, { "accuracy_reward": 0.6966146230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21161821484565735, "action_level_variance/metric": 3800.544677734375, "action_level_variance_full_gradient/metric": 6581.7490234375, "adam_stats/lr_effective_max": 6.533075065817684e-05, "adam_stats/lr_effective_mean": -6.87559564838125e-10, "adam_stats/lr_effective_min": -6.579050386790186e-05, "adam_stats/m_t_max": 0.025951936841011047, "adam_stats/m_t_mean": -1.165099683397841e-10, "adam_stats/m_t_min": -0.018847893923521042, "adam_stats/v_t_max": 2.3811082428437658e-05, "adam_stats/v_t_mean": 2.0167086316885685e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.09855322539806366, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.764775276184082, "all_logprobs": -0.13252811133861542, "all_logprobs/max": 0.0, "all_logprobs/median": -6.4373016357421875e-06, "all_logprobs/min": -11.75, "all_logprobs/p1": -2.359375, "all_logprobs/p10": -0.330078125, "all_logprobs/p25": -0.00762939453125, "all_logprobs/p5": -0.83984375, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.21197357773780823, "clip_ratio": 0.0, "completion_length": 648.9427490234375, "completion_length/correct": 550.8673095703125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 489.0, "completion_length/correct/min": 144.0, "completion_length/correct/p25": 349.0, "completion_length/correct/p75": 727.0, "completion_length/correct/var": 63081.8046875, "completion_length/incorrect": 874.1373291015625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 3.0, "completion_length/incorrect/p25": 748.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 55119.70703125, "completion_length/max": 1024.0, "completion_length/median": 616.0, "completion_length/min": 3.0, "completion_length/p25": 403.75, "completion_length/p75": 1005.25, "completion_length/var": 82706.0625, "epoch": 0.0896, "feature_vector_variance/max_squared_error": 104028.1015625, "feature_vector_variance/metric": 24658.009765625, "generated_tokens/total": 3331218.0, "grad_norm": 0.14526310563087463, "grouped_std_rewards": 0.25260740518569946, "learning_rate": 1.05e-05, "loss": -0.0986, "mean_logprobs": -0.1396484375, "mean_logprobs/var": 0.0218505859375, "num_completions/total": 5376, "per_sentence_gradient_norm": 9.354843139648438, "per_sentence_gradient_norm/max": 1193.882080078125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 46.24067306518555, "per_sentence_gradient_norm/p99": 155.9207763671875, "per_sentence_gradient_norm/var": 3717.872802734375, "per_token_feature_norm": 158.9490509033203, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 62.75, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 2054.96728515625, "per_token_full_gradient_variance/max_squared_error": 1158.88818359375, "per_token_full_gradient_variance/variance": 0.15814895927906036, "per_token_gradient_norm": 9.846894264221191, "per_token_gradient_norm/max": 7901.8935546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 21717.39453125, "per_token_policy_error_norm": 0.07044288516044617, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05935288593173027, "policy_entropy": 0.1468379646539688, "policy_entropy/max": 3.78125, "policy_entropy/median": 8.869171142578125e-05, "policy_entropy/min": 1.196959198423997e-16, "policy_entropy/p25": 3.680586814880371e-06, "policy_entropy/p75": 0.048583984375, "policy_entropy/var": 0.11373115330934525, "policy_error_vector_variance/max_squared_error": 2.017518997192383, "policy_error_vector_variance/metric": 0.07031398266553879, "policy_loss": -0.09855322539806366, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.79339599609375, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.76477575302124, "policy_sharpness": 7.520288944244385, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.998046636581421, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.901018142700195, "reward": 0.6966146230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21161821484565735, "rewards/accuracy_reward": 0.6966146230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21161821484565735, "sentence_full_gradient_variance/max_squared_error": 1568204.375, "sentence_full_gradient_variance/metric": 7375.01513671875, "sentence_full_gradient_variance/p75": 296.7035217285156, "sentence_full_gradient_variance/p90": 443.679931640625, "sentence_full_gradient_variance/p95": 8276.4306640625, "sentence_full_gradient_variance/p99": 88104.890625, "state_level_variance/metric": 391.6345520019531, "state_level_variance_full_gradient/metric": 793.266357421875, "step": 7 }, { "accuracy_reward": 0.7447916865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19032487273216248, "action_level_variance/metric": 1245.063720703125, "action_level_variance_full_gradient/metric": 3458.496826171875, "adam_stats/lr_effective_max": 7.406729855574667e-05, "adam_stats/lr_effective_mean": -6.905994665018511e-10, "adam_stats/lr_effective_min": -7.440715125994757e-05, "adam_stats/m_t_max": 0.023355979472398758, "adam_stats/m_t_mean": -1.0543325934531111e-10, "adam_stats/m_t_min": -0.016841797158122063, "adam_stats/v_t_max": 2.3787271857145242e-05, "adam_stats/v_t_mean": 2.015894395857032e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.02373921498656273, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.3041985034942627, "all_logprobs": -0.1333945095539093, "all_logprobs/max": 0.0, "all_logprobs/median": -5.245208740234375e-06, "all_logprobs/min": -12.5, "all_logprobs/p1": -2.34375, "all_logprobs/p10": -0.34765625, "all_logprobs/p25": -0.00982666015625, "all_logprobs/p5": -0.8515625, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.20633172988891602, "clip_ratio": 0.0, "completion_length": 605.6654052734375, "completion_length/correct": 520.4650268554688, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 473.0, "completion_length/correct/min": 161.0, "completion_length/correct/p25": 348.75, "completion_length/correct/p75": 653.75, "completion_length/correct/var": 50880.703125, "completion_length/incorrect": 854.3112182617188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 205.0, "completion_length/incorrect/p25": 670.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 59887.85546875, "completion_length/max": 1024.0, "completion_length/median": 532.0, "completion_length/min": 161.0, "completion_length/p25": 371.75, "completion_length/p75": 854.0, "completion_length/var": 74316.6484375, "epoch": 0.1024, "feature_vector_variance/max_squared_error": 103228.6640625, "feature_vector_variance/metric": 24882.6640625, "generated_tokens/total": 3796369.0, "grad_norm": 0.08890534937381744, "grouped_std_rewards": 0.22309470176696777, "learning_rate": 1.2e-05, "loss": -0.0237, "mean_logprobs": -0.1318359375, "mean_logprobs/var": 0.0029449462890625, "num_completions/total": 6144, "per_sentence_gradient_norm": 6.836091995239258, "per_sentence_gradient_norm/max": 442.8410949707031, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 39.559024810791016, "per_sentence_gradient_norm/p99": 167.49209594726562, "per_sentence_gradient_norm/var": 1199.893798828125, "per_token_feature_norm": 159.6527099609375, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 62.5, "per_token_feature_norm/p25": 124.0, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 2055.686767578125, "per_token_full_gradient_variance/max_squared_error": 541824.125, "per_token_full_gradient_variance/variance": 1.2611173391342163, "per_token_gradient_norm": 8.481917381286621, "per_token_gradient_norm/max": 7901.8935546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 11951.8544921875, "per_token_policy_error_norm": 0.07163325697183609, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.060003675520420074, "policy_entropy": 0.1486014723777771, "policy_entropy/max": 3.765625, "policy_entropy/median": 7.2479248046875e-05, "policy_entropy/min": 3.946495907847236e-17, "policy_entropy/p25": 2.562999725341797e-06, "policy_entropy/p75": 0.060302734375, "policy_entropy/var": 0.11067015677690506, "policy_error_vector_variance/max_squared_error": 2.020155191421509, "policy_error_vector_variance/metric": 0.07153350114822388, "policy_loss": -0.02373921498656273, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.30419921875, "policy_sharpness": 7.483667850494385, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.8707275390625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.981922149658203, "reward": 0.7447916865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19032487273216248, "rewards/accuracy_reward": 0.7447916865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19032487273216248, "sentence_full_gradient_variance/max_squared_error": 675386.125, "sentence_full_gradient_variance/metric": 3804.422607421875, "sentence_full_gradient_variance/p75": 268.3050537109375, "sentence_full_gradient_variance/p90": 784.7901611328125, "sentence_full_gradient_variance/p95": 785.0985717773438, "sentence_full_gradient_variance/p99": 72932.78125, "state_level_variance/metric": 110.047119140625, "state_level_variance_full_gradient/metric": 345.92584228515625, "step": 8 }, { "accuracy_reward": 0.75390625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18577350676059723, "action_level_variance/metric": 1161.982421875, "action_level_variance_full_gradient/metric": 2226.94189453125, "adam_stats/lr_effective_max": 8.334703306900337e-05, "adam_stats/lr_effective_mean": -6.45231867935081e-10, "adam_stats/lr_effective_min": -8.329474076163024e-05, "adam_stats/m_t_max": 0.021039454266428947, "adam_stats/m_t_mean": -9.496069208747571e-11, "adam_stats/m_t_min": -0.014994347468018532, "adam_stats/v_t_max": 2.3763521312503144e-05, "adam_stats/v_t_mean": 2.015049585524231e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.012236535549163818, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.1071815490722656, "all_logprobs": -0.13020022213459015, "all_logprobs/max": 0.0, "all_logprobs/median": -4.291534423828125e-06, "all_logprobs/min": -15.0, "all_logprobs/p1": -2.3125, "all_logprobs/p10": -0.32421875, "all_logprobs/p25": -0.007659912109375, "all_logprobs/p5": -0.83203125, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.20339982211589813, "clip_ratio": 0.0, "completion_length": 559.6198120117188, "completion_length/correct": 462.49395751953125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 411.0, "completion_length/correct/min": 98.0, "completion_length/correct/p25": 312.0, "completion_length/correct/p75": 550.0, "completion_length/correct/var": 44658.19140625, "completion_length/incorrect": 857.1640014648438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 297.0, "completion_length/incorrect/p25": 681.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 48666.17578125, "completion_length/max": 1024.0, "completion_length/median": 478.0, "completion_length/min": 98.0, "completion_length/p25": 341.0, "completion_length/p75": 767.75, "completion_length/var": 74519.2734375, "epoch": 0.1152, "feature_vector_variance/max_squared_error": 97914.046875, "feature_vector_variance/metric": 25344.25, "generated_tokens/total": 4226157.0, "grad_norm": 0.11989860236644745, "grouped_std_rewards": 0.18346789479255676, "learning_rate": 1.3500000000000001e-05, "loss": -0.0122, "mean_logprobs": -0.1298828125, "mean_logprobs/var": 0.0032196044921875, "num_completions/total": 6912, "per_sentence_gradient_norm": 5.571715354919434, "per_sentence_gradient_norm/max": 523.77880859375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 30.25502586364746, "per_sentence_gradient_norm/p99": 136.6603240966797, "per_sentence_gradient_norm/var": 1132.412841796875, "per_token_feature_norm": 160.7666473388672, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 61.75, "per_token_feature_norm/p25": 126.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 1980.152587890625, "per_token_full_gradient_variance/max_squared_error": 1383.2880859375, "per_token_full_gradient_variance/variance": 0.08354294300079346, "per_token_gradient_norm": 7.824136257171631, "per_token_gradient_norm/max": 6695.11474609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 12304.8388671875, "per_token_policy_error_norm": 0.06998489052057266, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.059042274951934814, "policy_entropy": 0.14385303854942322, "policy_entropy/max": 3.78125, "policy_entropy/median": 6.0558319091796875e-05, "policy_entropy/min": 4.4853010194856324e-14, "policy_entropy/p25": 2.2351741790771484e-06, "policy_entropy/p75": 0.048828125, "policy_entropy/var": 0.10754459351301193, "policy_error_vector_variance/max_squared_error": 2.0235843658447266, "policy_error_vector_variance/metric": 0.06992102414369583, "policy_loss": -0.012236535549163818, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.1071817874908447, "policy_sharpness": 7.548709869384766, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.011474609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.769142150878906, "reward": 0.75390625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18577350676059723, "rewards/accuracy_reward": 0.75390625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18577350676059723, "sentence_full_gradient_variance/max_squared_error": 549070.4375, "sentence_full_gradient_variance/metric": 2482.029052734375, "sentence_full_gradient_variance/p75": 147.37109375, "sentence_full_gradient_variance/p90": 240.37054443359375, "sentence_full_gradient_variance/p95": 240.37054443359375, "sentence_full_gradient_variance/p99": 50116.0078125, "state_level_variance/metric": 115.40592193603516, "state_level_variance_full_gradient/metric": 255.0870819091797, "step": 9 }, { "accuracy_reward": 0.75390625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18577350676059723, "action_level_variance/metric": 1156.6395263671875, "action_level_variance_full_gradient/metric": 2784.24365234375, "adam_stats/lr_effective_max": 9.388537728227675e-05, "adam_stats/lr_effective_mean": -8.192826972397427e-10, "adam_stats/lr_effective_min": -9.272230818169191e-05, "adam_stats/m_t_max": 0.019226951524615288, "adam_stats/m_t_mean": -8.784314942111848e-11, "adam_stats/m_t_min": -0.013674203306436539, "adam_stats/v_t_max": 2.374825271544978e-05, "adam_stats/v_t_mean": 2.0139679854369597e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.02686399780213833, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.1907553672790527, "all_logprobs": -0.12101192772388458, "all_logprobs/max": 0.0, "all_logprobs/median": -2.9802322387695312e-06, "all_logprobs/min": -12.8125, "all_logprobs/p1": -2.21875, "all_logprobs/p10": -0.28515625, "all_logprobs/p25": -0.004638671875, "all_logprobs/p5": -0.765625, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.188117116689682, "clip_ratio": 0.0, "completion_length": 596.83203125, "completion_length/correct": 524.67529296875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 490.0, "completion_length/correct/min": 117.0, "completion_length/correct/p25": 362.5, "completion_length/correct/p75": 667.5, "completion_length/correct/var": 48534.57421875, "completion_length/incorrect": 817.883544921875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 953.0, "completion_length/incorrect/min": 128.0, "completion_length/incorrect/p25": 625.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 57667.203125, "completion_length/max": 1024.0, "completion_length/median": 550.0, "completion_length/min": 117.0, "completion_length/p25": 396.75, "completion_length/p75": 789.0, "completion_length/var": 66680.953125, "epoch": 0.128, "feature_vector_variance/max_squared_error": 110468.046875, "feature_vector_variance/metric": 25245.806640625, "generated_tokens/total": 4684524.0, "grad_norm": 0.0965781956911087, "grouped_std_rewards": 0.18845713138580322, "learning_rate": 1.5e-05, "loss": 0.0269, "mean_logprobs": -0.12158203125, "mean_logprobs/var": 0.003143310546875, "num_completions/total": 7680, "per_sentence_gradient_norm": 5.539313316345215, "per_sentence_gradient_norm/max": 537.1190185546875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 29.37350845336914, "per_sentence_gradient_norm/p99": 116.3531723022461, "per_sentence_gradient_norm/var": 1127.4234619140625, "per_token_feature_norm": 159.8961181640625, "per_token_feature_norm/max": 336.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 62.25, "per_token_feature_norm/p25": 125.5, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 1930.337158203125, "per_token_full_gradient_variance/max_squared_error": 441.34765625, "per_token_full_gradient_variance/variance": 0.09079119563102722, "per_token_gradient_norm": 7.153838634490967, "per_token_gradient_norm/max": 7198.30029296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 11632.6669921875, "per_token_policy_error_norm": 0.0651954710483551, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05512375012040138, "policy_entropy": 0.1347092241048813, "policy_entropy/max": 3.765625, "policy_entropy/median": 4.315376281738281e-05, "policy_entropy/min": 2.2768245622195593e-17, "policy_entropy/p25": 1.7061829566955566e-06, "policy_entropy/p75": 0.03125, "policy_entropy/var": 0.10110935568809509, "policy_error_vector_variance/max_squared_error": 2.0128798484802246, "policy_error_vector_variance/metric": 0.06512432545423508, "policy_loss": 0.026863988488912582, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.1907553672790527, "policy_sharpness": 7.682409286499023, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.310546875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.357694625854492, "reward": 0.75390625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18577350676059723, "rewards/accuracy_reward": 0.75390625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18577350676059723, "sentence_full_gradient_variance/max_squared_error": 670482.0, "sentence_full_gradient_variance/metric": 3154.75390625, "sentence_full_gradient_variance/p75": 37.575286865234375, "sentence_full_gradient_variance/p90": 133.68075561523438, "sentence_full_gradient_variance/p95": 133.68075561523438, "sentence_full_gradient_variance/p99": 55058.94921875, "state_level_variance/metric": 115.09486389160156, "state_level_variance_full_gradient/metric": 370.510498046875, "step": 10 }, { "accuracy_reward": 0.7578125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1837719976902008, "action_level_variance/metric": 1204.6650390625, "action_level_variance_full_gradient/metric": 2891.651123046875, "adam_stats/lr_effective_max": 9.12754621822387e-05, "adam_stats/lr_effective_mean": -8.078445135062395e-10, "adam_stats/lr_effective_min": -9.180847700918093e-05, "adam_stats/m_t_max": 0.0172165185213089, "adam_stats/m_t_mean": -7.892855813373956e-11, "adam_stats/m_t_min": -0.012287899851799011, "adam_stats/v_t_max": 2.3725273422314785e-05, "adam_stats/v_t_mean": 2.0126016738591934e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.027092119678854942, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.1074132919311523, "all_logprobs": -0.11948475241661072, "all_logprobs/max": 0.0, "all_logprobs/median": -2.9802322387695312e-06, "all_logprobs/min": -13.625, "all_logprobs/p1": -2.203125, "all_logprobs/p10": -0.28125, "all_logprobs/p25": -0.004852294921875, "all_logprobs/p5": -0.76171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.18487319350242615, "clip_ratio": 0.0, "completion_length": 580.0130615234375, "completion_length/correct": 508.87115478515625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 454.0, "completion_length/correct/min": 149.0, "completion_length/correct/p25": 347.5, "completion_length/correct/p75": 654.0, "completion_length/correct/var": 46805.8125, "completion_length/incorrect": 802.6182861328125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 877.0, "completion_length/incorrect/min": 230.0, "completion_length/incorrect/p25": 567.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 59041.79296875, "completion_length/max": 1024.0, "completion_length/median": 507.0, "completion_length/min": 149.0, "completion_length/p25": 375.5, "completion_length/p75": 778.25, "completion_length/var": 65553.3046875, "epoch": 0.1408, "feature_vector_variance/max_squared_error": 102627.2578125, "feature_vector_variance/metric": 25844.466796875, "generated_tokens/total": 5129974.0, "grad_norm": 0.07926759868860245, "grouped_std_rewards": 0.18276016414165497, "learning_rate": 1.4995431202643219e-05, "loss": 0.0271, "mean_logprobs": -0.11767578125, "mean_logprobs/var": 0.0021514892578125, "num_completions/total": 8448, "per_sentence_gradient_norm": 6.086989879608154, "per_sentence_gradient_norm/max": 555.5958251953125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 26.35565948486328, "per_sentence_gradient_norm/p99": 165.42633056640625, "per_sentence_gradient_norm/var": 1169.1358642578125, "per_token_feature_norm": 161.8898162841797, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 156.0, "per_token_feature_norm/min": 62.5, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 191.0, "per_token_feature_norm/var": 1868.6014404296875, "per_token_full_gradient_variance/max_squared_error": 955.20751953125, "per_token_full_gradient_variance/variance": 0.1070122942328453, "per_token_gradient_norm": 8.36080265045166, "per_token_gradient_norm/max": 7046.7568359375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 13864.2822265625, "per_token_policy_error_norm": 0.06479126960039139, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05532575771212578, "policy_entropy": 0.13203738629817963, "policy_entropy/max": 3.796875, "policy_entropy/median": 4.291534423828125e-05, "policy_entropy/min": 3.8510861166685117e-16, "policy_entropy/p25": 1.4826655387878418e-06, "policy_entropy/p75": 0.032958984375, "policy_entropy/var": 0.09570395946502686, "policy_error_vector_variance/max_squared_error": 2.01432466506958, "policy_error_vector_variance/metric": 0.06474002450704575, "policy_loss": 0.02709212526679039, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.1074135303497314, "policy_sharpness": 7.67315149307251, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.3026123046875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.283329010009766, "reward": 0.7578125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1837719976902008, "rewards/accuracy_reward": 0.7578125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1837719976902008, "sentence_full_gradient_variance/max_squared_error": 694876.6875, "sentence_full_gradient_variance/metric": 3260.3583984375, "sentence_full_gradient_variance/p75": 44.30412292480469, "sentence_full_gradient_variance/p90": 188.73745727539062, "sentence_full_gradient_variance/p95": 188.73745727539062, "sentence_full_gradient_variance/p99": 93028.328125, "state_level_variance/metric": 114.72674560546875, "state_level_variance_full_gradient/metric": 368.70703125, "step": 11 }, { "accuracy_reward": 0.75390625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18577350676059723, "action_level_variance/metric": 1066.7802734375, "action_level_variance_full_gradient/metric": 5217.85498046875, "adam_stats/lr_effective_max": 9.308178414357826e-05, "adam_stats/lr_effective_mean": -6.79698741734569e-10, "adam_stats/lr_effective_min": -9.196380415232852e-05, "adam_stats/m_t_max": 0.015453286468982697, "adam_stats/m_t_mean": -6.698702009755308e-11, "adam_stats/m_t_min": -0.01069289818406105, "adam_stats/v_t_max": 2.3701721147517674e-05, "adam_stats/v_t_mean": 2.0124201784155193e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0038020811043679714, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 2.71541166305542, "all_logprobs": -0.11327296495437622, "all_logprobs/max": 0.0, "all_logprobs/median": -2.1457672119140625e-06, "all_logprobs/min": -14.125, "all_logprobs/p1": -2.1875, "all_logprobs/p10": -0.251953125, "all_logprobs/p25": -0.00299072265625, "all_logprobs/p5": -0.7109375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.17799121141433716, "clip_ratio": 0.0, "completion_length": 605.0638427734375, "completion_length/correct": 523.6373291015625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 476.0, "completion_length/correct/min": 164.0, "completion_length/correct/p25": 348.5, "completion_length/correct/p75": 675.5, "completion_length/correct/var": 47470.5625, "completion_length/incorrect": 854.51318359375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 14.0, "completion_length/incorrect/p25": 691.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 53953.62109375, "completion_length/max": 1024.0, "completion_length/median": 546.0, "completion_length/min": 14.0, "completion_length/p25": 381.75, "completion_length/p75": 813.25, "completion_length/var": 69336.0078125, "epoch": 0.1536, "feature_vector_variance/max_squared_error": 109907.3046875, "feature_vector_variance/metric": 25949.99609375, "generated_tokens/total": 5594663.0, "grad_norm": 0.14389654994010925, "grouped_std_rewards": 0.1865815818309784, "learning_rate": 1.4981730376948682e-05, "loss": -0.0038, "mean_logprobs": -0.11474609375, "mean_logprobs/var": 0.003387451171875, "num_completions/total": 9216, "per_sentence_gradient_norm": 4.837114334106445, "per_sentence_gradient_norm/max": 638.1419677734375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 24.82615852355957, "per_sentence_gradient_norm/p99": 113.32223510742188, "per_sentence_gradient_norm/var": 1044.7427978515625, "per_token_feature_norm": 161.88775634765625, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 156.0, "per_token_feature_norm/min": 63.25, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 191.0, "per_token_feature_norm/var": 1827.1846923828125, "per_token_full_gradient_variance/max_squared_error": 427.00592041015625, "per_token_full_gradient_variance/variance": 0.06017252802848816, "per_token_gradient_norm": 5.3139519691467285, "per_token_gradient_norm/max": 6911.60546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8087.54931640625, "per_token_policy_error_norm": 0.061047039926052094, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05237724632024765, "policy_entropy": 0.12490945309400558, "policy_entropy/max": 3.8125, "policy_entropy/median": 3.218650817871094e-05, "policy_entropy/min": 3.397282455352979e-14, "policy_entropy/p25": 1.0952353477478027e-06, "policy_entropy/p75": 0.021484375, "policy_entropy/var": 0.0928342193365097, "policy_error_vector_variance/max_squared_error": 2.0159239768981934, "policy_error_vector_variance/metric": 0.06099292263388634, "policy_loss": -0.003802075982093811, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.71541166305542, "policy_sharpness": 7.793125629425049, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.560546398162842, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.887595176696777, "reward": 0.75390625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18577350676059723, "rewards/accuracy_reward": 0.75390625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18577350676059723, "sentence_full_gradient_variance/max_squared_error": 1583259.375, "sentence_full_gradient_variance/metric": 5919.7626953125, "sentence_full_gradient_variance/p75": 88.82563018798828, "sentence_full_gradient_variance/p90": 144.91859436035156, "sentence_full_gradient_variance/p95": 144.91859436035156, "sentence_full_gradient_variance/p99": 64679.171875, "state_level_variance/metric": 111.10720825195312, "state_level_variance_full_gradient/metric": 701.908447265625, "step": 12 }, { "accuracy_reward": 0.796875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16207626461982727, "action_level_variance/metric": 699.522216796875, "action_level_variance_full_gradient/metric": 2017.7669677734375, "adam_stats/lr_effective_max": 9.075838897842914e-05, "adam_stats/lr_effective_mean": -6.68512634138807e-10, "adam_stats/lr_effective_min": -8.969300688477233e-05, "adam_stats/m_t_max": 0.013983869925141335, "adam_stats/m_t_mean": -6.0890882613851e-11, "adam_stats/m_t_min": -0.00979069247841835, "adam_stats/v_t_max": 2.3678596335230395e-05, "adam_stats/v_t_mean": 2.011147108224587e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.016126057133078575, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 3.1105682849884033, "all_logprobs": -0.1058788001537323, "all_logprobs/max": 0.0, "all_logprobs/median": -1.0728836059570312e-06, "all_logprobs/min": -11.1875, "all_logprobs/p1": -2.09375, "all_logprobs/p10": -0.2177734375, "all_logprobs/p25": -0.0021820068359375, "all_logprobs/p5": -0.66796875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.16389133036136627, "clip_ratio": 0.0, "completion_length": 584.4557495117188, "completion_length/correct": 515.9036254882812, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 491.0, "completion_length/correct/min": 173.0, "completion_length/correct/p25": 350.75, "completion_length/correct/p75": 635.25, "completion_length/correct/var": 41894.74609375, "completion_length/incorrect": 853.3910522460938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 190.0, "completion_length/incorrect/p25": 701.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 46119.51171875, "completion_length/max": 1024.0, "completion_length/median": 540.0, "completion_length/min": 173.0, "completion_length/p25": 387.0, "completion_length/p75": 737.0, "completion_length/var": 61154.01171875, "epoch": 0.1664, "feature_vector_variance/max_squared_error": 118869.5859375, "feature_vector_variance/metric": 26582.017578125, "generated_tokens/total": 6043525.0, "grad_norm": 0.09969515353441238, "grouped_std_rewards": 0.1753583401441574, "learning_rate": 1.495891421526205e-05, "loss": -0.0161, "mean_logprobs": -0.10400390625, "mean_logprobs/var": 0.0024871826171875, "num_completions/total": 9984, "per_sentence_gradient_norm": 4.426055431365967, "per_sentence_gradient_norm/max": 389.520263671875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 19.453481674194336, "per_sentence_gradient_norm/p99": 98.67133331298828, "per_sentence_gradient_norm/var": 680.8187255859375, "per_token_feature_norm": 163.85055541992188, "per_token_feature_norm/max": 338.0, "per_token_feature_norm/median": 159.0, "per_token_feature_norm/min": 52.25, "per_token_feature_norm/p25": 131.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 1732.6785888671875, "per_token_full_gradient_variance/max_squared_error": 518.6856689453125, "per_token_full_gradient_variance/variance": 0.07436901330947876, "per_token_gradient_norm": 5.881560325622559, "per_token_gradient_norm/max": 6866.14208984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 10013.4814453125, "per_token_policy_error_norm": 0.05757332220673561, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04960298165678978, "policy_entropy": 0.11704720556735992, "policy_entropy/max": 3.765625, "policy_entropy/median": 1.704692840576172e-05, "policy_entropy/min": 5.169475958410885e-16, "policy_entropy/p25": 5.62518835067749e-07, "policy_entropy/p75": 0.0162353515625, "policy_entropy/var": 0.08455108851194382, "policy_error_vector_variance/max_squared_error": 2.01389479637146, "policy_error_vector_variance/metric": 0.05753079056739807, "policy_loss": -0.016126064583659172, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.1105682849884033, "policy_sharpness": 7.884143352508545, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.24609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.530618667602539, "reward": 0.796875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16207626461982727, "rewards/accuracy_reward": 0.796875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16207626461982727, "sentence_full_gradient_variance/max_squared_error": 352182.34375, "sentence_full_gradient_variance/metric": 2281.514892578125, "sentence_full_gradient_variance/p75": 47.46660614013672, "sentence_full_gradient_variance/p90": 93.60413360595703, "sentence_full_gradient_variance/p95": 93.60413360595703, "sentence_full_gradient_variance/p99": 55065.6796875, "state_level_variance/metric": 68.56452178955078, "state_level_variance_full_gradient/metric": 263.7478332519531, "step": 13 }, { "accuracy_reward": 0.7252604365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19951751828193665, "action_level_variance/metric": 823.8746337890625, "action_level_variance_full_gradient/metric": 4354.2880859375, "adam_stats/lr_effective_max": 8.831995364744216e-05, "adam_stats/lr_effective_mean": -5.74468028702313e-10, "adam_stats/lr_effective_min": -9.03138643479906e-05, "adam_stats/m_t_max": 0.012522540055215359, "adam_stats/m_t_mean": -5.269975775212821e-11, "adam_stats/m_t_min": -0.008812385611236095, "adam_stats/v_t_max": 2.3655315089854412e-05, "adam_stats/v_t_mean": 2.0096127453100854e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.08895274996757507, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.768115997314453, "all_logprobs": -0.10019408911466599, "all_logprobs/max": 0.0, "all_logprobs/median": -1.3113021850585938e-06, "all_logprobs/min": -11.875, "all_logprobs/p1": -2.0625, "all_logprobs/p10": -0.1904296875, "all_logprobs/p25": -0.00145721435546875, "all_logprobs/p5": -0.60546875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15709102153778076, "clip_ratio": 0.0, "completion_length": 606.4818115234375, "completion_length/correct": 526.111328125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 500.0, "completion_length/correct/min": 139.0, "completion_length/correct/p25": 346.0, "completion_length/correct/p75": 658.0, "completion_length/correct/var": 50524.11328125, "completion_length/incorrect": 818.6445922851562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 900.0, "completion_length/incorrect/min": 223.0, "completion_length/incorrect/p25": 614.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 53108.8671875, "completion_length/max": 1024.0, "completion_length/median": 562.0, "completion_length/min": 139.0, "completion_length/p25": 402.0, "completion_length/p75": 814.0, "completion_length/var": 68239.78125, "epoch": 0.1792, "feature_vector_variance/max_squared_error": 111056.4375, "feature_vector_variance/metric": 26385.220703125, "generated_tokens/total": 6509303.0, "grad_norm": 0.07543423771858215, "grouped_std_rewards": 0.1953006386756897, "learning_rate": 1.4927010515561777e-05, "loss": -0.089, "mean_logprobs": -0.10107421875, "mean_logprobs/var": 0.0021209716796875, "num_completions/total": 10752, "per_sentence_gradient_norm": 5.043100357055664, "per_sentence_gradient_norm/max": 383.2004089355469, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 23.273883819580078, "per_sentence_gradient_norm/p99": 129.681884765625, "per_sentence_gradient_norm/var": 799.4827270507812, "per_token_feature_norm": 163.64968872070312, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 159.0, "per_token_feature_norm/min": 58.25, "per_token_feature_norm/p25": 131.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 1696.91796875, "per_token_full_gradient_variance/max_squared_error": 218.62326049804688, "per_token_full_gradient_variance/variance": 0.06954318284988403, "per_token_gradient_norm": 5.967596530914307, "per_token_gradient_norm/max": 6808.9267578125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9222.5830078125, "per_token_policy_error_norm": 0.05436229333281517, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.047549158334732056, "policy_entropy": 0.11038199812173843, "policy_entropy/max": 3.8125, "policy_entropy/median": 1.9550323486328125e-05, "policy_entropy/min": 3.7990444123892075e-16, "policy_entropy/p25": 6.668269634246826e-07, "policy_entropy/p75": 0.01129150390625, "policy_entropy/var": 0.07953877002000809, "policy_error_vector_variance/max_squared_error": 2.0122151374816895, "policy_error_vector_variance/metric": 0.05433080345392227, "policy_loss": -0.08895276486873627, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.768115997314453, "policy_sharpness": 7.9626569747924805, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.99609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.168499946594238, "reward": 0.7252604365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19951751828193665, "rewards/accuracy_reward": 0.7252604365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19951751828193665, "sentence_full_gradient_variance/max_squared_error": 1057184.125, "sentence_full_gradient_variance/metric": 4883.28076171875, "sentence_full_gradient_variance/p75": 251.51962280273438, "sentence_full_gradient_variance/p90": 371.07244873046875, "sentence_full_gradient_variance/p95": 371.07244873046875, "sentence_full_gradient_variance/p99": 86913.5, "state_level_variance/metric": 78.3677978515625, "state_level_variance_full_gradient/metric": 528.99267578125, "step": 14 }, { "accuracy_reward": 0.7604166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18242068588733673, "action_level_variance/metric": 583.481201171875, "action_level_variance_full_gradient/metric": 4734.80126953125, "adam_stats/lr_effective_max": 8.438485383521765e-05, "adam_stats/lr_effective_mean": -5.20344767274139e-10, "adam_stats/lr_effective_min": -8.426836575381458e-05, "adam_stats/m_t_max": 0.01125254761427641, "adam_stats/m_t_mean": -4.526507987878681e-11, "adam_stats/m_t_min": -0.00791836716234684, "adam_stats/v_t_max": 2.3631691874470562e-05, "adam_stats/v_t_mean": 2.0078229443637463e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.08757911622524261, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.447150945663452, "all_logprobs": -0.10012871026992798, "all_logprobs/max": 0.0, "all_logprobs/median": -8.344650268554688e-07, "all_logprobs/min": -10.0, "all_logprobs/p1": -2.015625, "all_logprobs/p10": -0.201171875, "all_logprobs/p25": -0.00185394287109375, "all_logprobs/p5": -0.61328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15287333726882935, "clip_ratio": 0.0, "completion_length": 579.45703125, "completion_length/correct": 505.3133544921875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 445.0, "completion_length/correct/min": 173.0, "completion_length/correct/p25": 340.25, "completion_length/correct/p75": 636.0, "completion_length/correct/var": 49894.5703125, "completion_length/incorrect": 814.7826538085938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 903.0, "completion_length/incorrect/min": 177.0, "completion_length/incorrect/p25": 605.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 55023.796875, "completion_length/max": 1024.0, "completion_length/median": 518.0, "completion_length/min": 173.0, "completion_length/p25": 368.25, "completion_length/p75": 797.0, "completion_length/var": 68523.9609375, "epoch": 0.192, "feature_vector_variance/max_squared_error": 118373.796875, "feature_vector_variance/metric": 27252.33984375, "generated_tokens/total": 6954326.0, "grad_norm": 0.048049360513687134, "grouped_std_rewards": 0.14111101627349854, "learning_rate": 1.488605814759156e-05, "loss": 0.0876, "mean_logprobs": -0.0986328125, "mean_logprobs/var": 0.00213623046875, "num_completions/total": 11520, "per_sentence_gradient_norm": 3.748539686203003, "per_sentence_gradient_norm/max": 350.6317138671875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 114.1982421875, "per_sentence_gradient_norm/var": 570.1720581054688, "per_token_feature_norm": 166.44015502929688, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 163.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 134.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 1670.252685546875, "per_token_full_gradient_variance/max_squared_error": 518.7827758789062, "per_token_full_gradient_variance/variance": 0.06638536602258682, "per_token_gradient_norm": 5.062168598175049, "per_token_gradient_norm/max": 7415.099609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9133.9423828125, "per_token_policy_error_norm": 0.054796814918518066, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04749459773302078, "policy_entropy": 0.11103343963623047, "policy_entropy/max": 3.671875, "policy_entropy/median": 1.3470649719238281e-05, "policy_entropy/min": 7.4593109467002705e-16, "policy_entropy/p25": 3.8743019104003906e-07, "policy_entropy/p75": 0.0140380859375, "policy_entropy/var": 0.07685557007789612, "policy_error_vector_variance/max_squared_error": 2.0167651176452637, "policy_error_vector_variance/metric": 0.05476226285099983, "policy_loss": 0.08757912367582321, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.447150945663452, "policy_sharpness": 7.929097652435303, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.49609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.284255981445312, "reward": 0.7604166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18242068588733673, "rewards/accuracy_reward": 0.7604166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18242068588733673, "sentence_full_gradient_variance/max_squared_error": 2809693.75, "sentence_full_gradient_variance/metric": 5373.28955078125, "sentence_full_gradient_variance/p75": 43.592288970947266, "sentence_full_gradient_variance/p90": 169.14645385742188, "sentence_full_gradient_variance/p95": 169.14645385742188, "sentence_full_gradient_variance/p99": 74978.6484375, "state_level_variance/metric": 59.5034294128418, "state_level_variance_full_gradient/metric": 638.4878540039062, "step": 15 }, { "accuracy_reward": 0.7395833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19285091757774353, "action_level_variance/metric": 176.15576171875, "action_level_variance_full_gradient/metric": 1242.4713134765625, "adam_stats/lr_effective_max": 8.243147749453783e-05, "adam_stats/lr_effective_mean": -4.3946019046003926e-10, "adam_stats/lr_effective_min": -8.347561379196122e-05, "adam_stats/m_t_max": 0.009967838414013386, "adam_stats/m_t_mean": -3.9065438384167095e-11, "adam_stats/m_t_min": -0.0069739422760903835, "adam_stats/v_t_max": 2.3610604330315255e-05, "adam_stats/v_t_mean": 2.0060667536847543e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.016179844737052917, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 1.4982815980911255, "all_logprobs": -0.08514981716871262, "all_logprobs/max": 0.0, "all_logprobs/median": -5.960464477539062e-07, "all_logprobs/min": -11.3125, "all_logprobs/p1": -1.828125, "all_logprobs/p10": -0.1318359375, "all_logprobs/p25": -0.00058746337890625, "all_logprobs/p5": -0.50390625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1289002150297165, "clip_ratio": 0.0, "completion_length": 609.2083740234375, "completion_length/correct": 509.7992858886719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 469.0, "completion_length/correct/min": 176.0, "completion_length/correct/p25": 361.0, "completion_length/correct/p75": 625.5, "completion_length/correct/var": 40383.30078125, "completion_length/incorrect": 891.5299682617188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 203.0, "completion_length/incorrect/p25": 817.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 51017.76953125, "completion_length/max": 1024.0, "completion_length/median": 530.0, "completion_length/min": 176.0, "completion_length/p25": 396.0, "completion_length/p75": 856.5, "completion_length/var": 71191.703125, "epoch": 0.2048, "feature_vector_variance/max_squared_error": 114040.578125, "feature_vector_variance/metric": 27197.904296875, "generated_tokens/total": 7422198.0, "grad_norm": 0.047904353588819504, "grouped_std_rewards": 0.1651805341243744, "learning_rate": 1.4836107005503543e-05, "loss": 0.0162, "mean_logprobs": -0.08642578125, "mean_logprobs/var": 0.0016937255859375, "num_completions/total": 12288, "per_sentence_gradient_norm": 2.407014846801758, "per_sentence_gradient_norm/max": 161.01687622070312, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 72.53736877441406, "per_sentence_gradient_norm/var": 170.5841522216797, "per_token_feature_norm": 166.38795471191406, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 164.0, "per_token_feature_norm/min": 63.5, "per_token_feature_norm/p25": 135.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 1559.0777587890625, "per_token_full_gradient_variance/max_squared_error": 84.52457427978516, "per_token_full_gradient_variance/variance": 0.024670211598277092, "per_token_gradient_norm": 2.9917502403259277, "per_token_gradient_norm/max": 4854.44384765625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2885.8720703125, "per_token_policy_error_norm": 0.046766653656959534, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04078027978539467, "policy_entropy": 0.09530621021986008, "policy_entropy/max": 3.828125, "policy_entropy/median": 9.47713851928711e-06, "policy_entropy/min": 2.2343238370581275e-15, "policy_entropy/p25": 3.110617399215698e-07, "policy_entropy/p75": 0.0052490234375, "policy_entropy/var": 0.06581579148769379, "policy_error_vector_variance/max_squared_error": 2.014193534851074, "policy_error_vector_variance/metric": 0.04672999307513237, "policy_loss": 0.016179833561182022, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -7.481915473937988, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.4982815980911255, "policy_sharpness": 8.14655590057373, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.328056335449219, "reward": 0.7395833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19285091757774353, "rewards/accuracy_reward": 0.7395833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19285091757774353, "sentence_full_gradient_variance/max_squared_error": 107911.2890625, "sentence_full_gradient_variance/metric": 1392.139404296875, "sentence_full_gradient_variance/p75": 54.43021774291992, "sentence_full_gradient_variance/p90": 116.62945556640625, "sentence_full_gradient_variance/p95": 116.62945556640625, "sentence_full_gradient_variance/p99": 48362.63671875, "state_level_variance/metric": 16.39654541015625, "state_level_variance_full_gradient/metric": 149.66812133789062, "step": 16 }, { "accuracy_reward": 0.71875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2024119794368744, "action_level_variance/metric": 595.6953735351562, "action_level_variance_full_gradient/metric": 2985.0654296875, "adam_stats/lr_effective_max": 8.258270827354863e-05, "adam_stats/lr_effective_mean": -3.4274968974123965e-10, "adam_stats/lr_effective_min": -8.27403346193023e-05, "adam_stats/m_t_max": 0.008716232143342495, "adam_stats/m_t_mean": -3.453908686834595e-11, "adam_stats/m_t_min": -0.005983578972518444, "adam_stats/v_t_max": 2.3593487640027888e-05, "adam_stats/v_t_mean": 2.0077672163720806e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.12135414779186249, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 3.852947235107422, "all_logprobs": -0.08578430861234665, "all_logprobs/max": 0.0, "all_logprobs/median": -5.960464477539062e-07, "all_logprobs/min": -13.3125, "all_logprobs/p1": -1.90625, "all_logprobs/p10": -0.130859375, "all_logprobs/p25": -0.00070953369140625, "all_logprobs/p5": -0.49609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.13207365572452545, "clip_ratio": 0.0, "completion_length": 589.5560302734375, "completion_length/correct": 508.02899169921875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 465.0, "completion_length/correct/min": 71.0, "completion_length/correct/p25": 360.75, "completion_length/correct/p75": 631.25, "completion_length/correct/var": 41430.734375, "completion_length/incorrect": 797.9027709960938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 969.0, "completion_length/incorrect/min": 209.0, "completion_length/incorrect/p25": 572.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 75282.796875, "completion_length/max": 1024.0, "completion_length/median": 534.0, "completion_length/min": 71.0, "completion_length/p25": 375.0, "completion_length/p75": 791.5, "completion_length/var": 67873.921875, "epoch": 0.2176, "feature_vector_variance/max_squared_error": 108346.7734375, "feature_vector_variance/metric": 28140.853515625, "generated_tokens/total": 7874977.0, "grad_norm": 0.14969202876091003, "grouped_std_rewards": 0.21515902876853943, "learning_rate": 1.4777217947069972e-05, "loss": -0.1214, "mean_logprobs": -0.08642578125, "mean_logprobs/var": 0.00156402587890625, "num_completions/total": 13056, "per_sentence_gradient_norm": 4.301090240478516, "per_sentence_gradient_norm/max": 336.28033447265625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 30.413414001464844, "per_sentence_gradient_norm/p99": 109.37117004394531, "per_sentence_gradient_norm/var": 577.948486328125, "per_token_feature_norm": 169.26673889160156, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 169.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 139.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 1487.6099853515625, "per_token_full_gradient_variance/max_squared_error": 674.1310424804688, "per_token_full_gradient_variance/variance": 0.08773934096097946, "per_token_gradient_norm": 5.734098434448242, "per_token_gradient_norm/max": 6745.83544921875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 10053.6416015625, "per_token_policy_error_norm": 0.04709576815366745, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04148322343826294, "policy_entropy": 0.09498720616102219, "policy_entropy/max": 3.8125, "policy_entropy/median": 1.0311603546142578e-05, "policy_entropy/min": 3.0357660829594124e-17, "policy_entropy/p25": 2.980232238769531e-07, "policy_entropy/p75": 0.0059814453125, "policy_entropy/var": 0.06486104428768158, "policy_error_vector_variance/max_squared_error": 2.014714479446411, "policy_error_vector_variance/metric": 0.047067586332559586, "policy_loss": -0.12135414779186249, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.852947473526001, "policy_sharpness": 8.12345027923584, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.36760425567627, "reward": 0.71875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2024119794368744, "rewards/accuracy_reward": 0.71875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2024119794368744, "sentence_full_gradient_variance/max_squared_error": 254874.046875, "sentence_full_gradient_variance/metric": 3316.70263671875, "sentence_full_gradient_variance/p75": 154.8989715576172, "sentence_full_gradient_variance/p90": 229.9839630126953, "sentence_full_gradient_variance/p95": 256.4110412597656, "sentence_full_gradient_variance/p99": 97775.3984375, "state_level_variance/metric": 56.55162048339844, "state_level_variance_full_gradient/metric": 331.6369323730469, "step": 17 }, { "accuracy_reward": 0.7760416865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17402760684490204, "action_level_variance/metric": 293.38232421875, "action_level_variance_full_gradient/metric": 2423.04248046875, "adam_stats/lr_effective_max": 8.456713840132579e-05, "adam_stats/lr_effective_mean": -2.2698547208488407e-10, "adam_stats/lr_effective_min": -8.55737816891633e-05, "adam_stats/m_t_max": 0.007827633060514927, "adam_stats/m_t_mean": -3.310686794155693e-11, "adam_stats/m_t_min": -0.005418218206614256, "adam_stats/v_t_max": 2.3569924451294355e-05, "adam_stats/v_t_mean": 2.0075325950219547e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.06611000746488571, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.608152985572815, "all_logprobs": -0.07816479355096817, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -13.8125, "all_logprobs/p1": -1.78125, "all_logprobs/p10": -0.1005859375, "all_logprobs/p25": -0.0002574920654296875, "all_logprobs/p5": -0.447265625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11999425292015076, "clip_ratio": 0.0, "completion_length": 593.9388427734375, "completion_length/correct": 513.132568359375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 479.0, "completion_length/correct/min": 128.0, "completion_length/correct/p25": 336.5, "completion_length/correct/p75": 665.5, "completion_length/correct/var": 46883.82421875, "completion_length/incorrect": 873.9418334960938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 227.0, "completion_length/incorrect/p25": 765.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 53245.33984375, "completion_length/max": 1024.0, "completion_length/median": 547.0, "completion_length/min": 128.0, "completion_length/p25": 363.75, "completion_length/p75": 803.25, "completion_length/var": 70896.4765625, "epoch": 0.2304, "feature_vector_variance/max_squared_error": 112614.9921875, "feature_vector_variance/metric": 28395.048828125, "generated_tokens/total": 8331122.0, "grad_norm": 0.11139713227748871, "grouped_std_rewards": 0.1804819107055664, "learning_rate": 1.4709462719537392e-05, "loss": -0.0661, "mean_logprobs": -0.07958984375, "mean_logprobs/var": 0.001617431640625, "num_completions/total": 13824, "per_sentence_gradient_norm": 2.9368185997009277, "per_sentence_gradient_norm/max": 253.14385986328125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 16.73573875427246, "per_sentence_gradient_norm/p99": 71.66694641113281, "per_sentence_gradient_norm/var": 285.1286926269531, "per_token_feature_norm": 170.87054443359375, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 171.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 142.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 1425.358642578125, "per_token_full_gradient_variance/max_squared_error": 66.93811798095703, "per_token_full_gradient_variance/variance": 0.025626685470342636, "per_token_gradient_norm": 3.233633279800415, "per_token_gradient_norm/max": 4824.8818359375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3173.974365234375, "per_token_policy_error_norm": 0.043006591498851776, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03795214369893074, "policy_entropy": 0.08666309714317322, "policy_entropy/max": 3.734375, "policy_entropy/median": 4.351139068603516e-06, "policy_entropy/min": 1.1310397063368782e-15, "policy_entropy/p25": 1.1688098311424255e-07, "policy_entropy/p75": 0.002410888671875, "policy_entropy/var": 0.0603259839117527, "policy_error_vector_variance/max_squared_error": 2.011942148208618, "policy_error_vector_variance/metric": 0.042978573590517044, "policy_loss": -0.06611000001430511, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.608152985572815, "policy_sharpness": 8.292327880859375, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.721845626831055, "reward": 0.7760416865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17402760684490204, "rewards/accuracy_reward": 0.7760416865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17402760684490204, "sentence_full_gradient_variance/max_squared_error": 757042.625, "sentence_full_gradient_variance/metric": 2709.935791015625, "sentence_full_gradient_variance/p75": 106.51365661621094, "sentence_full_gradient_variance/p90": 246.1415557861328, "sentence_full_gradient_variance/p95": 246.1415557861328, "sentence_full_gradient_variance/p99": 59871.53125, "state_level_variance/metric": 28.343130111694336, "state_level_variance_full_gradient/metric": 286.8932189941406, "step": 18 }, { "accuracy_reward": 0.7565104365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18444257974624634, "action_level_variance/metric": 359.87451171875, "action_level_variance_full_gradient/metric": 2623.647216796875, "adam_stats/lr_effective_max": 8.582664304412901e-05, "adam_stats/lr_effective_mean": -2.1972983155205128e-10, "adam_stats/lr_effective_min": -8.559576235711575e-05, "adam_stats/m_t_max": 0.006998330354690552, "adam_stats/m_t_mean": -2.97126073744991e-11, "adam_stats/m_t_min": -0.004769584629684687, "adam_stats/v_t_max": 2.354657044634223e-05, "adam_stats/v_t_mean": 2.0089496472613932e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.004097240976989269, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.545274019241333, "all_logprobs": -0.07423635572195053, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -10.25, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.099609375, "all_logprobs/p25": -0.00020503997802734375, "all_logprobs/p5": -0.408203125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11062562465667725, "clip_ratio": 0.0, "completion_length": 598.4518432617188, "completion_length/correct": 514.0946655273438, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 484.0, "completion_length/correct/min": 136.0, "completion_length/correct/p25": 371.0, "completion_length/correct/p75": 634.0, "completion_length/correct/var": 39056.671875, "completion_length/incorrect": 860.5454711914062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 271.0, "completion_length/incorrect/p25": 698.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 49050.9609375, "completion_length/max": 1024.0, "completion_length/median": 554.0, "completion_length/min": 136.0, "completion_length/p25": 399.5, "completion_length/p75": 780.0, "completion_length/var": 63567.70703125, "epoch": 0.2432, "feature_vector_variance/max_squared_error": 110114.34375, "feature_vector_variance/metric": 28827.134765625, "generated_tokens/total": 8790733.0, "grad_norm": 0.1072976365685463, "grouped_std_rewards": 0.20191740989685059, "learning_rate": 1.4632923872213653e-05, "loss": 0.0041, "mean_logprobs": -0.0751953125, "mean_logprobs/var": 0.00115966796875, "num_completions/total": 14592, "per_sentence_gradient_norm": 3.6681487560272217, "per_sentence_gradient_norm/max": 269.6708068847656, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 23.022926330566406, "per_sentence_gradient_norm/p99": 93.23739624023438, "per_sentence_gradient_norm/var": 346.8708801269531, "per_token_feature_norm": 172.7771759033203, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 175.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 144.0, "per_token_feature_norm/p75": 198.0, "per_token_feature_norm/var": 1369.295654296875, "per_token_full_gradient_variance/max_squared_error": 239.25009155273438, "per_token_full_gradient_variance/variance": 0.0478772409260273, "per_token_gradient_norm": 4.109480381011963, "per_token_gradient_norm/max": 6161.1572265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5166.8212890625, "per_token_policy_error_norm": 0.041365452110767365, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03676501289010048, "policy_entropy": 0.08220147341489792, "policy_entropy/max": 3.609375, "policy_entropy/median": 3.5017728805541992e-06, "policy_entropy/min": 2.3765711620882257e-16, "policy_entropy/p25": 8.614733815193176e-08, "policy_entropy/p75": 0.0020294189453125, "policy_entropy/var": 0.05352798104286194, "policy_error_vector_variance/max_squared_error": 2.012420892715454, "policy_error_vector_variance/metric": 0.04134412482380867, "policy_loss": 0.004097248427569866, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.545274257659912, "policy_sharpness": 8.326349258422852, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.499486923217773, "reward": 0.7565104365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18444257974624634, "rewards/accuracy_reward": 0.7565104365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18444257974624634, "sentence_full_gradient_variance/max_squared_error": 406459.5625, "sentence_full_gradient_variance/metric": 2949.576904296875, "sentence_full_gradient_variance/p75": 63.86090850830078, "sentence_full_gradient_variance/p90": 146.5404510498047, "sentence_full_gradient_variance/p95": 163.15283203125, "sentence_full_gradient_variance/p99": 63959.6640625, "state_level_variance/metric": 31.860877990722656, "state_level_variance_full_gradient/metric": 325.9295654296875, "step": 19 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.165145605802536, "action_level_variance/metric": 335.4866943359375, "action_level_variance_full_gradient/metric": 6087.81640625, "adam_stats/lr_effective_max": 8.431102469330654e-05, "adam_stats/lr_effective_mean": -1.5900340766261678e-10, "adam_stats/lr_effective_min": -8.607812924310565e-05, "adam_stats/m_t_max": 0.006364873144775629, "adam_stats/m_t_mean": -2.776971187723465e-11, "adam_stats/m_t_min": -0.0038687805645167828, "adam_stats/v_t_max": 2.3523463823948987e-05, "adam_stats/v_t_mean": 2.011485596142837e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.04279039427638054, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.557288646697998, "all_logprobs": -0.06813911348581314, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -9.3125, "all_logprobs/p1": -1.625, "all_logprobs/p10": -0.06982421875, "all_logprobs/p25": -9.870529174804688e-05, "all_logprobs/p5": -0.361328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.10338587313890457, "clip_ratio": 0.0, "completion_length": 567.72265625, "completion_length/correct": 511.2401428222656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 463.0, "completion_length/correct/min": 121.0, "completion_length/correct/p25": 362.75, "completion_length/correct/p75": 649.25, "completion_length/correct/var": 39389.4765625, "completion_length/incorrect": 782.3562622070312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 915.0, "completion_length/incorrect/min": 185.0, "completion_length/incorrect/p25": 528.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 72333.6953125, "completion_length/max": 1024.0, "completion_length/median": 507.0, "completion_length/min": 121.0, "completion_length/p25": 387.75, "completion_length/p75": 724.25, "completion_length/var": 58306.3515625, "epoch": 0.256, "feature_vector_variance/max_squared_error": 122593.734375, "feature_vector_variance/metric": 30053.06640625, "generated_tokens/total": 9226744.0, "grad_norm": 0.2337981015443802, "grouped_std_rewards": 0.18201421201229095, "learning_rate": 1.4547694655894313e-05, "loss": -0.0428, "mean_logprobs": -0.06884765625, "mean_logprobs/var": 0.0017242431640625, "num_completions/total": 15360, "per_sentence_gradient_norm": 3.036625862121582, "per_sentence_gradient_norm/max": 294.93902587890625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 9.904302597045898, "per_sentence_gradient_norm/p99": 84.44203186035156, "per_sentence_gradient_norm/var": 326.69091796875, "per_token_feature_norm": 177.7682342529297, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 182.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 152.0, "per_token_feature_norm/p75": 201.0, "per_token_feature_norm/var": 1245.702392578125, "per_token_full_gradient_variance/max_squared_error": 266.2983703613281, "per_token_full_gradient_variance/variance": 0.04248768463730812, "per_token_gradient_norm": 3.60383677482605, "per_token_gradient_norm/max": 6593.67333984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4899.861328125, "per_token_policy_error_norm": 0.03781307861208916, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03362934663891792, "policy_entropy": 0.07563821226358414, "policy_entropy/max": 3.828125, "policy_entropy/median": 2.0116567611694336e-06, "policy_entropy/min": 2.3245294578089215e-16, "policy_entropy/p25": 4.190951585769653e-08, "policy_entropy/p75": 0.001068115234375, "policy_entropy/var": 0.05153372511267662, "policy_error_vector_variance/max_squared_error": 2.0117123126983643, "policy_error_vector_variance/metric": 0.03777534142136574, "policy_loss": -0.042790405452251434, "policy_loss/max": 19.79339027404785, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.55728816986084, "policy_sharpness": 8.442899703979492, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.967748641967773, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.165145605802536, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.165145605802536, "sentence_full_gradient_variance/max_squared_error": 2298685.25, "sentence_full_gradient_variance/metric": 6882.18896484375, "sentence_full_gradient_variance/p75": 104.74055480957031, "sentence_full_gradient_variance/p90": 328.9883728027344, "sentence_full_gradient_variance/p95": 328.9883728027344, "sentence_full_gradient_variance/p99": 146878.328125, "state_level_variance/metric": 33.05910110473633, "state_level_variance_full_gradient/metric": 794.3729248046875, "step": 20 }, { "accuracy_reward": 0.7096354365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20632164180278778, "action_level_variance/metric": 248.80575561523438, "action_level_variance_full_gradient/metric": 2197.10009765625, "adam_stats/lr_effective_max": 8.752653957344592e-05, "adam_stats/lr_effective_mean": -2.380957930814276e-10, "adam_stats/lr_effective_min": -8.441347745247185e-05, "adam_stats/m_t_max": 0.005722044035792351, "adam_stats/m_t_mean": -2.9413870644701134e-11, "adam_stats/m_t_min": -0.0034358499106019735, "adam_stats/v_t_max": 2.349994429096114e-05, "adam_stats/v_t_mean": 2.0122026874597188e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.013536013662815094, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.1420092582702637, "all_logprobs": -0.062226761132478714, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.0, "all_logprobs/p1": -1.5, "all_logprobs/p10": -0.05659198760986328, "all_logprobs/p25": -6.246566772460938e-05, "all_logprobs/p5": -0.3125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.09256140887737274, "clip_ratio": 0.0, "completion_length": 620.6549682617188, "completion_length/correct": 542.8734130859375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 520.0, "completion_length/correct/min": 98.0, "completion_length/correct/p25": 375.0, "completion_length/correct/p75": 697.0, "completion_length/correct/var": 49263.03515625, "completion_length/incorrect": 810.7489013671875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 921.0, "completion_length/incorrect/min": 255.0, "completion_length/incorrect/p25": 588.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 60043.9453125, "completion_length/max": 1024.0, "completion_length/median": 590.0, "completion_length/min": 98.0, "completion_length/p25": 409.5, "completion_length/p75": 841.0, "completion_length/var": 67124.3046875, "epoch": 0.2688, "feature_vector_variance/max_squared_error": 121400.1015625, "feature_vector_variance/metric": 29996.111328125, "generated_tokens/total": 9703407.0, "grad_norm": 0.10497893393039703, "grouped_std_rewards": 0.2254960983991623, "learning_rate": 1.4453878909250906e-05, "loss": 0.0135, "mean_logprobs": -0.0634765625, "mean_logprobs/var": 0.000782012939453125, "num_completions/total": 16128, "per_sentence_gradient_norm": 3.1510791778564453, "per_sentence_gradient_norm/max": 208.90853881835938, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 21.8005428314209, "per_sentence_gradient_norm/p99": 75.5765151977539, "per_sentence_gradient_norm/var": 239.18785095214844, "per_token_feature_norm": 178.77125549316406, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 183.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 155.0, "per_token_feature_norm/p75": 202.0, "per_token_feature_norm/var": 1193.6923828125, "per_token_full_gradient_variance/max_squared_error": 92.48310852050781, "per_token_full_gradient_variance/variance": 0.03520674630999565, "per_token_gradient_norm": 3.950279951095581, "per_token_gradient_norm/max": 3679.9619140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3978.528076171875, "per_token_policy_error_norm": 0.034897103905677795, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03126499429345131, "policy_entropy": 0.0690290704369545, "policy_entropy/max": 3.484375, "policy_entropy/median": 1.2665987014770508e-06, "policy_entropy/min": 2.177077962350893e-16, "policy_entropy/p25": 2.6775524020195007e-08, "policy_entropy/p75": 0.000701904296875, "policy_entropy/var": 0.04338257387280464, "policy_error_vector_variance/max_squared_error": 2.0128629207611084, "policy_error_vector_variance/metric": 0.03487563878297806, "policy_loss": 0.013536008074879646, "policy_loss/max": 9.659050941467285, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.1420092582702637, "policy_sharpness": 8.515932083129883, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.547282218933105, "reward": 0.7096354365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20632164180278778, "rewards/accuracy_reward": 0.7096354365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20632164180278778, "sentence_full_gradient_variance/max_squared_error": 226724.65625, "sentence_full_gradient_variance/metric": 2473.8515625, "sentence_full_gradient_variance/p75": 79.74275970458984, "sentence_full_gradient_variance/p90": 87.48410034179688, "sentence_full_gradient_variance/p95": 87.48410034179688, "sentence_full_gradient_variance/p99": 97435.6484375, "state_level_variance/metric": 21.394271850585938, "state_level_variance_full_gradient/metric": 276.75115966796875, "step": 21 }, { "accuracy_reward": 0.7747396230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17474570870399475, "action_level_variance/metric": 194.9176483154297, "action_level_variance_full_gradient/metric": 4110.30224609375, "adam_stats/lr_effective_max": 8.13883962109685e-05, "adam_stats/lr_effective_mean": -2.459666359477808e-10, "adam_stats/lr_effective_min": -8.401979721384123e-05, "adam_stats/m_t_max": 0.005035398527979851, "adam_stats/m_t_mean": -2.9985181004832384e-11, "adam_stats/m_t_min": -0.0030513908714056015, "adam_stats/v_t_max": 2.3477754439227283e-05, "adam_stats/v_t_mean": 2.01176011613291e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.02869875729084015, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.1468827724456787, "all_logprobs": -0.058130133897066116, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.375, "all_logprobs/p1": -1.4609375, "all_logprobs/p10": -0.04345703125, "all_logprobs/p25": -3.170967102050781e-05, "all_logprobs/p5": -0.28125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.08636359870433807, "clip_ratio": 0.0, "completion_length": 602.5325927734375, "completion_length/correct": 533.3529663085938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 504.0, "completion_length/correct/min": 86.0, "completion_length/correct/p25": 381.0, "completion_length/correct/p75": 665.0, "completion_length/correct/var": 43414.1328125, "completion_length/incorrect": 840.46240234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 221.0, "completion_length/incorrect/p25": 657.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 54991.12109375, "completion_length/max": 1024.0, "completion_length/median": 556.0, "completion_length/min": 86.0, "completion_length/p25": 407.75, "completion_length/p75": 793.0, "completion_length/var": 62435.03125, "epoch": 0.2816, "feature_vector_variance/max_squared_error": 131143.1875, "feature_vector_variance/metric": 30820.306640625, "generated_tokens/total": 10166152.0, "grad_norm": 0.1201900988817215, "grouped_std_rewards": 0.15987887978553772, "learning_rate": 1.4351590932319506e-05, "loss": -0.0287, "mean_logprobs": -0.058349609375, "mean_logprobs/var": 0.000873565673828125, "num_completions/total": 16896, "per_sentence_gradient_norm": 2.3478026390075684, "per_sentence_gradient_norm/max": 159.3299560546875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 81.52803802490234, "per_sentence_gradient_norm/var": 189.65243530273438, "per_token_feature_norm": 182.52804565429688, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 161.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 1113.1978759765625, "per_token_full_gradient_variance/max_squared_error": 159.42901611328125, "per_token_full_gradient_variance/variance": 0.0300369281321764, "per_token_gradient_norm": 2.718871831893921, "per_token_gradient_norm/max": 4431.869140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3366.48779296875, "per_token_policy_error_norm": 0.032651908695697784, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.02934856154024601, "policy_entropy": 0.06434521079063416, "policy_entropy/max": 3.796875, "policy_entropy/median": 8.121132850646973e-07, "policy_entropy/min": 2.3071822230491534e-16, "policy_entropy/p25": 1.594889909029007e-08, "policy_entropy/p75": 0.0003795623779296875, "policy_entropy/var": 0.040442273020744324, "policy_error_vector_variance/max_squared_error": 2.0122382640838623, "policy_error_vector_variance/metric": 0.03263555467128754, "policy_loss": -0.02869875729084015, "policy_loss/max": 12.958683013916016, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.1468827724456787, "policy_sharpness": 8.603139877319336, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.148808479309082, "reward": 0.7747396230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17474570870399475, "rewards/accuracy_reward": 0.7747396230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17474570870399475, "sentence_full_gradient_variance/max_squared_error": 1459098.875, "sentence_full_gradient_variance/metric": 4635.4033203125, "sentence_full_gradient_variance/p75": 140.75216674804688, "sentence_full_gradient_variance/p90": 191.7021942138672, "sentence_full_gradient_variance/p95": 191.7021942138672, "sentence_full_gradient_variance/p99": 85042.90625, "state_level_variance/metric": 19.050975799560547, "state_level_variance_full_gradient/metric": 525.100341796875, "step": 22 }, { "accuracy_reward": 0.76171875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.181739941239357, "action_level_variance/metric": 123.83958435058594, "action_level_variance_full_gradient/metric": 3464.56640625, "adam_stats/lr_effective_max": 8.285562944365665e-05, "adam_stats/lr_effective_mean": -2.3654855851873435e-10, "adam_stats/lr_effective_min": -8.392452582484111e-05, "adam_stats/m_t_max": 0.004620359279215336, "adam_stats/m_t_mean": -2.6310923925687568e-11, "adam_stats/m_t_min": -0.0026917015202343464, "adam_stats/v_t_max": 2.3455060727428645e-05, "adam_stats/v_t_mean": 2.0104454125785542e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.09468890726566315, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.1582295894622803, "all_logprobs": -0.05507490038871765, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -12.1875, "all_logprobs/p1": -1.4140625, "all_logprobs/p10": -0.033935546875, "all_logprobs/p25": -1.3232231140136719e-05, "all_logprobs/p5": -0.251953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0823437049984932, "clip_ratio": 0.0, "completion_length": 647.71484375, "completion_length/correct": 574.4307861328125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 545.0, "completion_length/correct/min": 143.0, "completion_length/correct/p25": 390.0, "completion_length/correct/p75": 749.0, "completion_length/correct/var": 54225.9453125, "completion_length/incorrect": 881.9835815429688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 292.0, "completion_length/incorrect/p25": 725.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 40750.66015625, "completion_length/max": 1024.0, "completion_length/median": 619.0, "completion_length/min": 143.0, "completion_length/p25": 429.0, "completion_length/p75": 882.0, "completion_length/var": 68148.28125, "epoch": 0.2944, "feature_vector_variance/max_squared_error": 128935.0390625, "feature_vector_variance/metric": 29842.107421875, "generated_tokens/total": 10663597.0, "grad_norm": 0.0950595885515213, "grouped_std_rewards": 0.20336155593395233, "learning_rate": 1.4240955347243754e-05, "loss": -0.0947, "mean_logprobs": -0.056640625, "mean_logprobs/var": 0.000728607177734375, "num_completions/total": 17664, "per_sentence_gradient_norm": 2.2643518447875977, "per_sentence_gradient_norm/max": 101.33927154541016, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 13.411271095275879, "per_sentence_gradient_norm/p99": 65.42533874511719, "per_sentence_gradient_norm/var": 118.86705780029297, "per_token_feature_norm": 182.41265869140625, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 160.0, "per_token_feature_norm/p75": 205.0, "per_token_feature_norm/var": 1162.211181640625, "per_token_full_gradient_variance/max_squared_error": 67.15441131591797, "per_token_full_gradient_variance/variance": 0.024471908807754517, "per_token_gradient_norm": 2.689148187637329, "per_token_gradient_norm/max": 3715.09228515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2478.06201171875, "per_token_policy_error_norm": 0.03089742548763752, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.027768809348344803, "policy_entropy": 0.06065451726317406, "policy_entropy/max": 3.84375, "policy_entropy/median": 5.066394805908203e-07, "policy_entropy/min": 5.177065373618284e-18, "policy_entropy/p25": 1.2689270079135895e-08, "policy_entropy/p75": 0.0001678466796875, "policy_entropy/var": 0.03942551836371422, "policy_error_vector_variance/max_squared_error": 2.0064611434936523, "policy_error_vector_variance/metric": 0.030879858881235123, "policy_loss": -0.09468890726566315, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.158229351043701, "policy_sharpness": 8.69601058959961, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.703694820404053, "reward": 0.76171875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.181739941239357, "rewards/accuracy_reward": 0.76171875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.181739941239357, "sentence_full_gradient_variance/max_squared_error": 955950.5625, "sentence_full_gradient_variance/metric": 3918.64404296875, "sentence_full_gradient_variance/p75": 41.347190856933594, "sentence_full_gradient_variance/p90": 140.12741088867188, "sentence_full_gradient_variance/p95": 140.12741088867188, "sentence_full_gradient_variance/p99": 82479.796875, "state_level_variance/metric": 10.461633682250977, "state_level_variance_full_gradient/metric": 454.07708740234375, "step": 23 }, { "accuracy_reward": 0.7838541865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1696476936340332, "action_level_variance/metric": 341.2176818847656, "action_level_variance_full_gradient/metric": 2152.75439453125, "adam_stats/lr_effective_max": 7.894250302342698e-05, "adam_stats/lr_effective_mean": -1.612284750152071e-10, "adam_stats/lr_effective_min": -7.999356603249907e-05, "adam_stats/m_t_max": 0.004146306775510311, "adam_stats/m_t_mean": -2.539855305239147e-11, "adam_stats/m_t_min": -0.0024032711517065763, "adam_stats/v_t_max": 2.3431621229974553e-05, "adam_stats/v_t_mean": 2.011019389208668e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.04162019118666649, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.037606954574585, "all_logprobs": -0.05434150993824005, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.6875, "all_logprobs/p1": -1.40625, "all_logprobs/p10": -0.033447265625, "all_logprobs/p25": -1.2993812561035156e-05, "all_logprobs/p5": -0.251953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.08020903915166855, "clip_ratio": 0.0, "completion_length": 619.8776245117188, "completion_length/correct": 554.117919921875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 515.0, "completion_length/correct/min": 152.0, "completion_length/correct/p25": 383.25, "completion_length/correct/p75": 697.75, "completion_length/correct/var": 44353.1484375, "completion_length/incorrect": 858.3554077148438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 255.0, "completion_length/incorrect/p25": 722.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 55998.3515625, "completion_length/max": 1024.0, "completion_length/median": 573.0, "completion_length/min": 152.0, "completion_length/p25": 414.0, "completion_length/p75": 825.25, "completion_length/var": 62503.15625, "epoch": 0.3072, "feature_vector_variance/max_squared_error": 127023.1328125, "feature_vector_variance/metric": 30126.02734375, "generated_tokens/total": 11139663.0, "grad_norm": 0.09002792090177536, "grouped_std_rewards": 0.16000495851039886, "learning_rate": 1.4122106946441953e-05, "loss": -0.0416, "mean_logprobs": -0.0546875, "mean_logprobs/var": 0.0009613037109375, "num_completions/total": 18432, "per_sentence_gradient_norm": 2.7971978187561035, "per_sentence_gradient_norm/max": 301.3292236328125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 77.01371002197266, "per_sentence_gradient_norm/var": 333.8280029296875, "per_token_feature_norm": 184.17555236816406, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 164.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 1087.61572265625, "per_token_full_gradient_variance/max_squared_error": 1171.452880859375, "per_token_full_gradient_variance/variance": 0.05064942687749863, "per_token_gradient_norm": 3.79321551322937, "per_token_gradient_norm/max": 6457.59423828125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6136.78759765625, "per_token_policy_error_norm": 0.030565300956368446, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.02741033397614956, "policy_entropy": 0.06022173911333084, "policy_entropy/max": 3.8125, "policy_entropy/median": 3.9674341678619385e-07, "policy_entropy/min": 1.2685165418080402e-17, "policy_entropy/p25": 8.905772119760513e-09, "policy_entropy/p75": 0.00015926361083984375, "policy_entropy/var": 0.03866369277238846, "policy_error_vector_variance/max_squared_error": 2.0077152252197266, "policy_error_vector_variance/metric": 0.030547868460416794, "policy_loss": -0.04162019491195679, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.037606954574585, "policy_sharpness": 8.701085090637207, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.6884942054748535, "reward": 0.7838541865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1696476936340332, "rewards/accuracy_reward": 0.7838541865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1696476936340332, "sentence_full_gradient_variance/max_squared_error": 741033.375, "sentence_full_gradient_variance/metric": 2436.63330078125, "sentence_full_gradient_variance/p75": 8.197161674499512, "sentence_full_gradient_variance/p90": 171.9821014404297, "sentence_full_gradient_variance/p95": 171.9821014404297, "sentence_full_gradient_variance/p99": 46949.15625, "state_level_variance/metric": 35.19450378417969, "state_level_variance_full_gradient/metric": 283.8790283203125, "step": 24 }, { "accuracy_reward": 0.7369791865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19409358501434326, "action_level_variance/metric": 204.87100219726562, "action_level_variance_full_gradient/metric": 2883.0185546875, "adam_stats/lr_effective_max": 8.116290700854734e-05, "adam_stats/lr_effective_mean": 3.5579979934530215e-11, "adam_stats/lr_effective_min": -8.393145253648981e-05, "adam_stats/m_t_max": 0.004013963509351015, "adam_stats/m_t_mean": -1.995613743699387e-11, "adam_stats/m_t_min": -0.002539836335927248, "adam_stats/v_t_max": 2.341615800105501e-05, "adam_stats/v_t_mean": 2.0154461866789264e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.029222454875707626, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.5345723628997803, "all_logprobs": -0.058992065489292145, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -12.25, "all_logprobs/p1": -1.5, "all_logprobs/p10": -0.04638671875, "all_logprobs/p25": -2.753734588623047e-05, "all_logprobs/p5": -0.28125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.08736497908830643, "clip_ratio": 0.0, "completion_length": 594.6627807617188, "completion_length/correct": 522.8392333984375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 487.0, "completion_length/correct/min": 118.0, "completion_length/correct/p25": 360.25, "completion_length/correct/p75": 662.25, "completion_length/correct/var": 45078.546875, "completion_length/incorrect": 795.910888671875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 946.0, "completion_length/incorrect/min": 283.0, "completion_length/incorrect/p25": 542.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 66527.1640625, "completion_length/max": 1024.0, "completion_length/median": 537.0, "completion_length/min": 118.0, "completion_length/p25": 385.0, "completion_length/p75": 793.0, "completion_length/var": 65113.79296875, "epoch": 0.32, "feature_vector_variance/max_squared_error": 127897.625, "feature_vector_variance/metric": 30814.390625, "generated_tokens/total": 11596364.0, "grad_norm": 0.18209975957870483, "grouped_std_rewards": 0.19314977526664734, "learning_rate": 1.3995190528383292e-05, "loss": -0.0292, "mean_logprobs": -0.058837890625, "mean_logprobs/var": 0.000782012939453125, "num_completions/total": 19200, "per_sentence_gradient_norm": 2.6019697189331055, "per_sentence_gradient_norm/max": 203.5817108154297, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 15.255487442016602, "per_sentence_gradient_norm/p99": 62.515193939208984, "per_sentence_gradient_norm/var": 198.3590087890625, "per_token_feature_norm": 185.75038146972656, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 189.0, "per_token_feature_norm/min": 48.75, "per_token_feature_norm/p25": 165.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 1133.3934326171875, "per_token_full_gradient_variance/max_squared_error": 400.1419982910156, "per_token_full_gradient_variance/variance": 0.03767146170139313, "per_token_gradient_norm": 3.0917716026306152, "per_token_gradient_norm/max": 5967.7080078125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3898.267822265625, "per_token_policy_error_norm": 0.03320375084877014, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.02995077148079872, "policy_entropy": 0.06476139277219772, "policy_entropy/max": 3.59375, "policy_entropy/median": 4.4330954551696777e-07, "policy_entropy/min": 5.692061405548898e-18, "policy_entropy/p25": 9.313225746154785e-09, "policy_entropy/p75": 0.000324249267578125, "policy_entropy/var": 0.041394222527742386, "policy_error_vector_variance/max_squared_error": 2.008458375930786, "policy_error_vector_variance/metric": 0.033185333013534546, "policy_loss": -0.02922244742512703, "policy_loss/max": 19.79339599609375, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.5345723628997803, "policy_sharpness": 8.607564926147461, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.111315727233887, "reward": 0.7369791865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19409358501434326, "rewards/accuracy_reward": 0.7369791865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19409358501434326, "sentence_full_gradient_variance/max_squared_error": 882258.875, "sentence_full_gradient_variance/metric": 3236.47265625, "sentence_full_gradient_variance/p75": 79.54875946044922, "sentence_full_gradient_variance/p90": 195.15744018554688, "sentence_full_gradient_variance/p95": 195.15744018554688, "sentence_full_gradient_variance/p99": 55430.74609375, "state_level_variance/metric": 19.036930084228516, "state_level_variance_full_gradient/metric": 353.45404052734375, "step": 25 }, { "accuracy_reward": 0.7760416865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17402760684490204, "action_level_variance/metric": 270.4691467285156, "action_level_variance_full_gradient/metric": 5040.73828125, "adam_stats/lr_effective_max": 7.900271884864196e-05, "adam_stats/lr_effective_mean": 1.1109196895731088e-10, "adam_stats/lr_effective_min": -7.904339145170525e-05, "adam_stats/m_t_max": 0.0035473357420414686, "adam_stats/m_t_mean": -1.6436933411578813e-11, "adam_stats/m_t_min": -0.0023156071547418833, "adam_stats/v_t_max": 2.3393167793983594e-05, "adam_stats/v_t_mean": 2.0138745272096914e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.022298555821180344, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.496692419052124, "all_logprobs": -0.05115116015076637, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -12.125, "all_logprobs/p1": -1.3125, "all_logprobs/p10": -0.02978515625, "all_logprobs/p25": -1.1205673217773438e-05, "all_logprobs/p5": -0.2255859375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.07412664592266083, "clip_ratio": 0.0, "completion_length": 592.5208740234375, "completion_length/correct": 532.2265014648438, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 487.0, "completion_length/correct/min": 128.0, "completion_length/correct/p25": 380.75, "completion_length/correct/p75": 662.0, "completion_length/correct/var": 44341.7109375, "completion_length/incorrect": 801.4476928710938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 908.0, "completion_length/incorrect/min": 172.0, "completion_length/incorrect/p25": 586.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63078.09375, "completion_length/max": 1024.0, "completion_length/median": 533.0, "completion_length/min": 128.0, "completion_length/p25": 399.25, "completion_length/p75": 779.25, "completion_length/var": 61074.6328125, "epoch": 0.3328, "feature_vector_variance/max_squared_error": 131695.609375, "feature_vector_variance/metric": 31241.58203125, "generated_tokens/total": 12051420.0, "grad_norm": 0.10115554928779602, "grouped_std_rewards": 0.14242695271968842, "learning_rate": 1.3860360721173195e-05, "loss": 0.0223, "mean_logprobs": -0.05322265625, "mean_logprobs/var": 0.000751495361328125, "num_completions/total": 19968, "per_sentence_gradient_norm": 2.172482967376709, "per_sentence_gradient_norm/max": 279.22320556640625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 43.42034912109375, "per_sentence_gradient_norm/var": 266.0959167480469, "per_token_feature_norm": 188.92681884765625, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 191.0, "per_token_feature_norm/min": 68.5, "per_token_feature_norm/p25": 172.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 978.9617919921875, "per_token_full_gradient_variance/max_squared_error": 229.86683654785156, "per_token_full_gradient_variance/variance": 0.03724852576851845, "per_token_gradient_norm": 2.618279218673706, "per_token_gradient_norm/max": 5888.22509765625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4109.66162109375, "per_token_policy_error_norm": 0.028984250500798225, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.026021648198366165, "policy_entropy": 0.05702501907944679, "policy_entropy/max": 3.6875, "policy_entropy/median": 2.3748725652694702e-07, "policy_entropy/min": 1.1058862159352145e-17, "policy_entropy/p25": 3.863533493131399e-09, "policy_entropy/p75": 0.00014209747314453125, "policy_entropy/var": 0.03529631718993187, "policy_error_vector_variance/max_squared_error": 2.0091559886932373, "policy_error_vector_variance/metric": 0.02896212972700596, "policy_loss": 0.022298557683825493, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.496692419052124, "policy_sharpness": 8.728800773620605, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.498842239379883, "reward": 0.7760416865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17402760684490204, "rewards/accuracy_reward": 0.7760416865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17402760684490204, "sentence_full_gradient_variance/max_squared_error": 2780579.0, "sentence_full_gradient_variance/metric": 5685.19482421875, "sentence_full_gradient_variance/p75": 94.4447250366211, "sentence_full_gradient_variance/p90": 450.0309143066406, "sentence_full_gradient_variance/p95": 450.0309143066406, "sentence_full_gradient_variance/p99": 70458.5625, "state_level_variance/metric": 29.395158767700195, "state_level_variance_full_gradient/metric": 644.4552001953125, "step": 26 }, { "accuracy_reward": 0.7981771230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1613004505634308, "action_level_variance/metric": 555.8073120117188, "action_level_variance_full_gradient/metric": 7073.71630859375, "adam_stats/lr_effective_max": 7.561212260043249e-05, "adam_stats/lr_effective_mean": 2.6074538361786637e-11, "adam_stats/lr_effective_min": -7.612861372763291e-05, "adam_stats/m_t_max": 0.003179060062393546, "adam_stats/m_t_mean": -1.7370046373477166e-11, "adam_stats/m_t_min": -0.002014618832617998, "adam_stats/v_t_max": 2.336979378014803e-05, "adam_stats/v_t_mean": 2.0126526313613002e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.015690624713897705, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.401768207550049, "all_logprobs": -0.044822175055742264, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.875, "all_logprobs/p1": -1.2265625, "all_logprobs/p10": -0.01611328125, "all_logprobs/p25": -3.6954879760742188e-06, "all_logprobs/p5": -0.162109375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.06612465530633926, "clip_ratio": 0.0, "completion_length": 601.1732177734375, "completion_length/correct": 544.7716064453125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 515.0, "completion_length/correct/min": 120.0, "completion_length/correct/p25": 380.0, "completion_length/correct/p75": 719.0, "completion_length/correct/var": 48177.90234375, "completion_length/incorrect": 824.2322387695312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 990.0, "completion_length/incorrect/min": 208.0, "completion_length/incorrect/p25": 640.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 66029.2109375, "completion_length/max": 1024.0, "completion_length/median": 557.0, "completion_length/min": 120.0, "completion_length/p25": 392.0, "completion_length/p75": 810.0, "completion_length/var": 64296.59375, "epoch": 0.3456, "feature_vector_variance/max_squared_error": 142667.46875, "feature_vector_variance/metric": 31284.986328125, "generated_tokens/total": 12513121.0, "grad_norm": 0.10853929817676544, "grouped_std_rewards": 0.17298850417137146, "learning_rate": 1.3717781794162813e-05, "loss": 0.0157, "mean_logprobs": -0.046630859375, "mean_logprobs/var": 0.000629425048828125, "num_completions/total": 20736, "per_sentence_gradient_norm": 2.5654685497283936, "per_sentence_gradient_norm/max": 591.6436767578125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 5.081022262573242, "per_sentence_gradient_norm/p99": 50.48749923706055, "per_sentence_gradient_norm/var": 549.9417114257812, "per_token_feature_norm": 192.26229858398438, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 194.0, "per_token_feature_norm/min": 71.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 874.5756225585938, "per_token_full_gradient_variance/max_squared_error": 495.43536376953125, "per_token_full_gradient_variance/variance": 0.03575017675757408, "per_token_gradient_norm": 2.40854811668396, "per_token_gradient_norm/max": 7265.1025390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3504.3037109375, "per_token_policy_error_norm": 0.025446675717830658, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.023289885371923447, "policy_entropy": 0.04936100170016289, "policy_entropy/max": 3.796875, "policy_entropy/median": 1.3504177331924438e-07, "policy_entropy/min": 7.589415207398531e-18, "policy_entropy/p25": 2.240994945168495e-09, "policy_entropy/p75": 5.14984130859375e-05, "policy_entropy/var": 0.029972316697239876, "policy_error_vector_variance/max_squared_error": 2.007141351699829, "policy_error_vector_variance/metric": 0.02543007582426071, "policy_loss": 0.015690622851252556, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.401768207550049, "policy_sharpness": 8.86609172821045, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.775149345397949, "reward": 0.7981771230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1613004505634308, "rewards/accuracy_reward": 0.7981771230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1613004505634308, "sentence_full_gradient_variance/max_squared_error": 2632134.0, "sentence_full_gradient_variance/metric": 8022.77880859375, "sentence_full_gradient_variance/p75": 122.93804168701172, "sentence_full_gradient_variance/p90": 173.72796630859375, "sentence_full_gradient_variance/p95": 173.72796630859375, "sentence_full_gradient_variance/p99": 57893.046875, "state_level_variance/metric": 63.556331634521484, "state_level_variance_full_gradient/metric": 949.0626220703125, "step": 27 }, { "accuracy_reward": 0.7565104365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18444257974624634, "action_level_variance/metric": 117.63603973388672, "action_level_variance_full_gradient/metric": 2635.786865234375, "adam_stats/lr_effective_max": 7.553917384939268e-05, "adam_stats/lr_effective_mean": -9.018290454687605e-12, "adam_stats/lr_effective_min": -7.650044426554814e-05, "adam_stats/m_t_max": 0.002763497643172741, "adam_stats/m_t_mean": -1.5325015562117628e-11, "adam_stats/m_t_min": -0.0017574622761458158, "adam_stats/v_t_max": 2.334737655473873e-05, "adam_stats/v_t_mean": 2.011067310944692e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.04134686663746834, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.561608076095581, "all_logprobs": -0.04399473965167999, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.875, "all_logprobs/p1": -1.1796875, "all_logprobs/p10": -0.01416015625, "all_logprobs/p25": -2.86102294921875e-06, "all_logprobs/p5": -0.16015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.06465312838554382, "clip_ratio": 0.0, "completion_length": 613.6614990234375, "completion_length/correct": 547.208251953125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 505.0, "completion_length/correct/min": 128.0, "completion_length/correct/p25": 369.0, "completion_length/correct/p75": 724.0, "completion_length/correct/var": 54187.78125, "completion_length/incorrect": 820.1283569335938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 999.0, "completion_length/incorrect/min": 214.0, "completion_length/incorrect/p25": 588.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 60067.28515625, "completion_length/max": 1024.0, "completion_length/median": 574.0, "completion_length/min": 128.0, "completion_length/p25": 393.75, "completion_length/p75": 831.5, "completion_length/var": 69281.2109375, "epoch": 0.3584, "feature_vector_variance/max_squared_error": 135980.078125, "feature_vector_variance/metric": 31154.90234375, "generated_tokens/total": 12984413.0, "grad_norm": 0.06621233373880386, "grouped_std_rewards": 0.17692431807518005, "learning_rate": 1.3567627457812107e-05, "loss": -0.0413, "mean_logprobs": -0.0458984375, "mean_logprobs/var": 0.000614166259765625, "num_completions/total": 21504, "per_sentence_gradient_norm": 1.9339284896850586, "per_sentence_gradient_norm/max": 118.72734832763672, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 52.6716194152832, "per_sentence_gradient_norm/var": 114.04447174072266, "per_token_feature_norm": 192.3899688720703, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 69.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 827.9651489257812, "per_token_full_gradient_variance/max_squared_error": 127.5349349975586, "per_token_full_gradient_variance/variance": 0.028987370431423187, "per_token_gradient_norm": 2.281130313873291, "per_token_gradient_norm/max": 7002.99462890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2872.919677734375, "per_token_policy_error_norm": 0.024962956085801125, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.022740643471479416, "policy_entropy": 0.04850926995277405, "policy_entropy/max": 3.59375, "policy_entropy/median": 1.0291114449501038e-07, "policy_entropy/min": 4.30970363562988e-18, "policy_entropy/p25": 1.7971615307033062e-09, "policy_entropy/p75": 4.100799560546875e-05, "policy_entropy/var": 0.02990233339369297, "policy_error_vector_variance/max_squared_error": 2.0092551708221436, "policy_error_vector_variance/metric": 0.02494991570711136, "policy_loss": -0.04134687781333923, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -19.79339599609375, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.56160831451416, "policy_sharpness": 8.893101692199707, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.62767219543457, "reward": 0.7565104365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18444257974624634, "rewards/accuracy_reward": 0.7565104365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18444257974624634, "sentence_full_gradient_variance/max_squared_error": 457010.5, "sentence_full_gradient_variance/metric": 2969.99951171875, "sentence_full_gradient_variance/p75": 106.25582122802734, "sentence_full_gradient_variance/p90": 113.02397155761719, "sentence_full_gradient_variance/p95": 113.02397155761719, "sentence_full_gradient_variance/p99": 76626.40625, "state_level_variance/metric": 11.079840660095215, "state_level_variance_full_gradient/metric": 334.21295166015625, "step": 28 }, { "accuracy_reward": 0.7734375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17546038329601288, "action_level_variance/metric": 469.310546875, "action_level_variance_full_gradient/metric": 4596.3876953125, "adam_stats/lr_effective_max": 7.609654130646959e-05, "adam_stats/lr_effective_mean": -1.823355268854021e-12, "adam_stats/lr_effective_min": -7.722468581050634e-05, "adam_stats/m_t_max": 0.0029143940191715956, "adam_stats/m_t_mean": -1.1430297680581347e-11, "adam_stats/m_t_min": -0.002066945657134056, "adam_stats/v_t_max": 2.3342283384408802e-05, "adam_stats/v_t_mean": 2.0134375937341797e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.13348805904388428, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.851332664489746, "all_logprobs": -0.043122392147779465, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.8125, "all_logprobs/p1": -1.140625, "all_logprobs/p10": -0.01416015625, "all_logprobs/p25": -3.2186508178710938e-06, "all_logprobs/p5": -0.16015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.06372671574354172, "clip_ratio": 0.0, "completion_length": 580.4583740234375, "completion_length/correct": 508.1615905761719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 445.0, "completion_length/correct/min": 129.0, "completion_length/correct/p25": 342.25, "completion_length/correct/p75": 632.0, "completion_length/correct/var": 45723.6171875, "completion_length/incorrect": 827.2643432617188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 957.0, "completion_length/incorrect/min": 189.0, "completion_length/incorrect/p25": 630.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 58073.23828125, "completion_length/max": 1024.0, "completion_length/median": 517.0, "completion_length/min": 129.0, "completion_length/p25": 367.75, "completion_length/p75": 788.5, "completion_length/var": 66316.0390625, "epoch": 0.3712, "feature_vector_variance/max_squared_error": 137696.546875, "feature_vector_variance/metric": 31474.318359375, "generated_tokens/total": 13430205.0, "grad_norm": 0.18523186445236206, "grouped_std_rewards": 0.16338279843330383, "learning_rate": 1.3410080652050414e-05, "loss": 0.1335, "mean_logprobs": -0.044189453125, "mean_logprobs/var": 0.00060272216796875, "num_completions/total": 22272, "per_sentence_gradient_norm": 3.047481060028076, "per_sentence_gradient_norm/max": 369.3745422363281, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 83.4144058227539, "per_sentence_gradient_norm/var": 460.6231994628906, "per_token_feature_norm": 193.59825134277344, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 194.0, "per_token_feature_norm/min": 69.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 739.39208984375, "per_token_full_gradient_variance/max_squared_error": 392.7717590332031, "per_token_full_gradient_variance/variance": 0.06582201272249222, "per_token_gradient_norm": 3.691267490386963, "per_token_gradient_norm/max": 6241.56787109375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6975.90771484375, "per_token_policy_error_norm": 0.0244819987565279, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.022165367379784584, "policy_entropy": 0.04761054366827011, "policy_entropy/max": 3.6875, "policy_entropy/median": 9.778887033462524e-08, "policy_entropy/min": 1.0842021724855044e-17, "policy_entropy/p25": 1.6443664208054543e-09, "policy_entropy/p75": 4.482269287109375e-05, "policy_entropy/var": 0.02855951152741909, "policy_error_vector_variance/max_squared_error": 2.0082175731658936, "policy_error_vector_variance/metric": 0.02446558326482773, "policy_loss": 0.13348805904388428, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.851332426071167, "policy_sharpness": 8.896344184875488, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.59547758102417, "reward": 0.7734375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17546038329601288, "rewards/accuracy_reward": 0.7734375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17546038329601288, "sentence_full_gradient_variance/max_squared_error": 1682161.5, "sentence_full_gradient_variance/metric": 5216.984375, "sentence_full_gradient_variance/p75": 82.94164276123047, "sentence_full_gradient_variance/p90": 126.44129180908203, "sentence_full_gradient_variance/p95": 126.44129180908203, "sentence_full_gradient_variance/p99": 126408.734375, "state_level_variance/metric": 49.89643859863281, "state_level_variance_full_gradient/metric": 620.59716796875, "step": 29 }, { "accuracy_reward": 0.78125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17112122476100922, "action_level_variance/metric": 49.70798873901367, "action_level_variance_full_gradient/metric": 2687.92431640625, "adam_stats/lr_effective_max": 6.900432344991714e-05, "adam_stats/lr_effective_mean": 3.8565199644846615e-11, "adam_stats/lr_effective_min": -7.173093763412908e-05, "adam_stats/m_t_max": 0.0025207207072526217, "adam_stats/m_t_mean": -1.1621321960308961e-11, "adam_stats/m_t_min": -0.0018182893982157111, "adam_stats/v_t_max": 2.331998803128954e-05, "adam_stats/v_t_mean": 2.012333225401286e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.055770885199308395, "advantages/max": 5.795430660247803, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.0026134252548218, "all_logprobs": -0.04506939649581909, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -12.4375, "all_logprobs/p1": -1.2265625, "all_logprobs/p10": -0.0159912109375, "all_logprobs/p25": -3.6954879760742188e-06, "all_logprobs/p5": -0.1630859375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.06637486815452576, "clip_ratio": 0.0, "completion_length": 588.1185302734375, "completion_length/correct": 532.4283447265625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 492.0, "completion_length/correct/min": 164.0, "completion_length/correct/p25": 384.0, "completion_length/correct/p75": 631.0, "completion_length/correct/var": 40316.015625, "completion_length/incorrect": 787.0119018554688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 858.0, "completion_length/incorrect/min": 166.0, "completion_length/incorrect/p25": 581.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 66057.2109375, "completion_length/max": 1024.0, "completion_length/median": 529.0, "completion_length/min": 164.0, "completion_length/p25": 400.75, "completion_length/p75": 743.75, "completion_length/var": 56958.97265625, "epoch": 0.384, "feature_vector_variance/max_squared_error": 141495.046875, "feature_vector_variance/metric": 31458.509765625, "generated_tokens/total": 13881880.0, "grad_norm": 0.06205524504184723, "grouped_std_rewards": 0.15071754157543182, "learning_rate": 1.3245333323392335e-05, "loss": -0.0558, "mean_logprobs": -0.045654296875, "mean_logprobs/var": 0.00060272216796875, "num_completions/total": 23040, "per_sentence_gradient_norm": 1.1897928714752197, "per_sentence_gradient_norm/max": 91.12174987792969, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 30.349485397338867, "per_sentence_gradient_norm/var": 48.3553466796875, "per_token_feature_norm": 192.4054718017578, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 69.5, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 739.32568359375, "per_token_full_gradient_variance/max_squared_error": 48.9753303527832, "per_token_full_gradient_variance/variance": 0.00989789329469204, "per_token_gradient_norm": 1.203742265701294, "per_token_gradient_norm/max": 2958.52783203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 910.8646240234375, "per_token_policy_error_norm": 0.02563052624464035, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.023374132812023163, "policy_entropy": 0.04957231506705284, "policy_entropy/max": 3.8125, "policy_entropy/median": 1.1874362826347351e-07, "policy_entropy/min": 3.40168431617327e-18, "policy_entropy/p25": 1.9936123862862587e-09, "policy_entropy/p75": 5.054473876953125e-05, "policy_entropy/var": 0.030649440363049507, "policy_error_vector_variance/max_squared_error": 2.0076005458831787, "policy_error_vector_variance/metric": 0.025608334690332413, "policy_loss": -0.055770885199308395, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -5.795430660247803, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.0026134252548218, "policy_sharpness": 8.873383522033691, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.750101089477539, "reward": 0.78125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17112122476100922, "rewards/accuracy_reward": 0.78125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17112122476100922, "sentence_full_gradient_variance/max_squared_error": 633661.0625, "sentence_full_gradient_variance/metric": 3041.95654296875, "sentence_full_gradient_variance/p75": 56.66798400878906, "sentence_full_gradient_variance/p90": 73.9383544921875, "sentence_full_gradient_variance/p95": 73.9383544921875, "sentence_full_gradient_variance/p99": 59636.88671875, "state_level_variance/metric": 4.848395824432373, "state_level_variance_full_gradient/metric": 354.03216552734375, "step": 30 }, { "accuracy_reward": 0.8111979365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15335553884506226, "action_level_variance/metric": 99.26239013671875, "action_level_variance_full_gradient/metric": 3898.35107421875, "adam_stats/lr_effective_max": 6.805868906667456e-05, "adam_stats/lr_effective_mean": -2.0026478305540385e-12, "adam_stats/lr_effective_min": -6.859746645204723e-05, "adam_stats/m_t_max": 0.0018169882241636515, "adam_stats/m_t_mean": -1.2337294380548869e-11, "adam_stats/m_t_min": -0.0011603861348703504, "adam_stats/v_t_max": 2.331706855329685e-05, "adam_stats/v_t_mean": 2.015013806852539e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.06911969184875488, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.1373002529144287, "all_logprobs": -0.044115375727415085, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.625, "all_logprobs/p1": -1.2265625, "all_logprobs/p10": -0.01416015625, "all_logprobs/p25": -3.6954879760742188e-06, "all_logprobs/p5": -0.16015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.06576365232467651, "clip_ratio": 0.0, "completion_length": 598.9779052734375, "completion_length/correct": 548.0465698242188, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 502.0, "completion_length/correct/min": 135.0, "completion_length/correct/p25": 404.0, "completion_length/correct/p75": 680.5, "completion_length/correct/var": 40307.75390625, "completion_length/incorrect": 817.806884765625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 937.0, "completion_length/incorrect/min": 242.0, "completion_length/incorrect/p25": 653.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 64257.08984375, "completion_length/max": 1024.0, "completion_length/median": 545.0, "completion_length/min": 135.0, "completion_length/p25": 416.75, "completion_length/p75": 767.25, "completion_length/var": 55911.34765625, "epoch": 0.3968, "feature_vector_variance/max_squared_error": 145570.546875, "feature_vector_variance/metric": 31794.189453125, "generated_tokens/total": 14341895.0, "grad_norm": 0.13012337684631348, "grouped_std_rewards": 0.15987887978553772, "learning_rate": 1.3073586191080456e-05, "loss": -0.0691, "mean_logprobs": -0.04443359375, "mean_logprobs/var": 0.000659942626953125, "num_completions/total": 23808, "per_sentence_gradient_norm": 1.7537882328033447, "per_sentence_gradient_norm/max": 118.08425903320312, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 51.23747253417969, "per_sentence_gradient_norm/var": 96.31201934814453, "per_token_feature_norm": 193.45054626464844, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 658.3953857421875, "per_token_full_gradient_variance/max_squared_error": 164.88426208496094, "per_token_full_gradient_variance/variance": 0.024971265345811844, "per_token_gradient_norm": 1.9851999282836914, "per_token_gradient_norm/max": 4235.05908203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2267.970947265625, "per_token_policy_error_norm": 0.02512810379266739, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.023062683641910553, "policy_entropy": 0.048032213002443314, "policy_entropy/max": 3.640625, "policy_entropy/median": 1.019798219203949e-07, "policy_entropy/min": 6.261267546103788e-18, "policy_entropy/p25": 1.5061232261359692e-09, "policy_entropy/p75": 5.030632019042969e-05, "policy_entropy/var": 0.029990723356604576, "policy_error_vector_variance/max_squared_error": 2.0048983097076416, "policy_error_vector_variance/metric": 0.0251107607036829, "policy_loss": -0.06911969184875488, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.1373002529144287, "policy_sharpness": 8.895594596862793, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.597801208496094, "reward": 0.8111979365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15335553884506226, "rewards/accuracy_reward": 0.8111979365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15335553884506226, "sentence_full_gradient_variance/max_squared_error": 1468600.125, "sentence_full_gradient_variance/metric": 4390.10693359375, "sentence_full_gradient_variance/p75": 132.0411834716797, "sentence_full_gradient_variance/p90": 195.68121337890625, "sentence_full_gradient_variance/p95": 195.68121337890625, "sentence_full_gradient_variance/p99": 79290.375, "state_level_variance/metric": 9.430258750915527, "state_level_variance_full_gradient/metric": 491.7555847167969, "step": 31 }, { "accuracy_reward": 0.8020833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15895263850688934, "action_level_variance/metric": 152.47537231445312, "action_level_variance_full_gradient/metric": 2808.545654296875, "adam_stats/lr_effective_max": 6.73000467941165e-05, "adam_stats/lr_effective_mean": 6.944617103599171e-11, "adam_stats/lr_effective_min": -6.778166425647214e-05, "adam_stats/m_t_max": 0.0012538195587694645, "adam_stats/m_t_mean": -6.157838995657361e-12, "adam_stats/m_t_min": -0.0006997081800363958, "adam_stats/v_t_max": 2.330830284336116e-05, "adam_stats/v_t_mean": 2.014400365263347e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.04998057708144188, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.389108419418335, "all_logprobs": -0.041585683822631836, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.0625, "all_logprobs/p1": -1.140625, "all_logprobs/p10": -0.01104736328125, "all_logprobs/p25": -1.9073486328125e-06, "all_logprobs/p5": -0.1328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.062216099351644516, "clip_ratio": 0.0, "completion_length": 570.7005615234375, "completion_length/correct": 503.6396179199219, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 472.0, "completion_length/correct/min": 139.0, "completion_length/correct/p25": 359.0, "completion_length/correct/p75": 606.0, "completion_length/correct/var": 39561.86328125, "completion_length/incorrect": 842.4736938476562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 294.0, "completion_length/incorrect/p25": 610.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 51967.4140625, "completion_length/max": 1024.0, "completion_length/median": 517.0, "completion_length/min": 139.0, "completion_length/p25": 385.0, "completion_length/p75": 718.5, "completion_length/var": 60201.6953125, "epoch": 0.4096, "feature_vector_variance/max_squared_error": 141096.484375, "feature_vector_variance/metric": 31250.419921875, "generated_tokens/total": 14780193.0, "grad_norm": 0.1067969799041748, "grouped_std_rewards": 0.1306823492050171, "learning_rate": 1.2895048502539883e-05, "loss": 0.05, "mean_logprobs": -0.041259765625, "mean_logprobs/var": 0.000621795654296875, "num_completions/total": 24576, "per_sentence_gradient_norm": 1.8002322912216187, "per_sentence_gradient_norm/max": 168.8680419921875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 57.338436126708984, "per_sentence_gradient_norm/var": 149.42909240722656, "per_token_feature_norm": 192.91500854492188, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 669.7153930664062, "per_token_full_gradient_variance/max_squared_error": 145.47047424316406, "per_token_full_gradient_variance/variance": 0.033506400883197784, "per_token_gradient_norm": 2.4671711921691895, "per_token_gradient_norm/max": 5216.64111328125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3588.93701171875, "per_token_policy_error_norm": 0.02358189783990383, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.021748164668679237, "policy_entropy": 0.045591916888952255, "policy_entropy/max": 3.65625, "policy_entropy/median": 7.962808012962341e-08, "policy_entropy/min": 9.75781955236954e-18, "policy_entropy/p25": 1.2878444977104664e-09, "policy_entropy/p75": 2.8252601623535156e-05, "policy_entropy/var": 0.028726518154144287, "policy_error_vector_variance/max_squared_error": 2.0067648887634277, "policy_error_vector_variance/metric": 0.023563051596283913, "policy_loss": 0.04998057335615158, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.389108180999756, "policy_sharpness": 8.958560943603516, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.313472270965576, "reward": 0.8020833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15895263850688934, "rewards/accuracy_reward": 0.8020833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15895263850688934, "sentence_full_gradient_variance/max_squared_error": 837229.125, "sentence_full_gradient_variance/metric": 3178.14306640625, "sentence_full_gradient_variance/p75": 85.9072036743164, "sentence_full_gradient_variance/p90": 104.34104919433594, "sentence_full_gradient_variance/p95": 104.34104919433594, "sentence_full_gradient_variance/p99": 62150.85546875, "state_level_variance/metric": 15.985095024108887, "state_level_variance_full_gradient/metric": 369.5975036621094, "step": 32 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.13906997442245483, "action_level_variance/metric": 148.2180938720703, "action_level_variance_full_gradient/metric": 4416.1015625, "adam_stats/lr_effective_max": 7.12929613655433e-05, "adam_stats/lr_effective_mean": 4.40653555311421e-11, "adam_stats/lr_effective_min": -6.970299000386149e-05, "adam_stats/m_t_max": 0.0007588520529679954, "adam_stats/m_t_mean": -1.0656364679562103e-11, "adam_stats/m_t_min": -0.0008852159953676164, "adam_stats/v_t_max": 2.3298742235056125e-05, "adam_stats/v_t_mean": 2.0169371815065285e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.02748286724090576, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.756375312805176, "all_logprobs": -0.04105871170759201, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.875, "all_logprobs/p1": -1.140625, "all_logprobs/p10": -0.01104736328125, "all_logprobs/p25": -2.86102294921875e-06, "all_logprobs/p5": -0.1337890625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.06015074998140335, "clip_ratio": 0.0, "completion_length": 566.3828125, "completion_length/correct": 522.1828002929688, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 505.0, "completion_length/correct/min": 138.0, "completion_length/correct/p25": 367.5, "completion_length/correct/p75": 659.25, "completion_length/correct/var": 38014.2890625, "completion_length/incorrect": 787.3828125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 789.0, "completion_length/incorrect/min": 273.0, "completion_length/incorrect/p25": 606.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 48041.94140625, "completion_length/max": 1024.0, "completion_length/median": 534.0, "completion_length/min": 138.0, "completion_length/p25": 393.0, "completion_length/p75": 707.25, "completion_length/var": 49406.05078125, "epoch": 0.4224, "feature_vector_variance/max_squared_error": 134339.671875, "feature_vector_variance/metric": 31435.201171875, "generated_tokens/total": 15215175.0, "grad_norm": 0.1803266406059265, "grouped_std_rewards": 0.15698234736919403, "learning_rate": 1.270993777844248e-05, "loss": 0.0275, "mean_logprobs": -0.042724609375, "mean_logprobs/var": 0.00054168701171875, "num_completions/total": 25344, "per_sentence_gradient_norm": 1.7975658178329468, "per_sentence_gradient_norm/max": 182.15664672851562, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 43.187564849853516, "per_sentence_gradient_norm/var": 145.17587280273438, "per_token_feature_norm": 191.92263793945312, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 73.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 205.0, "per_token_feature_norm/var": 580.05419921875, "per_token_full_gradient_variance/max_squared_error": 914.8514404296875, "per_token_full_gradient_variance/variance": 0.03573988005518913, "per_token_gradient_norm": 2.118265390396118, "per_token_gradient_norm/max": 6420.4814453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3470.263427734375, "per_token_policy_error_norm": 0.02335253730416298, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.02141089364886284, "policy_entropy": 0.04530317336320877, "policy_entropy/max": 3.640625, "policy_entropy/median": 9.499490261077881e-08, "policy_entropy/min": 4.933119884809045e-18, "policy_entropy/p25": 1.3969838619232178e-09, "policy_entropy/p75": 4.124641418457031e-05, "policy_entropy/var": 0.0272214412689209, "policy_error_vector_variance/max_squared_error": 2.0061142444610596, "policy_error_vector_variance/metric": 0.023336902260780334, "policy_loss": 0.02748287096619606, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.756375551223755, "policy_sharpness": 8.935091018676758, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.380286693572998, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.13906997442245483, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.13906997442245483, "sentence_full_gradient_variance/max_squared_error": 2261826.5, "sentence_full_gradient_variance/metric": 5012.10546875, "sentence_full_gradient_variance/p75": 46.460548400878906, "sentence_full_gradient_variance/p90": 102.88133239746094, "sentence_full_gradient_variance/p95": 102.88133239746094, "sentence_full_gradient_variance/p99": 64034.61328125, "state_level_variance/metric": 15.457030296325684, "state_level_variance_full_gradient/metric": 596.002685546875, "step": 33 }, { "accuracy_reward": 0.8151041865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15090587735176086, "action_level_variance/metric": 204.6455535888672, "action_level_variance_full_gradient/metric": 2840.15576171875, "adam_stats/lr_effective_max": 6.984705396462232e-05, "adam_stats/lr_effective_mean": 1.1323550430653029e-10, "adam_stats/lr_effective_min": -7.190587348304689e-05, "adam_stats/m_t_max": 0.0007719108834862709, "adam_stats/m_t_mean": -7.019167365396317e-12, "adam_stats/m_t_min": -0.0007279603159986436, "adam_stats/v_t_max": 2.327625588804949e-05, "adam_stats/v_t_mean": 2.019279058199097e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.08400075882673264, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.3309719562530518, "all_logprobs": -0.04326211288571358, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.0, "all_logprobs/p1": -1.171875, "all_logprobs/p10": -0.01373291015625, "all_logprobs/p25": -3.4570693969726562e-06, "all_logprobs/p5": -0.16015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.06396450847387314, "clip_ratio": 0.0, "completion_length": 554.6497802734375, "completion_length/correct": 484.54791259765625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 441.0, "completion_length/correct/min": 166.0, "completion_length/correct/p25": 337.0, "completion_length/correct/p75": 601.5, "completion_length/correct/var": 37973.85546875, "completion_length/incorrect": 863.6901245117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 320.0, "completion_length/incorrect/p25": 651.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 52529.06640625, "completion_length/max": 1024.0, "completion_length/median": 496.0, "completion_length/min": 166.0, "completion_length/p25": 361.5, "completion_length/p75": 716.0, "completion_length/var": 62292.609375, "epoch": 0.4352, "feature_vector_variance/max_squared_error": 150137.6875, "feature_vector_variance/metric": 31556.392578125, "generated_tokens/total": 15641146.0, "grad_norm": 0.15316176414489746, "grouped_std_rewards": 0.1468161940574646, "learning_rate": 1.2518479547691437e-05, "loss": -0.084, "mean_logprobs": -0.04296875, "mean_logprobs/var": 0.000507354736328125, "num_completions/total": 26112, "per_sentence_gradient_norm": 2.0914244651794434, "per_sentence_gradient_norm/max": 221.73414611816406, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 61.837493896484375, "per_sentence_gradient_norm/var": 200.5325927734375, "per_token_feature_norm": 192.3076934814453, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 75.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 566.3348388671875, "per_token_full_gradient_variance/max_squared_error": 556.6282958984375, "per_token_full_gradient_variance/variance": 0.04294357821345329, "per_token_gradient_norm": 2.9033045768737793, "per_token_gradient_norm/max": 6678.4140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4620.10400390625, "per_token_policy_error_norm": 0.024560876190662384, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.022407054901123047, "policy_entropy": 0.047587499022483826, "policy_entropy/max": 3.703125, "policy_entropy/median": 9.778887033462524e-08, "policy_entropy/min": 7.101524229780054e-18, "policy_entropy/p25": 1.4115357771515846e-09, "policy_entropy/p75": 4.863739013671875e-05, "policy_entropy/var": 0.02954692766070366, "policy_error_vector_variance/max_squared_error": 2.0037682056427, "policy_error_vector_variance/metric": 0.02454371564090252, "policy_loss": -0.08400075882673264, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.330972194671631, "policy_sharpness": 8.905707359313965, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.5573811531066895, "reward": 0.8151041865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15090587735176086, "rewards/accuracy_reward": 0.8151041865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15090587735176086, "sentence_full_gradient_variance/max_squared_error": 752987.3125, "sentence_full_gradient_variance/metric": 3176.317138671875, "sentence_full_gradient_variance/p75": 105.46109771728516, "sentence_full_gradient_variance/p90": 136.4373779296875, "sentence_full_gradient_variance/p95": 136.4373779296875, "sentence_full_gradient_variance/p99": 75094.0, "state_level_variance/metric": 21.42986297607422, "state_level_variance_full_gradient/metric": 336.1618347167969, "step": 34 }, { "accuracy_reward": 0.84375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.13200783729553223, "action_level_variance/metric": 116.86759948730469, "action_level_variance_full_gradient/metric": 2770.581787109375, "adam_stats/lr_effective_max": 6.627934635616839e-05, "adam_stats/lr_effective_mean": -4.457101007715458e-11, "adam_stats/lr_effective_min": -7.062542135827243e-05, "adam_stats/m_t_max": 0.0007901767385192215, "adam_stats/m_t_mean": 1.3467661231017503e-12, "adam_stats/m_t_min": -0.0009795126970857382, "adam_stats/v_t_max": 2.3253451217897236e-05, "adam_stats/v_t_mean": 2.0198183403596914e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.11119665205478668, "advantages/max": 5.795430660247803, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.5216469764709473, "all_logprobs": -0.04029601067304611, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.875, "all_logprobs/p1": -1.140625, "all_logprobs/p10": -0.01031494140625, "all_logprobs/p25": -2.2649765014648438e-06, "all_logprobs/p5": -0.1279296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.057659488171339035, "clip_ratio": 0.0, "completion_length": 545.1315307617188, "completion_length/correct": 486.0848693847656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 435.0, "completion_length/correct/min": 159.0, "completion_length/correct/p25": 336.75, "completion_length/correct/p75": 594.0, "completion_length/correct/var": 39873.0703125, "completion_length/incorrect": 863.9833984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 977.0, "completion_length/incorrect/min": 257.0, "completion_length/incorrect/p25": 730.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 42416.5390625, "completion_length/max": 1024.0, "completion_length/median": 472.0, "completion_length/min": 159.0, "completion_length/p25": 357.0, "completion_length/p75": 703.25, "completion_length/var": 59067.37890625, "epoch": 0.448, "feature_vector_variance/max_squared_error": 142327.328125, "feature_vector_variance/metric": 31442.595703125, "generated_tokens/total": 16059807.0, "grad_norm": 0.14281632006168365, "grouped_std_rewards": 0.1122685968875885, "learning_rate": 1.2320907072649045e-05, "loss": 0.1112, "mean_logprobs": -0.040283203125, "mean_logprobs/var": 0.000507354736328125, "num_completions/total": 26880, "per_sentence_gradient_norm": 1.478590726852417, "per_sentence_gradient_norm/max": 172.93504333496094, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 53.806583404541016, "per_sentence_gradient_norm/var": 114.83088684082031, "per_token_feature_norm": 194.00802612304688, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 71.5, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 605.685791015625, "per_token_full_gradient_variance/max_squared_error": 189.35369873046875, "per_token_full_gradient_variance/variance": 0.03254447132349014, "per_token_gradient_norm": 2.013408899307251, "per_token_gradient_norm/max": 5332.4638671875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3140.93408203125, "per_token_policy_error_norm": 0.023119907826185226, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.021138358861207962, "policy_entropy": 0.044540341943502426, "policy_entropy/max": 3.703125, "policy_entropy/median": 6.845220923423767e-08, "policy_entropy/min": 7.860465750519907e-18, "policy_entropy/p25": 1.0695657692849636e-09, "policy_entropy/p75": 3.170967102050781e-05, "policy_entropy/var": 0.02664046175777912, "policy_error_vector_variance/max_squared_error": 2.0051167011260986, "policy_error_vector_variance/metric": 0.02310647815465927, "policy_loss": 0.11119665205478668, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -5.795430660247803, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.5216472148895264, "policy_sharpness": 8.955942153930664, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.274688720703125, "reward": 0.84375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.13200783729553223, "rewards/accuracy_reward": 0.84375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.13200783729553223, "sentence_full_gradient_variance/max_squared_error": 704321.125, "sentence_full_gradient_variance/metric": 3147.349609375, "sentence_full_gradient_variance/p75": 50.21564483642578, "sentence_full_gradient_variance/p90": 71.72528076171875, "sentence_full_gradient_variance/p95": 71.72528076171875, "sentence_full_gradient_variance/p99": 41718.54296875, "state_level_variance/metric": 12.552979469299316, "state_level_variance_full_gradient/metric": 376.76776123046875, "step": 35 }, { "accuracy_reward": 0.7213541865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2012643963098526, "action_level_variance/metric": 315.62054443359375, "action_level_variance_full_gradient/metric": 6395.0693359375, "adam_stats/lr_effective_max": 6.887973722768947e-05, "adam_stats/lr_effective_mean": -9.277622420311715e-11, "adam_stats/lr_effective_min": -6.706400017719716e-05, "adam_stats/m_t_max": 0.0009189795819111168, "adam_stats/m_t_mean": -8.148352652337376e-12, "adam_stats/m_t_min": -0.0005635552806779742, "adam_stats/v_t_max": 2.3242837414727546e-05, "adam_stats/v_t_mean": 2.0226101609538416e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.0008342564105987549, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.607116937637329, "all_logprobs": -0.04126504808664322, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.0, "all_logprobs/p1": -1.140625, "all_logprobs/p10": -0.0101318359375, "all_logprobs/p25": -2.0265579223632812e-06, "all_logprobs/p5": -0.1357421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.06114766746759415, "clip_ratio": 0.0, "completion_length": 607.2994995117188, "completion_length/correct": 517.972900390625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 476.0, "completion_length/correct/min": 119.0, "completion_length/correct/p25": 348.25, "completion_length/correct/p75": 643.0, "completion_length/correct/var": 46800.34375, "completion_length/incorrect": 838.5466918945312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 260.0, "completion_length/incorrect/p25": 642.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 60167.2109375, "completion_length/max": 1024.0, "completion_length/median": 550.0, "completion_length/min": 119.0, "completion_length/p25": 372.0, "completion_length/p75": 862.25, "completion_length/var": 71134.828125, "epoch": 0.4608, "feature_vector_variance/max_squared_error": 157878.484375, "feature_vector_variance/metric": 31209.1328125, "generated_tokens/total": 16526213.0, "grad_norm": 0.19246214628219604, "grouped_std_rewards": 0.16272792220115662, "learning_rate": 1.2117461064942437e-05, "loss": 0.0008, "mean_logprobs": -0.0419921875, "mean_logprobs/var": 0.000522613525390625, "num_completions/total": 27648, "per_sentence_gradient_norm": 2.487760543823242, "per_sentence_gradient_norm/max": 332.04833984375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 68.48002624511719, "per_sentence_gradient_norm/var": 309.8349609375, "per_token_feature_norm": 194.98875427246094, "per_token_feature_norm/max": 338.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 69.0, "per_token_feature_norm/p25": 184.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 581.9280395507812, "per_token_full_gradient_variance/max_squared_error": 330.3479309082031, "per_token_full_gradient_variance/variance": 0.04866699129343033, "per_token_gradient_norm": 2.866089344024658, "per_token_gradient_norm/max": 6815.26708984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5273.6279296875, "per_token_policy_error_norm": 0.023429084569215775, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.021330101415514946, "policy_entropy": 0.04534284770488739, "policy_entropy/max": 3.828125, "policy_entropy/median": 6.100162863731384e-08, "policy_entropy/min": 2.1033522146218786e-17, "policy_entropy/p25": 9.094947017729282e-10, "policy_entropy/p75": 2.8967857360839844e-05, "policy_entropy/var": 0.0280263964086771, "policy_error_vector_variance/max_squared_error": 2.0092573165893555, "policy_error_vector_variance/metric": 0.02340969815850258, "policy_loss": 0.0008342365617863834, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.60711669921875, "policy_sharpness": 8.95325756072998, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.315624237060547, "reward": 0.7213541865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2012643963098526, "rewards/accuracy_reward": 0.7213541865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2012643963098526, "sentence_full_gradient_variance/max_squared_error": 2360171.25, "sentence_full_gradient_variance/metric": 7174.43115234375, "sentence_full_gradient_variance/p75": 163.96006774902344, "sentence_full_gradient_variance/p90": 694.3523559570312, "sentence_full_gradient_variance/p95": 694.3523559570312, "sentence_full_gradient_variance/p99": 77999.2109375, "state_level_variance/metric": 33.6137580871582, "state_level_variance_full_gradient/metric": 779.3629150390625, "step": 36 }, { "accuracy_reward": 0.76171875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.181739941239357, "action_level_variance/metric": 124.39512634277344, "action_level_variance_full_gradient/metric": 3334.807373046875, "adam_stats/lr_effective_max": 6.492968532256782e-05, "adam_stats/lr_effective_mean": -8.374905630104124e-11, "adam_stats/lr_effective_min": -6.866594048915431e-05, "adam_stats/m_t_max": 0.0009811954805627465, "adam_stats/m_t_mean": -1.515645768612739e-11, "adam_stats/m_t_min": -0.0007414642022922635, "adam_stats/v_t_max": 2.3221969968290068e-05, "adam_stats/v_t_mean": 2.0226394344124987e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.027765151113271713, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.802931785583496, "all_logprobs": -0.03831014409661293, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.5625, "all_logprobs/p1": -1.1015625, "all_logprobs/p10": -0.007598876953125, "all_logprobs/p25": -1.1920928955078125e-06, "all_logprobs/p5": -0.11865234375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.05622991546988487, "clip_ratio": 0.0, "completion_length": 597.2200927734375, "completion_length/correct": 510.5384521484375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 497.0, "completion_length/correct/min": 106.0, "completion_length/correct/p25": 340.0, "completion_length/correct/p75": 636.0, "completion_length/correct/var": 45198.84375, "completion_length/incorrect": 874.31689453125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 306.0, "completion_length/incorrect/p25": 749.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 43501.7265625, "completion_length/max": 1024.0, "completion_length/median": 540.0, "completion_length/min": 106.0, "completion_length/p25": 383.75, "completion_length/p75": 793.5, "completion_length/var": 68787.71875, "epoch": 0.4736, "feature_vector_variance/max_squared_error": 144431.25, "feature_vector_variance/metric": 30872.5859375, "generated_tokens/total": 16984878.0, "grad_norm": 0.15058563649654388, "grouped_std_rewards": 0.1771048754453659, "learning_rate": 1.1908389392193549e-05, "loss": -0.0278, "mean_logprobs": -0.038818359375, "mean_logprobs/var": 0.0004520416259765625, "num_completions/total": 28416, "per_sentence_gradient_norm": 1.8070939779281616, "per_sentence_gradient_norm/max": 160.3523406982422, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 6.783420562744141, "per_sentence_gradient_norm/p99": 43.945770263671875, "per_sentence_gradient_norm/var": 121.28744506835938, "per_token_feature_norm": 195.69349670410156, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 71.0, "per_token_feature_norm/p25": 184.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 558.9096069335938, "per_token_full_gradient_variance/max_squared_error": 222.33123779296875, "per_token_full_gradient_variance/variance": 0.032043177634477615, "per_token_gradient_norm": 2.1194090843200684, "per_token_gradient_norm/max": 6131.00341796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3000.044189453125, "per_token_policy_error_norm": 0.02186855673789978, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.02003447152674198, "policy_entropy": 0.04216942936182022, "policy_entropy/max": 3.625, "policy_entropy/median": 4.6798959374427795e-08, "policy_entropy/min": 6.179952383167375e-18, "policy_entropy/p25": 7.639755494892597e-10, "policy_entropy/p75": 1.7642974853515625e-05, "policy_entropy/var": 0.025359565392136574, "policy_error_vector_variance/max_squared_error": 2.0059146881103516, "policy_error_vector_variance/metric": 0.021855169907212257, "policy_loss": -0.027765151113271713, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.802931785583496, "policy_sharpness": 9.006235122680664, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.014491558074951, "reward": 0.76171875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.181739941239357, "rewards/accuracy_reward": 0.76171875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.181739941239357, "sentence_full_gradient_variance/max_squared_error": 494077.96875, "sentence_full_gradient_variance/metric": 3757.037841796875, "sentence_full_gradient_variance/p75": 107.87855529785156, "sentence_full_gradient_variance/p90": 120.10149383544922, "sentence_full_gradient_variance/p95": 120.10149383544922, "sentence_full_gradient_variance/p99": 123628.078125, "state_level_variance/metric": 12.413104057312012, "state_level_variance_full_gradient/metric": 422.23052978515625, "step": 37 }, { "accuracy_reward": 0.8138021230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15172582864761353, "action_level_variance/metric": 95.09317016601562, "action_level_variance_full_gradient/metric": 1911.507568359375, "adam_stats/lr_effective_max": 6.196885806275532e-05, "adam_stats/lr_effective_mean": -5.7605604314447945e-11, "adam_stats/lr_effective_min": -6.43204984953627e-05, "adam_stats/m_t_max": 0.0006831857608631253, "adam_stats/m_t_mean": -7.732032028529012e-12, "adam_stats/m_t_min": -0.0004963051760569215, "adam_stats/v_t_max": 2.3202745069283992e-05, "adam_stats/v_t_mean": 2.021806333463161e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.04975999519228935, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.1916580200195312, "all_logprobs": -0.03722362965345383, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.875, "all_logprobs/p1": -1.0546875, "all_logprobs/p10": -0.006866455078125, "all_logprobs/p25": -1.6689300537109375e-06, "all_logprobs/p5": -0.1064453125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.05341827869415283, "clip_ratio": 0.0, "completion_length": 548.7604370117188, "completion_length/correct": 469.43359375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 423.0, "completion_length/correct/min": 127.0, "completion_length/correct/p25": 337.0, "completion_length/correct/p75": 566.0, "completion_length/correct/var": 36583.6875, "completion_length/incorrect": 895.468505859375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 316.0, "completion_length/incorrect/p25": 793.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 39765.04296875, "completion_length/max": 1024.0, "completion_length/median": 470.0, "completion_length/min": 127.0, "completion_length/p25": 356.5, "completion_length/p75": 701.5, "completion_length/var": 64664.078125, "epoch": 0.4864, "feature_vector_variance/max_squared_error": 136775.09375, "feature_vector_variance/metric": 31189.42578125, "generated_tokens/total": 17406326.0, "grad_norm": 0.07547604292631149, "grouped_std_rewards": 0.135592982172966, "learning_rate": 1.1693946776030601e-05, "loss": 0.0498, "mean_logprobs": -0.03759765625, "mean_logprobs/var": 0.00045013427734375, "num_completions/total": 29184, "per_sentence_gradient_norm": 1.3960317373275757, "per_sentence_gradient_norm/max": 171.75881958007812, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 43.80222702026367, "per_sentence_gradient_norm/var": 93.26570892333984, "per_token_feature_norm": 193.9283905029297, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 74.0, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 482.2408142089844, "per_token_full_gradient_variance/max_squared_error": 167.61570739746094, "per_token_full_gradient_variance/variance": 0.027334725484251976, "per_token_gradient_norm": 1.7266746759414673, "per_token_gradient_norm/max": 5210.6103515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2607.12255859375, "per_token_policy_error_norm": 0.02130526304244995, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01940010115504265, "policy_entropy": 0.0413241945207119, "policy_entropy/max": 3.296875, "policy_entropy/median": 5.960464477539063e-08, "policy_entropy/min": 1.9651164376299768e-18, "policy_entropy/p25": 8.803908713161945e-10, "policy_entropy/p75": 2.5033950805664062e-05, "policy_entropy/var": 0.024592895060777664, "policy_error_vector_variance/max_squared_error": 2.0062150955200195, "policy_error_vector_variance/metric": 0.021290745586156845, "policy_loss": 0.04975999891757965, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.1916580200195312, "policy_sharpness": 9.008657455444336, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.949803829193115, "reward": 0.8138021230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15172582864761353, "rewards/accuracy_reward": 0.8138021230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15172582864761353, "sentence_full_gradient_variance/max_squared_error": 289735.46875, "sentence_full_gradient_variance/metric": 2170.113525390625, "sentence_full_gradient_variance/p75": 33.95999526977539, "sentence_full_gradient_variance/p90": 41.820533752441406, "sentence_full_gradient_variance/p95": 41.820533752441406, "sentence_full_gradient_variance/p99": 84294.5390625, "state_level_variance/metric": 10.042349815368652, "state_level_variance_full_gradient/metric": 258.60595703125, "step": 38 }, { "accuracy_reward": 0.8072916865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15577466785907745, "action_level_variance/metric": 112.92486572265625, "action_level_variance_full_gradient/metric": 1043.979736328125, "adam_stats/lr_effective_max": 6.240702350623906e-05, "adam_stats/lr_effective_mean": -1.8104240329108734e-10, "adam_stats/lr_effective_min": -6.1461039877031e-05, "adam_stats/m_t_max": 0.0007827139343135059, "adam_stats/m_t_mean": -1.1411678893513688e-11, "adam_stats/m_t_min": -0.0004995727795176208, "adam_stats/v_t_max": 2.3182359655038454e-05, "adam_stats/v_t_mean": 2.0210434888146e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.018438002094626427, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.81501042842865, "all_logprobs": -0.039383988827466965, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.875, "all_logprobs/p1": -1.140625, "all_logprobs/p10": -0.00860595703125, "all_logprobs/p25": -1.6689300537109375e-06, "all_logprobs/p5": -0.126953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.05629914999008179, "clip_ratio": 0.0, "completion_length": 536.7565307617188, "completion_length/correct": 476.57257080078125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 429.0, "completion_length/correct/min": 132.0, "completion_length/correct/p25": 319.0, "completion_length/correct/p75": 612.5, "completion_length/correct/var": 44437.8671875, "completion_length/incorrect": 788.87841796875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 887.0, "completion_length/incorrect/min": 196.0, "completion_length/incorrect/p25": 621.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 74082.4140625, "completion_length/max": 1024.0, "completion_length/median": 466.0, "completion_length/min": 132.0, "completion_length/p25": 335.75, "completion_length/p75": 724.0, "completion_length/var": 65254.95703125, "epoch": 0.4992, "feature_vector_variance/max_squared_error": 138431.546875, "feature_vector_variance/metric": 31011.16015625, "generated_tokens/total": 17818556.0, "grad_norm": 0.14399485290050507, "grouped_std_rewards": 0.11998659372329712, "learning_rate": 1.1474394481749037e-05, "loss": 0.0184, "mean_logprobs": -0.041015625, "mean_logprobs/var": 0.000614166259765625, "num_completions/total": 29952, "per_sentence_gradient_norm": 1.3838227987289429, "per_sentence_gradient_norm/max": 184.66949462890625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 30.532245635986328, "per_sentence_gradient_norm/var": 111.15461730957031, "per_token_feature_norm": 193.5987548828125, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 503.48675537109375, "per_token_full_gradient_variance/max_squared_error": 252.7789306640625, "per_token_full_gradient_variance/variance": 0.03213347867131233, "per_token_gradient_norm": 2.030604124069214, "per_token_gradient_norm/max": 6571.40625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3538.38134765625, "per_token_policy_error_norm": 0.022565795108675957, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.020615611225366592, "policy_entropy": 0.043571874499320984, "policy_entropy/max": 3.71875, "policy_entropy/median": 5.2386894822120667e-08, "policy_entropy/min": 8.456776945386935e-18, "policy_entropy/p25": 7.239577826112509e-10, "policy_entropy/p75": 2.4318695068359375e-05, "policy_entropy/var": 0.026394469663500786, "policy_error_vector_variance/max_squared_error": 2.0075371265411377, "policy_error_vector_variance/metric": 0.02254803478717804, "policy_loss": 0.018438009545207024, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -7.481914520263672, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.815010666847229, "policy_sharpness": 8.980938911437988, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.142625331878662, "reward": 0.8072916865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15577466785907745, "rewards/accuracy_reward": 0.8072916865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15577466785907745, "sentence_full_gradient_variance/max_squared_error": 206918.15625, "sentence_full_gradient_variance/metric": 1183.2894287109375, "sentence_full_gradient_variance/p75": 18.737743377685547, "sentence_full_gradient_variance/p90": 42.06778335571289, "sentence_full_gradient_variance/p95": 42.06778335571289, "sentence_full_gradient_variance/p99": 37780.8515625, "state_level_variance/metric": 12.329071044921875, "state_level_variance_full_gradient/metric": 139.30972290039062, "step": 39 }, { "accuracy_reward": 0.8294271230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14166225492954254, "action_level_variance/metric": 117.41943359375, "action_level_variance_full_gradient/metric": 7463.3603515625, "adam_stats/lr_effective_max": 6.053034303477034e-05, "adam_stats/lr_effective_mean": 2.6397464075733623e-11, "adam_stats/lr_effective_min": -5.958958718110807e-05, "adam_stats/m_t_max": 0.0012598625617101789, "adam_stats/m_t_mean": -2.7229149490715088e-11, "adam_stats/m_t_min": -0.0012247621780261397, "adam_stats/v_t_max": 2.31900266953744e-05, "adam_stats/v_t_mean": 2.0271479807265624e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.09595510363578796, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.404600143432617, "all_logprobs": -0.03486574441194534, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.375, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.005218505859375, "all_logprobs/p25": -8.344650268554688e-07, "all_logprobs/p5": -0.09716796875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.051493436098098755, "clip_ratio": 0.0, "completion_length": 547.9310302734375, "completion_length/correct": 492.6452331542969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 458.0, "completion_length/correct/min": 144.0, "completion_length/correct/p25": 342.0, "completion_length/correct/p75": 609.0, "completion_length/correct/var": 38616.57421875, "completion_length/incorrect": 816.7633666992188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 956.0, "completion_length/incorrect/min": 231.0, "completion_length/incorrect/p25": 632.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 62298.4765625, "completion_length/max": 1024.0, "completion_length/median": 491.0, "completion_length/min": 144.0, "completion_length/p25": 354.0, "completion_length/p75": 703.0, "completion_length/var": 57462.0859375, "epoch": 0.512, "feature_vector_variance/max_squared_error": 135908.53125, "feature_vector_variance/metric": 30495.9609375, "generated_tokens/total": 18239366.0, "grad_norm": 0.17151588201522827, "grouped_std_rewards": 0.1315169632434845, "learning_rate": 1.125e-05, "loss": 0.096, "mean_logprobs": -0.03515625, "mean_logprobs/var": 0.00040435791015625, "num_completions/total": 30720, "per_sentence_gradient_norm": 1.518926978111267, "per_sentence_gradient_norm/max": 196.4217987060547, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 46.58340072631836, "per_sentence_gradient_norm/var": 115.26237487792969, "per_token_feature_norm": 193.2788543701172, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 77.0, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 205.0, "per_token_feature_norm/var": 456.2031555175781, "per_token_full_gradient_variance/max_squared_error": 143.100341796875, "per_token_full_gradient_variance/variance": 0.03266024962067604, "per_token_gradient_norm": 1.9979770183563232, "per_token_gradient_norm/max": 6440.58447265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3401.709228515625, "per_token_policy_error_norm": 0.01987069845199585, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.018291283398866653, "policy_entropy": 0.038443341851234436, "policy_entropy/max": 3.796875, "policy_entropy/median": 4.307366907596588e-08, "policy_entropy/min": 7.914675859144182e-18, "policy_entropy/p25": 6.257323548197746e-10, "policy_entropy/p75": 1.3947486877441406e-05, "policy_entropy/var": 0.022931912913918495, "policy_error_vector_variance/max_squared_error": 2.0059494972229004, "policy_error_vector_variance/metric": 0.019850879907608032, "policy_loss": 0.09595510363578796, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.4046006202697754, "policy_sharpness": 9.072715759277344, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.623265266418457, "reward": 0.8294271230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14166225492954254, "rewards/accuracy_reward": 0.8294271230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14166225492954254, "sentence_full_gradient_variance/max_squared_error": 1654022.0, "sentence_full_gradient_variance/metric": 8491.576171875, "sentence_full_gradient_variance/p75": 66.2085189819336, "sentence_full_gradient_variance/p90": 183.2705841064453, "sentence_full_gradient_variance/p95": 183.2705841064453, "sentence_full_gradient_variance/p99": 64807.4921875, "state_level_variance/metric": 12.500505447387695, "state_level_variance_full_gradient/metric": 1028.2158203125, "step": 40 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.165145605802536, "action_level_variance/metric": 74.48561096191406, "action_level_variance_full_gradient/metric": 6349.0, "adam_stats/lr_effective_max": 6.1659055063501e-05, "adam_stats/lr_effective_mean": -1.1214676409743163e-10, "adam_stats/lr_effective_min": -6.0563786973943934e-05, "adam_stats/m_t_max": 0.0011178544955328107, "adam_stats/m_t_mean": -2.8086812389749305e-11, "adam_stats/m_t_min": -0.0010809235973283648, "adam_stats/v_t_max": 2.3166861865320243e-05, "adam_stats/v_t_mean": 2.0273717600549634e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.003520200727507472, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.0678908824920654, "all_logprobs": -0.03500819206237793, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.875, "all_logprobs/p1": -0.9765625, "all_logprobs/p10": -0.005218505859375, "all_logprobs/p25": -1.1920928955078125e-06, "all_logprobs/p5": -0.0966796875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0517352856695652, "clip_ratio": 0.0, "completion_length": 548.6510620117188, "completion_length/correct": 479.1036071777344, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 427.0, "completion_length/correct/min": 110.0, "completion_length/correct/p25": 335.75, "completion_length/correct/p75": 609.25, "completion_length/correct/var": 39479.8359375, "completion_length/incorrect": 812.9312744140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 866.0, "completion_length/incorrect/min": 341.0, "completion_length/incorrect/p25": 611.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 50001.4765625, "completion_length/max": 1024.0, "completion_length/median": 490.0, "completion_length/min": 110.0, "completion_length/p25": 361.0, "completion_length/p75": 706.25, "completion_length/var": 60013.4765625, "epoch": 0.5248, "feature_vector_variance/max_squared_error": 134069.78125, "feature_vector_variance/metric": 30861.83203125, "generated_tokens/total": 18660730.0, "grad_norm": 0.19132395088672638, "grouped_std_rewards": 0.1571076214313507, "learning_rate": 1.1021036720894182e-05, "loss": -0.0035, "mean_logprobs": -0.035888671875, "mean_logprobs/var": 0.000423431396484375, "num_completions/total": 31488, "per_sentence_gradient_norm": 1.5435466766357422, "per_sentence_gradient_norm/max": 82.02959442138672, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 49.62910461425781, "per_sentence_gradient_norm/var": 72.19708251953125, "per_token_feature_norm": 192.0729522705078, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 191.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 433.3264465332031, "per_token_full_gradient_variance/max_squared_error": 99.58460998535156, "per_token_full_gradient_variance/variance": 0.025254346430301666, "per_token_gradient_norm": 1.7302356958389282, "per_token_gradient_norm/max": 5031.5419921875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2170.547607421875, "per_token_policy_error_norm": 0.019986504688858986, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.018371127545833588, "policy_entropy": 0.038583915680646896, "policy_entropy/max": 3.671875, "policy_entropy/median": 5.3783878684043884e-08, "policy_entropy/min": 3.4558944247975454e-18, "policy_entropy/p25": 7.421476766467094e-10, "policy_entropy/p75": 1.895427703857422e-05, "policy_entropy/var": 0.0229208804666996, "policy_error_vector_variance/max_squared_error": 2.006499767303467, "policy_error_vector_variance/metric": 0.01996574178338051, "policy_loss": -0.0035202051512897015, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.0678904056549072, "policy_sharpness": 9.059624671936035, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.676592826843262, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.165145605802536, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.165145605802536, "sentence_full_gradient_variance/max_squared_error": 1893377.375, "sentence_full_gradient_variance/metric": 7193.13232421875, "sentence_full_gradient_variance/p75": 107.6878433227539, "sentence_full_gradient_variance/p90": 231.03819274902344, "sentence_full_gradient_variance/p95": 231.03819274902344, "sentence_full_gradient_variance/p99": 124928.6328125, "state_level_variance/metric": 7.001092910766602, "state_level_variance_full_gradient/metric": 844.130615234375, "step": 41 }, { "accuracy_reward": 0.77734375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17330610752105713, "action_level_variance/metric": 245.04525756835938, "action_level_variance_full_gradient/metric": 6272.30224609375, "adam_stats/lr_effective_max": 6.282651156652719e-05, "adam_stats/lr_effective_mean": 2.7387772605358407e-11, "adam_stats/lr_effective_min": -6.269363075261936e-05, "adam_stats/m_t_max": 0.0010430716210976243, "adam_stats/m_t_mean": -2.3830942427749413e-11, "adam_stats/m_t_min": -0.0008644937770441175, "adam_stats/v_t_max": 2.314383164048195e-05, "adam_stats/v_t_mean": 2.0285797781155468e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.012054115533828735, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.168517589569092, "all_logprobs": -0.03494515269994736, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.25, "all_logprobs/p1": -0.98046875, "all_logprobs/p10": -0.005279541015625, "all_logprobs/p25": -1.0728836059570312e-06, "all_logprobs/p5": -0.10009765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04912850633263588, "clip_ratio": 0.0, "completion_length": 545.1419677734375, "completion_length/correct": 488.3115539550781, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 473.0, "completion_length/correct/min": 81.0, "completion_length/correct/p25": 340.0, "completion_length/correct/p75": 602.0, "completion_length/correct/var": 37757.4375, "completion_length/incorrect": 743.5497436523438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 784.0, "completion_length/incorrect/min": 214.0, "completion_length/incorrect/p25": 493.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 72759.8984375, "completion_length/max": 1024.0, "completion_length/median": 507.0, "completion_length/min": 81.0, "completion_length/p25": 366.75, "completion_length/p75": 672.25, "completion_length/var": 56756.54296875, "epoch": 0.5376, "feature_vector_variance/max_squared_error": 128973.96875, "feature_vector_variance/metric": 30310.306640625, "generated_tokens/total": 19079400.0, "grad_norm": 0.22911414504051208, "grouped_std_rewards": 0.18144342303276062, "learning_rate": 1.078778360091808e-05, "loss": 0.0121, "mean_logprobs": -0.0361328125, "mean_logprobs/var": 0.00042724609375, "num_completions/total": 32256, "per_sentence_gradient_norm": 2.3829054832458496, "per_sentence_gradient_norm/max": 225.32861328125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 5.6948652267456055, "per_sentence_gradient_norm/p99": 50.07498550415039, "per_sentence_gradient_norm/var": 239.6791229248047, "per_token_feature_norm": 193.54234313964844, "per_token_feature_norm/max": 300.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 184.0, "per_token_feature_norm/p75": 205.0, "per_token_feature_norm/var": 444.15570068359375, "per_token_full_gradient_variance/max_squared_error": 204.80081176757812, "per_token_full_gradient_variance/variance": 0.05395572632551193, "per_token_gradient_norm": 2.9167697429656982, "per_token_gradient_norm/max": 6545.73681640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5367.11181640625, "per_token_policy_error_norm": 0.02017315663397312, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.018342675641179085, "policy_entropy": 0.038923587650060654, "policy_entropy/max": 3.8125, "policy_entropy/median": 4.44706529378891e-08, "policy_entropy/min": 3.06287113727155e-18, "policy_entropy/p25": 6.621121428906918e-10, "policy_entropy/p75": 1.5854835510253906e-05, "policy_entropy/var": 0.02263936586678028, "policy_error_vector_variance/max_squared_error": 2.003831148147583, "policy_error_vector_variance/metric": 0.020153669640421867, "policy_loss": 0.01205411832779646, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.168517589569092, "policy_sharpness": 9.057123184204102, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.692917823791504, "reward": 0.77734375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17330610752105713, "rewards/accuracy_reward": 0.77734375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17330610752105713, "sentence_full_gradient_variance/max_squared_error": 1984811.5, "sentence_full_gradient_variance/metric": 7113.94287109375, "sentence_full_gradient_variance/p75": 71.03702545166016, "sentence_full_gradient_variance/p90": 206.68603515625, "sentence_full_gradient_variance/p95": 206.68603515625, "sentence_full_gradient_variance/p99": 100466.078125, "state_level_variance/metric": 25.215072631835938, "state_level_variance_full_gradient/metric": 841.6408081054688, "step": 42 }, { "accuracy_reward": 0.7513021230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18709087371826172, "action_level_variance/metric": 65.77885437011719, "action_level_variance_full_gradient/metric": 3357.94921875, "adam_stats/lr_effective_max": 6.18001286056824e-05, "adam_stats/lr_effective_mean": 4.4473532495592494e-11, "adam_stats/lr_effective_min": -5.87914910283871e-05, "adam_stats/m_t_max": 0.0009082468459382653, "adam_stats/m_t_mean": -2.2543474031966326e-11, "adam_stats/m_t_min": -0.0007345568155869842, "adam_stats/v_t_max": 2.3120781406760216e-05, "adam_stats/v_t_mean": 2.028265793166395e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.023804832249879837, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.8725358247756958, "all_logprobs": -0.035643719136714935, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.625, "all_logprobs/p1": -0.9921875, "all_logprobs/p10": -0.00592041015625, "all_logprobs/p25": -1.0728836059570312e-06, "all_logprobs/p5": -0.10009765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.052759069949388504, "clip_ratio": 0.0, "completion_length": 516.8854370117188, "completion_length/correct": 432.1074523925781, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 403.0, "completion_length/correct/min": 113.0, "completion_length/correct/p25": 307.0, "completion_length/correct/p75": 534.0, "completion_length/correct/var": 27836.3046875, "completion_length/incorrect": 772.9947509765625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 822.0, "completion_length/incorrect/min": 140.0, "completion_length/incorrect/p25": 590.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 65045.73046875, "completion_length/max": 1024.0, "completion_length/median": 453.0, "completion_length/min": 113.0, "completion_length/p25": 329.0, "completion_length/p75": 644.25, "completion_length/var": 58758.20703125, "epoch": 0.5504, "feature_vector_variance/max_squared_error": 137698.1875, "feature_vector_variance/metric": 30262.140625, "generated_tokens/total": 19476368.0, "grad_norm": 0.15582023561000824, "grouped_std_rewards": 0.14843714237213135, "learning_rate": 1.0550524823068504e-05, "loss": 0.0238, "mean_logprobs": -0.0361328125, "mean_logprobs/var": 0.0005340576171875, "num_completions/total": 33024, "per_sentence_gradient_norm": 1.3436330556869507, "per_sentence_gradient_norm/max": 99.11660766601562, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 45.57320785522461, "per_sentence_gradient_norm/var": 64.0569076538086, "per_token_feature_norm": 194.41531372070312, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 194.0, "per_token_feature_norm/min": 74.5, "per_token_feature_norm/p25": 184.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 482.8536071777344, "per_token_full_gradient_variance/max_squared_error": 84.5648422241211, "per_token_full_gradient_variance/variance": 0.01872672513127327, "per_token_gradient_norm": 1.506286382675171, "per_token_gradient_norm/max": 5381.32861328125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1844.8843994140625, "per_token_policy_error_norm": 0.020388446748256683, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01889931596815586, "policy_entropy": 0.039044756442308426, "policy_entropy/max": 3.796875, "policy_entropy/median": 4.1676685214042664e-08, "policy_entropy/min": 2.3310346708438345e-17, "policy_entropy/p25": 5.602487362921238e-10, "policy_entropy/p75": 1.5974044799804688e-05, "policy_entropy/var": 0.0228660237044096, "policy_error_vector_variance/max_squared_error": 2.00411319732666, "policy_error_vector_variance/metric": 0.02037467062473297, "policy_loss": 0.023804832249879837, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.8725357055664062, "policy_sharpness": 9.048075675964355, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.734081745147705, "reward": 0.7513021230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18709087371826172, "rewards/accuracy_reward": 0.7513021230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18709087371826172, "sentence_full_gradient_variance/max_squared_error": 733556.9375, "sentence_full_gradient_variance/metric": 3828.05078125, "sentence_full_gradient_variance/p75": 28.77268409729004, "sentence_full_gradient_variance/p90": 31.79567527770996, "sentence_full_gradient_variance/p95": 31.79567527770996, "sentence_full_gradient_variance/p99": 124061.921875, "state_level_variance/metric": 6.484554290771484, "state_level_variance_full_gradient/metric": 470.1015625, "step": 43 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18774443864822388, "action_level_variance/metric": 102.9247055053711, "action_level_variance_full_gradient/metric": 1975.125732421875, "adam_stats/lr_effective_max": 5.6169970775954425e-05, "adam_stats/lr_effective_mean": 2.0607278036766097e-11, "adam_stats/lr_effective_min": -5.783597953268327e-05, "adam_stats/m_t_max": 0.0007835664437152445, "adam_stats/m_t_mean": -2.1302390407207383e-11, "adam_stats/m_t_min": -0.0006084583001211286, "adam_stats/v_t_max": 2.309783667442389e-05, "adam_stats/v_t_mean": 2.026594603937726e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.009737893007695675, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.343479633331299, "all_logprobs": -0.03378142789006233, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.75, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.004119873046875, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.0859375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04818956181406975, "clip_ratio": 0.0, "completion_length": 552.39453125, "completion_length/correct": 474.5052185058594, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 438.0, "completion_length/correct/min": 83.0, "completion_length/correct/p25": 309.5, "completion_length/correct/p75": 600.25, "completion_length/correct/var": 43520.12109375, "completion_length/incorrect": 786.0625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 957.0, "completion_length/incorrect/min": 156.0, "completion_length/incorrect/p25": 510.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 80064.5, "completion_length/max": 1024.0, "completion_length/median": 473.0, "completion_length/min": 83.0, "completion_length/p25": 336.75, "completion_length/p75": 735.0, "completion_length/var": 70787.7109375, "epoch": 0.5632, "feature_vector_variance/max_squared_error": 143048.296875, "feature_vector_variance/metric": 29759.13671875, "generated_tokens/total": 19900606.0, "grad_norm": 0.08180695027112961, "grouped_std_rewards": 0.13322606682777405, "learning_rate": 1.0309549450619342e-05, "loss": 0.0097, "mean_logprobs": -0.03515625, "mean_logprobs/var": 0.000537872314453125, "num_completions/total": 33792, "per_sentence_gradient_norm": 1.3586299419403076, "per_sentence_gradient_norm/max": 201.0535888671875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 42.92353820800781, "per_sentence_gradient_norm/var": 101.21062469482422, "per_token_feature_norm": 194.95654296875, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 194.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 184.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 442.8105773925781, "per_token_full_gradient_variance/max_squared_error": 297.53985595703125, "per_token_full_gradient_variance/variance": 0.025760388001799583, "per_token_gradient_norm": 1.8174773454666138, "per_token_gradient_norm/max": 6640.68310546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2953.57421875, "per_token_policy_error_norm": 0.019283683970570564, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.017416127026081085, "policy_entropy": 0.03795276954770088, "policy_entropy/max": 3.8125, "policy_entropy/median": 2.584420144557953e-08, "policy_entropy/min": 2.6020852139652106e-18, "policy_entropy/p25": 3.838067641481757e-10, "policy_entropy/p75": 8.761882781982422e-06, "policy_entropy/var": 0.02311268448829651, "policy_error_vector_variance/max_squared_error": 2.0062549114227295, "policy_error_vector_variance/metric": 0.019265901297330856, "policy_loss": 0.009737896732985973, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.343479871749878, "policy_sharpness": 9.093695640563965, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.510985374450684, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.18774443864822388, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18774443864822388, "sentence_full_gradient_variance/max_squared_error": 370172.1875, "sentence_full_gradient_variance/metric": 2227.178466796875, "sentence_full_gradient_variance/p75": 41.163856506347656, "sentence_full_gradient_variance/p90": 121.43755340576172, "sentence_full_gradient_variance/p95": 121.43755340576172, "sentence_full_gradient_variance/p99": 34646.5078125, "state_level_variance/metric": 11.135709762573242, "state_level_variance_full_gradient/metric": 252.05288696289062, "step": 44 }, { "accuracy_reward": 0.8268229365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14337347447872162, "action_level_variance/metric": 118.03304290771484, "action_level_variance_full_gradient/metric": 1055.9698486328125, "adam_stats/lr_effective_max": 5.056912050349638e-05, "adam_stats/lr_effective_mean": 3.454313918238583e-11, "adam_stats/lr_effective_min": -5.251653783489019e-05, "adam_stats/m_t_max": 0.0006963883060961962, "adam_stats/m_t_mean": -1.8719560623825515e-11, "adam_stats/m_t_min": -0.0005507453461177647, "adam_stats/v_t_max": 2.30747573368717e-05, "adam_stats/v_t_mean": 2.024642606346383e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.01815570890903473, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.2055171728134155, "all_logprobs": -0.03323700278997421, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.625, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.0037384033203125, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.0791015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04883535951375961, "clip_ratio": 0.0, "completion_length": 521.4049682617188, "completion_length/correct": 451.84881591796875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 413.0, "completion_length/correct/min": 59.0, "completion_length/correct/p25": 328.0, "completion_length/correct/p75": 546.5, "completion_length/correct/var": 35384.671875, "completion_length/incorrect": 853.4962768554688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 327.0, "completion_length/incorrect/p25": 670.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 58740.078125, "completion_length/max": 1024.0, "completion_length/median": 444.0, "completion_length/min": 59.0, "completion_length/p25": 345.5, "completion_length/p75": 648.75, "completion_length/var": 62487.0859375, "epoch": 0.576, "feature_vector_variance/max_squared_error": 136356.78125, "feature_vector_variance/metric": 29601.431640625, "generated_tokens/total": 20301044.0, "grad_norm": 0.03602856770157814, "grouped_std_rewards": 0.1193292886018753, "learning_rate": 1.0065151074942516e-05, "loss": -0.0182, "mean_logprobs": -0.033935546875, "mean_logprobs/var": 0.0004711151123046875, "num_completions/total": 34560, "per_sentence_gradient_norm": 1.123779535293579, "per_sentence_gradient_norm/max": 252.8164520263672, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 24.09684181213379, "per_sentence_gradient_norm/var": 116.92240142822266, "per_token_feature_norm": 195.54562377929688, "per_token_feature_norm/max": 298.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 71.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 416.9029846191406, "per_token_full_gradient_variance/max_squared_error": 654.6804809570312, "per_token_full_gradient_variance/variance": 0.02585640177130699, "per_token_gradient_norm": 1.6043552160263062, "per_token_gradient_norm/max": 6335.43212890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2612.432373046875, "per_token_policy_error_norm": 0.01899118721485138, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.017409780994057655, "policy_entropy": 0.03655411675572395, "policy_entropy/max": 3.671875, "policy_entropy/median": 2.293381839990616e-08, "policy_entropy/min": 6.342582709040201e-18, "policy_entropy/p25": 3.1468516681343317e-10, "policy_entropy/p75": 7.927417755126953e-06, "policy_entropy/var": 0.021978776901960373, "policy_error_vector_variance/max_squared_error": 2.00571346282959, "policy_error_vector_variance/metric": 0.01898103766143322, "policy_loss": -0.01815570890903473, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -7.481915473937988, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.2055171728134155, "policy_sharpness": 9.113073348999023, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.398587703704834, "reward": 0.8268229365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14337347447872162, "rewards/accuracy_reward": 0.8268229365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14337347447872162, "sentence_full_gradient_variance/max_squared_error": 181970.6875, "sentence_full_gradient_variance/metric": 1186.803466796875, "sentence_full_gradient_variance/p75": 28.5696964263916, "sentence_full_gradient_variance/p90": 109.54070281982422, "sentence_full_gradient_variance/p95": 109.54070281982422, "sentence_full_gradient_variance/p99": 38175.01171875, "state_level_variance/metric": 13.633260726928711, "state_level_variance_full_gradient/metric": 130.8337860107422, "step": 45 }, { "accuracy_reward": 0.7734375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17546039819717407, "action_level_variance/metric": 58.410430908203125, "action_level_variance_full_gradient/metric": 3840.615966796875, "adam_stats/lr_effective_max": 4.943772728438489e-05, "adam_stats/lr_effective_mean": 2.4792876085477644e-11, "adam_stats/lr_effective_min": -4.785925557371229e-05, "adam_stats/m_t_max": 0.0006106323562562466, "adam_stats/m_t_mean": -1.7109549887983633e-11, "adam_stats/m_t_min": -0.0004868923278991133, "adam_stats/v_t_max": 2.3051839889376424e-05, "adam_stats/v_t_mean": 2.022759997694079e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.055107321590185165, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.0042169094085693, "all_logprobs": -0.03376556932926178, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.75, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.00421142578125, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.08261680603027344, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04906366392970085, "clip_ratio": 0.0, "completion_length": 525.4388427734375, "completion_length/correct": 445.4040222167969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 397.0, "completion_length/correct/min": 119.0, "completion_length/correct/p25": 300.25, "completion_length/correct/p75": 534.0, "completion_length/correct/var": 39881.55859375, "completion_length/incorrect": 798.660888671875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 863.0, "completion_length/incorrect/min": 311.0, "completion_length/incorrect/p25": 584.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 60408.625, "completion_length/max": 1024.0, "completion_length/median": 443.0, "completion_length/min": 119.0, "completion_length/p25": 332.75, "completion_length/p75": 693.0, "completion_length/var": 66355.296875, "epoch": 0.5888, "feature_vector_variance/max_squared_error": 150071.96875, "feature_vector_variance/metric": 29534.19140625, "generated_tokens/total": 20704582.0, "grad_norm": 0.05207895115017891, "grouped_std_rewards": 0.12625271081924438, "learning_rate": 9.817627457812105e-06, "loss": -0.0551, "mean_logprobs": -0.03369140625, "mean_logprobs/var": 0.000278472900390625, "num_completions/total": 35328, "per_sentence_gradient_norm": 1.1203776597976685, "per_sentence_gradient_norm/max": 128.16534423828125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 39.913822174072266, "per_sentence_gradient_norm/var": 57.22970962524414, "per_token_feature_norm": 197.18057250976562, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 75.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 460.425048828125, "per_token_full_gradient_variance/max_squared_error": 137.1051788330078, "per_token_full_gradient_variance/variance": 0.01984739676117897, "per_token_gradient_norm": 1.3231490850448608, "per_token_gradient_norm/max": 5578.025390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1859.4757080078125, "per_token_policy_error_norm": 0.019429638981819153, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.017834261059761047, "policy_entropy": 0.03706920146942139, "policy_entropy/max": 3.6875, "policy_entropy/median": 2.4097971618175507e-08, "policy_entropy/min": 6.071532165918825e-18, "policy_entropy/p25": 3.54702933691442e-10, "policy_entropy/p75": 8.761882781982422e-06, "policy_entropy/var": 0.021504230797290802, "policy_error_vector_variance/max_squared_error": 2.004088878631592, "policy_error_vector_variance/metric": 0.019415684044361115, "policy_loss": -0.05510731786489487, "policy_loss/max": 9.659051895141602, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.0042169094085693, "policy_sharpness": 9.094863891601562, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.479730606079102, "reward": 0.7734375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17546039819717407, "rewards/accuracy_reward": 0.7734375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17546039819717407, "sentence_full_gradient_variance/max_squared_error": 1247486.125, "sentence_full_gradient_variance/metric": 4322.82958984375, "sentence_full_gradient_variance/p75": 129.61703491210938, "sentence_full_gradient_variance/p90": 319.27777099609375, "sentence_full_gradient_variance/p95": 319.27777099609375, "sentence_full_gradient_variance/p99": 97317.015625, "state_level_variance/metric": 6.109700679779053, "state_level_variance_full_gradient/metric": 482.213134765625, "step": 46 }, { "accuracy_reward": 0.8046875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1573704332113266, "action_level_variance/metric": 40.4998779296875, "action_level_variance_full_gradient/metric": 1735.895263671875, "adam_stats/lr_effective_max": 5.200730811338872e-05, "adam_stats/lr_effective_mean": 4.709118164858417e-11, "adam_stats/lr_effective_min": -5.0812075642170385e-05, "adam_stats/m_t_max": 0.0004534386971499771, "adam_stats/m_t_mean": -1.3822322626755312e-11, "adam_stats/m_t_min": -0.0004314319812692702, "adam_stats/v_t_max": 2.3029564545140602e-05, "adam_stats/v_t_mean": 2.021078400124554e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.021774768829345703, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.9878952503204346, "all_logprobs": -0.031210923567414284, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.9375, "all_logprobs/p1": -0.90625, "all_logprobs/p10": -0.002838134765625, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.06884765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04565038904547691, "clip_ratio": 0.0, "completion_length": 499.9349060058594, "completion_length/correct": 443.6650695800781, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 408.0, "completion_length/correct/min": 114.0, "completion_length/correct/p25": 310.25, "completion_length/correct/p75": 530.5, "completion_length/correct/var": 37517.953125, "completion_length/incorrect": 731.7666625976562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 755.0, "completion_length/incorrect/min": 177.0, "completion_length/incorrect/p25": 481.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 81028.0859375, "completion_length/max": 1024.0, "completion_length/median": 439.0, "completion_length/min": 114.0, "completion_length/p25": 321.0, "completion_length/p75": 623.0, "completion_length/var": 58983.61328125, "epoch": 0.6016, "feature_vector_variance/max_squared_error": 145266.3125, "feature_vector_variance/metric": 29552.98828125, "generated_tokens/total": 21088532.0, "grad_norm": 0.07694974541664124, "grouped_std_rewards": 0.09184105694293976, "learning_rate": 9.567280168627493e-06, "loss": -0.0218, "mean_logprobs": -0.033447265625, "mean_logprobs/var": 0.0005950927734375, "num_completions/total": 36096, "per_sentence_gradient_norm": 0.7840067148208618, "per_sentence_gradient_norm/max": 97.67859649658203, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 27.071613311767578, "per_sentence_gradient_norm/var": 39.93721389770508, "per_token_feature_norm": 195.99102783203125, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 75.5, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 445.1524658203125, "per_token_full_gradient_variance/max_squared_error": 71.63107299804688, "per_token_full_gradient_variance/variance": 0.010630982927978039, "per_token_gradient_norm": 0.9286863803863525, "per_token_gradient_norm/max": 4174.720703125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 979.6394653320312, "per_token_policy_error_norm": 0.017818884924054146, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016407743096351624, "policy_entropy": 0.03469349443912506, "policy_entropy/max": 3.671875, "policy_entropy/median": 2.2351741790771484e-08, "policy_entropy/min": 5.21772295508649e-19, "policy_entropy/p25": 3.601599019020796e-10, "policy_entropy/p75": 6.020069122314453e-06, "policy_entropy/var": 0.02064458839595318, "policy_error_vector_variance/max_squared_error": 2.0055227279663086, "policy_error_vector_variance/metric": 0.017806105315685272, "policy_loss": -0.021774768829345703, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -7.481914520263672, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.9878950715065002, "policy_sharpness": 9.150473594665527, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.203485488891602, "reward": 0.8046875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1573704332113266, "rewards/accuracy_reward": 0.8046875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1573704332113266, "sentence_full_gradient_variance/max_squared_error": 689198.625, "sentence_full_gradient_variance/metric": 1961.32763671875, "sentence_full_gradient_variance/p75": 42.69268035888672, "sentence_full_gradient_variance/p90": 85.9217529296875, "sentence_full_gradient_variance/p95": 85.9217529296875, "sentence_full_gradient_variance/p99": 31360.66015625, "state_level_variance/metric": 4.494637489318848, "state_level_variance_full_gradient/metric": 225.43212890625, "step": 47 }, { "accuracy_reward": 0.8450521230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.13110977411270142, "action_level_variance/metric": 106.8581771850586, "action_level_variance_full_gradient/metric": 4061.373779296875, "adam_stats/lr_effective_max": 5.308616528054699e-05, "adam_stats/lr_effective_mean": -4.085723079305659e-11, "adam_stats/lr_effective_min": -4.927960617351346e-05, "adam_stats/m_t_max": 0.0003899750008713454, "adam_stats/m_t_mean": -1.165307433881324e-11, "adam_stats/m_t_min": -0.00037980108754709363, "adam_stats/v_t_max": 2.300668893440161e-05, "adam_stats/v_t_mean": 2.019430412822376e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0951085239648819, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.8890841007232666, "all_logprobs": -0.032387759536504745, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.125, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.003173828125, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.0791015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0473591648042202, "clip_ratio": 0.0, "completion_length": 503.7421875, "completion_length/correct": 450.8582458496094, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 399.0, "completion_length/correct/min": 148.0, "completion_length/correct/p25": 298.0, "completion_length/correct/p75": 566.0, "completion_length/correct/var": 39274.96484375, "completion_length/incorrect": 792.1597290039062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 831.0, "completion_length/incorrect/min": 185.0, "completion_length/incorrect/p25": 618.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 59364.390625, "completion_length/max": 1024.0, "completion_length/median": 432.0, "completion_length/min": 148.0, "completion_length/p25": 316.75, "completion_length/p75": 648.25, "completion_length/var": 57586.9765625, "epoch": 0.6144, "feature_vector_variance/max_squared_error": 138593.6875, "feature_vector_variance/metric": 29246.958984375, "generated_tokens/total": 21475406.0, "grad_norm": 0.09510370343923569, "grouped_std_rewards": 0.12822434306144714, "learning_rate": 9.314414216997507e-06, "loss": -0.0951, "mean_logprobs": -0.03271484375, "mean_logprobs/var": 0.0003337860107421875, "num_completions/total": 36864, "per_sentence_gradient_norm": 1.316901445388794, "per_sentence_gradient_norm/max": 211.8434295654297, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 36.23813247680664, "per_sentence_gradient_norm/var": 105.26100158691406, "per_token_feature_norm": 196.69497680664062, "per_token_feature_norm/max": 302.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 75.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 473.39837646484375, "per_token_full_gradient_variance/max_squared_error": 223.14218139648438, "per_token_full_gradient_variance/variance": 0.02306556887924671, "per_token_gradient_norm": 1.6076312065124512, "per_token_gradient_norm/max": 4576.603515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2010.8182373046875, "per_token_policy_error_norm": 0.018569258973002434, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.017111098393797874, "policy_entropy": 0.03553563356399536, "policy_entropy/max": 3.765625, "policy_entropy/median": 2.0372681319713593e-08, "policy_entropy/min": 2.358139725155972e-18, "policy_entropy/p25": 2.874003257602453e-10, "policy_entropy/p75": 6.16908073425293e-06, "policy_entropy/var": 0.020886432379484177, "policy_error_vector_variance/max_squared_error": 2.0043816566467285, "policy_error_vector_variance/metric": 0.018551481887698174, "policy_loss": -0.0951085239648819, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.8890841007232666, "policy_sharpness": 9.136265754699707, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.272744655609131, "reward": 0.8450521230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.13110977411270142, "rewards/accuracy_reward": 0.8450521230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.13110977411270142, "sentence_full_gradient_variance/max_squared_error": 2022395.375, "sentence_full_gradient_variance/metric": 4561.904296875, "sentence_full_gradient_variance/p75": 95.05009460449219, "sentence_full_gradient_variance/p90": 393.90447998046875, "sentence_full_gradient_variance/p95": 393.90447998046875, "sentence_full_gradient_variance/p99": 49014.71484375, "state_level_variance/metric": 11.745390892028809, "state_level_variance_full_gradient/metric": 500.5309753417969, "step": 48 }, { "accuracy_reward": 0.7578125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1837720274925232, "action_level_variance/metric": 147.3351287841797, "action_level_variance_full_gradient/metric": 3472.041748046875, "adam_stats/lr_effective_max": 5.138541746418923e-05, "adam_stats/lr_effective_mean": 1.0378743003913726e-11, "adam_stats/lr_effective_min": -4.760421870741993e-05, "adam_stats/m_t_max": 0.0004066721012350172, "adam_stats/m_t_mean": -1.0314169657243966e-11, "adam_stats/m_t_min": -0.00036966826883144677, "adam_stats/v_t_max": 2.2983997041592374e-05, "adam_stats/v_t_mean": 2.017879570034853e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.008044309914112091, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.413133144378662, "all_logprobs": -0.03311945125460625, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.875, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.00408935546875, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.0791015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04905127361416817, "clip_ratio": 0.0, "completion_length": 531.0234375, "completion_length/correct": 469.9570617675781, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 448.0, "completion_length/correct/min": 113.0, "completion_length/correct/p25": 345.25, "completion_length/correct/p75": 589.0, "completion_length/correct/var": 32239.3046875, "completion_length/incorrect": 722.1021728515625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 702.0, "completion_length/incorrect/min": 211.0, "completion_length/incorrect/p25": 511.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 66390.515625, "completion_length/max": 1024.0, "completion_length/median": 485.0, "completion_length/min": 113.0, "completion_length/p25": 368.0, "completion_length/p75": 661.25, "completion_length/var": 52118.2265625, "epoch": 0.6272, "feature_vector_variance/max_squared_error": 149468.265625, "feature_vector_variance/metric": 29174.70703125, "generated_tokens/total": 21883232.0, "grad_norm": 0.10142666846513748, "grouped_std_rewards": 0.15571126341819763, "learning_rate": 9.059337681133194e-06, "loss": -0.008, "mean_logprobs": -0.032470703125, "mean_logprobs/var": 0.000293731689453125, "num_completions/total": 37632, "per_sentence_gradient_norm": 1.670867919921875, "per_sentence_gradient_norm/max": 255.09194946289062, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 43.18729782104492, "per_sentence_gradient_norm/var": 144.73178100585938, "per_token_feature_norm": 196.9171600341797, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 75.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 446.54168701171875, "per_token_full_gradient_variance/max_squared_error": 327.3265686035156, "per_token_full_gradient_variance/variance": 0.03617500141263008, "per_token_gradient_norm": 2.2073769569396973, "per_token_gradient_norm/max": 6521.7685546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3715.06640625, "per_token_policy_error_norm": 0.01901685632765293, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.017720358446240425, "policy_entropy": 0.035962898284196854, "policy_entropy/max": 3.796875, "policy_entropy/median": 1.955777406692505e-08, "policy_entropy/min": 6.884683795282953e-18, "policy_entropy/p25": 2.6557245291769505e-10, "policy_entropy/p75": 7.569789886474609e-06, "policy_entropy/var": 0.020753178745508194, "policy_error_vector_variance/max_squared_error": 2.0054874420166016, "policy_error_vector_variance/metric": 0.01900297962129116, "policy_loss": -0.008044309914112091, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.413133144378662, "policy_sharpness": 9.11295223236084, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.380565166473389, "reward": 0.7578125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1837720274925232, "rewards/accuracy_reward": 0.7578125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1837720274925232, "sentence_full_gradient_variance/max_squared_error": 529754.6875, "sentence_full_gradient_variance/metric": 3912.191162109375, "sentence_full_gradient_variance/p75": 170.26890563964844, "sentence_full_gradient_variance/p90": 179.17755126953125, "sentence_full_gradient_variance/p95": 179.17755126953125, "sentence_full_gradient_variance/p99": 94566.375, "state_level_variance/metric": 15.789567947387695, "state_level_variance_full_gradient/metric": 440.1494140625, "step": 49 }, { "accuracy_reward": 0.78515625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1689058393239975, "action_level_variance/metric": 73.52007293701172, "action_level_variance_full_gradient/metric": 3748.089599609375, "adam_stats/lr_effective_max": 4.4207667087903246e-05, "adam_stats/lr_effective_mean": 6.281068720692673e-11, "adam_stats/lr_effective_min": -4.406464358908124e-05, "adam_stats/m_t_max": 0.0003426398616284132, "adam_stats/m_t_mean": -7.402564672354117e-12, "adam_stats/m_t_min": -0.0003462436143308878, "adam_stats/v_t_max": 2.296101774845738e-05, "adam_stats/v_t_mean": 2.0164460379223925e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.04781309887766838, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.8485857248306274, "all_logprobs": -0.03410637006163597, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.125, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.00408935546875, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.0888671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04843709245324135, "clip_ratio": 0.0, "completion_length": 517.7578125, "completion_length/correct": 451.5373229980469, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 401.0, "completion_length/correct/min": 93.0, "completion_length/correct/p25": 317.0, "completion_length/correct/p75": 568.5, "completion_length/correct/var": 34473.34375, "completion_length/incorrect": 759.7636108398438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 746.0, "completion_length/incorrect/min": 225.0, "completion_length/incorrect/p25": 573.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 58084.42578125, "completion_length/max": 1024.0, "completion_length/median": 453.0, "completion_length/min": 93.0, "completion_length/p25": 341.25, "completion_length/p75": 650.25, "completion_length/var": 55523.56640625, "epoch": 0.64, "feature_vector_variance/max_squared_error": 148070.671875, "feature_vector_variance/metric": 28963.728515625, "generated_tokens/total": 22280870.0, "grad_norm": 0.09934690594673157, "grouped_std_rewards": 0.15413199365139008, "learning_rate": 8.80236133250198e-06, "loss": 0.0478, "mean_logprobs": -0.034912109375, "mean_logprobs/var": 0.0004863739013671875, "num_completions/total": 38400, "per_sentence_gradient_norm": 1.286210536956787, "per_sentence_gradient_norm/max": 141.21702575683594, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 33.48344039916992, "per_sentence_gradient_norm/var": 71.95943450927734, "per_token_feature_norm": 197.64486694335938, "per_token_feature_norm/max": 302.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 73.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 474.4320373535156, "per_token_full_gradient_variance/max_squared_error": 348.6483154296875, "per_token_full_gradient_variance/variance": 0.021286670118570328, "per_token_gradient_norm": 1.6039996147155762, "per_token_gradient_norm/max": 4906.5966796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2026.625244140625, "per_token_policy_error_norm": 0.019746137782931328, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.018166176974773407, "policy_entropy": 0.037356771528720856, "policy_entropy/max": 3.65625, "policy_entropy/median": 1.9674189388751984e-08, "policy_entropy/min": 2.2632720350634905e-18, "policy_entropy/p25": 2.8558133635669947e-10, "policy_entropy/p75": 6.616115570068359e-06, "policy_entropy/var": 0.022034483030438423, "policy_error_vector_variance/max_squared_error": 2.0028328895568848, "policy_error_vector_variance/metric": 0.019730431959033012, "policy_loss": 0.04781309887766838, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -7.481915473937988, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.8485854864120483, "policy_sharpness": 9.101045608520508, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.491157531738281, "reward": 0.78515625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1689058393239975, "rewards/accuracy_reward": 0.78515625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1689058393239975, "sentence_full_gradient_variance/max_squared_error": 1791069.625, "sentence_full_gradient_variance/metric": 4233.4130859375, "sentence_full_gradient_variance/p75": 125.62458801269531, "sentence_full_gradient_variance/p90": 205.76876831054688, "sentence_full_gradient_variance/p95": 205.76876831054688, "sentence_full_gradient_variance/p99": 50407.0390625, "state_level_variance/metric": 7.614994525909424, "state_level_variance_full_gradient/metric": 485.32244873046875, "step": 50 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18774448335170746, "action_level_variance/metric": 100.82513427734375, "action_level_variance_full_gradient/metric": 2917.42919921875, "adam_stats/lr_effective_max": 4.297833947930485e-05, "adam_stats/lr_effective_mean": 4.079753548880127e-11, "adam_stats/lr_effective_min": -4.115154661121778e-05, "adam_stats/m_t_max": 0.000364023755537346, "adam_stats/m_t_mean": -6.575140171910521e-12, "adam_stats/m_t_min": -0.00028902123449370265, "adam_stats/v_t_max": 2.2938402253203094e-05, "adam_stats/v_t_mean": 2.0148591996227427e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0805596262216568, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -7.48191499710083, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.9069126844406128, "all_logprobs": -0.0305546335875988, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.4375, "all_logprobs/p1": -0.86328125, "all_logprobs/p10": -0.0021820068359375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04418036341667175, "clip_ratio": 0.0, "completion_length": 572.7396240234375, "completion_length/correct": 481.54339599609375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 431.0, "completion_length/correct/min": 94.0, "completion_length/correct/p25": 328.0, "completion_length/correct/p75": 599.5, "completion_length/correct/var": 44493.99609375, "completion_length/incorrect": 846.328125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 249.0, "completion_length/incorrect/p25": 688.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 60618.56640625, "completion_length/max": 1024.0, "completion_length/median": 491.0, "completion_length/min": 94.0, "completion_length/p25": 358.0, "completion_length/p75": 793.5, "completion_length/var": 73434.1171875, "epoch": 0.6528, "feature_vector_variance/max_squared_error": 152953.875, "feature_vector_variance/metric": 28567.419921875, "generated_tokens/total": 22720734.0, "grad_norm": 0.0981600433588028, "grouped_std_rewards": 0.16527000069618225, "learning_rate": 8.543798257200491e-06, "loss": -0.0806, "mean_logprobs": -0.03173828125, "mean_logprobs/var": 0.000377655029296875, "num_completions/total": 39168, "per_sentence_gradient_norm": 1.5859431028366089, "per_sentence_gradient_norm/max": 136.18182373046875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 39.236595153808594, "per_sentence_gradient_norm/var": 98.43810272216797, "per_token_feature_norm": 199.0283660888672, "per_token_feature_norm/max": 306.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 75.5, "per_token_feature_norm/p25": 187.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 462.53912353515625, "per_token_full_gradient_variance/max_squared_error": 105.11579895019531, "per_token_full_gradient_variance/variance": 0.020424295216798782, "per_token_gradient_norm": 1.7260948419570923, "per_token_gradient_norm/max": 5325.6591796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2093.141845703125, "per_token_policy_error_norm": 0.01753012090921402, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01612759940326214, "policy_entropy": 0.03365642949938774, "policy_entropy/max": 3.78125, "policy_entropy/median": 1.1117663234472275e-08, "policy_entropy/min": 3.6253010142484055e-19, "policy_entropy/p25": 1.9099388737231493e-10, "policy_entropy/p75": 2.86102294921875e-06, "policy_entropy/var": 0.019920576363801956, "policy_error_vector_variance/max_squared_error": 2.005519151687622, "policy_error_vector_variance/metric": 0.01751779019832611, "policy_loss": -0.0805596262216568, "policy_loss/max": 7.48191499710083, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.906912922859192, "policy_sharpness": 9.181854248046875, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.038948059082031, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.18774448335170746, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18774448335170746, "sentence_full_gradient_variance/max_squared_error": 502116.4375, "sentence_full_gradient_variance/metric": 3298.8564453125, "sentence_full_gradient_variance/p75": 95.15219116210938, "sentence_full_gradient_variance/p90": 138.5269775390625, "sentence_full_gradient_variance/p95": 138.5269775390625, "sentence_full_gradient_variance/p99": 73763.1953125, "state_level_variance/metric": 10.194113731384277, "state_level_variance_full_gradient/metric": 381.42742919921875, "step": 51 }, { "accuracy_reward": 0.7864583730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1681605875492096, "action_level_variance/metric": 76.58629608154297, "action_level_variance_full_gradient/metric": 3228.76318359375, "adam_stats/lr_effective_max": 4.201960837235674e-05, "adam_stats/lr_effective_mean": 1.768483623432182e-11, "adam_stats/lr_effective_min": -4.158788578934036e-05, "adam_stats/m_t_max": 0.00033617016742937267, "adam_stats/m_t_mean": -9.931798439222206e-12, "adam_stats/m_t_min": -0.00032171266502700746, "adam_stats/v_t_max": 2.2915990484762006e-05, "adam_stats/v_t_mean": 2.013743555587255e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.06627629697322845, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.087021827697754, "all_logprobs": -0.03429828956723213, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.125, "all_logprobs/p1": -0.9765625, "all_logprobs/p10": -0.00408935546875, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.0830078125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.049588315188884735, "clip_ratio": 0.0, "completion_length": 530.09375, "completion_length/correct": 452.30462646484375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 403.0, "completion_length/correct/min": 119.0, "completion_length/correct/p25": 306.75, "completion_length/correct/p75": 564.25, "completion_length/correct/var": 37774.37109375, "completion_length/incorrect": 816.5853271484375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 987.0, "completion_length/incorrect/min": 235.0, "completion_length/incorrect/p25": 617.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 66937.3359375, "completion_length/max": 1024.0, "completion_length/median": 447.0, "completion_length/min": 119.0, "completion_length/p25": 324.0, "completion_length/p75": 704.75, "completion_length/var": 66237.7109375, "epoch": 0.6656, "feature_vector_variance/max_squared_error": 156580.359375, "feature_vector_variance/metric": 29093.5078125, "generated_tokens/total": 23127846.0, "grad_norm": 0.12619774043560028, "grouped_std_rewards": 0.18206331133842468, "learning_rate": 8.283963474507402e-06, "loss": -0.0663, "mean_logprobs": -0.033935546875, "mean_logprobs/var": 0.00060272216796875, "num_completions/total": 39936, "per_sentence_gradient_norm": 1.5089685916900635, "per_sentence_gradient_norm/max": 125.33257293701172, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 8.370205879211426, "per_sentence_gradient_norm/p99": 38.3498420715332, "per_sentence_gradient_norm/var": 74.40618133544922, "per_token_feature_norm": 198.57864379882812, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 69.5, "per_token_feature_norm/p25": 187.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 457.8731994628906, "per_token_full_gradient_variance/max_squared_error": 117.8856430053711, "per_token_full_gradient_variance/variance": 0.023324280977249146, "per_token_gradient_norm": 1.8870456218719482, "per_token_gradient_norm/max": 3536.50537109375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2175.454345703125, "per_token_policy_error_norm": 0.019592292606830597, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01790560409426689, "policy_entropy": 0.0377926230430603, "policy_entropy/max": 3.671875, "policy_entropy/median": 2.0372681319713593e-08, "policy_entropy/min": 2.168404344971009e-18, "policy_entropy/p25": 2.9467628337442875e-10, "policy_entropy/p75": 7.361173629760742e-06, "policy_entropy/var": 0.02365075796842575, "policy_error_vector_variance/max_squared_error": 2.002969741821289, "policy_error_vector_variance/metric": 0.019578633829951286, "policy_loss": -0.06627629697322845, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.087021827697754, "policy_sharpness": 9.104477882385254, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.446413040161133, "reward": 0.7864583730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1681605875492096, "rewards/accuracy_reward": 0.7864583730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1681605875492096, "sentence_full_gradient_variance/max_squared_error": 750319.5625, "sentence_full_gradient_variance/metric": 3601.8642578125, "sentence_full_gradient_variance/p75": 137.072265625, "sentence_full_gradient_variance/p90": 322.02813720703125, "sentence_full_gradient_variance/p95": 322.02813720703125, "sentence_full_gradient_variance/p99": 66571.75, "state_level_variance/metric": 7.373103618621826, "state_level_variance_full_gradient/metric": 373.10162353515625, "step": 52 }, { "accuracy_reward": 0.7942708730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16361773014068604, "action_level_variance/metric": 92.91302490234375, "action_level_variance_full_gradient/metric": 2642.766357421875, "adam_stats/lr_effective_max": 3.841586294583976e-05, "adam_stats/lr_effective_mean": 5.276691583677717e-11, "adam_stats/lr_effective_min": -3.7595993489958346e-05, "adam_stats/m_t_max": 0.0002787112898658961, "adam_stats/m_t_mean": -8.732570222491631e-12, "adam_stats/m_t_min": -0.00026760686887428164, "adam_stats/v_t_max": 2.289307667524554e-05, "adam_stats/v_t_mean": 2.0119383589700668e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.09123950451612473, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.0024046897888184, "all_logprobs": -0.03439578786492348, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.6875, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.00520932674407959, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.0908203125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.049719974398612976, "clip_ratio": 0.0, "completion_length": 500.7083435058594, "completion_length/correct": 439.97540283203125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 393.0, "completion_length/correct/min": 122.0, "completion_length/correct/p25": 291.25, "completion_length/correct/p75": 530.5, "completion_length/correct/var": 40783.60546875, "completion_length/incorrect": 735.1835327148438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 740.0, "completion_length/incorrect/min": 168.0, "completion_length/incorrect/p25": 502.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 64628.12890625, "completion_length/max": 1024.0, "completion_length/median": 442.0, "completion_length/min": 122.0, "completion_length/p25": 318.75, "completion_length/p75": 656.5, "completion_length/var": 59870.19140625, "epoch": 0.6784, "feature_vector_variance/max_squared_error": 147014.765625, "feature_vector_variance/metric": 28976.57421875, "generated_tokens/total": 23512390.0, "grad_norm": 0.06166474521160126, "grouped_std_rewards": 0.12598565220832825, "learning_rate": 8.02317355308094e-06, "loss": -0.0912, "mean_logprobs": -0.034423828125, "mean_logprobs/var": 0.0005645751953125, "num_completions/total": 40704, "per_sentence_gradient_norm": 1.4301173686981201, "per_sentence_gradient_norm/max": 131.34500122070312, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 47.63914108276367, "per_sentence_gradient_norm/var": 90.98626708984375, "per_token_feature_norm": 198.8873748779297, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 70.5, "per_token_feature_norm/p25": 187.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 476.52911376953125, "per_token_full_gradient_variance/max_squared_error": 4485364.5, "per_token_full_gradient_variance/variance": 11.692843437194824, "per_token_gradient_norm": 1.9290094375610352, "per_token_gradient_norm/max": 4566.24267578125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2441.977294921875, "per_token_policy_error_norm": 0.019670924171805382, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0181269608438015, "policy_entropy": 0.03795613721013069, "policy_entropy/max": 3.671875, "policy_entropy/median": 1.6880221664905548e-08, "policy_entropy/min": 1.7753810574450135e-18, "policy_entropy/p25": 2.419255906715989e-10, "policy_entropy/p75": 6.794929504394531e-06, "policy_entropy/var": 0.022865379229187965, "policy_error_vector_variance/max_squared_error": 2.0064597129821777, "policy_error_vector_variance/metric": 0.019653309136629105, "policy_loss": -0.09123949706554413, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.0024046897888184, "policy_sharpness": 9.086730003356934, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.535361289978027, "reward": 0.7942708730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16361773014068604, "rewards/accuracy_reward": 0.7942708730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16361773014068604, "sentence_full_gradient_variance/max_squared_error": 457620.71875, "sentence_full_gradient_variance/metric": 2998.07373046875, "sentence_full_gradient_variance/p75": 61.0734748840332, "sentence_full_gradient_variance/p90": 61.62657165527344, "sentence_full_gradient_variance/p95": 61.62657165527344, "sentence_full_gradient_variance/p99": 83135.671875, "state_level_variance/metric": 9.669617652893066, "state_level_variance_full_gradient/metric": 355.30694580078125, "step": 53 }, { "accuracy_reward": 0.828125, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14251956343650818, "action_level_variance/metric": 91.13426208496094, "action_level_variance_full_gradient/metric": 4356.75, "adam_stats/lr_effective_max": 3.684087278088555e-05, "adam_stats/lr_effective_mean": -6.796015139531875e-12, "adam_stats/lr_effective_min": -3.865855251206085e-05, "adam_stats/m_t_max": 0.00028555389144457877, "adam_stats/m_t_mean": -1.1106398786764338e-11, "adam_stats/m_t_min": -0.0002827185089699924, "adam_stats/v_t_max": 2.2870584871270694e-05, "adam_stats/v_t_mean": 2.0104926837932746e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.09443986415863037, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.039379835128784, "all_logprobs": -0.03288775682449341, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.625, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.0033660531044006348, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.0791015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.047355737537145615, "clip_ratio": 0.0, "completion_length": 477.71484375, "completion_length/correct": 427.0660095214844, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 370.0, "completion_length/correct/min": 86.0, "completion_length/correct/p25": 291.0, "completion_length/correct/p75": 524.0, "completion_length/correct/var": 38632.3984375, "completion_length/incorrect": 721.75, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 707.0, "completion_length/incorrect/min": 143.0, "completion_length/incorrect/p25": 515.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 69973.4453125, "completion_length/max": 1024.0, "completion_length/median": 410.0, "completion_length/min": 86.0, "completion_length/p25": 302.75, "completion_length/p75": 601.25, "completion_length/var": 56311.1328125, "epoch": 0.6912, "feature_vector_variance/max_squared_error": 160594.109375, "feature_vector_variance/metric": 29089.890625, "generated_tokens/total": 23879276.0, "grad_norm": 0.10627609491348267, "grouped_std_rewards": 0.13511034846305847, "learning_rate": 7.76174622526876e-06, "loss": 0.0944, "mean_logprobs": -0.035400390625, "mean_logprobs/var": 0.000553131103515625, "num_completions/total": 41472, "per_sentence_gradient_norm": 1.418466329574585, "per_sentence_gradient_norm/max": 139.7432861328125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 47.153621673583984, "per_sentence_gradient_norm/var": 89.23841857910156, "per_token_feature_norm": 197.52976989746094, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 73.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 475.0284118652344, "per_token_full_gradient_variance/max_squared_error": 116.26152801513672, "per_token_full_gradient_variance/variance": 0.031036539003252983, "per_token_gradient_norm": 2.0812482833862305, "per_token_gradient_norm/max": 5751.681640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3308.881103515625, "per_token_policy_error_norm": 0.018953587859869003, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01734403893351555, "policy_entropy": 0.03613477945327759, "policy_entropy/max": 3.59375, "policy_entropy/median": 1.594889909029007e-08, "policy_entropy/min": 6.437450399132683e-19, "policy_entropy/p25": 2.419255906715989e-10, "policy_entropy/p75": 5.155801773071289e-06, "policy_entropy/var": 0.02117035537958145, "policy_error_vector_variance/max_squared_error": 2.0040383338928223, "policy_error_vector_variance/metric": 0.018939951434731483, "policy_loss": 0.09443986415863037, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.0393800735473633, "policy_sharpness": 9.126534461975098, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.324803352355957, "reward": 0.828125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14251956343650818, "rewards/accuracy_reward": 0.828125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14251956343650818, "sentence_full_gradient_variance/max_squared_error": 1777809.5, "sentence_full_gradient_variance/metric": 4876.525390625, "sentence_full_gradient_variance/p75": 111.3424301147461, "sentence_full_gradient_variance/p90": 611.5135498046875, "sentence_full_gradient_variance/p95": 611.5135498046875, "sentence_full_gradient_variance/p99": 59630.83984375, "state_level_variance/metric": 9.478470802307129, "state_level_variance_full_gradient/metric": 519.7754516601562, "step": 54 }, { "accuracy_reward": 0.86328125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.11818061769008636, "action_level_variance/metric": 74.89562225341797, "action_level_variance_full_gradient/metric": 3349.5439453125, "adam_stats/lr_effective_max": 3.693896360346116e-05, "adam_stats/lr_effective_mean": -8.872808043847158e-11, "adam_stats/lr_effective_min": -3.903305696439929e-05, "adam_stats/m_t_max": 0.00037481277831830084, "adam_stats/m_t_mean": -1.1322279051817485e-11, "adam_stats/m_t_min": -0.0003282611141912639, "adam_stats/v_t_max": 2.284776564920321e-05, "adam_stats/v_t_mean": 2.0104154885985936e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.007940651848912239, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.757588267326355, "all_logprobs": -0.030832352116703987, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.0, "all_logprobs/p1": -0.8984375, "all_logprobs/p10": -0.0028076171875, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.06982421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04383036866784096, "clip_ratio": 0.0, "completion_length": 453.99090576171875, "completion_length/correct": 406.6229248046875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 370.0, "completion_length/correct/min": 124.0, "completion_length/correct/p25": 281.0, "completion_length/correct/p75": 515.0, "completion_length/correct/var": 28819.419921875, "completion_length/incorrect": 753.0857543945312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 762.0, "completion_length/incorrect/min": 262.0, "completion_length/incorrect/p25": 494.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 69413.203125, "completion_length/max": 1024.0, "completion_length/median": 391.0, "completion_length/min": 124.0, "completion_length/p25": 289.0, "completion_length/p75": 560.25, "completion_length/var": 48472.06640625, "epoch": 0.704, "feature_vector_variance/max_squared_error": 150330.53125, "feature_vector_variance/metric": 28991.5234375, "generated_tokens/total": 24227940.0, "grad_norm": 0.18597164750099182, "grouped_std_rewards": 0.0828557163476944, "learning_rate": 7.5e-06, "loss": 0.0079, "mean_logprobs": -0.031982421875, "mean_logprobs/var": 0.000308990478515625, "num_completions/total": 42240, "per_sentence_gradient_norm": 0.9769837260246277, "per_sentence_gradient_norm/max": 138.31759643554688, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 33.5466423034668, "per_sentence_gradient_norm/var": 74.03752136230469, "per_token_feature_norm": 198.4821319580078, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 74.0, "per_token_feature_norm/p25": 187.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 480.3672180175781, "per_token_full_gradient_variance/max_squared_error": 262.4056091308594, "per_token_full_gradient_variance/variance": 0.0224364772439003, "per_token_gradient_norm": 1.3227635622024536, "per_token_gradient_norm/max": 5205.04345703125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2426.238037109375, "per_token_policy_error_norm": 0.01786472462117672, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016375094652175903, "policy_entropy": 0.033988241106271744, "policy_entropy/max": 3.34375, "policy_entropy/median": 1.7462298274040222e-08, "policy_entropy/min": 2.290377089375628e-18, "policy_entropy/p25": 2.5283952709287405e-10, "policy_entropy/p75": 4.9173831939697266e-06, "policy_entropy/var": 0.01909100078046322, "policy_error_vector_variance/max_squared_error": 2.003871440887451, "policy_error_vector_variance/metric": 0.017854273319244385, "policy_loss": 0.007940655574202538, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.757588267326355, "policy_sharpness": 9.159157752990723, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.121413230895996, "reward": 0.86328125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.11818061769008636, "rewards/accuracy_reward": 0.86328125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.11818061769008636, "sentence_full_gradient_variance/max_squared_error": 1744011.0, "sentence_full_gradient_variance/metric": 3797.333740234375, "sentence_full_gradient_variance/p75": 63.74579620361328, "sentence_full_gradient_variance/p90": 73.93132019042969, "sentence_full_gradient_variance/p95": 73.93132019042969, "sentence_full_gradient_variance/p99": 54157.29296875, "state_level_variance/metric": 8.495954513549805, "state_level_variance_full_gradient/metric": 447.7898864746094, "step": 55 }, { "accuracy_reward": 0.859375, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.12100717425346375, "action_level_variance/metric": 167.65768432617188, "action_level_variance_full_gradient/metric": 4577.97021484375, "adam_stats/lr_effective_max": 3.7552050343947485e-05, "adam_stats/lr_effective_mean": -4.0795249123259936e-11, "adam_stats/lr_effective_min": -3.685969204525463e-05, "adam_stats/m_t_max": 0.00032111903419718146, "adam_stats/m_t_mean": -7.067716637637611e-12, "adam_stats/m_t_min": -0.0005240729078650475, "adam_stats/v_t_max": 2.2825097403256223e-05, "adam_stats/v_t_mean": 2.0102732412735635e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.11526009440422058, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.1072144508361816, "all_logprobs": -0.03103545308113098, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.0, "all_logprobs/p1": -0.90625, "all_logprobs/p10": -0.0024871826171875, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.0654296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0452352911233902, "clip_ratio": 0.0, "completion_length": 477.2005310058594, "completion_length/correct": 421.99847412109375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 375.0, "completion_length/correct/min": 118.0, "completion_length/correct/p25": 285.0, "completion_length/correct/p75": 521.0, "completion_length/correct/var": 35370.30859375, "completion_length/incorrect": 814.5463256835938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 895.0, "completion_length/incorrect/min": 266.0, "completion_length/incorrect/p25": 637.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 58324.62890625, "completion_length/max": 1024.0, "completion_length/median": 401.0, "completion_length/min": 118.0, "completion_length/p25": 295.0, "completion_length/p75": 620.25, "completion_length/var": 57172.8828125, "epoch": 0.7168, "feature_vector_variance/max_squared_error": 146482.28125, "feature_vector_variance/metric": 28701.0234375, "generated_tokens/total": 24594430.0, "grad_norm": 0.17667049169540405, "grouped_std_rewards": 0.12436680495738983, "learning_rate": 7.238253774731245e-06, "loss": 0.1153, "mean_logprobs": -0.03125, "mean_logprobs/var": 0.00040435791015625, "num_completions/total": 43008, "per_sentence_gradient_norm": 1.570401668548584, "per_sentence_gradient_norm/max": 195.431396484375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 37.330142974853516, "per_sentence_gradient_norm/var": 165.4069061279297, "per_token_feature_norm": 197.76171875, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 68.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 475.9422912597656, "per_token_full_gradient_variance/max_squared_error": 513.2669677734375, "per_token_full_gradient_variance/variance": 0.05204614996910095, "per_token_gradient_norm": 2.440117359161377, "per_token_gradient_norm/max": 6817.5869140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5717.08740234375, "per_token_policy_error_norm": 0.01779615506529808, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016448743641376495, "policy_entropy": 0.03411203250288963, "policy_entropy/max": 3.609375, "policy_entropy/median": 1.5366822481155396e-08, "policy_entropy/min": 1.0842021724855044e-19, "policy_entropy/p25": 2.15550244320184e-10, "policy_entropy/p75": 4.6193599700927734e-06, "policy_entropy/var": 0.020174281671643257, "policy_error_vector_variance/max_squared_error": 2.005399465560913, "policy_error_vector_variance/metric": 0.01777823641896248, "policy_loss": 0.11526010930538177, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.1072146892547607, "policy_sharpness": 9.16269302368164, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.129635334014893, "reward": 0.859375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.12100717425346375, "rewards/accuracy_reward": 0.859375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.12100717425346375, "sentence_full_gradient_variance/max_squared_error": 2206607.0, "sentence_full_gradient_variance/metric": 5182.361328125, "sentence_full_gradient_variance/p75": 65.25640869140625, "sentence_full_gradient_variance/p90": 244.49940490722656, "sentence_full_gradient_variance/p95": 244.49940490722656, "sentence_full_gradient_variance/p99": 42298.88671875, "state_level_variance/metric": 18.68568992614746, "state_level_variance_full_gradient/metric": 604.3917236328125, "step": 56 }, { "accuracy_reward": 0.76171875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.181739941239357, "action_level_variance/metric": 62.58573532104492, "action_level_variance_full_gradient/metric": 2966.14794921875, "adam_stats/lr_effective_max": 3.599392221076414e-05, "adam_stats/lr_effective_mean": -2.094430011367887e-11, "adam_stats/lr_effective_min": -3.746163201867603e-05, "adam_stats/m_t_max": 0.0003038844442926347, "adam_stats/m_t_mean": -6.6399320937382544e-12, "adam_stats/m_t_min": -0.0004893086152151227, "adam_stats/v_t_max": 2.2802445528213866e-05, "adam_stats/v_t_mean": 2.0088394923206687e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.02320738695561886, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.2339353561401367, "all_logprobs": -0.030547086149454117, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.25, "all_logprobs/p1": -0.87109375, "all_logprobs/p10": -0.002471923828125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.0634765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.043376624584198, "clip_ratio": 0.0, "completion_length": 519.9935302734375, "completion_length/correct": 448.6068420410156, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 416.0, "completion_length/correct/min": 87.0, "completion_length/correct/p25": 303.0, "completion_length/correct/p75": 573.0, "completion_length/correct/var": 37081.74609375, "completion_length/incorrect": 748.1967163085938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 782.0, "completion_length/incorrect/min": 153.0, "completion_length/incorrect/p25": 505.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 73133.984375, "completion_length/max": 1024.0, "completion_length/median": 458.0, "completion_length/min": 87.0, "completion_length/p25": 326.0, "completion_length/p75": 687.5, "completion_length/var": 61900.06640625, "epoch": 0.7296, "feature_vector_variance/max_squared_error": 156747.359375, "feature_vector_variance/metric": 28328.919921875, "generated_tokens/total": 24993784.0, "grad_norm": 0.12535105645656586, "grouped_std_rewards": 0.12445415556430817, "learning_rate": 6.976826446919061e-06, "loss": -0.0232, "mean_logprobs": -0.0322265625, "mean_logprobs/var": 0.0002956390380859375, "num_completions/total": 43776, "per_sentence_gradient_norm": 1.2095239162445068, "per_sentence_gradient_norm/max": 114.3813705444336, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 42.9117546081543, "per_sentence_gradient_norm/var": 61.20247268676758, "per_token_feature_norm": 198.76364135742188, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 73.5, "per_token_feature_norm/p25": 187.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 461.090576171875, "per_token_full_gradient_variance/max_squared_error": 234.0971221923828, "per_token_full_gradient_variance/variance": 0.024927884340286255, "per_token_gradient_norm": 1.4863746166229248, "per_token_gradient_norm/max": 5198.85791015625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2294.5908203125, "per_token_policy_error_norm": 0.017758846282958984, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01650310307741165, "policy_entropy": 0.03330428898334503, "policy_entropy/max": 3.328125, "policy_entropy/median": 1.461012288928032e-08, "policy_entropy/min": 4.2690460541616737e-19, "policy_entropy/p25": 2.28283170145005e-10, "policy_entropy/p75": 3.874301910400391e-06, "policy_entropy/var": 0.018441492691636086, "policy_error_vector_variance/max_squared_error": 2.0026357173919678, "policy_error_vector_variance/metric": 0.0177511777728796, "policy_loss": -0.023207377642393112, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.2339353561401367, "policy_sharpness": 9.169955253601074, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.070244789123535, "reward": 0.76171875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.181739941239357, "rewards/accuracy_reward": 0.76171875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.181739941239357, "sentence_full_gradient_variance/max_squared_error": 629589.625, "sentence_full_gradient_variance/metric": 3364.813232421875, "sentence_full_gradient_variance/p75": 34.02701187133789, "sentence_full_gradient_variance/p90": 98.0513916015625, "sentence_full_gradient_variance/p95": 98.0513916015625, "sentence_full_gradient_variance/p99": 128297.28125, "state_level_variance/metric": 6.427218914031982, "state_level_variance_full_gradient/metric": 398.66546630859375, "step": 57 }, { "accuracy_reward": 0.82421875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1450711041688919, "action_level_variance/metric": 48.33625411987305, "action_level_variance_full_gradient/metric": 1966.900390625, "adam_stats/lr_effective_max": 3.321220356156118e-05, "adam_stats/lr_effective_mean": -2.7837371296968172e-11, "adam_stats/lr_effective_min": -3.303511766716838e-05, "adam_stats/m_t_max": 0.00031549367122352123, "adam_stats/m_t_mean": -6.466860033582655e-12, "adam_stats/m_t_min": -0.0003986067895311862, "adam_stats/v_t_max": 2.278002466482576e-05, "adam_stats/v_t_mean": 2.007570325257557e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.04450061544775963, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.848191499710083, "all_logprobs": -0.02971254289150238, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.25, "all_logprobs/p1": -0.83203125, "all_logprobs/p10": -0.00225830078125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04311804100871086, "clip_ratio": 0.0, "completion_length": 504.53515625, "completion_length/correct": 448.6698303222656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 400.0, "completion_length/correct/min": 95.0, "completion_length/correct/p25": 329.0, "completion_length/correct/p75": 538.0, "completion_length/correct/var": 31502.404296875, "completion_length/incorrect": 766.4814453125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 783.0, "completion_length/incorrect/min": 222.0, "completion_length/incorrect/p25": 585.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 57236.58203125, "completion_length/max": 1024.0, "completion_length/median": 445.0, "completion_length/min": 95.0, "completion_length/p25": 337.0, "completion_length/p75": 611.0, "completion_length/var": 50610.06640625, "epoch": 0.7424, "feature_vector_variance/max_squared_error": 152204.109375, "feature_vector_variance/metric": 28595.966796875, "generated_tokens/total": 25381268.0, "grad_norm": 0.10345062613487244, "grouped_std_rewards": 0.11945618689060211, "learning_rate": 6.7160365254926005e-06, "loss": 0.0445, "mean_logprobs": -0.0296630859375, "mean_logprobs/var": 0.000308990478515625, "num_completions/total": 44544, "per_sentence_gradient_norm": 1.0675749778747559, "per_sentence_gradient_norm/max": 80.78334045410156, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 44.27606964111328, "per_sentence_gradient_norm/var": 47.2580680847168, "per_token_feature_norm": 198.78561401367188, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 73.0, "per_token_feature_norm/p25": 187.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 468.93414306640625, "per_token_full_gradient_variance/max_squared_error": 175.96226501464844, "per_token_full_gradient_variance/variance": 0.020851528272032738, "per_token_gradient_norm": 1.3879337310791016, "per_token_gradient_norm/max": 3807.521728515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1603.3121337890625, "per_token_policy_error_norm": 0.01704481989145279, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015604302287101746, "policy_entropy": 0.03291086480021477, "policy_entropy/max": 3.796875, "policy_entropy/median": 1.2223608791828156e-08, "policy_entropy/min": 3.211948935988307e-18, "policy_entropy/p25": 1.9190338207408786e-10, "policy_entropy/p75": 3.382563591003418e-06, "policy_entropy/var": 0.01902870088815689, "policy_error_vector_variance/max_squared_error": 2.004206418991089, "policy_error_vector_variance/metric": 0.017027778550982475, "policy_loss": 0.04450061544775963, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.8481913805007935, "policy_sharpness": 9.182394981384277, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.000380992889404, "reward": 0.82421875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1450711041688919, "rewards/accuracy_reward": 0.82421875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1450711041688919, "sentence_full_gradient_variance/max_squared_error": 576978.0, "sentence_full_gradient_variance/metric": 2225.607177734375, "sentence_full_gradient_variance/p75": 54.27877426147461, "sentence_full_gradient_variance/p90": 89.70731353759766, "sentence_full_gradient_variance/p95": 89.70731353759766, "sentence_full_gradient_variance/p99": 59392.4140625, "state_level_variance/metric": 4.953918933868408, "state_level_variance_full_gradient/metric": 258.70660400390625, "step": 58 }, { "accuracy_reward": 0.7786458730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17258122563362122, "action_level_variance/metric": 22.493541717529297, "action_level_variance_full_gradient/metric": 1635.240234375, "adam_stats/lr_effective_max": 3.113594721071422e-05, "adam_stats/lr_effective_mean": 2.2798415932889782e-11, "adam_stats/lr_effective_min": -3.172350989188999e-05, "adam_stats/m_t_max": 0.00045420590322464705, "adam_stats/m_t_mean": -6.030655575611776e-12, "adam_stats/m_t_min": -0.00031306486926041543, "adam_stats/v_t_max": 2.276366103615146e-05, "adam_stats/v_t_mean": 2.0064434055194758e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.011224164627492428, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.7929405570030212, "all_logprobs": -0.03203682228922844, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.6875, "all_logprobs/p1": -0.9708595275878906, "all_logprobs/p10": -0.00372314453125, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.0791015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04565040022134781, "clip_ratio": 0.0, "completion_length": 514.25390625, "completion_length/correct": 425.1922912597656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 384.0, "completion_length/correct/min": 127.0, "completion_length/correct/p25": 285.25, "completion_length/correct/p75": 518.0, "completion_length/correct/var": 34719.02734375, "completion_length/incorrect": 827.5411987304688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 182.0, "completion_length/incorrect/p25": 604.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63288.78125, "completion_length/max": 1024.0, "completion_length/median": 447.0, "completion_length/min": 127.0, "completion_length/p25": 301.0, "completion_length/p75": 645.0, "completion_length/var": 68907.0390625, "epoch": 0.7552, "feature_vector_variance/max_squared_error": 140100.28125, "feature_vector_variance/metric": 28699.111328125, "generated_tokens/total": 25776216.0, "grad_norm": 0.14637437462806702, "grouped_std_rewards": 0.09687940776348114, "learning_rate": 6.456201742799511e-06, "loss": -0.0112, "mean_logprobs": -0.03271484375, "mean_logprobs/var": 0.000370025634765625, "num_completions/total": 45312, "per_sentence_gradient_norm": 0.6732929944992065, "per_sentence_gradient_norm/max": 69.64200592041016, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 27.039106369018555, "per_sentence_gradient_norm/var": 22.068952560424805, "per_token_feature_norm": 199.28192138671875, "per_token_feature_norm/max": 290.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 73.5, "per_token_feature_norm/p25": 188.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 448.6182861328125, "per_token_full_gradient_variance/max_squared_error": 31.91147232055664, "per_token_full_gradient_variance/variance": 0.007784368470311165, "per_token_gradient_norm": 0.8054730296134949, "per_token_gradient_norm/max": 2726.72021484375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 736.70751953125, "per_token_policy_error_norm": 0.0185533594340086, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.017202885821461678, "policy_entropy": 0.03508191555738449, "policy_entropy/max": 3.6875, "policy_entropy/median": 1.548323780298233e-08, "policy_entropy/min": 8.029872339970767e-19, "policy_entropy/p25": 2.219167072325945e-10, "policy_entropy/p75": 5.617737770080566e-06, "policy_entropy/var": 0.019646944478154182, "policy_error_vector_variance/max_squared_error": 2.004002571105957, "policy_error_vector_variance/metric": 0.018542801961302757, "policy_loss": -0.011224163696169853, "policy_loss/max": 9.659051895141602, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.7929405570030212, "policy_sharpness": 9.124229431152344, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.298397064208984, "reward": 0.7786458730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17258122563362122, "rewards/accuracy_reward": 0.7786458730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17258122563362122, "sentence_full_gradient_variance/max_squared_error": 571879.1875, "sentence_full_gradient_variance/metric": 1832.658935546875, "sentence_full_gradient_variance/p75": 57.7735595703125, "sentence_full_gradient_variance/p90": 118.46334075927734, "sentence_full_gradient_variance/p95": 118.46334075927734, "sentence_full_gradient_variance/p99": 55177.28125, "state_level_variance/metric": 2.3831939697265625, "state_level_variance_full_gradient/metric": 197.4185791015625, "step": 59 }, { "accuracy_reward": 0.8059896230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15657426416873932, "action_level_variance/metric": 24.12972640991211, "action_level_variance_full_gradient/metric": 881.6996459960938, "adam_stats/lr_effective_max": 2.77076233032858e-05, "adam_stats/lr_effective_mean": 2.0767255970444154e-11, "adam_stats/lr_effective_min": -2.7734926334233023e-05, "adam_stats/m_t_max": 0.0004189896280877292, "adam_stats/m_t_mean": -4.990655458336768e-12, "adam_stats/m_t_min": -0.00027822976699098945, "adam_stats/v_t_max": 2.274090911669191e-05, "adam_stats/v_t_mean": 2.0044467387986264e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.061644356697797775, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -7.48191499710083, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.837215006351471, "all_logprobs": -0.029458213597536087, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.25, "all_logprobs/p1": -0.8359375, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04101580008864403, "clip_ratio": 0.0, "completion_length": 532.3815307617188, "completion_length/correct": 462.9046936035156, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 420.0, "completion_length/correct/min": 132.0, "completion_length/correct/p25": 314.5, "completion_length/correct/p75": 590.0, "completion_length/correct/var": 37125.6171875, "completion_length/incorrect": 821.013427734375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 954.0, "completion_length/incorrect/min": 131.0, "completion_length/incorrect/p25": 667.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63065.3515625, "completion_length/max": 1024.0, "completion_length/median": 484.0, "completion_length/min": 131.0, "completion_length/p25": 328.0, "completion_length/p75": 699.25, "completion_length/var": 62161.91015625, "epoch": 0.768, "feature_vector_variance/max_squared_error": 146141.328125, "feature_vector_variance/metric": 28187.91015625, "generated_tokens/total": 26185084.0, "grad_norm": 0.013867736794054508, "grouped_std_rewards": 0.10358648002147675, "learning_rate": 6.197638667498023e-06, "loss": -0.0616, "mean_logprobs": -0.029296875, "mean_logprobs/var": 0.0003070831298828125, "num_completions/total": 46080, "per_sentence_gradient_norm": 0.6645392179489136, "per_sentence_gradient_norm/max": 93.26346588134766, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 22.39567756652832, "per_sentence_gradient_norm/var": 23.718994140625, "per_token_feature_norm": 198.76817321777344, "per_token_feature_norm/max": 294.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 69.5, "per_token_feature_norm/p25": 187.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 487.89593505859375, "per_token_full_gradient_variance/max_squared_error": 43.82465744018555, "per_token_full_gradient_variance/variance": 0.008881805464625359, "per_token_gradient_norm": 0.9160621166229248, "per_token_gradient_norm/max": 3684.619140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 866.1608276367188, "per_token_policy_error_norm": 0.017029937356710434, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015580808743834496, "policy_entropy": 0.03276904299855232, "policy_entropy/max": 3.703125, "policy_entropy/median": 9.487848728895187e-09, "policy_entropy/min": 8.063753657860939e-19, "policy_entropy/p25": 1.3915268937125802e-10, "policy_entropy/p75": 2.294778823852539e-06, "policy_entropy/var": 0.01881386525928974, "policy_error_vector_variance/max_squared_error": 2.0057880878448486, "policy_error_vector_variance/metric": 0.017021171748638153, "policy_loss": -0.061644356697797775, "policy_loss/max": 7.48191499710083, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.8372149467468262, "policy_sharpness": 9.20041561126709, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.930693626403809, "reward": 0.8059896230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15657426416873932, "rewards/accuracy_reward": 0.8059896230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15657426416873932, "sentence_full_gradient_variance/max_squared_error": 119943.9296875, "sentence_full_gradient_variance/metric": 976.2362060546875, "sentence_full_gradient_variance/p75": 48.314674377441406, "sentence_full_gradient_variance/p90": 157.1576385498047, "sentence_full_gradient_variance/p95": 157.1576385498047, "sentence_full_gradient_variance/p99": 39136.234375, "state_level_variance/metric": 2.6017043590545654, "state_level_variance_full_gradient/metric": 94.5367431640625, "step": 60 }, { "accuracy_reward": 0.77734375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17330607771873474, "action_level_variance/metric": 87.12081909179688, "action_level_variance_full_gradient/metric": 5249.35693359375, "adam_stats/lr_effective_max": 2.9335518775042146e-05, "adam_stats/lr_effective_mean": 2.081377986629107e-10, "adam_stats/lr_effective_min": -2.8503827707027085e-05, "adam_stats/m_t_max": 0.0007079060305841267, "adam_stats/m_t_mean": -6.680047140439349e-13, "adam_stats/m_t_min": -0.0003668092249426991, "adam_stats/v_t_max": 2.2719734261045232e-05, "adam_stats/v_t_mean": 2.005951828254471e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.08134337514638901, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.146554946899414, "all_logprobs": -0.030241969972848892, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.25, "all_logprobs/p1": -0.88671875, "all_logprobs/p10": -0.00225830078125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04260196164250374, "clip_ratio": 0.0, "completion_length": 509.66278076171875, "completion_length/correct": 439.9045104980469, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 413.0, "completion_length/correct/min": 120.0, "completion_length/correct/p25": 310.0, "completion_length/correct/p75": 546.0, "completion_length/correct/var": 31535.3984375, "completion_length/incorrect": 753.2047119140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 810.0, "completion_length/incorrect/min": 234.0, "completion_length/incorrect/p25": 541.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 73782.640625, "completion_length/max": 1024.0, "completion_length/median": 448.0, "completion_length/min": 120.0, "completion_length/p25": 324.0, "completion_length/p75": 628.0, "completion_length/var": 57869.2890625, "epoch": 0.7808, "feature_vector_variance/max_squared_error": 154867.25, "feature_vector_variance/metric": 28340.283203125, "generated_tokens/total": 26576504.0, "grad_norm": 0.2853810787200928, "grouped_std_rewards": 0.16206899285316467, "learning_rate": 5.9406623188668065e-06, "loss": 0.0813, "mean_logprobs": -0.03173828125, "mean_logprobs/var": 0.0003757476806640625, "num_completions/total": 46848, "per_sentence_gradient_norm": 1.520817518234253, "per_sentence_gradient_norm/max": 140.54713439941406, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 38.733951568603516, "per_sentence_gradient_norm/var": 84.91851043701172, "per_token_feature_norm": 197.7097625732422, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 74.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 507.16632080078125, "per_token_full_gradient_variance/max_squared_error": 151.95358276367188, "per_token_full_gradient_variance/variance": 0.02933243289589882, "per_token_gradient_norm": 1.7914764881134033, "per_token_gradient_norm/max": 5556.2216796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2567.790771484375, "per_token_policy_error_norm": 0.017482111230492592, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015897998586297035, "policy_entropy": 0.033604420721530914, "policy_entropy/max": 3.328125, "policy_entropy/median": 1.2631062418222427e-08, "policy_entropy/min": 8.436448154652831e-19, "policy_entropy/p25": 1.8553691916167736e-10, "policy_entropy/p75": 4.1425228118896484e-06, "policy_entropy/var": 0.01954706758260727, "policy_error_vector_variance/max_squared_error": 2.0038795471191406, "policy_error_vector_variance/metric": 0.01747215911746025, "policy_loss": 0.08134336769580841, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.146554946899414, "policy_sharpness": 9.18327522277832, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.029162406921387, "reward": 0.77734375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17330607771873474, "rewards/accuracy_reward": 0.77734375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17330607771873474, "sentence_full_gradient_variance/max_squared_error": 1460981.25, "sentence_full_gradient_variance/metric": 5941.9296875, "sentence_full_gradient_variance/p75": 134.30975341796875, "sentence_full_gradient_variance/p90": 281.8516845703125, "sentence_full_gradient_variance/p95": 281.8516845703125, "sentence_full_gradient_variance/p99": 112109.8359375, "state_level_variance/metric": 8.667505264282227, "state_level_variance_full_gradient/metric": 692.5729370117188, "step": 61 }, { "accuracy_reward": 0.8346354365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1381990909576416, "action_level_variance/metric": 120.71189880371094, "action_level_variance_full_gradient/metric": 2221.350341796875, "adam_stats/lr_effective_max": 2.7299691282678396e-05, "adam_stats/lr_effective_mean": 1.5919901508176793e-10, "adam_stats/lr_effective_min": -2.7752990717999637e-05, "adam_stats/m_t_max": 0.0006372763309627771, "adam_stats/m_t_mean": -1.6521335081259858e-12, "adam_stats/m_t_min": -0.0003717085055541247, "adam_stats/v_t_max": 2.2697133317706175e-05, "adam_stats/v_t_mean": 2.0044603997459998e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.030483968555927277, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.19111967086792, "all_logprobs": -0.03249012678861618, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.75, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.0032196044921875, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.0791015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04690080136060715, "clip_ratio": 0.0, "completion_length": 508.60418701171875, "completion_length/correct": 445.71917724609375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 399.0, "completion_length/correct/min": 101.0, "completion_length/correct/p25": 292.0, "completion_length/correct/p75": 571.0, "completion_length/correct/var": 41070.16796875, "completion_length/incorrect": 826.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 992.0, "completion_length/incorrect/min": 221.0, "completion_length/incorrect/p25": 628.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63551.94140625, "completion_length/max": 1024.0, "completion_length/median": 454.0, "completion_length/min": 101.0, "completion_length/p25": 310.5, "completion_length/p75": 662.25, "completion_length/var": 64695.30078125, "epoch": 0.7936, "feature_vector_variance/max_squared_error": 145361.5625, "feature_vector_variance/metric": 28598.41015625, "generated_tokens/total": 26967112.0, "grad_norm": 0.09535453468561172, "grouped_std_rewards": 0.10394139587879181, "learning_rate": 5.685585783002493e-06, "loss": 0.0305, "mean_logprobs": -0.0341796875, "mean_logprobs/var": 0.0004253387451171875, "num_completions/total": 47616, "per_sentence_gradient_norm": 1.3751893043518066, "per_sentence_gradient_norm/max": 173.7648468017578, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 47.81253433227539, "per_sentence_gradient_norm/var": 118.97566986083984, "per_token_feature_norm": 198.26649475097656, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 187.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 474.9961853027344, "per_token_full_gradient_variance/max_squared_error": 529.7218017578125, "per_token_full_gradient_variance/variance": 0.03350324183702469, "per_token_gradient_norm": 1.9513764381408691, "per_token_gradient_norm/max": 6705.78466796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3634.388671875, "per_token_policy_error_norm": 0.01879623346030712, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.017369449138641357, "policy_entropy": 0.035398781299591064, "policy_entropy/max": 3.796875, "policy_entropy/median": 1.4784745872020721e-08, "policy_entropy/min": 1.6855955650360577e-19, "policy_entropy/p25": 1.8735590856522322e-10, "policy_entropy/p75": 4.976987838745117e-06, "policy_entropy/var": 0.02063114568591118, "policy_error_vector_variance/max_squared_error": 2.00423002243042, "policy_error_vector_variance/metric": 0.018785715103149414, "policy_loss": 0.030483976006507874, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.19111967086792, "policy_sharpness": 9.133715629577637, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.279264450073242, "reward": 0.8346354365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1381990909576416, "rewards/accuracy_reward": 0.8346354365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1381990909576416, "sentence_full_gradient_variance/max_squared_error": 717864.625, "sentence_full_gradient_variance/metric": 2506.14892578125, "sentence_full_gradient_variance/p75": 51.10885238647461, "sentence_full_gradient_variance/p90": 110.65367889404297, "sentence_full_gradient_variance/p95": 110.65367889404297, "sentence_full_gradient_variance/p99": 46384.63671875, "state_level_variance/metric": 13.336767196655273, "state_level_variance_full_gradient/metric": 284.79864501953125, "step": 62 }, { "accuracy_reward": 0.6783854365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21846309304237366, "action_level_variance/metric": 30.393966674804688, "action_level_variance_full_gradient/metric": 1489.401611328125, "adam_stats/lr_effective_max": 2.5135852411040105e-05, "adam_stats/lr_effective_mean": 1.5232187733360547e-10, "adam_stats/lr_effective_min": -2.585727270343341e-05, "adam_stats/m_t_max": 0.0005728572723455727, "adam_stats/m_t_mean": -2.506233501980981e-12, "adam_stats/m_t_min": -0.00035036864574067295, "adam_stats/v_t_max": 2.267475429107435e-05, "adam_stats/v_t_mean": 2.0027595033778045e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.010929925367236137, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.1981618404388428, "all_logprobs": -0.030479006469249725, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.625, "all_logprobs/p1": -0.8828125, "all_logprobs/p10": -0.0019378662109375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.044893741607666016, "clip_ratio": 0.0, "completion_length": 581.8046875, "completion_length/correct": 466.4203186035156, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 407.0, "completion_length/correct/min": 62.0, "completion_length/correct/p25": 301.0, "completion_length/correct/p75": 599.0, "completion_length/correct/var": 49282.55859375, "completion_length/incorrect": 825.186279296875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1005.0, "completion_length/incorrect/min": 131.0, "completion_length/incorrect/p25": 640.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 67111.140625, "completion_length/max": 1024.0, "completion_length/median": 512.0, "completion_length/min": 62.0, "completion_length/p25": 332.75, "completion_length/p75": 853.0, "completion_length/var": 83055.5, "epoch": 0.8064, "feature_vector_variance/max_squared_error": 164795.65625, "feature_vector_variance/metric": 28205.43359375, "generated_tokens/total": 27413940.0, "grad_norm": 0.08634080737829208, "grouped_std_rewards": 0.1827593445777893, "learning_rate": 5.432719831372507e-06, "loss": -0.0109, "mean_logprobs": -0.03271484375, "mean_logprobs/var": 0.000415802001953125, "num_completions/total": 48384, "per_sentence_gradient_norm": 1.0732150077819824, "per_sentence_gradient_norm/max": 54.84892654418945, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 5.690332412719727, "per_sentence_gradient_norm/p99": 30.097131729125977, "per_sentence_gradient_norm/var": 29.280302047729492, "per_token_feature_norm": 198.24984741210938, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 74.0, "per_token_feature_norm/p25": 187.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 442.3800354003906, "per_token_full_gradient_variance/max_squared_error": 43.037899017333984, "per_token_full_gradient_variance/variance": 0.011121705174446106, "per_token_gradient_norm": 1.277139663696289, "per_token_gradient_norm/max": 2688.008544921875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 958.36376953125, "per_token_policy_error_norm": 0.017572885379195213, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016331452876329422, "policy_entropy": 0.03305112197995186, "policy_entropy/max": 3.78125, "policy_entropy/median": 1.2514647096395493e-08, "policy_entropy/min": 1.212951180468158e-18, "policy_entropy/p25": 1.8917489796876907e-10, "policy_entropy/p75": 2.9206275939941406e-06, "policy_entropy/var": 0.01934397779405117, "policy_error_vector_variance/max_squared_error": 2.002983808517456, "policy_error_vector_variance/metric": 0.01756543107330799, "policy_loss": -0.01092993002384901, "policy_loss/max": 9.659050941467285, "policy_loss/median": 0.0, "policy_loss/min": -7.48191499710083, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.1981618404388428, "policy_sharpness": 9.190502166748047, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.985070705413818, "reward": 0.6783854365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21846309304237366, "rewards/accuracy_reward": 0.6783854365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21846309304237366, "sentence_full_gradient_variance/max_squared_error": 266340.59375, "sentence_full_gradient_variance/metric": 1654.36669921875, "sentence_full_gradient_variance/p75": 149.85572814941406, "sentence_full_gradient_variance/p90": 181.9696807861328, "sentence_full_gradient_variance/p95": 181.9696807861328, "sentence_full_gradient_variance/p99": 48629.55859375, "state_level_variance/metric": 2.675323247909546, "state_level_variance_full_gradient/metric": 164.96536254882812, "step": 63 }, { "accuracy_reward": 0.80859375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15497168898582458, "action_level_variance/metric": 26.021581649780273, "action_level_variance_full_gradient/metric": 1279.1500244140625, "adam_stats/lr_effective_max": 2.4036982722464018e-05, "adam_stats/lr_effective_mean": 1.454978221238079e-10, "adam_stats/lr_effective_min": -2.3562542992294766e-05, "adam_stats/m_t_max": 0.0005150469951331615, "adam_stats/m_t_mean": -1.5348738447752697e-13, "adam_stats/m_t_min": -0.00032153064967133105, "adam_stats/v_t_max": 2.265214061480947e-05, "adam_stats/v_t_mean": 2.0011770018868447e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.015478725545108318, "advantages/max": 7.48191499710083, "advantages/median": -0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.0611200332641602, "all_logprobs": -0.02895362116396427, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.96875, "all_logprobs/p1": -0.83203125, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.05908203125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.041344135999679565, "clip_ratio": 0.0, "completion_length": 524.703125, "completion_length/correct": 460.1159362792969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 407.0, "completion_length/correct/min": 91.0, "completion_length/correct/p25": 297.0, "completion_length/correct/p75": 573.0, "completion_length/correct/var": 43100.73046875, "completion_length/incorrect": 797.551025390625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 912.0, "completion_length/incorrect/min": 203.0, "completion_length/incorrect/p25": 564.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63090.63671875, "completion_length/max": 1024.0, "completion_length/median": 445.0, "completion_length/min": 91.0, "completion_length/p25": 319.0, "completion_length/p75": 677.5, "completion_length/var": 64495.109375, "epoch": 0.8192, "feature_vector_variance/max_squared_error": 156934.71875, "feature_vector_variance/metric": 28243.41015625, "generated_tokens/total": 27816912.0, "grad_norm": 0.09491833299398422, "grouped_std_rewards": 0.13953915238380432, "learning_rate": 5.182372542187895e-06, "loss": -0.0155, "mean_logprobs": -0.029541015625, "mean_logprobs/var": 0.00026702880859375, "num_completions/total": 49152, "per_sentence_gradient_norm": 0.8747681975364685, "per_sentence_gradient_norm/max": 59.15887451171875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 28.06096076965332, "per_sentence_gradient_norm/var": 25.289291381835938, "per_token_feature_norm": 196.72677612304688, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 462.384033203125, "per_token_full_gradient_variance/max_squared_error": 49.42164611816406, "per_token_full_gradient_variance/variance": 0.010164138861000538, "per_token_gradient_norm": 1.1107206344604492, "per_token_gradient_norm/max": 2324.20947265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 881.0913696289062, "per_token_policy_error_norm": 0.01673133485019207, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015475708059966564, "policy_entropy": 0.03181483969092369, "policy_entropy/max": 3.59375, "policy_entropy/median": 1.0477378964424133e-08, "policy_entropy/min": 5.454892180317694e-19, "policy_entropy/p25": 1.4006218407303095e-10, "policy_entropy/p75": 3.069639205932617e-06, "policy_entropy/var": 0.017987782135605812, "policy_error_vector_variance/max_squared_error": 2.002182722091675, "policy_error_vector_variance/metric": 0.016722004860639572, "policy_loss": -0.015478729270398617, "policy_loss/max": 9.659050941467285, "policy_loss/median": 0.0, "policy_loss/min": -7.48191499710083, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.0611200332641602, "policy_sharpness": 9.20672607421875, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.861709117889404, "reward": 0.80859375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15497168898582458, "rewards/accuracy_reward": 0.80859375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15497168898582458, "sentence_full_gradient_variance/max_squared_error": 260048.765625, "sentence_full_gradient_variance/metric": 1432.35498046875, "sentence_full_gradient_variance/p75": 65.35176086425781, "sentence_full_gradient_variance/p90": 65.39765167236328, "sentence_full_gradient_variance/p95": 65.39765167236328, "sentence_full_gradient_variance/p99": 33027.27734375, "state_level_variance/metric": 2.513662099838257, "state_level_variance_full_gradient/metric": 153.20513916015625, "step": 64 }, { "accuracy_reward": 0.7018229365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20954032242298126, "action_level_variance/metric": 80.65078735351562, "action_level_variance_full_gradient/metric": 4059.60107421875, "adam_stats/lr_effective_max": 2.328149821551051e-05, "adam_stats/lr_effective_mean": 1.428374779566255e-10, "adam_stats/lr_effective_min": -2.3745747967041098e-05, "adam_stats/m_t_max": 0.0004640817060135305, "adam_stats/m_t_mean": 2.1088024989429632e-12, "adam_stats/m_t_min": -0.00032815293525345623, "adam_stats/v_t_max": 2.2630105377174914e-05, "adam_stats/v_t_mean": 2.0005230111364014e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.039291612803936005, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.205132484436035, "all_logprobs": -0.034004323184490204, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.8125, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.0036468505859375, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.0791015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.051043327897787094, "clip_ratio": 0.0, "completion_length": 551.34375, "completion_length/correct": 450.7272644042969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 420.0, "completion_length/correct/min": 113.0, "completion_length/correct/p25": 309.5, "completion_length/correct/p75": 551.5, "completion_length/correct/var": 35163.01953125, "completion_length/incorrect": 788.1659545898438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 894.0, "completion_length/incorrect/min": 107.0, "completion_length/incorrect/p25": 570.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 68452.1640625, "completion_length/max": 1024.0, "completion_length/median": 477.0, "completion_length/min": 107.0, "completion_length/p25": 340.0, "completion_length/p75": 728.0, "completion_length/var": 68872.0546875, "epoch": 0.832, "feature_vector_variance/max_squared_error": 156698.171875, "feature_vector_variance/metric": 28549.9609375, "generated_tokens/total": 28240344.0, "grad_norm": 0.13256819546222687, "grouped_std_rewards": 0.12068265676498413, "learning_rate": 4.934848925057485e-06, "loss": 0.0393, "mean_logprobs": -0.035400390625, "mean_logprobs/var": 0.001007080078125, "num_completions/total": 49920, "per_sentence_gradient_norm": 1.2433826923370361, "per_sentence_gradient_norm/max": 131.6454620361328, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 37.5911979675293, "per_sentence_gradient_norm/var": 79.20793151855469, "per_token_feature_norm": 197.94955444335938, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 187.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 493.9382019042969, "per_token_full_gradient_variance/max_squared_error": 186.54244995117188, "per_token_full_gradient_variance/variance": 0.024517063051462173, "per_token_gradient_norm": 1.4548143148422241, "per_token_gradient_norm/max": 5257.61962890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2076.1240234375, "per_token_policy_error_norm": 0.019341643899679184, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01787254586815834, "policy_entropy": 0.037191104143857956, "policy_entropy/max": 3.609375, "policy_entropy/median": 1.4319084584712982e-08, "policy_entropy/min": 1.5924219408380846e-18, "policy_entropy/p25": 1.9281287677586079e-10, "policy_entropy/p75": 5.8710575103759766e-06, "policy_entropy/var": 0.023238984867930412, "policy_error_vector_variance/max_squared_error": 2.0055131912231445, "policy_error_vector_variance/metric": 0.01932169683277607, "policy_loss": 0.039291612803936005, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.205132484436035, "policy_sharpness": 9.115180969238281, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.4184770584106445, "reward": 0.7018229365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20954032242298126, "rewards/accuracy_reward": 0.7018229365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20954032242298126, "sentence_full_gradient_variance/max_squared_error": 1490144.125, "sentence_full_gradient_variance/metric": 4593.79296875, "sentence_full_gradient_variance/p75": 78.05760955810547, "sentence_full_gradient_variance/p90": 144.3651123046875, "sentence_full_gradient_variance/p95": 144.3651123046875, "sentence_full_gradient_variance/p99": 65136.73046875, "state_level_variance/metric": 8.62519359588623, "state_level_variance_full_gradient/metric": 534.19140625, "step": 65 }, { "accuracy_reward": 0.84375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.13200783729553223, "action_level_variance/metric": 120.19524383544922, "action_level_variance_full_gradient/metric": 4376.1015625, "adam_stats/lr_effective_max": 2.2484271539724432e-05, "adam_stats/lr_effective_mean": 1.1034054919756286e-10, "adam_stats/lr_effective_min": -2.3755155780236237e-05, "adam_stats/m_t_max": 0.00041538471123203635, "adam_stats/m_t_mean": 1.8729672760647853e-12, "adam_stats/m_t_min": -0.00030330082518048584, "adam_stats/v_t_max": 2.2607564460486174e-05, "adam_stats/v_t_mean": 1.998822982129944e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.04211072623729706, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.8428776264190674, "all_logprobs": -0.029837552458047867, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.71875, "all_logprobs/p1": -0.86328125, "all_logprobs/p10": -0.0020751953125, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04230443015694618, "clip_ratio": 0.0, "completion_length": 478.1015625, "completion_length/correct": 434.0601806640625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 387.0, "completion_length/correct/min": 90.0, "completion_length/correct/p25": 293.0, "completion_length/correct/p75": 536.0, "completion_length/correct/var": 37463.03125, "completion_length/incorrect": 715.925048828125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 648.0, "completion_length/incorrect/min": 198.0, "completion_length/incorrect/p25": 491.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 74027.4609375, "completion_length/max": 1024.0, "completion_length/median": 419.0, "completion_length/min": 90.0, "completion_length/p25": 304.75, "completion_length/p75": 589.5, "completion_length/var": 53574.890625, "epoch": 0.8448, "feature_vector_variance/max_squared_error": 164510.671875, "feature_vector_variance/metric": 28229.884765625, "generated_tokens/total": 28607524.0, "grad_norm": 0.07209963351488113, "grouped_std_rewards": 0.08627311885356903, "learning_rate": 4.6904505493806595e-06, "loss": -0.0421, "mean_logprobs": -0.031005859375, "mean_logprobs/var": 0.0003299713134765625, "num_completions/total": 50688, "per_sentence_gradient_norm": 1.2163022756576538, "per_sentence_gradient_norm/max": 213.443359375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 35.952152252197266, "per_sentence_gradient_norm/var": 118.8706283569336, "per_token_feature_norm": 197.2969970703125, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 499.4388732910156, "per_token_full_gradient_variance/max_squared_error": 335.6304931640625, "per_token_full_gradient_variance/variance": 0.03648115321993828, "per_token_gradient_norm": 1.7342239618301392, "per_token_gradient_norm/max": 6587.79736328125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3943.906494140625, "per_token_policy_error_norm": 0.017151618376374245, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015738438814878464, "policy_entropy": 0.03316846862435341, "policy_entropy/max": 3.8125, "policy_entropy/median": 1.257285475730896e-08, "policy_entropy/min": 7.894347068410079e-19, "policy_entropy/p25": 1.7917045624926686e-10, "policy_entropy/p75": 3.11434268951416e-06, "policy_entropy/var": 0.019633518531918526, "policy_error_vector_variance/max_squared_error": 2.005810260772705, "policy_error_vector_variance/metric": 0.01713927648961544, "policy_loss": -0.04211072251200676, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.79339027404785, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.8428776264190674, "policy_sharpness": 9.188093185424805, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.982117652893066, "reward": 0.84375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.13200783729553223, "rewards/accuracy_reward": 0.84375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.13200783729553223, "sentence_full_gradient_variance/max_squared_error": 904210.75, "sentence_full_gradient_variance/metric": 4925.8212890625, "sentence_full_gradient_variance/p75": 89.44700622558594, "sentence_full_gradient_variance/p90": 387.7218933105469, "sentence_full_gradient_variance/p95": 387.7218933105469, "sentence_full_gradient_variance/p99": 59515.03125, "state_level_variance/metric": 13.687592506408691, "state_level_variance_full_gradient/metric": 549.7200927734375, "step": 66 }, { "accuracy_reward": 0.8046875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1573704332113266, "action_level_variance/metric": 41.90507507324219, "action_level_variance_full_gradient/metric": 2370.865234375, "adam_stats/lr_effective_max": 2.1644338630721904e-05, "adam_stats/lr_effective_mean": 5.768888491908264e-11, "adam_stats/lr_effective_min": -2.1139165255590342e-05, "adam_stats/m_t_max": 0.000555498234461993, "adam_stats/m_t_mean": -1.9932985383802615e-12, "adam_stats/m_t_min": -0.0005617958959192038, "adam_stats/v_t_max": 2.25892636080971e-05, "adam_stats/v_t_mean": 1.9981828691673087e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.04191534221172333, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.4251978397369385, "all_logprobs": -0.028953995555639267, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.125, "all_logprobs/p1": -0.8281650543212891, "all_logprobs/p10": -0.00151824951171875, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.052490234375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.041319381445646286, "clip_ratio": 0.0, "completion_length": 544.140625, "completion_length/correct": 491.67962646484375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 455.0, "completion_length/correct/min": 154.0, "completion_length/correct/p25": 311.5, "completion_length/correct/p75": 637.0, "completion_length/correct/var": 45333.6328125, "completion_length/incorrect": 760.280029296875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 774.0, "completion_length/incorrect/min": 168.0, "completion_length/incorrect/p25": 555.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63179.73046875, "completion_length/max": 1024.0, "completion_length/median": 494.0, "completion_length/min": 154.0, "completion_length/p25": 349.0, "completion_length/p75": 719.0, "completion_length/var": 60095.0390625, "epoch": 0.8576, "feature_vector_variance/max_squared_error": 150691.53125, "feature_vector_variance/metric": 28121.6328125, "generated_tokens/total": 29025424.0, "grad_norm": 0.12519775331020355, "grouped_std_rewards": 0.13545361161231995, "learning_rate": 4.4494751769315e-06, "loss": -0.0419, "mean_logprobs": -0.0299072265625, "mean_logprobs/var": 0.000301361083984375, "num_completions/total": 51456, "per_sentence_gradient_norm": 1.0257089138031006, "per_sentence_gradient_norm/max": 92.29805755615234, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 28.61589241027832, "per_sentence_gradient_norm/var": 40.9062614440918, "per_token_feature_norm": 196.9345703125, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 77.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 478.8979797363281, "per_token_full_gradient_variance/max_squared_error": 73.79145812988281, "per_token_full_gradient_variance/variance": 0.013655542396008968, "per_token_gradient_norm": 1.2226098775863647, "per_token_gradient_norm/max": 3532.455810546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1295.494384765625, "per_token_policy_error_norm": 0.016734687611460686, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015513152815401554, "policy_entropy": 0.03172674775123596, "policy_entropy/max": 3.8125, "policy_entropy/median": 7.916241884231567e-09, "policy_entropy/min": 1.2807138162485021e-18, "policy_entropy/p25": 1.0732037480920553e-10, "policy_entropy/p75": 2.3990869522094727e-06, "policy_entropy/var": 0.018474506214261055, "policy_error_vector_variance/max_squared_error": 2.002358913421631, "policy_error_vector_variance/metric": 0.016722297295928, "policy_loss": -0.041915349662303925, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.4251976013183594, "policy_sharpness": 9.215909957885742, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.834933280944824, "reward": 0.8046875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1573704332113266, "rewards/accuracy_reward": 0.8046875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1573704332113266, "sentence_full_gradient_variance/max_squared_error": 694820.25, "sentence_full_gradient_variance/metric": 2663.78759765625, "sentence_full_gradient_variance/p75": 64.46240997314453, "sentence_full_gradient_variance/p90": 216.88229370117188, "sentence_full_gradient_variance/p95": 216.88229370117188, "sentence_full_gradient_variance/p99": 75089.5, "state_level_variance/metric": 4.230119705200195, "state_level_variance_full_gradient/metric": 292.9221496582031, "step": 67 }, { "accuracy_reward": 0.7434896230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19096145033836365, "action_level_variance/metric": 31.58795738220215, "action_level_variance_full_gradient/metric": 2699.3662109375, "adam_stats/lr_effective_max": 1.9035564037039876e-05, "adam_stats/lr_effective_mean": 6.785411815757314e-11, "adam_stats/lr_effective_min": -2.0378807676024735e-05, "adam_stats/m_t_max": 0.0005354250897653401, "adam_stats/m_t_mean": -5.424261300540634e-13, "adam_stats/m_t_min": -0.0005264063947834074, "adam_stats/v_t_max": 2.2567101041204296e-05, "adam_stats/v_t_mean": 1.996692091180141e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0878528505563736, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.3183600902557373, "all_logprobs": -0.03310413286089897, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.25, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.0031890869140625, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.0791015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.049286894500255585, "clip_ratio": 0.0, "completion_length": 511.28387451171875, "completion_length/correct": 423.4395751953125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 386.0, "completion_length/correct/min": 135.0, "completion_length/correct/p25": 299.5, "completion_length/correct/p75": 542.0, "completion_length/correct/var": 29910.462890625, "completion_length/incorrect": 765.8984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 780.0, "completion_length/incorrect/min": 300.0, "completion_length/incorrect/p25": 557.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 55419.6484375, "completion_length/max": 1024.0, "completion_length/median": 449.0, "completion_length/min": 135.0, "completion_length/p25": 317.0, "completion_length/p75": 646.25, "completion_length/var": 58785.71484375, "epoch": 0.8704, "feature_vector_variance/max_squared_error": 143313.640625, "feature_vector_variance/metric": 28457.8125, "generated_tokens/total": 29418092.0, "grad_norm": 0.11197062581777573, "grouped_std_rewards": 0.09486740827560425, "learning_rate": 4.212216399081919e-06, "loss": -0.0879, "mean_logprobs": -0.032470703125, "mean_logprobs/var": 0.000499725341796875, "num_completions/total": 52224, "per_sentence_gradient_norm": 0.7553305625915527, "per_sentence_gradient_norm/max": 95.94491577148438, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 20.65300178527832, "per_sentence_gradient_norm/var": 31.057872772216797, "per_token_feature_norm": 196.98316955566406, "per_token_feature_norm/max": 298.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 73.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 459.2901916503906, "per_token_full_gradient_variance/max_squared_error": 241.05300903320312, "per_token_full_gradient_variance/variance": 0.011239239014685154, "per_token_gradient_norm": 0.8604358434677124, "per_token_gradient_norm/max": 4202.69384765625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 892.9505615234375, "per_token_policy_error_norm": 0.018811555579304695, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01732444576919079, "policy_entropy": 0.03620779141783714, "policy_entropy/max": 3.640625, "policy_entropy/median": 1.367880031466484e-08, "policy_entropy/min": 4.845028458294598e-19, "policy_entropy/p25": 1.864464138634503e-10, "policy_entropy/p75": 4.559755325317383e-06, "policy_entropy/var": 0.022646505385637283, "policy_error_vector_variance/max_squared_error": 2.0044965744018555, "policy_error_vector_variance/metric": 0.018805406987667084, "policy_loss": -0.0878528505563736, "policy_loss/max": 9.659049987792969, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.3183600902557373, "policy_sharpness": 9.132457733154297, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.294945240020752, "reward": 0.7434896230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19096145033836365, "rewards/accuracy_reward": 0.7434896230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19096145033836365, "sentence_full_gradient_variance/max_squared_error": 1125372.125, "sentence_full_gradient_variance/metric": 3026.306640625, "sentence_full_gradient_variance/p75": 168.09207153320312, "sentence_full_gradient_variance/p90": 228.00730895996094, "sentence_full_gradient_variance/p95": 228.00730895996094, "sentence_full_gradient_variance/p99": 48687.5703125, "state_level_variance/metric": 3.4135282039642334, "state_level_variance_full_gradient/metric": 326.939453125, "step": 68 }, { "accuracy_reward": 0.8203125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14759209752082825, "action_level_variance/metric": 19.466705322265625, "action_level_variance_full_gradient/metric": 1468.863037109375, "adam_stats/lr_effective_max": 1.818881901272107e-05, "adam_stats/lr_effective_mean": 5.244425033135158e-11, "adam_stats/lr_effective_min": -1.8033144442597404e-05, "adam_stats/m_t_max": 0.0004456429451238364, "adam_stats/m_t_mean": -9.427038143161592e-13, "adam_stats/m_t_min": -0.00039976061088964343, "adam_stats/v_t_max": 2.254462924611289e-05, "adam_stats/v_t_mean": 1.994906193361623e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0747973695397377, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -7.48191499710083, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.7693952322006226, "all_logprobs": -0.031048698350787163, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -12.0, "all_logprobs/p1": -0.9296875, "all_logprobs/p10": -0.00250244140625, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.06591796875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04414278268814087, "clip_ratio": 0.0, "completion_length": 526.81640625, "completion_length/correct": 461.3412780761719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 415.0, "completion_length/correct/min": 100.0, "completion_length/correct/p25": 317.0, "completion_length/correct/p75": 575.5, "completion_length/correct/var": 41265.421875, "completion_length/incorrect": 825.7246704101562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 977.0, "completion_length/incorrect/min": 216.0, "completion_length/incorrect/p25": 611.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 59086.56640625, "completion_length/max": 1024.0, "completion_length/median": 460.0, "completion_length/min": 100.0, "completion_length/p25": 332.0, "completion_length/p75": 687.5, "completion_length/var": 63991.3671875, "epoch": 0.8832, "feature_vector_variance/max_squared_error": 158473.65625, "feature_vector_variance/metric": 28412.677734375, "generated_tokens/total": 29822686.0, "grad_norm": 0.06811962276697159, "grouped_std_rewards": 0.09648542106151581, "learning_rate": 3.978963279105821e-06, "loss": -0.0748, "mean_logprobs": -0.03271484375, "mean_logprobs/var": 0.0003490447998046875, "num_completions/total": 52992, "per_sentence_gradient_norm": 0.6201035976409912, "per_sentence_gradient_norm/max": 56.78536605834961, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 22.293546676635742, "per_sentence_gradient_norm/var": 19.1070556640625, "per_token_feature_norm": 196.5001983642578, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 74.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 453.8005676269531, "per_token_full_gradient_variance/max_squared_error": 35.95389938354492, "per_token_full_gradient_variance/variance": 0.007031221408396959, "per_token_gradient_norm": 0.7697384357452393, "per_token_gradient_norm/max": 3015.435302734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 615.1395874023438, "per_token_policy_error_norm": 0.017928913235664368, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01628784090280533, "policy_entropy": 0.034477896988391876, "policy_entropy/max": 3.59375, "policy_entropy/median": 1.2281816452741623e-08, "policy_entropy/min": 8.605854744103691e-19, "policy_entropy/p25": 1.509761204943061e-10, "policy_entropy/p75": 3.904104232788086e-06, "policy_entropy/var": 0.020203683525323868, "policy_error_vector_variance/max_squared_error": 2.006105422973633, "policy_error_vector_variance/metric": 0.017918990924954414, "policy_loss": -0.0747973769903183, "policy_loss/max": 7.48191499710083, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.7693952322006226, "policy_sharpness": 9.164867401123047, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.117000579833984, "reward": 0.8203125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14759209752082825, "rewards/accuracy_reward": 0.8203125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14759209752082825, "sentence_full_gradient_variance/max_squared_error": 407498.46875, "sentence_full_gradient_variance/metric": 1660.77490234375, "sentence_full_gradient_variance/p75": 32.06037521362305, "sentence_full_gradient_variance/p90": 39.051998138427734, "sentence_full_gradient_variance/p95": 39.051998138427734, "sentence_full_gradient_variance/p99": 59195.07421875, "state_level_variance/metric": 2.07037615776062, "state_level_variance_full_gradient/metric": 191.91207885742188, "step": 69 }, { "accuracy_reward": 0.7513021230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18709087371826172, "action_level_variance/metric": 87.20832824707031, "action_level_variance_full_gradient/metric": 2808.435546875, "adam_stats/lr_effective_max": 1.796493052097503e-05, "adam_stats/lr_effective_mean": 7.678585545178862e-11, "adam_stats/lr_effective_min": -1.8210414054919966e-05, "adam_stats/m_t_max": 0.0003248638822697103, "adam_stats/m_t_mean": -1.7449507550698629e-12, "adam_stats/m_t_min": -0.0003182983200531453, "adam_stats/v_t_max": 2.252760714327451e-05, "adam_stats/v_t_mean": 1.9950766299431377e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.06116240471601486, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.2482051849365234, "all_logprobs": -0.02935897186398506, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.75, "all_logprobs/p1": -0.8359375, "all_logprobs/p10": -0.001800537109375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0422806479036808, "clip_ratio": 0.0, "completion_length": 560.0794677734375, "completion_length/correct": 462.5892639160156, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 428.0, "completion_length/correct/min": 159.0, "completion_length/correct/p25": 293.0, "completion_length/correct/p75": 592.0, "completion_length/correct/var": 43092.859375, "completion_length/incorrect": 854.5916137695312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1005.0, "completion_length/incorrect/min": 169.0, "completion_length/incorrect/p25": 689.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 47023.828125, "completion_length/max": 1024.0, "completion_length/median": 505.0, "completion_length/min": 159.0, "completion_length/p25": 323.0, "completion_length/p75": 782.5, "completion_length/var": 72759.9296875, "epoch": 0.896, "feature_vector_variance/max_squared_error": 157606.796875, "feature_vector_variance/metric": 28175.26953125, "generated_tokens/total": 30252828.0, "grad_norm": 0.24837234616279602, "grouped_std_rewards": 0.14067910611629486, "learning_rate": 3.750000000000002e-06, "loss": -0.0612, "mean_logprobs": -0.030517578125, "mean_logprobs/var": 0.0003910064697265625, "num_completions/total": 53760, "per_sentence_gradient_norm": 1.4554443359375, "per_sentence_gradient_norm/max": 123.27796936035156, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 49.9632568359375, "per_sentence_gradient_norm/var": 85.20093536376953, "per_token_feature_norm": 197.04782104492188, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 438.11956787109375, "per_token_full_gradient_variance/max_squared_error": 151.5633544921875, "per_token_full_gradient_variance/variance": 0.03267936035990715, "per_token_gradient_norm": 1.8013437986373901, "per_token_gradient_norm/max": 5327.51513671875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2674.44482421875, "per_token_policy_error_norm": 0.01689092628657818, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015561547130346298, "policy_entropy": 0.0324753113090992, "policy_entropy/max": 3.703125, "policy_entropy/median": 9.89530235528946e-09, "policy_entropy/min": 9.825582188149884e-20, "policy_entropy/p25": 1.3096723705530167e-10, "policy_entropy/p75": 2.3990869522094727e-06, "policy_entropy/var": 0.0191891398280859, "policy_error_vector_variance/max_squared_error": 2.0053176879882812, "policy_error_vector_variance/metric": 0.016878148540854454, "policy_loss": -0.06116240471601486, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.2482054233551025, "policy_sharpness": 9.207836151123047, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.881613731384277, "reward": 0.7513021230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18709087371826172, "rewards/accuracy_reward": 0.7513021230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18709087371826172, "sentence_full_gradient_variance/max_squared_error": 523478.03125, "sentence_full_gradient_variance/metric": 3136.93408203125, "sentence_full_gradient_variance/p75": 124.94074249267578, "sentence_full_gradient_variance/p90": 240.1490478515625, "sentence_full_gradient_variance/p95": 240.1490478515625, "sentence_full_gradient_variance/p99": 82273.203125, "state_level_variance/metric": 8.875170707702637, "state_level_variance_full_gradient/metric": 328.4986267089844, "step": 70 }, { "accuracy_reward": 0.8424479365348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.13290248811244965, "action_level_variance/metric": 74.15987396240234, "action_level_variance_full_gradient/metric": 4146.39697265625, "adam_stats/lr_effective_max": 1.7118518371717073e-05, "adam_stats/lr_effective_mean": 7.697878445789286e-11, "adam_stats/lr_effective_min": -1.8010343410423957e-05, "adam_stats/m_t_max": 0.000344126601703465, "adam_stats/m_t_mean": -6.133373431534139e-13, "adam_stats/m_t_min": -0.0003380086272954941, "adam_stats/v_t_max": 2.2505109882331453e-05, "adam_stats/v_t_mean": 1.9936680344806446e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.008940785191953182, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.0919206142425537, "all_logprobs": -0.028561843559145927, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.9375, "all_logprobs/p1": -0.828125, "all_logprobs/p10": -0.0016021728515625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.0498046875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04115244373679161, "clip_ratio": 0.0, "completion_length": 492.5481872558594, "completion_length/correct": 451.584228515625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 410.0, "completion_length/correct/min": 127.0, "completion_length/correct/p25": 301.5, "completion_length/correct/p75": 554.0, "completion_length/correct/var": 39524.32421875, "completion_length/incorrect": 711.5867309570312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 657.0, "completion_length/incorrect/min": 236.0, "completion_length/incorrect/p25": 474.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 74052.0234375, "completion_length/max": 1024.0, "completion_length/median": 439.0, "completion_length/min": 127.0, "completion_length/p25": 320.0, "completion_length/p75": 616.0, "completion_length/var": 53859.1640625, "epoch": 0.9088, "feature_vector_variance/max_squared_error": 160567.953125, "feature_vector_variance/metric": 28022.7109375, "generated_tokens/total": 30631104.0, "grad_norm": 0.113491490483284, "grouped_std_rewards": 0.1137584000825882, "learning_rate": 3.525605518250964e-06, "loss": 0.0089, "mean_logprobs": -0.029296875, "mean_logprobs/var": 0.00026702880859375, "num_completions/total": 54528, "per_sentence_gradient_norm": 1.0991017818450928, "per_sentence_gradient_norm/max": 144.62965393066406, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 39.94755935668945, "per_sentence_gradient_norm/var": 73.04696655273438, "per_token_feature_norm": 196.73287963867188, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 75.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 444.85223388671875, "per_token_full_gradient_variance/max_squared_error": 362.4612731933594, "per_token_full_gradient_variance/variance": 0.0249414574354887, "per_token_gradient_norm": 1.4230079650878906, "per_token_gradient_norm/max": 6602.9521484375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2641.5185546875, "per_token_policy_error_norm": 0.01644056662917137, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015174778178334236, "policy_entropy": 0.03143911063671112, "policy_entropy/max": 3.578125, "policy_entropy/median": 8.905772119760513e-09, "policy_entropy/min": 8.74138001566438e-19, "policy_entropy/p25": 1.2005330063402653e-10, "policy_entropy/p75": 2.682209014892578e-06, "policy_entropy/var": 0.01840771920979023, "policy_error_vector_variance/max_squared_error": 2.0059406757354736, "policy_error_vector_variance/metric": 0.016429558396339417, "policy_loss": 0.008940786123275757, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.0919206142425537, "policy_sharpness": 9.217010498046875, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.809564590454102, "reward": 0.8424479365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.13290248811244965, "rewards/accuracy_reward": 0.8424479365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.13290248811244965, "sentence_full_gradient_variance/max_squared_error": 1564164.625, "sentence_full_gradient_variance/metric": 4719.59912109375, "sentence_full_gradient_variance/p75": 33.27257537841797, "sentence_full_gradient_variance/p90": 56.906795501708984, "sentence_full_gradient_variance/p95": 56.906795501708984, "sentence_full_gradient_variance/p99": 108618.3515625, "state_level_variance/metric": 8.146822929382324, "state_level_variance_full_gradient/metric": 573.20166015625, "step": 71 }, { "accuracy_reward": 0.8216146230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1467551589012146, "action_level_variance/metric": 63.59937286376953, "action_level_variance_full_gradient/metric": 2796.47509765625, "adam_stats/lr_effective_max": 1.6027219317038544e-05, "adam_stats/lr_effective_mean": 3.3161012130689116e-11, "adam_stats/lr_effective_min": -1.6248053725576028e-05, "adam_stats/m_t_max": 0.00029763946076855063, "adam_stats/m_t_mean": -2.8118390862225118e-12, "adam_stats/m_t_min": -0.00030984397744759917, "adam_stats/v_t_max": 2.2483329303213395e-05, "adam_stats/v_t_mean": 1.9927143702497263e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.07836031913757324, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.2456815242767334, "all_logprobs": -0.029414139688014984, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.375, "all_logprobs/p1": -0.8359375, "all_logprobs/p10": -0.001953125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04214321821928024, "clip_ratio": 0.0, "completion_length": 511.90106201171875, "completion_length/correct": 448.4437561035156, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 405.0, "completion_length/correct/min": 101.0, "completion_length/correct/p25": 310.0, "completion_length/correct/p75": 557.0, "completion_length/correct/var": 37926.59765625, "completion_length/incorrect": 804.1751708984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 863.0, "completion_length/incorrect/min": 274.0, "completion_length/incorrect/p25": 601.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 57165.90234375, "completion_length/max": 1024.0, "completion_length/median": 448.0, "completion_length/min": 101.0, "completion_length/p25": 326.0, "completion_length/p75": 644.25, "completion_length/var": 59859.65625, "epoch": 0.9216, "feature_vector_variance/max_squared_error": 152790.5, "feature_vector_variance/metric": 28194.806640625, "generated_tokens/total": 31024244.0, "grad_norm": 0.14679649472236633, "grouped_std_rewards": 0.1421232521533966, "learning_rate": 3.3060532239694e-06, "loss": 0.0784, "mean_logprobs": -0.030517578125, "mean_logprobs/var": 0.00030517578125, "num_completions/total": 55296, "per_sentence_gradient_norm": 1.233432650566101, "per_sentence_gradient_norm/max": 98.7427749633789, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 40.88481521606445, "per_sentence_gradient_norm/var": 62.15895462036133, "per_token_feature_norm": 196.98634338378906, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 75.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 458.9866943359375, "per_token_full_gradient_variance/max_squared_error": 95.08433532714844, "per_token_full_gradient_variance/variance": 0.020467285066843033, "per_token_gradient_norm": 1.4417471885681152, "per_token_gradient_norm/max": 5399.884765625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1660.6058349609375, "per_token_policy_error_norm": 0.01698196306824684, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015712877735495567, "policy_entropy": 0.0323992595076561, "policy_entropy/max": 3.6875, "policy_entropy/median": 1.1408701539039612e-08, "policy_entropy/min": 7.487771253728015e-19, "policy_entropy/p25": 1.5643308870494366e-10, "policy_entropy/p75": 3.129243850708008e-06, "policy_entropy/var": 0.018431466072797775, "policy_error_vector_variance/max_squared_error": 2.0049753189086914, "policy_error_vector_variance/metric": 0.016970068216323853, "policy_loss": 0.07836031913757324, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.2456815242767334, "policy_sharpness": 9.193276405334473, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.928056716918945, "reward": 0.8216146230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1467551589012146, "rewards/accuracy_reward": 0.8216146230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1467551589012146, "sentence_full_gradient_variance/max_squared_error": 501152.78125, "sentence_full_gradient_variance/metric": 3179.41064453125, "sentence_full_gradient_variance/p75": 40.250492095947266, "sentence_full_gradient_variance/p90": 77.8276596069336, "sentence_full_gradient_variance/p95": 77.8276596069336, "sentence_full_gradient_variance/p99": 75548.125, "state_level_variance/metric": 6.496234893798828, "state_level_variance_full_gradient/metric": 382.93548583984375, "step": 72 }, { "accuracy_reward": 0.8190104365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1484256386756897, "action_level_variance/metric": 35.29652404785156, "action_level_variance_full_gradient/metric": 1020.9635620117188, "adam_stats/lr_effective_max": 1.4305639524536673e-05, "adam_stats/lr_effective_mean": 3.410788665392239e-11, "adam_stats/lr_effective_min": -1.4421684682019986e-05, "adam_stats/m_t_max": 0.00027416975353844464, "adam_stats/m_t_mean": -1.1928289302479134e-12, "adam_stats/m_t_min": -0.0002764634555205703, "adam_stats/v_t_max": 2.2460952095570974e-05, "adam_stats/v_t_mean": 1.9908373994487194e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.05748865753412247, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.298467755317688, "all_logprobs": -0.03141755983233452, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.625, "all_logprobs/p1": -0.953125, "all_logprobs/p10": -0.0025634765625, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.07080078125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04446621611714363, "clip_ratio": 0.0, "completion_length": 477.6184997558594, "completion_length/correct": 410.82830810546875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 378.0, "completion_length/correct/min": 118.0, "completion_length/correct/p25": 286.0, "completion_length/correct/p75": 488.0, "completion_length/correct/var": 30958.8671875, "completion_length/incorrect": 779.8561401367188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 873.0, "completion_length/incorrect/min": 230.0, "completion_length/incorrect/p25": 553.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63650.53515625, "completion_length/max": 1024.0, "completion_length/median": 418.0, "completion_length/min": 118.0, "completion_length/p25": 298.75, "completion_length/p75": 571.5, "completion_length/var": 57013.2734375, "epoch": 0.9344, "feature_vector_variance/max_squared_error": 156939.65625, "feature_vector_variance/metric": 28324.001953125, "generated_tokens/total": 31391056.0, "grad_norm": 0.052821263670921326, "grouped_std_rewards": 0.08228574693202972, "learning_rate": 3.0916106078064522e-06, "loss": 0.0575, "mean_logprobs": -0.032958984375, "mean_logprobs/var": 0.0004520416259765625, "num_completions/total": 56064, "per_sentence_gradient_norm": 0.7144171595573425, "per_sentence_gradient_norm/max": 111.55003356933594, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 25.443138122558594, "per_sentence_gradient_norm/var": 34.83148193359375, "per_token_feature_norm": 196.07925415039062, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 72.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 453.4371032714844, "per_token_full_gradient_variance/max_squared_error": 113.7215576171875, "per_token_full_gradient_variance/variance": 0.014245152473449707, "per_token_gradient_norm": 1.064963459968567, "per_token_gradient_norm/max": 5546.7890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1361.6671142578125, "per_token_policy_error_norm": 0.01814311370253563, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01659420318901539, "policy_entropy": 0.03472897782921791, "policy_entropy/max": 3.5625, "policy_entropy/median": 1.0826624929904938e-08, "policy_entropy/min": 4.9339669177562995e-20, "policy_entropy/p25": 1.3369572116062045e-10, "policy_entropy/p75": 3.5017728805541992e-06, "policy_entropy/var": 0.020552730187773705, "policy_error_vector_variance/max_squared_error": 2.004732608795166, "policy_error_vector_variance/metric": 0.01813090406358242, "policy_loss": 0.05748865008354187, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -7.48191499710083, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.2984676361083984, "policy_sharpness": 9.15960693359375, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.165884494781494, "reward": 0.8190104365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1484256386756897, "rewards/accuracy_reward": 0.8190104365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1484256386756897, "sentence_full_gradient_variance/max_squared_error": 349348.96875, "sentence_full_gradient_variance/metric": 1153.7738037109375, "sentence_full_gradient_variance/p75": 35.94941329956055, "sentence_full_gradient_variance/p90": 38.70903015136719, "sentence_full_gradient_variance/p95": 38.70903015136719, "sentence_full_gradient_variance/p99": 16488.46484375, "state_level_variance/metric": 3.9427433013916016, "state_level_variance_full_gradient/metric": 132.8102264404297, "step": 73 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.165145605802536, "action_level_variance/metric": 65.7888412475586, "action_level_variance_full_gradient/metric": 2450.63037109375, "adam_stats/lr_effective_max": 1.3021013728575781e-05, "adam_stats/lr_effective_mean": 1.521045789321107e-11, "adam_stats/lr_effective_min": -1.2719275218842085e-05, "adam_stats/m_t_max": 0.0002584352914709598, "adam_stats/m_t_mean": -9.632629980119156e-13, "adam_stats/m_t_min": -0.00025263181305490434, "adam_stats/v_t_max": 2.243853305117227e-05, "adam_stats/v_t_mean": 1.988970186467265e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.08082908391952515, "advantages/max": 9.659051895141602, "advantages/median": -0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.5420572757720947, "all_logprobs": -0.0306257177144289, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -12.3125, "all_logprobs/p1": -0.8828125, "all_logprobs/p10": -0.002471923828125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.06396484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.044029492884874344, "clip_ratio": 0.0, "completion_length": 531.0221557617188, "completion_length/correct": 463.31414794921875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 424.0, "completion_length/correct/min": 107.0, "completion_length/correct/p25": 331.75, "completion_length/correct/p75": 573.75, "completion_length/correct/var": 36543.875, "completion_length/incorrect": 788.3125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 862.0, "completion_length/incorrect/min": 226.0, "completion_length/incorrect/p25": 570.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 68294.09375, "completion_length/max": 1024.0, "completion_length/median": 462.0, "completion_length/min": 107.0, "completion_length/p25": 354.0, "completion_length/p75": 672.25, "completion_length/var": 60521.4140625, "epoch": 0.9472, "feature_vector_variance/max_squared_error": 143030.625, "feature_vector_variance/metric": 28137.935546875, "generated_tokens/total": 31798880.0, "grad_norm": 0.05645059794187546, "grouped_std_rewards": 0.14176751673221588, "learning_rate": 2.882538935057563e-06, "loss": 0.0808, "mean_logprobs": -0.031982421875, "mean_logprobs/var": 0.00030517578125, "num_completions/total": 56832, "per_sentence_gradient_norm": 1.2457354068756104, "per_sentence_gradient_norm/max": 138.29269409179688, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 36.89769744873047, "per_sentence_gradient_norm/var": 64.32072448730469, "per_token_feature_norm": 196.8806610107422, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 71.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 429.83416748046875, "per_token_full_gradient_variance/max_squared_error": 61.4448127746582, "per_token_full_gradient_variance/variance": 0.016281340271234512, "per_token_gradient_norm": 1.5893809795379639, "per_token_gradient_norm/max": 3251.61669921875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1698.94091796875, "per_token_policy_error_norm": 0.017734244465827942, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016281044110655785, "policy_entropy": 0.033536188304424286, "policy_entropy/max": 3.609375, "policy_entropy/median": 1.1932570487260818e-08, "policy_entropy/min": 3.6761229910836635e-19, "policy_entropy/p25": 1.482476363889873e-10, "policy_entropy/p75": 3.606081008911133e-06, "policy_entropy/var": 0.018988313153386116, "policy_error_vector_variance/max_squared_error": 2.0080230236053467, "policy_error_vector_variance/metric": 0.017720559611916542, "policy_loss": 0.08082907646894455, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.5420572757720947, "policy_sharpness": 9.172426223754883, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.063854217529297, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.165145605802536, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.165145605802536, "sentence_full_gradient_variance/max_squared_error": 1136491.625, "sentence_full_gradient_variance/metric": 2763.9609375, "sentence_full_gradient_variance/p75": 22.126924514770508, "sentence_full_gradient_variance/p90": 219.33692932128906, "sentence_full_gradient_variance/p95": 219.33692932128906, "sentence_full_gradient_variance/p99": 47001.1328125, "state_level_variance/metric": 6.741976737976074, "state_level_variance_full_gradient/metric": 313.33001708984375, "step": 74 }, { "accuracy_reward": 0.7747396230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17474570870399475, "action_level_variance/metric": 49.41538619995117, "action_level_variance_full_gradient/metric": 1193.5599365234375, "adam_stats/lr_effective_max": 1.2360928849375341e-05, "adam_stats/lr_effective_mean": 7.253587487598967e-12, "adam_stats/lr_effective_min": -1.1712681043718476e-05, "adam_stats/m_t_max": 0.0006839659763500094, "adam_stats/m_t_mean": -5.310904493960322e-12, "adam_stats/m_t_min": -0.000791732338257134, "adam_stats/v_t_max": 2.2434087441070005e-05, "adam_stats/v_t_mean": 1.9904004659732077e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.057418763637542725, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.446566581726074, "all_logprobs": -0.029396144673228264, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.75, "all_logprobs/p1": -0.8359375, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04157419502735138, "clip_ratio": 0.0, "completion_length": 517.7825927734375, "completion_length/correct": 429.90924072265625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 402.0, "completion_length/correct/min": 158.0, "completion_length/correct/p25": 309.5, "completion_length/correct/p75": 511.5, "completion_length/correct/var": 26901.62890625, "completion_length/incorrect": 820.0057373046875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 193.0, "completion_length/incorrect/p25": 592.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63536.7265625, "completion_length/max": 1024.0, "completion_length/median": 455.0, "completion_length/min": 158.0, "completion_length/p25": 336.75, "completion_length/p75": 625.75, "completion_length/var": 61673.96484375, "epoch": 0.96, "feature_vector_variance/max_squared_error": 151404.0625, "feature_vector_variance/metric": 28182.013671875, "generated_tokens/total": 32196536.0, "grad_norm": 0.121567003428936, "grouped_std_rewards": 0.09894049167633057, "learning_rate": 2.6790929273509547e-06, "loss": 0.0574, "mean_logprobs": -0.029541015625, "mean_logprobs/var": 0.0002288818359375, "num_completions/total": 57600, "per_sentence_gradient_norm": 0.8771073818206787, "per_sentence_gradient_norm/max": 132.02346801757812, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 26.68716049194336, "per_sentence_gradient_norm/var": 48.70949172973633, "per_token_feature_norm": 196.50596618652344, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 75.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 462.9799499511719, "per_token_full_gradient_variance/max_squared_error": 440.45159912109375, "per_token_full_gradient_variance/variance": 0.029162922874093056, "per_token_gradient_norm": 1.4095969200134277, "per_token_gradient_norm/max": 6619.0341796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2647.44677734375, "per_token_policy_error_norm": 0.016994658857584, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01556717325001955, "policy_entropy": 0.03255298361182213, "policy_entropy/max": 3.328125, "policy_entropy/median": 1.1525116860866547e-08, "policy_entropy/min": 6.708500942254059e-19, "policy_entropy/p25": 1.6189005691558123e-10, "policy_entropy/p75": 2.905726432800293e-06, "policy_entropy/var": 0.018792156130075455, "policy_error_vector_variance/max_squared_error": 2.008357524871826, "policy_error_vector_variance/metric": 0.01698676124215126, "policy_loss": 0.057418759912252426, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.79339599609375, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.4465668201446533, "policy_sharpness": 9.198108673095703, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.937145709991455, "reward": 0.7747396230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17474570870399475, "rewards/accuracy_reward": 0.7747396230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17474570870399475, "sentence_full_gradient_variance/max_squared_error": 454493.40625, "sentence_full_gradient_variance/metric": 1345.58349609375, "sentence_full_gradient_variance/p75": 35.425682067871094, "sentence_full_gradient_variance/p90": 47.37882995605469, "sentence_full_gradient_variance/p95": 47.37882995605469, "sentence_full_gradient_variance/p99": 36788.34375, "state_level_variance/metric": 5.464527606964111, "state_level_variance_full_gradient/metric": 152.02377319335938, "step": 75 }, { "accuracy_reward": 0.7786458730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17258122563362122, "action_level_variance/metric": 43.576934814453125, "action_level_variance_full_gradient/metric": 1638.5750732421875, "adam_stats/lr_effective_max": 1.0499445124878548e-05, "adam_stats/lr_effective_mean": -8.420355558647064e-12, "adam_stats/lr_effective_min": -1.0511000255064573e-05, "adam_stats/m_t_max": 0.0005884849815629423, "adam_stats/m_t_mean": -5.382006472431922e-12, "adam_stats/m_t_min": -0.0007220481638796628, "adam_stats/v_t_max": 2.2411772079067305e-05, "adam_stats/v_t_mean": 1.988624325974242e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.12608498334884644, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.012199640274048, "all_logprobs": -0.030485445633530617, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.0, "all_logprobs/p1": -0.87109375, "all_logprobs/p10": -0.003173828125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.068359375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0421394482254982, "clip_ratio": 0.0, "completion_length": 519.0859375, "completion_length/correct": 446.0, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 415.0, "completion_length/correct/min": 125.0, "completion_length/correct/p25": 293.25, "completion_length/correct/p75": 574.75, "completion_length/correct/var": 39433.87890625, "completion_length/incorrect": 776.176513671875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 811.0, "completion_length/incorrect/min": 259.0, "completion_length/incorrect/p25": 573.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 57754.65625, "completion_length/max": 1024.0, "completion_length/median": 464.0, "completion_length/min": 125.0, "completion_length/p25": 327.75, "completion_length/p75": 674.25, "completion_length/var": 62233.44921875, "epoch": 0.9728, "feature_vector_variance/max_squared_error": 154587.5625, "feature_vector_variance/metric": 28131.455078125, "generated_tokens/total": 32595196.0, "grad_norm": 0.07429850846529007, "grouped_std_rewards": 0.14593136310577393, "learning_rate": 2.4815204523085656e-06, "loss": -0.1261, "mean_logprobs": -0.029541015625, "mean_logprobs/var": 0.00025177001953125, "num_completions/total": 58368, "per_sentence_gradient_norm": 1.0872466564178467, "per_sentence_gradient_norm/max": 93.37310791015625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 36.18450164794922, "per_sentence_gradient_norm/var": 42.45010757446289, "per_token_feature_norm": 196.780029296875, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 71.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 458.72344970703125, "per_token_full_gradient_variance/max_squared_error": 84.47527313232422, "per_token_full_gradient_variance/variance": 0.01787598617374897, "per_token_gradient_norm": 1.4746599197387695, "per_token_gradient_norm/max": 4319.75341796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1671.3372802734375, "per_token_policy_error_norm": 0.017577145248651505, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0158714447170496, "policy_entropy": 0.03436492383480072, "policy_entropy/max": 3.75, "policy_entropy/median": 1.0244548320770264e-08, "policy_entropy/min": 1.1604351377383915e-19, "policy_entropy/p25": 1.2187229003757238e-10, "policy_entropy/p75": 3.904104232788086e-06, "policy_entropy/var": 0.019929125905036926, "policy_error_vector_variance/max_squared_error": 2.005575656890869, "policy_error_vector_variance/metric": 0.01756259799003601, "policy_loss": -0.12608496844768524, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.012199640274048, "policy_sharpness": 9.151885986328125, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.177688121795654, "reward": 0.7786458730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17258122563362122, "rewards/accuracy_reward": 0.7786458730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17258122563362122, "sentence_full_gradient_variance/max_squared_error": 208267.078125, "sentence_full_gradient_variance/metric": 1841.5916748046875, "sentence_full_gradient_variance/p75": 41.38446044921875, "sentence_full_gradient_variance/p90": 112.9713134765625, "sentence_full_gradient_variance/p95": 112.9713134765625, "sentence_full_gradient_variance/p99": 61275.3046875, "state_level_variance/metric": 4.309906482696533, "state_level_variance_full_gradient/metric": 203.0165252685547, "step": 76 }, { "accuracy_reward": 0.8125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.152542382478714, "action_level_variance/metric": 96.4063720703125, "action_level_variance_full_gradient/metric": 3888.75390625, "adam_stats/lr_effective_max": 1.0144003681489266e-05, "adam_stats/lr_effective_mean": -1.7040821878588908e-11, "adam_stats/lr_effective_min": -1.0595240382826887e-05, "adam_stats/m_t_max": 0.0006715432391501963, "adam_stats/m_t_mean": -1.0609834191765977e-11, "adam_stats/m_t_min": -0.0007658101385459304, "adam_stats/v_t_max": 2.2389376681530848e-05, "adam_stats/v_t_mean": 1.988241819447789e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.03178253769874573, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.4822776317596436, "all_logprobs": -0.028119832277297974, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.375, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.001171112060546875, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.048583984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04048696905374527, "clip_ratio": 0.0, "completion_length": 518.0416870117188, "completion_length/correct": 455.30450439453125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 415.0, "completion_length/correct/min": 117.0, "completion_length/correct/p25": 313.75, "completion_length/correct/p75": 584.0, "completion_length/correct/var": 37907.26953125, "completion_length/incorrect": 789.9027709960938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 888.0, "completion_length/incorrect/min": 166.0, "completion_length/incorrect/p25": 596.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 68469.0546875, "completion_length/max": 1024.0, "completion_length/median": 461.0, "completion_length/min": 117.0, "completion_length/p25": 329.75, "completion_length/p75": 664.25, "completion_length/var": 60633.83984375, "epoch": 0.9856, "feature_vector_variance/max_squared_error": 160617.078125, "feature_vector_variance/metric": 27795.412109375, "generated_tokens/total": 32993052.0, "grad_norm": 0.1951788365840912, "grouped_std_rewards": 0.1314670592546463, "learning_rate": 2.29006222155752e-06, "loss": 0.0318, "mean_logprobs": -0.0294189453125, "mean_logprobs/var": 0.0002880096435546875, "num_completions/total": 59136, "per_sentence_gradient_norm": 1.2952903509140015, "per_sentence_gradient_norm/max": 180.33578491210938, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 34.466243743896484, "per_sentence_gradient_norm/var": 94.85210418701172, "per_token_feature_norm": 196.50601196289062, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 61.75, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 448.0071105957031, "per_token_full_gradient_variance/max_squared_error": 647.3351440429688, "per_token_full_gradient_variance/variance": 0.030097736045718193, "per_token_gradient_norm": 1.641278862953186, "per_token_gradient_norm/max": 6404.70849609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3053.4130859375, "per_token_policy_error_norm": 0.016312843188643456, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015255246311426163, "policy_entropy": 0.03050745651125908, "policy_entropy/max": 3.578125, "policy_entropy/median": 8.149072527885437e-09, "policy_entropy/min": 1.5839516113655416e-19, "policy_entropy/p25": 1.1550582712516189e-10, "policy_entropy/p75": 1.9371509552001953e-06, "policy_entropy/var": 0.017337895929813385, "policy_error_vector_variance/max_squared_error": 2.0022568702697754, "policy_error_vector_variance/metric": 0.01630568876862526, "policy_loss": 0.03178253024816513, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.4822776317596436, "policy_sharpness": 9.245481491088867, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.661639213562012, "reward": 0.8125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.152542382478714, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.152542382478714, "sentence_full_gradient_variance/max_squared_error": 1019314.4375, "sentence_full_gradient_variance/metric": 4416.7392578125, "sentence_full_gradient_variance/p75": 22.248361587524414, "sentence_full_gradient_variance/p90": 148.55743408203125, "sentence_full_gradient_variance/p95": 148.55743408203125, "sentence_full_gradient_variance/p99": 75278.5625, "state_level_variance/metric": 10.482208251953125, "state_level_variance_full_gradient/metric": 527.985595703125, "step": 77 }, { "accuracy_reward": 0.8307291865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1408015638589859, "action_level_variance/metric": 21.0767822265625, "action_level_variance_full_gradient/metric": 983.0633544921875, "adam_stats/lr_effective_max": 9.410242455487605e-06, "adam_stats/lr_effective_mean": -1.5687026330701848e-11, "adam_stats/lr_effective_min": -9.395273082191125e-06, "adam_stats/m_t_max": 0.00047087445273064077, "adam_stats/m_t_mean": -1.0244256831715148e-11, "adam_stats/m_t_min": -0.0005320635391399264, "adam_stats/v_t_max": 2.2367208657669835e-05, "adam_stats/v_t_mean": 1.9866083604547224e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.021463921293616295, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.5591771602630615, "all_logprobs": -0.030282435938715935, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.25, "all_logprobs/p1": -0.87109375, "all_logprobs/p10": -0.0020599365234375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04350804537534714, "clip_ratio": 0.0, "completion_length": 456.62371826171875, "completion_length/correct": 390.1959228515625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 358.0, "completion_length/correct/min": 109.0, "completion_length/correct/p25": 266.25, "completion_length/correct/p75": 474.75, "completion_length/correct/var": 29047.388671875, "completion_length/incorrect": 782.6307983398438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 90.0, "completion_length/incorrect/p25": 531.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 79970.1875, "completion_length/max": 1024.0, "completion_length/median": 388.0, "completion_length/min": 90.0, "completion_length/p25": 282.0, "completion_length/p75": 557.5, "completion_length/var": 59258.26171875, "epoch": 0.9984, "feature_vector_variance/max_squared_error": 153410.953125, "feature_vector_variance/metric": 28308.740234375, "generated_tokens/total": 33343738.0, "grad_norm": 0.08809278905391693, "grouped_std_rewards": 0.08469172567129135, "learning_rate": 2.104951497460118e-06, "loss": -0.0215, "mean_logprobs": -0.031494140625, "mean_logprobs/var": 0.000820159912109375, "num_completions/total": 59904, "per_sentence_gradient_norm": 0.6003960967063904, "per_sentence_gradient_norm/max": 79.79315948486328, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 23.59941864013672, "per_sentence_gradient_norm/var": 20.743316650390625, "per_token_feature_norm": 197.00148010253906, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 77.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 465.6045837402344, "per_token_full_gradient_variance/max_squared_error": 74.28070831298828, "per_token_full_gradient_variance/variance": 0.012707293964922428, "per_token_gradient_norm": 0.9813051223754883, "per_token_gradient_norm/max": 3548.893310546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1009.575439453125, "per_token_policy_error_norm": 0.017360210418701172, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015861481428146362, "policy_entropy": 0.03372737765312195, "policy_entropy/max": 3.375, "policy_entropy/median": 9.89530235528946e-09, "policy_entropy/min": 4.3198680309969317e-19, "policy_entropy/p25": 1.3369572116062045e-10, "policy_entropy/p75": 2.7418136596679688e-06, "policy_entropy/var": 0.020320944488048553, "policy_error_vector_variance/max_squared_error": 2.0018019676208496, "policy_error_vector_variance/metric": 0.01734257861971855, "policy_loss": 0.06036906689405441, "policy_loss/max": 5.7954301834106445, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.3498647212982178, "policy_sharpness": 9.186145782470703, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.042375087738037, "reward": 0.8307291865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1408015638589859, "rewards/accuracy_reward": 0.8307291865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1408015638589859, "sentence_full_gradient_variance/max_squared_error": 323130.625, "sentence_full_gradient_variance/metric": 1114.050048828125, "sentence_full_gradient_variance/p75": 20.649866104125977, "sentence_full_gradient_variance/p90": 24.96649932861328, "sentence_full_gradient_variance/p95": 24.96649932861328, "sentence_full_gradient_variance/p99": 19883.033203125, "state_level_variance/metric": 2.298060417175293, "state_level_variance_full_gradient/metric": 130.9866180419922, "step": 78 }, { "accuracy_reward": 0.8515625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.12656861543655396, "action_level_variance/metric": 16.59099578857422, "action_level_variance_full_gradient/metric": 793.699951171875, "adam_stats/lr_effective_max": 8.677020559844095e-06, "adam_stats/lr_effective_mean": 1.5527093499834166e-12, "adam_stats/lr_effective_min": -8.861087735567708e-06, "adam_stats/m_t_max": 0.00045430456520989537, "adam_stats/m_t_mean": -7.285987785321524e-12, "adam_stats/m_t_min": -0.0005349332350306213, "adam_stats/v_t_max": 2.234493149444461e-05, "adam_stats/v_t_mean": 1.984746785324565e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.04765627533197403, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.0673366785049438, "all_logprobs": -0.026915472000837326, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.25, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00118255615234375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.044677734375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03797418624162674, "clip_ratio": 0.0, "completion_length": 491.69012451171875, "completion_length/correct": 439.342529296875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 397.0, "completion_length/correct/min": 97.0, "completion_length/correct/p25": 301.0, "completion_length/correct/p75": 512.0, "completion_length/correct/var": 35740.2890625, "completion_length/incorrect": 792.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 164.0, "completion_length/incorrect/p25": 543.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 85311.171875, "completion_length/max": 1024.0, "completion_length/median": 416.0, "completion_length/min": 97.0, "completion_length/p25": 315.0, "completion_length/p75": 613.0, "completion_length/var": 58737.8203125, "epoch": 1.0128, "feature_vector_variance/max_squared_error": 151647.953125, "feature_vector_variance/metric": 27894.939453125, "generated_tokens/total": 33721356.0, "grad_norm": 0.05761045962572098, "grouped_std_rewards": 0.09798028320074081, "learning_rate": 1.9264138089195424e-06, "loss": -0.0477, "mean_logprobs": -0.028076171875, "mean_logprobs/var": 0.0002841949462890625, "num_completions/total": 60672, "per_sentence_gradient_norm": 0.5547307729721069, "per_sentence_gradient_norm/max": 54.28512191772461, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 17.306150436401367, "per_sentence_gradient_norm/var": 16.30449867248535, "per_token_feature_norm": 196.77020263671875, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 78.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 454.80322265625, "per_token_full_gradient_variance/max_squared_error": 116.74906158447266, "per_token_full_gradient_variance/variance": 0.009253404103219509, "per_token_gradient_norm": 0.7990337610244751, "per_token_gradient_norm/max": 4851.5458984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 892.1572265625, "per_token_policy_error_norm": 0.015544820576906204, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01429332047700882, "policy_entropy": 0.02994520589709282, "policy_entropy/max": 3.59375, "policy_entropy/median": 1.0186340659856796e-08, "policy_entropy/min": 6.505213034913027e-19, "policy_entropy/p25": 1.355147105641663e-10, "policy_entropy/p75": 2.6226043701171875e-06, "policy_entropy/var": 0.017138205468654633, "policy_error_vector_variance/max_squared_error": 2.0042858123779297, "policy_error_vector_variance/metric": 0.015534822829067707, "policy_loss": -0.04765627905726433, "policy_loss/max": 9.659050941467285, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.0673366785049438, "policy_sharpness": 9.243281364440918, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.641942024230957, "reward": 0.8515625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.12656861543655396, "rewards/accuracy_reward": 0.8515625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.12656861543655396, "sentence_full_gradient_variance/max_squared_error": 141465.90625, "sentence_full_gradient_variance/metric": 886.7861328125, "sentence_full_gradient_variance/p75": 40.04995346069336, "sentence_full_gradient_variance/p90": 51.50922775268555, "sentence_full_gradient_variance/p95": 51.50922775268555, "sentence_full_gradient_variance/p99": 37024.2109375, "state_level_variance/metric": 1.7847391366958618, "state_level_variance_full_gradient/metric": 93.08612060546875, "step": 79 }, { "accuracy_reward": 0.8125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.152542382478714, "action_level_variance/metric": 89.44206237792969, "action_level_variance_full_gradient/metric": 3009.40966796875, "adam_stats/lr_effective_max": 7.584751529066125e-06, "adam_stats/lr_effective_mean": -7.989467594449184e-12, "adam_stats/lr_effective_min": -7.938259841466788e-06, "adam_stats/m_t_max": 0.00037072712439112365, "adam_stats/m_t_mean": -7.054257351868376e-12, "adam_stats/m_t_min": -0.00046389229828491807, "adam_stats/v_t_max": 2.2322969016386196e-05, "adam_stats/v_t_mean": 1.9831471534392797e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.06704483926296234, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.976349115371704, "all_logprobs": -0.03245178610086441, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.125, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.00311279296875, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.0751953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0482412613928318, "clip_ratio": 0.0, "completion_length": 517.2135620117188, "completion_length/correct": 461.9038391113281, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 427.0, "completion_length/correct/min": 131.0, "completion_length/correct/p25": 324.75, "completion_length/correct/p75": 566.0, "completion_length/correct/var": 34822.640625, "completion_length/incorrect": 756.888916015625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 792.0, "completion_length/incorrect/min": 252.0, "completion_length/incorrect/p25": 542.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 65303.671875, "completion_length/max": 1024.0, "completion_length/median": 458.0, "completion_length/min": 131.0, "completion_length/p25": 344.0, "completion_length/p75": 658.75, "completion_length/var": 53733.79296875, "epoch": 1.0256, "feature_vector_variance/max_squared_error": 160952.1875, "feature_vector_variance/metric": 28400.240234375, "generated_tokens/total": 34118576.0, "grad_norm": 0.09122723340988159, "grouped_std_rewards": 0.16364824771881104, "learning_rate": 1.7546666766076658e-06, "loss": -0.067, "mean_logprobs": -0.032470703125, "mean_logprobs/var": 0.0004444122314453125, "num_completions/total": 61440, "per_sentence_gradient_norm": 1.57900071144104, "per_sentence_gradient_norm/max": 116.95780944824219, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 46.397037506103516, "per_sentence_gradient_norm/var": 87.06217193603516, "per_token_feature_norm": 196.4755401611328, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 77.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 449.421630859375, "per_token_full_gradient_variance/max_squared_error": 82.57443237304688, "per_token_full_gradient_variance/variance": 0.024124013260006905, "per_token_gradient_norm": 2.0392327308654785, "per_token_gradient_norm/max": 5939.56396484375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2468.70556640625, "per_token_policy_error_norm": 0.018475327640771866, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.017209505662322044, "policy_entropy": 0.03537244349718094, "policy_entropy/max": 3.46875, "policy_entropy/median": 1.1408701539039612e-08, "policy_entropy/min": 1.3213713977167085e-18, "policy_entropy/p25": 1.446096575818956e-10, "policy_entropy/p75": 4.231929779052734e-06, "policy_entropy/var": 0.022030631080269814, "policy_error_vector_variance/max_squared_error": 2.0028445720672607, "policy_error_vector_variance/metric": 0.018465179949998856, "policy_loss": -0.06704483926296234, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.976349115371704, "policy_sharpness": 9.150017738342285, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.192208290100098, "reward": 0.8125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.152542382478714, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.152542382478714, "sentence_full_gradient_variance/max_squared_error": 882715.625, "sentence_full_gradient_variance/metric": 3411.725341796875, "sentence_full_gradient_variance/p75": 35.47242736816406, "sentence_full_gradient_variance/p90": 93.32637786865234, "sentence_full_gradient_variance/p95": 93.32637786865234, "sentence_full_gradient_variance/p99": 92318.171875, "state_level_variance/metric": 8.778456687927246, "state_level_variance_full_gradient/metric": 402.31585693359375, "step": 80 }, { "accuracy_reward": 0.7721354365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1761716902256012, "action_level_variance/metric": 57.73186492919922, "action_level_variance_full_gradient/metric": 1170.2166748046875, "adam_stats/lr_effective_max": 7.43958298698999e-06, "adam_stats/lr_effective_mean": -6.919872660993143e-12, "adam_stats/lr_effective_min": -7.4437898547330406e-06, "adam_stats/m_t_max": 0.0004122371901758015, "adam_stats/m_t_mean": -5.415182625229109e-12, "adam_stats/m_t_min": -0.00046556827146559954, "adam_stats/v_t_max": 2.23018359974958e-05, "adam_stats/v_t_mean": 1.981846110832297e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.09068338572978973, "advantages/max": 5.795430660247803, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.079622745513916, "all_logprobs": -0.029549673199653625, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -12.125, "all_logprobs/p1": -0.828125, "all_logprobs/p10": -0.0021820068359375, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04163314774632454, "clip_ratio": 0.0, "completion_length": 538.1159057617188, "completion_length/correct": 464.2344055175781, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 409.0, "completion_length/correct/min": 116.0, "completion_length/correct/p25": 306.0, "completion_length/correct/p75": 606.0, "completion_length/correct/var": 39810.44140625, "completion_length/incorrect": 788.4685668945312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 792.0, "completion_length/incorrect/min": 47.0, "completion_length/incorrect/p25": 619.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 43451.05859375, "completion_length/max": 1024.0, "completion_length/median": 479.0, "completion_length/min": 47.0, "completion_length/p25": 329.75, "completion_length/p75": 717.25, "completion_length/var": 59104.984375, "epoch": 1.0384, "feature_vector_variance/max_squared_error": 141646.671875, "feature_vector_variance/metric": 27981.74609375, "generated_tokens/total": 34531848.0, "grad_norm": 0.1131129264831543, "grouped_std_rewards": 0.1389230340719223, "learning_rate": 1.5899193479495858e-06, "loss": 0.0907, "mean_logprobs": -0.0306396484375, "mean_logprobs/var": 0.000324249267578125, "num_completions/total": 62208, "per_sentence_gradient_norm": 1.1777269840240479, "per_sentence_gradient_norm/max": 101.63443756103516, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 35.12853240966797, "per_sentence_gradient_norm/var": 56.41828918457031, "per_token_feature_norm": 196.72259521484375, "per_token_feature_norm/max": 298.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 60.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 409.9010925292969, "per_token_full_gradient_variance/max_squared_error": 159.40811157226562, "per_token_full_gradient_variance/variance": 0.024091195315122604, "per_token_gradient_norm": 1.4967259168624878, "per_token_gradient_norm/max": 4829.89697265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1901.1129150390625, "per_token_policy_error_norm": 0.017096536234021187, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015677494928240776, "policy_entropy": 0.03269781544804573, "policy_entropy/max": 3.8125, "policy_entropy/median": 9.89530235528946e-09, "policy_entropy/min": 1.212951180468158e-18, "policy_entropy/p25": 1.1732481652870774e-10, "policy_entropy/p75": 3.2782554626464844e-06, "policy_entropy/var": 0.01859026774764061, "policy_error_vector_variance/max_squared_error": 2.007674217224121, "policy_error_vector_variance/metric": 0.017088670283555984, "policy_loss": 0.09068337082862854, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -5.795430660247803, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.079622745513916, "policy_sharpness": 9.186427116394043, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.9793925285339355, "reward": 0.7721354365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1761716902256012, "rewards/accuracy_reward": 0.7721354365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1761716902256012, "sentence_full_gradient_variance/max_squared_error": 153320.375, "sentence_full_gradient_variance/metric": 1323.10791015625, "sentence_full_gradient_variance/p75": 23.44948959350586, "sentence_full_gradient_variance/p90": 79.7909164428711, "sentence_full_gradient_variance/p95": 79.7909164428711, "sentence_full_gradient_variance/p99": 53101.28125, "state_level_variance/metric": 5.890804767608643, "state_level_variance_full_gradient/metric": 152.8912353515625, "step": 81 }, { "accuracy_reward": 0.7877604365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1674119383096695, "action_level_variance/metric": 90.58407592773438, "action_level_variance_full_gradient/metric": 4599.7275390625, "adam_stats/lr_effective_max": 6.804763870604802e-06, "adam_stats/lr_effective_mean": 2.7729730404241204e-12, "adam_stats/lr_effective_min": -6.512456820928492e-06, "adam_stats/m_t_max": 0.00023599903215654194, "adam_stats/m_t_mean": -8.02264851773593e-12, "adam_stats/m_t_min": -0.0002885104331653565, "adam_stats/v_t_max": 2.227959703304805e-05, "adam_stats/v_t_mean": 1.9808629563022873e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.020529991015791893, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.6594252586364746, "all_logprobs": -0.028749605640769005, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.1875, "all_logprobs/p1": -0.828125, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.0595703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.040290266275405884, "clip_ratio": 0.0, "completion_length": 501.13934326171875, "completion_length/correct": 441.3355407714844, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 391.0, "completion_length/correct/min": 106.0, "completion_length/correct/p25": 307.0, "completion_length/correct/p75": 545.0, "completion_length/correct/var": 34548.99609375, "completion_length/incorrect": 723.1104125976562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 778.0, "completion_length/incorrect/min": 104.0, "completion_length/incorrect/p25": 460.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 83993.21875, "completion_length/max": 1024.0, "completion_length/median": 433.0, "completion_length/min": 104.0, "completion_length/p25": 318.0, "completion_length/p75": 628.25, "completion_length/var": 58239.2109375, "epoch": 1.0512, "feature_vector_variance/max_squared_error": 136805.875, "feature_vector_variance/metric": 28178.9140625, "generated_tokens/total": 34916724.0, "grad_norm": 0.13081279397010803, "grouped_std_rewards": 0.17170998454093933, "learning_rate": 1.432372542187895e-06, "loss": -0.0205, "mean_logprobs": -0.0303955078125, "mean_logprobs/var": 0.0004367828369140625, "num_completions/total": 62976, "per_sentence_gradient_norm": 1.6043403148651123, "per_sentence_gradient_norm/max": 111.388427734375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 45.87720489501953, "per_sentence_gradient_norm/var": 88.12491607666016, "per_token_feature_norm": 196.44100952148438, "per_token_feature_norm/max": 294.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 73.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 461.7132873535156, "per_token_full_gradient_variance/max_squared_error": 253.4027099609375, "per_token_full_gradient_variance/variance": 0.026221664622426033, "per_token_gradient_norm": 1.8258357048034668, "per_token_gradient_norm/max": 5441.63671875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2150.41064453125, "per_token_policy_error_norm": 0.016620798036456108, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01513983216136694, "policy_entropy": 0.032032083719968796, "policy_entropy/max": 3.59375, "policy_entropy/median": 1.0069925338029861e-08, "policy_entropy/min": 3.8455295805345235e-19, "policy_entropy/p25": 1.2823875294998288e-10, "policy_entropy/p75": 3.11434268951416e-06, "policy_entropy/var": 0.01799621246755123, "policy_error_vector_variance/max_squared_error": 2.0064733028411865, "policy_error_vector_variance/metric": 0.016612738370895386, "policy_loss": -0.02052999474108219, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.6594252586364746, "policy_sharpness": 9.19926643371582, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.912602424621582, "reward": 0.7877604365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1674119383096695, "rewards/accuracy_reward": 0.7877604365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1674119383096695, "sentence_full_gradient_variance/max_squared_error": 1029661.3125, "sentence_full_gradient_variance/metric": 5144.5244140625, "sentence_full_gradient_variance/p75": 141.2679901123047, "sentence_full_gradient_variance/p90": 540.9395141601562, "sentence_full_gradient_variance/p95": 540.9395141601562, "sentence_full_gradient_variance/p99": 162752.28125, "state_level_variance/metric": 8.841197967529297, "state_level_variance_full_gradient/metric": 544.7977905273438, "step": 82 }, { "accuracy_reward": 0.8294271230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14166226983070374, "action_level_variance/metric": 28.923137664794922, "action_level_variance_full_gradient/metric": 2930.829833984375, "adam_stats/lr_effective_max": 6.071923053241335e-06, "adam_stats/lr_effective_mean": 1.2807282144533527e-11, "adam_stats/lr_effective_min": -5.978995886835037e-06, "adam_stats/m_t_max": 0.0002681314945220947, "adam_stats/m_t_mean": -4.8714626083024015e-12, "adam_stats/m_t_min": -0.00028659903910011053, "adam_stats/v_t_max": 2.2257876480580308e-05, "adam_stats/v_t_mean": 1.97953806124751e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.003776390105485916, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.4040011167526245, "all_logprobs": -0.027399959042668343, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.1875, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.0013275146484375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.048583984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.039946556091308594, "clip_ratio": 0.0, "completion_length": 504.7981872558594, "completion_length/correct": 445.945068359375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 411.0, "completion_length/correct/min": 115.0, "completion_length/correct/p25": 301.0, "completion_length/correct/p75": 549.0, "completion_length/correct/var": 37022.3984375, "completion_length/incorrect": 790.9771118164062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 877.0, "completion_length/incorrect/min": 319.0, "completion_length/incorrect/p25": 568.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 60904.63671875, "completion_length/max": 1024.0, "completion_length/median": 444.0, "completion_length/min": 115.0, "completion_length/p25": 325.5, "completion_length/p75": 612.5, "completion_length/var": 57886.4453125, "epoch": 1.064, "feature_vector_variance/max_squared_error": 143077.5625, "feature_vector_variance/metric": 27758.048828125, "generated_tokens/total": 35304408.0, "grad_norm": 0.12468641996383667, "grouped_std_rewards": 0.12625354528427124, "learning_rate": 1.282218205837188e-06, "loss": 0.0038, "mean_logprobs": -0.0284423828125, "mean_logprobs/var": 0.0002841949462890625, "num_completions/total": 63744, "per_sentence_gradient_norm": 0.8549712896347046, "per_sentence_gradient_norm/max": 61.64815139770508, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 29.513216018676758, "per_sentence_gradient_norm/var": 28.228918075561523, "per_token_feature_norm": 196.88172912597656, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 441.9793395996094, "per_token_full_gradient_variance/max_squared_error": 55.28718185424805, "per_token_full_gradient_variance/variance": 0.011727241799235344, "per_token_gradient_norm": 1.028143048286438, "per_token_gradient_norm/max": 3279.25537109375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 979.095703125, "per_token_policy_error_norm": 0.0158176701515913, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014688625000417233, "policy_entropy": 0.029963770881295204, "policy_entropy/max": 3.78125, "policy_entropy/median": 8.905772119760513e-09, "policy_entropy/min": 2.6291902682773483e-18, "policy_entropy/p25": 1.2460077414289117e-10, "policy_entropy/p75": 2.0265579223632812e-06, "policy_entropy/var": 0.01703691855072975, "policy_error_vector_variance/max_squared_error": 2.004276990890503, "policy_error_vector_variance/metric": 0.01580706797540188, "policy_loss": 0.003776390105485916, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.4040011167526245, "policy_sharpness": 9.238099098205566, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.670091152191162, "reward": 0.8294271230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14166226983070374, "rewards/accuracy_reward": 0.8294271230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14166226983070374, "sentence_full_gradient_variance/max_squared_error": 689756.375, "sentence_full_gradient_variance/metric": 3318.005615234375, "sentence_full_gradient_variance/p75": 38.6873664855957, "sentence_full_gradient_variance/p90": 117.32637023925781, "sentence_full_gradient_variance/p95": 117.32637023925781, "sentence_full_gradient_variance/p99": 50789.6484375, "state_level_variance/metric": 2.914778470993042, "state_level_variance_full_gradient/metric": 387.1756286621094, "step": 83 }, { "accuracy_reward": 0.8098958730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15416531264781952, "action_level_variance/metric": 38.647552490234375, "action_level_variance_full_gradient/metric": 1487.351318359375, "adam_stats/lr_effective_max": 5.438349035102874e-06, "adam_stats/lr_effective_mean": 1.521032431950342e-11, "adam_stats/lr_effective_min": -5.3445146477315575e-06, "adam_stats/m_t_max": 0.00029543033451773226, "adam_stats/m_t_mean": -1.6606166311841641e-12, "adam_stats/m_t_min": -0.0002600968291517347, "adam_stats/v_t_max": 2.2236776203499176e-05, "adam_stats/v_t_mean": 1.978437596042437e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.02780270203948021, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.4712846279144287, "all_logprobs": -0.0281668733805418, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -14.125, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.0013275146484375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.048583984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04085457697510719, "clip_ratio": 0.0, "completion_length": 523.1823120117188, "completion_length/correct": 447.519287109375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 404.0, "completion_length/correct/min": 112.0, "completion_length/correct/p25": 297.25, "completion_length/correct/p75": 580.0, "completion_length/correct/var": 39458.84765625, "completion_length/incorrect": 845.5274047851562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 968.0, "completion_length/incorrect/min": 284.0, "completion_length/incorrect/p25": 679.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 47557.90625, "completion_length/max": 1024.0, "completion_length/median": 453.0, "completion_length/min": 112.0, "completion_length/p25": 315.75, "completion_length/p75": 685.25, "completion_length/var": 65359.90625, "epoch": 1.0768, "feature_vector_variance/max_squared_error": 145975.1875, "feature_vector_variance/metric": 27987.060546875, "generated_tokens/total": 35706212.0, "grad_norm": 0.1549830436706543, "grouped_std_rewards": 0.10735715925693512, "learning_rate": 1.1396392788268054e-06, "loss": -0.0278, "mean_logprobs": -0.0283203125, "mean_logprobs/var": 0.000518798828125, "num_completions/total": 64512, "per_sentence_gradient_norm": 0.8441437482833862, "per_sentence_gradient_norm/max": 98.03079223632812, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 25.445579528808594, "per_sentence_gradient_norm/var": 37.98443603515625, "per_token_feature_norm": 195.0069122314453, "per_token_feature_norm/max": 306.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 439.74639892578125, "per_token_full_gradient_variance/max_squared_error": 106.23310852050781, "per_token_full_gradient_variance/variance": 0.017112793400883675, "per_token_gradient_norm": 1.2688864469528198, "per_token_gradient_norm/max": 5034.634765625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1631.1092529296875, "per_token_policy_error_norm": 0.016239378601312637, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014981604181230068, "policy_entropy": 0.030890818685293198, "policy_entropy/max": 3.71875, "policy_entropy/median": 1.0826624929904938e-08, "policy_entropy/min": 2.625802136488331e-19, "policy_entropy/p25": 1.382431946694851e-10, "policy_entropy/p75": 2.6971101760864258e-06, "policy_entropy/var": 0.018298355862498283, "policy_error_vector_variance/max_squared_error": 2.002779960632324, "policy_error_vector_variance/metric": 0.016231363639235497, "policy_loss": -0.02780270203948021, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958683013916016, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.4712846279144287, "policy_sharpness": 9.235997200012207, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.703773498535156, "reward": 0.8098958730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15416531264781952, "rewards/accuracy_reward": 0.8098958730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15416531264781952, "sentence_full_gradient_variance/max_squared_error": 564258.25, "sentence_full_gradient_variance/metric": 1667.6448974609375, "sentence_full_gradient_variance/p75": 62.354164123535156, "sentence_full_gradient_variance/p90": 151.02622985839844, "sentence_full_gradient_variance/p95": 151.02622985839844, "sentence_full_gradient_variance/p99": 44710.31640625, "state_level_variance/metric": 4.161716938018799, "state_level_variance_full_gradient/metric": 180.29367065429688, "step": 84 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16514559090137482, "action_level_variance/metric": 52.004638671875, "action_level_variance_full_gradient/metric": 2207.140869140625, "adam_stats/lr_effective_max": 5.112356120662298e-06, "adam_stats/lr_effective_mean": 1.641554774056697e-11, "adam_stats/lr_effective_min": -4.853202881349716e-06, "adam_stats/m_t_max": 0.0002567797200754285, "adam_stats/m_t_mean": 2.245405033782233e-12, "adam_stats/m_t_min": -0.0003624950477387756, "adam_stats/v_t_max": 2.2214539058040828e-05, "adam_stats/v_t_mean": 1.9772495273018276e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.04308555647730827, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.6907334327697754, "all_logprobs": -0.027023766189813614, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.625, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00138092041015625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.048583984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.037613626569509506, "clip_ratio": 0.0, "completion_length": 530.1744995117188, "completion_length/correct": 445.3009948730469, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 400.0, "completion_length/correct/min": 106.0, "completion_length/correct/p25": 307.0, "completion_length/correct/p75": 539.25, "completion_length/correct/var": 37364.55078125, "completion_length/incorrect": 852.6937866210938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 962.0, "completion_length/incorrect/min": 210.0, "completion_length/incorrect/p25": 771.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 50898.87109375, "completion_length/max": 1024.0, "completion_length/median": 456.0, "completion_length/min": 106.0, "completion_length/p25": 332.75, "completion_length/p75": 692.0, "completion_length/var": 67530.53125, "epoch": 1.0896, "feature_vector_variance/max_squared_error": 145282.25, "feature_vector_variance/metric": 27851.369140625, "generated_tokens/total": 36113388.0, "grad_norm": 0.13637347519397736, "grouped_std_rewards": 0.1268601268529892, "learning_rate": 1.0048094716167097e-06, "loss": 0.0431, "mean_logprobs": -0.0274658203125, "mean_logprobs/var": 0.0002498626708984375, "num_completions/total": 65280, "per_sentence_gradient_norm": 1.114296793937683, "per_sentence_gradient_norm/max": 103.17479705810547, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 38.15834045410156, "per_sentence_gradient_norm/var": 50.829158782958984, "per_token_feature_norm": 195.79791259765625, "per_token_feature_norm/max": 300.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 419.455078125, "per_token_full_gradient_variance/max_squared_error": 192.4973907470703, "per_token_full_gradient_variance/variance": 0.024260634556412697, "per_token_gradient_norm": 1.5253345966339111, "per_token_gradient_norm/max": 5109.63330078125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2143.70166015625, "per_token_policy_error_norm": 0.01560838334262371, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014351770281791687, "policy_entropy": 0.03018096648156643, "policy_entropy/max": 3.375, "policy_entropy/median": 9.255018085241318e-09, "policy_entropy/min": 9.215718466126788e-19, "policy_entropy/p25": 1.1186784831807017e-10, "policy_entropy/p75": 2.5331974029541016e-06, "policy_entropy/var": 0.017033914104104042, "policy_error_vector_variance/max_squared_error": 2.0034494400024414, "policy_error_vector_variance/metric": 0.015598185360431671, "policy_loss": 0.04308555647730827, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.690732717514038, "policy_sharpness": 9.234465599060059, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.689882755279541, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16514559090137482, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16514559090137482, "sentence_full_gradient_variance/max_squared_error": 973724.0, "sentence_full_gradient_variance/metric": 2498.41357421875, "sentence_full_gradient_variance/p75": 37.76551818847656, "sentence_full_gradient_variance/p90": 107.89083862304688, "sentence_full_gradient_variance/p95": 107.89083862304688, "sentence_full_gradient_variance/p99": 32758.193359375, "state_level_variance/metric": 5.314279556274414, "state_level_variance_full_gradient/metric": 291.27264404296875, "step": 85 }, { "accuracy_reward": 0.7643229365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18036825954914093, "action_level_variance/metric": 38.712554931640625, "action_level_variance_full_gradient/metric": 1373.9085693359375, "adam_stats/lr_effective_max": 4.1195803532900754e-06, "adam_stats/lr_effective_mean": 1.9044247082100618e-11, "adam_stats/lr_effective_min": -3.859626303892583e-06, "adam_stats/m_t_max": 0.0002682266931515187, "adam_stats/m_t_mean": 3.4691808887404862e-12, "adam_stats/m_t_min": -0.00032362292404286563, "adam_stats/v_t_max": 2.219233829237055e-05, "adam_stats/v_t_mean": 1.975689360375621e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.08716358989477158, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.488162636756897, "all_logprobs": -0.02969014085829258, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.375, "all_logprobs/p1": -0.83984375, "all_logprobs/p10": -0.001953125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.042942777276039124, "clip_ratio": 0.0, "completion_length": 539.12109375, "completion_length/correct": 465.4957275390625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 406.0, "completion_length/correct/min": 88.0, "completion_length/correct/p25": 294.0, "completion_length/correct/p75": 616.5, "completion_length/correct/var": 46971.703125, "completion_length/incorrect": 777.8950805664062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 798.0, "completion_length/incorrect/min": 257.0, "completion_length/incorrect/p25": 582.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 54941.81640625, "completion_length/max": 1024.0, "completion_length/median": 471.0, "completion_length/min": 88.0, "completion_length/p25": 327.0, "completion_length/p75": 727.25, "completion_length/var": 66383.6328125, "epoch": 1.1024, "feature_vector_variance/max_squared_error": 157313.0625, "feature_vector_variance/metric": 28044.10546875, "generated_tokens/total": 36527432.0, "grad_norm": 0.09365833550691605, "grouped_std_rewards": 0.1394900679588318, "learning_rate": 8.778930535580476e-07, "loss": 0.0872, "mean_logprobs": -0.029541015625, "mean_logprobs/var": 0.0004177093505859375, "num_completions/total": 66048, "per_sentence_gradient_norm": 1.0068674087524414, "per_sentence_gradient_norm/max": 84.47734832763672, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 28.396408081054688, "per_sentence_gradient_norm/var": 37.747920989990234, "per_token_feature_norm": 195.5677032470703, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 77.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 429.7409973144531, "per_token_full_gradient_variance/max_squared_error": 117.85619354248047, "per_token_full_gradient_variance/variance": 0.014473500661551952, "per_token_gradient_norm": 1.303521752357483, "per_token_gradient_norm/max": 3490.340087890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1242.1754150390625, "per_token_policy_error_norm": 0.017002558335661888, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015637267380952835, "policy_entropy": 0.03301960602402687, "policy_entropy/max": 3.6875, "policy_entropy/median": 1.2514647096395493e-08, "policy_entropy/min": 2.744386749103933e-19, "policy_entropy/p25": 1.3369572116062045e-10, "policy_entropy/p75": 4.082918167114258e-06, "policy_entropy/var": 0.019863251596689224, "policy_error_vector_variance/max_squared_error": 2.0015645027160645, "policy_error_vector_variance/metric": 0.016982946544885635, "policy_loss": 0.08716358244419098, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.4881625175476074, "policy_sharpness": 9.186837196350098, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.992346286773682, "reward": 0.7643229365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18036825954914093, "rewards/accuracy_reward": 0.7643229365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18036825954914093, "sentence_full_gradient_variance/max_squared_error": 454919.78125, "sentence_full_gradient_variance/metric": 1552.325439453125, "sentence_full_gradient_variance/p75": 39.93497848510742, "sentence_full_gradient_variance/p90": 48.46435546875, "sentence_full_gradient_variance/p95": 48.46435546875, "sentence_full_gradient_variance/p99": 43917.69140625, "state_level_variance/metric": 3.865553379058838, "state_level_variance_full_gradient/metric": 178.41673278808594, "step": 86 }, { "accuracy_reward": 0.7669271230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17898298799991608, "action_level_variance/metric": 143.537841796875, "action_level_variance_full_gradient/metric": 6719.12060546875, "adam_stats/lr_effective_max": 3.4853264878620394e-06, "adam_stats/lr_effective_mean": 1.474498995124307e-11, "adam_stats/lr_effective_min": -3.532250502757961e-06, "adam_stats/m_t_max": 0.0003246151318307966, "adam_stats/m_t_mean": 3.724079204736608e-12, "adam_stats/m_t_min": -0.0003237299097236246, "adam_stats/v_t_max": 2.2170150259626098e-05, "adam_stats/v_t_mean": 1.975005879326086e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.012872045859694481, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.135019063949585, "all_logprobs": -0.02732694521546364, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.6875, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00150299072265625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.048583984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03788808733224869, "clip_ratio": 0.0, "completion_length": 545.7565307617188, "completion_length/correct": 472.17657470703125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 411.0, "completion_length/correct/min": 134.0, "completion_length/correct/p25": 316.0, "completion_length/correct/p75": 604.0, "completion_length/correct/var": 42942.62109375, "completion_length/incorrect": 787.8714599609375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 913.0, "completion_length/incorrect/min": 221.0, "completion_length/incorrect/p25": 517.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 72035.90625, "completion_length/max": 1024.0, "completion_length/median": 475.0, "completion_length/min": 134.0, "completion_length/p25": 338.0, "completion_length/p75": 725.0, "completion_length/var": 67476.4296875, "epoch": 1.1152, "feature_vector_variance/max_squared_error": 146211.453125, "feature_vector_variance/metric": 27599.00390625, "generated_tokens/total": 36946572.0, "grad_norm": 0.1586829423904419, "grouped_std_rewards": 0.14111018180847168, "learning_rate": 7.59044652756249e-07, "loss": 0.0129, "mean_logprobs": -0.0284423828125, "mean_logprobs/var": 0.0002689361572265625, "num_completions/total": 66816, "per_sentence_gradient_norm": 1.5641368627548218, "per_sentence_gradient_norm/max": 198.0440216064453, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 34.13520431518555, "per_sentence_gradient_norm/var": 141.27525329589844, "per_token_feature_norm": 196.03868103027344, "per_token_feature_norm/max": 306.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 75.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 404.642333984375, "per_token_full_gradient_variance/max_squared_error": 130.1831817626953, "per_token_full_gradient_variance/variance": 0.03107142634689808, "per_token_gradient_norm": 1.7487787008285522, "per_token_gradient_norm/max": 5831.9375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3145.035888671875, "per_token_policy_error_norm": 0.015919292345643044, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014578931033611298, "policy_entropy": 0.03028734028339386, "policy_entropy/max": 3.8125, "policy_entropy/median": 1.0477378964424133e-08, "policy_entropy/min": 1.3688052427629493e-18, "policy_entropy/p25": 1.318767317570746e-10, "policy_entropy/p75": 2.726912498474121e-06, "policy_entropy/var": 0.016871685162186623, "policy_error_vector_variance/max_squared_error": 2.003575325012207, "policy_error_vector_variance/metric": 0.015912646427750587, "policy_loss": 0.012872045859694481, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.135019063949585, "policy_sharpness": 9.227245330810547, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.721079349517822, "reward": 0.7669271230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17898298799991608, "rewards/accuracy_reward": 0.7669271230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17898298799991608, "sentence_full_gradient_variance/max_squared_error": 2216214.0, "sentence_full_gradient_variance/metric": 7604.42138671875, "sentence_full_gradient_variance/p75": 81.20115661621094, "sentence_full_gradient_variance/p90": 230.4092254638672, "sentence_full_gradient_variance/p95": 230.4092254638672, "sentence_full_gradient_variance/p99": 100942.578125, "state_level_variance/metric": 15.658818244934082, "state_level_variance_full_gradient/metric": 885.3014526367188, "step": 87 }, { "accuracy_reward": 0.8046875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1573704481124878, "action_level_variance/metric": 56.2296142578125, "action_level_variance_full_gradient/metric": 2230.073486328125, "adam_stats/lr_effective_max": 2.9373175038927e-06, "adam_stats/lr_effective_mean": 1.3277794662369669e-11, "adam_stats/lr_effective_min": -2.9140146580175497e-06, "adam_stats/m_t_max": 0.000271275028353557, "adam_stats/m_t_mean": 4.222273572440649e-12, "adam_stats/m_t_min": -0.0003013704845216125, "adam_stats/v_t_max": 2.2148265998112038e-05, "adam_stats/v_t_mean": 1.9732774442227097e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0524759404361248, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.5659246444702148, "all_logprobs": -0.029239073395729065, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -15.25, "all_logprobs/p1": -0.8359375, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04234690964221954, "clip_ratio": 0.0, "completion_length": 515.7357177734375, "completion_length/correct": 448.8527526855469, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 409.0, "completion_length/correct/min": 125.0, "completion_length/correct/p25": 319.0, "completion_length/correct/p75": 515.75, "completion_length/correct/var": 36391.8046875, "completion_length/incorrect": 791.2933349609375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 920.0, "completion_length/incorrect/min": 241.0, "completion_length/incorrect/p25": 567.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 67915.640625, "completion_length/max": 1024.0, "completion_length/median": 434.0, "completion_length/min": 125.0, "completion_length/p25": 336.0, "completion_length/p75": 663.0, "completion_length/var": 60922.41796875, "epoch": 1.1280000000000001, "feature_vector_variance/max_squared_error": 152899.515625, "feature_vector_variance/metric": 27787.693359375, "generated_tokens/total": 37342656.0, "grad_norm": 0.07969631999731064, "grouped_std_rewards": 0.1335289478302002, "learning_rate": 6.484090676804927e-07, "loss": -0.0525, "mean_logprobs": -0.029541015625, "mean_logprobs/var": 0.00030517578125, "num_completions/total": 67584, "per_sentence_gradient_norm": 1.0752084255218506, "per_sentence_gradient_norm/max": 106.4428939819336, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 27.970417022705078, "per_sentence_gradient_norm/var": 55.14533615112305, "per_token_feature_norm": 196.3902130126953, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 71.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 408.5749206542969, "per_token_full_gradient_variance/max_squared_error": 183.71495056152344, "per_token_full_gradient_variance/variance": 0.016878562048077583, "per_token_gradient_norm": 1.3241322040557861, "per_token_gradient_norm/max": 5430.6572265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1759.436767578125, "per_token_policy_error_norm": 0.016874678432941437, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015651468187570572, "policy_entropy": 0.032073598355054855, "policy_entropy/max": 3.578125, "policy_entropy/median": 1.2514647096395493e-08, "policy_entropy/min": 3.1170812458958252e-19, "policy_entropy/p25": 1.482476363889873e-10, "policy_entropy/p75": 3.933906555175781e-06, "policy_entropy/var": 0.018071642145514488, "policy_error_vector_variance/max_squared_error": 2.0026166439056396, "policy_error_vector_variance/metric": 0.016865000128746033, "policy_loss": -0.052475955337285995, "policy_loss/max": 9.659050941467285, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.5659246444702148, "policy_sharpness": 9.197389602661133, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.893955230712891, "reward": 0.8046875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1573704481124878, "rewards/accuracy_reward": 0.8046875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1573704481124878, "sentence_full_gradient_variance/max_squared_error": 426586.21875, "sentence_full_gradient_variance/metric": 2493.7109375, "sentence_full_gradient_variance/p75": 57.773494720458984, "sentence_full_gradient_variance/p90": 291.614013671875, "sentence_full_gradient_variance/p95": 291.614013671875, "sentence_full_gradient_variance/p99": 57454.0703125, "state_level_variance/metric": 5.934444904327393, "state_level_variance_full_gradient/metric": 263.637451171875, "step": 88 }, { "accuracy_reward": 0.81640625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15008249878883362, "action_level_variance/metric": 32.08196258544922, "action_level_variance_full_gradient/metric": 2375.000244140625, "adam_stats/lr_effective_max": 2.4150551780621754e-06, "adam_stats/lr_effective_mean": 1.0274545970967441e-11, "adam_stats/lr_effective_min": -2.3421787318511633e-06, "adam_stats/m_t_max": 0.0002461193362250924, "adam_stats/m_t_mean": 4.07371402236234e-12, "adam_stats/m_t_min": -0.0002557839034125209, "adam_stats/v_t_max": 2.2126117983134463e-05, "adam_stats/v_t_mean": 1.9713315181635327e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.02947486937046051, "advantages/max": 9.659051895141602, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.4412994384765625, "all_logprobs": -0.02671622484922409, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.3125, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00116729736328125, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.0458984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.037428222596645355, "clip_ratio": 0.0, "completion_length": 497.3099060058594, "completion_length/correct": 451.91387939453125, "completion_length/correct/max": 1022.0, "completion_length/correct/median": 433.0, "completion_length/correct/min": 74.0, "completion_length/correct/p25": 320.0, "completion_length/correct/p75": 547.5, "completion_length/correct/var": 31968.708984375, "completion_length/incorrect": 699.1773071289062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 696.0, "completion_length/incorrect/min": 183.0, "completion_length/incorrect/p25": 431.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 80397.0703125, "completion_length/max": 1024.0, "completion_length/median": 448.0, "completion_length/min": 74.0, "completion_length/p25": 333.0, "completion_length/p75": 623.0, "completion_length/var": 49942.55078125, "epoch": 1.1408, "feature_vector_variance/max_squared_error": 138380.078125, "feature_vector_variance/metric": 27744.62109375, "generated_tokens/total": 37724592.0, "grad_norm": 0.02809053845703602, "grouped_std_rewards": 0.10175129026174545, "learning_rate": 5.461210907490952e-07, "loss": -0.0295, "mean_logprobs": -0.027587890625, "mean_logprobs/var": 0.0002498626708984375, "num_completions/total": 68352, "per_sentence_gradient_norm": 0.747717559337616, "per_sentence_gradient_norm/max": 85.26595306396484, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 22.0987548828125, "per_sentence_gradient_norm/var": 31.563980102539062, "per_token_feature_norm": 196.59835815429688, "per_token_feature_norm/max": 286.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 76.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 437.2625427246094, "per_token_full_gradient_variance/max_squared_error": 102.55608367919922, "per_token_full_gradient_variance/variance": 0.013565638102591038, "per_token_gradient_norm": 0.8770765066146851, "per_token_gradient_norm/max": 5372.05029296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1099.005126953125, "per_token_policy_error_norm": 0.015535247512161732, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014372516423463821, "policy_entropy": 0.029407083988189697, "policy_entropy/max": 3.359375, "policy_entropy/median": 7.62520357966423e-09, "policy_entropy/min": 1.2366681029912785e-19, "policy_entropy/p25": 1.0049916454590857e-10, "policy_entropy/p75": 1.9222497940063477e-06, "policy_entropy/var": 0.016144974157214165, "policy_error_vector_variance/max_squared_error": 2.0027592182159424, "policy_error_vector_variance/metric": 0.015529937110841274, "policy_loss": -0.02947486750781536, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.4412997961044312, "policy_sharpness": 9.255793571472168, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.595062732696533, "reward": 0.81640625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15008249878883362, "rewards/accuracy_reward": 0.81640625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15008249878883362, "sentence_full_gradient_variance/max_squared_error": 544951.125, "sentence_full_gradient_variance/metric": 2688.2607421875, "sentence_full_gradient_variance/p75": 63.63618469238281, "sentence_full_gradient_variance/p90": 119.47059631347656, "sentence_full_gradient_variance/p95": 119.47059631347656, "sentence_full_gradient_variance/p99": 50559.05859375, "state_level_variance/metric": 3.4874916076660156, "state_level_variance_full_gradient/metric": 313.26043701171875, "step": 89 }, { "accuracy_reward": 0.7942708730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16361773014068604, "action_level_variance/metric": 93.67059326171875, "action_level_variance_full_gradient/metric": 2069.257080078125, "adam_stats/lr_effective_max": 1.8009864106716122e-06, "adam_stats/lr_effective_mean": 7.474522736861111e-12, "adam_stats/lr_effective_min": -1.7661024003245984e-06, "adam_stats/m_t_max": 0.00021778666996397078, "adam_stats/m_t_mean": 3.5247672836025146e-12, "adam_stats/m_t_min": -0.0002561454602982849, "adam_stats/v_t_max": 2.2104188246885315e-05, "adam_stats/v_t_mean": 1.9694391516916765e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.002959365723654628, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.6576592922210693, "all_logprobs": -0.027044668793678284, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.75, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00116729736328125, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.04638671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03838534653186798, "clip_ratio": 0.0, "completion_length": 544.17578125, "completion_length/correct": 484.1098327636719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 439.0, "completion_length/correct/min": 176.0, "completion_length/correct/p25": 341.0, "completion_length/correct/p75": 585.75, "completion_length/correct/var": 39318.265625, "completion_length/incorrect": 776.0759887695312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 828.0, "completion_length/incorrect/min": 199.0, "completion_length/incorrect/p25": 546.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 61855.97265625, "completion_length/max": 1024.0, "completion_length/median": 478.0, "completion_length/min": 176.0, "completion_length/p25": 362.75, "completion_length/p75": 695.5, "completion_length/var": 57827.796875, "epoch": 1.1536, "feature_vector_variance/max_squared_error": 152420.65625, "feature_vector_variance/metric": 27865.806640625, "generated_tokens/total": 38142520.0, "grad_norm": 0.03805512189865112, "grouped_std_rewards": 0.15628549456596375, "learning_rate": 4.5230534410568764e-07, "loss": 0.003, "mean_logprobs": -0.0281982421875, "mean_logprobs/var": 0.0002956390380859375, "num_completions/total": 69120, "per_sentence_gradient_norm": 1.2936160564422607, "per_sentence_gradient_norm/max": 212.2674560546875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 32.14263916015625, "per_sentence_gradient_norm/var": 92.11710357666016, "per_token_feature_norm": 196.38839721679688, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 72.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 417.8927307128906, "per_token_full_gradient_variance/max_squared_error": 310.24407958984375, "per_token_full_gradient_variance/variance": 0.026550086215138435, "per_token_gradient_norm": 1.686395287513733, "per_token_gradient_norm/max": 6708.10400390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3002.8935546875, "per_token_policy_error_norm": 0.015624125488102436, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014388036914169788, "policy_entropy": 0.029855409637093544, "policy_entropy/max": 3.625, "policy_entropy/median": 9.42964106798172e-09, "policy_entropy/min": 4.0996394647108136e-19, "policy_entropy/p25": 1.2460077414289117e-10, "policy_entropy/p75": 2.2798776626586914e-06, "policy_entropy/var": 0.01736637018620968, "policy_error_vector_variance/max_squared_error": 2.005350351333618, "policy_error_vector_variance/metric": 0.015612726099789143, "policy_loss": 0.0029593706130981445, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.6576592922210693, "policy_sharpness": 9.252580642700195, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.597815990447998, "reward": 0.7942708730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16361773014068604, "rewards/accuracy_reward": 0.7942708730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16361773014068604, "sentence_full_gradient_variance/max_squared_error": 367803.75, "sentence_full_gradient_variance/metric": 2325.22900390625, "sentence_full_gradient_variance/p75": 68.28144836425781, "sentence_full_gradient_variance/p90": 161.3927459716797, "sentence_full_gradient_variance/p95": 161.3927459716797, "sentence_full_gradient_variance/p99": 48708.9296875, "state_level_variance/metric": 10.14101791381836, "state_level_variance_full_gradient/metric": 255.9719696044922, "step": 90 }, { "accuracy_reward": 0.7643229365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18036825954914093, "action_level_variance/metric": 68.4593505859375, "action_level_variance_full_gradient/metric": 3418.065185546875, "adam_stats/lr_effective_max": 1.5775098063386395e-06, "adam_stats/lr_effective_mean": 6.482772218346922e-12, "adam_stats/lr_effective_min": -1.522703541922965e-06, "adam_stats/m_t_max": 0.0002817636705003679, "adam_stats/m_t_mean": 3.926607303195162e-12, "adam_stats/m_t_min": -0.0003553581773303449, "adam_stats/v_t_max": 2.2087115212343633e-05, "adam_stats/v_t_mean": 1.968693003756572e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.05486205220222473, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.298250675201416, "all_logprobs": -0.03193388506770134, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.78125, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.002471923828125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.06298828125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.048084158450365067, "clip_ratio": 0.0, "completion_length": 516.9166870117188, "completion_length/correct": 448.62689208984375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 415.0, "completion_length/correct/min": 111.0, "completion_length/correct/p25": 300.0, "completion_length/correct/p75": 567.0, "completion_length/correct/var": 37000.63671875, "completion_length/incorrect": 738.3867797851562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 722.0, "completion_length/incorrect/min": 129.0, "completion_length/incorrect/p25": 548.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 60092.51171875, "completion_length/max": 1024.0, "completion_length/median": 467.0, "completion_length/min": 111.0, "completion_length/p25": 328.5, "completion_length/p75": 676.25, "completion_length/var": 57515.46875, "epoch": 1.1663999999999999, "feature_vector_variance/max_squared_error": 149110.546875, "feature_vector_variance/metric": 28183.564453125, "generated_tokens/total": 38539512.0, "grad_norm": 0.17483805119991302, "grouped_std_rewards": 0.14843714237213135, "learning_rate": 3.6707612778634855e-07, "loss": -0.0549, "mean_logprobs": -0.032470703125, "mean_logprobs/var": 0.000812530517578125, "num_completions/total": 69888, "per_sentence_gradient_norm": 1.3659261465072632, "per_sentence_gradient_norm/max": 107.27810668945312, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 45.13869094848633, "per_sentence_gradient_norm/var": 66.68041229248047, "per_token_feature_norm": 196.35646057128906, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 441.8651428222656, "per_token_full_gradient_variance/max_squared_error": 220.16990661621094, "per_token_full_gradient_variance/variance": 0.024517182260751724, "per_token_gradient_norm": 1.6850732564926147, "per_token_gradient_norm/max": 5603.849609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2276.2177734375, "per_token_policy_error_norm": 0.01810491271317005, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01677706092596054, "policy_entropy": 0.034816134721040726, "policy_entropy/max": 3.8125, "policy_entropy/median": 1.123407855629921e-08, "policy_entropy/min": 3.9471735342050396e-19, "policy_entropy/p25": 1.191438059322536e-10, "policy_entropy/p75": 3.635883331298828e-06, "policy_entropy/var": 0.022179152816534042, "policy_error_vector_variance/max_squared_error": 2.006716728210449, "policy_error_vector_variance/metric": 0.01808885484933853, "policy_loss": -0.05486205965280533, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.298250436782837, "policy_sharpness": 9.169217109680176, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.122412204742432, "reward": 0.7643229365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18036825954914093, "rewards/accuracy_reward": 0.7643229365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18036825954914093, "sentence_full_gradient_variance/max_squared_error": 623840.875, "sentence_full_gradient_variance/metric": 3864.35546875, "sentence_full_gradient_variance/p75": 27.25602912902832, "sentence_full_gradient_variance/p90": 271.04888916015625, "sentence_full_gradient_variance/p95": 271.04888916015625, "sentence_full_gradient_variance/p99": 91671.8984375, "state_level_variance/metric": 6.762103080749512, "state_level_variance_full_gradient/metric": 446.2899169921875, "step": 91 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.13906998932361603, "action_level_variance/metric": 49.035701751708984, "action_level_variance_full_gradient/metric": 579.0643920898438, "adam_stats/lr_effective_max": 1.3660818467542413e-06, "adam_stats/lr_effective_mean": 4.469571848048082e-12, "adam_stats/lr_effective_min": -1.3826737585986848e-06, "adam_stats/m_t_max": 0.00024099879374261945, "adam_stats/m_t_mean": 3.191822240539155e-12, "adam_stats/m_t_min": -0.0003409939236007631, "adam_stats/v_t_max": 2.206509634561371e-05, "adam_stats/v_t_mean": 1.9669179479597787e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.04086063802242279, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.8699793815612793, "all_logprobs": -0.025867275893688202, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.375, "all_logprobs/p1": -0.7578125, "all_logprobs/p10": -0.00116729736328125, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.041015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03531525656580925, "clip_ratio": 0.0, "completion_length": 509.90106201171875, "completion_length/correct": 426.6156311035156, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 372.0, "completion_length/correct/min": 83.0, "completion_length/correct/p25": 291.0, "completion_length/correct/p75": 500.5, "completion_length/correct/var": 39986.37890625, "completion_length/incorrect": 926.328125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 194.0, "completion_length/incorrect/p25": 834.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 27472.28515625, "completion_length/max": 1024.0, "completion_length/median": 421.0, "completion_length/min": 83.0, "completion_length/p25": 312.0, "completion_length/p75": 689.5, "completion_length/var": 72589.6875, "epoch": 1.1792, "feature_vector_variance/max_squared_error": 148455.5, "feature_vector_variance/metric": 27869.30859375, "generated_tokens/total": 38931116.0, "grad_norm": 0.060980647802352905, "grouped_std_rewards": 0.08911971747875214, "learning_rate": 2.905372804626083e-07, "loss": -0.0409, "mean_logprobs": -0.027099609375, "mean_logprobs/var": 0.0002269744873046875, "num_completions/total": 70656, "per_sentence_gradient_norm": 0.7020925283432007, "per_sentence_gradient_norm/max": 147.70693969726562, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 19.437387466430664, "per_sentence_gradient_norm/var": 48.606056213378906, "per_token_feature_norm": 195.8765869140625, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 77.5, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 405.5885925292969, "per_token_full_gradient_variance/max_squared_error": 361.23101806640625, "per_token_full_gradient_variance/variance": 0.023665284737944603, "per_token_gradient_norm": 1.2122199535369873, "per_token_gradient_norm/max": 6791.607421875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2365.48974609375, "per_token_policy_error_norm": 0.015038774348795414, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.013682480901479721, "policy_entropy": 0.029090238735079765, "policy_entropy/max": 3.8125, "policy_entropy/median": 1.0360963642597198e-08, "policy_entropy/min": 2.439454888092385e-19, "policy_entropy/p25": 1.3642420526593924e-10, "policy_entropy/p75": 2.6971101760864258e-06, "policy_entropy/var": 0.016109097748994827, "policy_error_vector_variance/max_squared_error": 2.0051844120025635, "policy_error_vector_variance/metric": 0.01503008883446455, "policy_loss": -0.04086063802242279, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.8699795007705688, "policy_sharpness": 9.254340171813965, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.563468933105469, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.13906998932361603, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.13906998932361603, "sentence_full_gradient_variance/max_squared_error": 132302.96875, "sentence_full_gradient_variance/metric": 650.3529052734375, "sentence_full_gradient_variance/p75": 23.79503059387207, "sentence_full_gradient_variance/p90": 45.63874435424805, "sentence_full_gradient_variance/p95": 45.63874435424805, "sentence_full_gradient_variance/p99": 8610.0009765625, "state_level_variance/metric": 5.695861339569092, "state_level_variance_full_gradient/metric": 71.28851318359375, "step": 92 }, { "accuracy_reward": 0.7747396230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17474567890167236, "action_level_variance/metric": 38.678306579589844, "action_level_variance_full_gradient/metric": 823.4767456054688, "adam_stats/lr_effective_max": 1.0214297390120919e-06, "adam_stats/lr_effective_mean": 3.984232648662767e-12, "adam_stats/lr_effective_min": -1.0089909210364567e-06, "adam_stats/m_t_max": 0.00018538675794843584, "adam_stats/m_t_mean": 3.5763926542475843e-12, "adam_stats/m_t_min": -0.00026545010041445494, "adam_stats/v_t_max": 2.2043179342290387e-05, "adam_stats/v_t_mean": 1.965127496492136e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.05480830371379852, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.4677362442016602, "all_logprobs": -0.0303953904658556, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.9375, "all_logprobs/p1": -0.87109375, "all_logprobs/p10": -0.002471923828125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.062255859375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.043318722397089005, "clip_ratio": 0.0, "completion_length": 513.6979370117188, "completion_length/correct": 445.35125732421875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 403.0, "completion_length/correct/min": 153.0, "completion_length/correct/p25": 305.0, "completion_length/correct/p75": 562.5, "completion_length/correct/var": 33869.1015625, "completion_length/incorrect": 748.7630004882812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 785.0, "completion_length/incorrect/min": 149.0, "completion_length/incorrect/p25": 539.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 64975.3671875, "completion_length/max": 1024.0, "completion_length/median": 465.0, "completion_length/min": 149.0, "completion_length/p25": 319.0, "completion_length/p75": 651.25, "completion_length/var": 56887.39453125, "epoch": 1.192, "feature_vector_variance/max_squared_error": 150321.234375, "feature_vector_variance/metric": 28293.95703125, "generated_tokens/total": 39325636.0, "grad_norm": 0.06754282116889954, "grouped_std_rewards": 0.11073581874370575, "learning_rate": 2.2278205293002645e-07, "loss": 0.0548, "mean_logprobs": -0.031982421875, "mean_logprobs/var": 0.000858306884765625, "num_completions/total": 71424, "per_sentence_gradient_norm": 0.9126266241073608, "per_sentence_gradient_norm/max": 65.44204711914062, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 38.70530319213867, "per_sentence_gradient_norm/var": 37.89475631713867, "per_token_feature_norm": 195.41693115234375, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 435.4613952636719, "per_token_full_gradient_variance/max_squared_error": 66.32959747314453, "per_token_full_gradient_variance/variance": 0.017028125002980232, "per_token_gradient_norm": 1.377332091331482, "per_token_gradient_norm/max": 3735.7451171875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1607.640380859375, "per_token_policy_error_norm": 0.0174521803855896, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015812797471880913, "policy_entropy": 0.03417279198765755, "policy_entropy/max": 3.671875, "policy_entropy/median": 1.3154931366443634e-08, "policy_entropy/min": 1.2366681029912785e-19, "policy_entropy/p25": 1.5006662579253316e-10, "policy_entropy/p75": 4.827976226806641e-06, "policy_entropy/var": 0.02038382925093174, "policy_error_vector_variance/max_squared_error": 2.004011869430542, "policy_error_vector_variance/metric": 0.017437396571040154, "policy_loss": 0.054808299988508224, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.4677362442016602, "policy_sharpness": 9.161359786987305, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.1466803550720215, "reward": 0.7747396230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17474567890167236, "rewards/accuracy_reward": 0.7747396230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17474567890167236, "sentence_full_gradient_variance/max_squared_error": 157758.703125, "sentence_full_gradient_variance/metric": 932.0711669921875, "sentence_full_gradient_variance/p75": 16.32276725769043, "sentence_full_gradient_variance/p90": 22.648447036743164, "sentence_full_gradient_variance/p95": 22.648447036743164, "sentence_full_gradient_variance/p99": 37880.89453125, "state_level_variance/metric": 4.0440263748168945, "state_level_variance_full_gradient/metric": 108.5942611694336, "step": 93 }, { "accuracy_reward": 0.828125, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14251956343650818, "action_level_variance/metric": 79.48138427734375, "action_level_variance_full_gradient/metric": 1585.8314208984375, "adam_stats/lr_effective_max": 6.905211193952709e-07, "adam_stats/lr_effective_mean": 4.2757884906319266e-12, "adam_stats/lr_effective_min": -6.845161806268152e-07, "adam_stats/m_t_max": 0.00020604286692105234, "adam_stats/m_t_mean": 7.022387879529468e-12, "adam_stats/m_t_min": -0.00023165714810602367, "adam_stats/v_t_max": 2.2021238692104816e-05, "adam_stats/v_t_mean": 1.963906034324614e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.008596561849117279, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.0695366859436035, "all_logprobs": -0.0280531644821167, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.5, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00150299072265625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.048583984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.041200846433639526, "clip_ratio": 0.0, "completion_length": 492.6614685058594, "completion_length/correct": 444.14306640625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 421.0, "completion_length/correct/min": 87.0, "completion_length/correct/p25": 304.0, "completion_length/correct/p75": 560.0, "completion_length/correct/var": 33422.8203125, "completion_length/incorrect": 726.4318237304688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 797.0, "completion_length/incorrect/min": 250.0, "completion_length/incorrect/p25": 451.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 83490.328125, "completion_length/max": 1024.0, "completion_length/median": 445.0, "completion_length/min": 87.0, "completion_length/p25": 313.0, "completion_length/p75": 613.25, "completion_length/var": 53287.484375, "epoch": 1.2048, "feature_vector_variance/max_squared_error": 152560.40625, "feature_vector_variance/metric": 27994.818359375, "generated_tokens/total": 39704000.0, "grad_norm": 0.12969200313091278, "grouped_std_rewards": 0.09798028320074081, "learning_rate": 1.6389299449645734e-07, "loss": -0.0086, "mean_logprobs": -0.0281982421875, "mean_logprobs/var": 0.00031280517578125, "num_completions/total": 72192, "per_sentence_gradient_norm": 0.8366098403930664, "per_sentence_gradient_norm/max": 224.83224487304688, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 21.177059173583984, "per_sentence_gradient_norm/var": 78.88418579101562, "per_token_feature_norm": 195.41287231445312, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 420.3399658203125, "per_token_full_gradient_variance/max_squared_error": 324.02532958984375, "per_token_full_gradient_variance/variance": 0.02013252303004265, "per_token_gradient_norm": 1.1647013425827026, "per_token_gradient_norm/max": 6705.78466796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2292.361572265625, "per_token_policy_error_norm": 0.016075102612376213, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01496710442006588, "policy_entropy": 0.030857164412736893, "policy_entropy/max": 3.671875, "policy_entropy/median": 1.0011717677116394e-08, "policy_entropy/min": 1.1722935989999517e-18, "policy_entropy/p25": 1.191438059322536e-10, "policy_entropy/p75": 3.129243850708008e-06, "policy_entropy/var": 0.01806401088833809, "policy_error_vector_variance/max_squared_error": 2.0042724609375, "policy_error_vector_variance/metric": 0.01606266386806965, "policy_loss": -0.00859655998647213, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -7.48191499710083, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.0695366859436035, "policy_sharpness": 9.227049827575684, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.738064289093018, "reward": 0.828125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14251956343650818, "rewards/accuracy_reward": 0.828125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14251956343650818, "sentence_full_gradient_variance/max_squared_error": 563480.75, "sentence_full_gradient_variance/metric": 1799.72265625, "sentence_full_gradient_variance/p75": 17.74240493774414, "sentence_full_gradient_variance/p90": 42.57767868041992, "sentence_full_gradient_variance/p95": 42.57767868041992, "sentence_full_gradient_variance/p99": 39385.921875, "state_level_variance/metric": 9.332470893859863, "state_level_variance_full_gradient/metric": 213.8914031982422, "step": 94 }, { "accuracy_reward": 0.7760416865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17402757704257965, "action_level_variance/metric": 67.2182846069336, "action_level_variance_full_gradient/metric": 2565.11181640625, "adam_stats/lr_effective_max": 4.975449883204419e-07, "adam_stats/lr_effective_mean": 2.6239806768385554e-12, "adam_stats/lr_effective_min": -4.873015768680489e-07, "adam_stats/m_t_max": 0.0001975806662812829, "adam_stats/m_t_mean": 5.315533603555966e-12, "adam_stats/m_t_min": -0.00017053518968168646, "adam_stats/v_t_max": 2.2000634999130853e-05, "adam_stats/v_t_mean": 1.9621643719547333e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.018862014636397362, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 1.8505131006240845, "all_logprobs": -0.029462696984410286, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.25, "all_logprobs/p1": -0.83203125, "all_logprobs/p10": -0.00183868408203125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.056396484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.043496835976839066, "clip_ratio": 0.0, "completion_length": 539.6315307617188, "completion_length/correct": 458.45806884765625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 421.0, "completion_length/correct/min": 110.0, "completion_length/correct/p25": 330.5, "completion_length/correct/p75": 548.25, "completion_length/correct/var": 32447.49609375, "completion_length/incorrect": 820.906982421875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1016.0, "completion_length/incorrect/min": 186.0, "completion_length/incorrect/p25": 610.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 67588.78125, "completion_length/max": 1024.0, "completion_length/median": 459.0, "completion_length/min": 110.0, "completion_length/p25": 350.0, "completion_length/p75": 684.0, "completion_length/var": 63101.68359375, "epoch": 1.2176, "feature_vector_variance/max_squared_error": 153851.703125, "feature_vector_variance/metric": 28155.90234375, "generated_tokens/total": 40118436.0, "grad_norm": 0.07021607458591461, "grouped_std_rewards": 0.1342749297618866, "learning_rate": 1.1394185240843985e-07, "loss": 0.0189, "mean_logprobs": -0.0301513671875, "mean_logprobs/var": 0.00061798095703125, "num_completions/total": 72960, "per_sentence_gradient_norm": 1.0441372394561768, "per_sentence_gradient_norm/max": 172.20687866210938, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 27.708738327026367, "per_sentence_gradient_norm/var": 66.21429443359375, "per_token_feature_norm": 195.74996948242188, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 73.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 451.0323181152344, "per_token_full_gradient_variance/max_squared_error": 562.7159423828125, "per_token_full_gradient_variance/variance": 0.020014068111777306, "per_token_gradient_norm": 1.3700031042099, "per_token_gradient_norm/max": 6733.1552734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2175.005126953125, "per_token_policy_error_norm": 0.016804691404104233, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015470690093934536, "policy_entropy": 0.032631609588861465, "policy_entropy/max": 3.828125, "policy_entropy/median": 1.123407855629921e-08, "policy_entropy/min": 1.2400562347802957e-18, "policy_entropy/p25": 1.355147105641663e-10, "policy_entropy/p75": 3.2186508178710938e-06, "policy_entropy/var": 0.01990281604230404, "policy_error_vector_variance/max_squared_error": 2.002206563949585, "policy_error_vector_variance/metric": 0.016789237037301064, "policy_loss": 0.01886202208697796, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.8505128622055054, "policy_sharpness": 9.202552795410156, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.921722412109375, "reward": 0.7760416865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17402757704257965, "rewards/accuracy_reward": 0.7760416865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17402757704257965, "sentence_full_gradient_variance/max_squared_error": 1318124.25, "sentence_full_gradient_variance/metric": 2903.3447265625, "sentence_full_gradient_variance/p75": 30.977537155151367, "sentence_full_gradient_variance/p90": 146.05255126953125, "sentence_full_gradient_variance/p95": 146.05255126953125, "sentence_full_gradient_variance/p99": 37441.2734375, "state_level_variance/metric": 7.389033794403076, "state_level_variance_full_gradient/metric": 338.23297119140625, "step": 95 }, { "accuracy_reward": 0.7942708730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16361773014068604, "action_level_variance/metric": 120.13545989990234, "action_level_variance_full_gradient/metric": 2584.577880859375, "adam_stats/lr_effective_max": 2.9939675982859626e-07, "adam_stats/lr_effective_mean": 1.5493246876413513e-12, "adam_stats/lr_effective_min": -3.033204336588824e-07, "adam_stats/m_t_max": 0.0002022366679739207, "adam_stats/m_t_mean": 6.429426869375421e-12, "adam_stats/m_t_min": -0.0002347347472095862, "adam_stats/v_t_max": 2.1979603843647055e-05, "adam_stats/v_t_mean": 1.9607017832240503e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.11035484075546265, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.8678925037384033, "all_logprobs": -0.030396586284041405, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.0, "all_logprobs/p1": -0.87890625, "all_logprobs/p10": -0.002471923828125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04415137320756912, "clip_ratio": 0.0, "completion_length": 472.6224060058594, "completion_length/correct": 395.93115234375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 358.0, "completion_length/correct/min": 113.0, "completion_length/correct/p25": 280.25, "completion_length/correct/p75": 469.75, "completion_length/correct/var": 29218.4921875, "completion_length/incorrect": 768.7088623046875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 905.0, "completion_length/incorrect/min": 168.0, "completion_length/incorrect/p25": 538.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 84335.75, "completion_length/max": 1024.0, "completion_length/median": 394.0, "completion_length/min": 113.0, "completion_length/p25": 296.0, "completion_length/p75": 561.0, "completion_length/var": 63199.39453125, "epoch": 1.2304, "feature_vector_variance/max_squared_error": 156552.21875, "feature_vector_variance/metric": 28012.642578125, "generated_tokens/total": 40481408.0, "grad_norm": 0.10897623002529144, "grouped_std_rewards": 0.09771482646465302, "learning_rate": 7.298948443822229e-08, "loss": 0.1104, "mean_logprobs": -0.0299072265625, "mean_logprobs/var": 0.0003337860107421875, "num_completions/total": 73728, "per_sentence_gradient_norm": 1.2698116302490234, "per_sentence_gradient_norm/max": 225.00650024414062, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 34.571876525878906, "per_sentence_gradient_norm/var": 118.67756652832031, "per_token_feature_norm": 196.3115997314453, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 437.1883239746094, "per_token_full_gradient_variance/max_squared_error": 301.9869384765625, "per_token_full_gradient_variance/variance": 0.04540349543094635, "per_token_gradient_norm": 2.0440378189086914, "per_token_gradient_norm/max": 6708.10400390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4302.7001953125, "per_token_policy_error_norm": 0.017513709142804146, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01614418253302574, "policy_entropy": 0.033311378210783005, "policy_entropy/max": 3.609375, "policy_entropy/median": 1.501757651567459e-08, "policy_entropy/min": 3.6930636500287495e-19, "policy_entropy/p25": 1.673470251262188e-10, "policy_entropy/p75": 4.798173904418945e-06, "policy_entropy/var": 0.019601233303546906, "policy_error_vector_variance/max_squared_error": 2.005265712738037, "policy_error_vector_variance/metric": 0.017502907663583755, "policy_loss": 0.11035484075546265, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.8678925037384033, "policy_sharpness": 9.175540924072266, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.0303850173950195, "reward": 0.7942708730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16361773014068604, "rewards/accuracy_reward": 0.7942708730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16361773014068604, "sentence_full_gradient_variance/max_squared_error": 971041.625, "sentence_full_gradient_variance/metric": 2914.68896484375, "sentence_full_gradient_variance/p75": 19.942188262939453, "sentence_full_gradient_variance/p90": 267.2137451171875, "sentence_full_gradient_variance/p95": 267.2137451171875, "sentence_full_gradient_variance/p99": 51591.33203125, "state_level_variance/metric": 13.545610427856445, "state_level_variance_full_gradient/metric": 330.1107177734375, "step": 96 }, { "accuracy_reward": 0.80859375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15497168898582458, "action_level_variance/metric": 169.33407592773438, "action_level_variance_full_gradient/metric": 3207.869140625, "adam_stats/lr_effective_max": 1.7684654096683516e-07, "adam_stats/lr_effective_mean": 7.734475494965143e-13, "adam_stats/lr_effective_min": -1.7943710872714291e-07, "adam_stats/m_t_max": 0.0002824611437972635, "adam_stats/m_t_mean": 5.316150297751676e-12, "adam_stats/m_t_min": -0.0002358593192184344, "adam_stats/v_t_max": 2.19584908336401e-05, "adam_stats/v_t_mean": 1.9598430951034418e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.06858018040657043, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.742650270462036, "all_logprobs": -0.025921538472175598, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.625, "all_logprobs/p1": -0.7578125, "all_logprobs/p10": -0.0008697509765625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.0380859375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03779463469982147, "clip_ratio": 0.0, "completion_length": 515.1575927734375, "completion_length/correct": 432.9726257324219, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 388.0, "completion_length/correct/min": 102.0, "completion_length/correct/p25": 297.0, "completion_length/correct/p75": 527.0, "completion_length/correct/var": 37992.828125, "completion_length/incorrect": 862.346923828125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 301.0, "completion_length/incorrect/p25": 715.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 50543.73828125, "completion_length/max": 1024.0, "completion_length/median": 441.0, "completion_length/min": 102.0, "completion_length/p25": 319.0, "completion_length/p75": 641.5, "completion_length/var": 68903.3203125, "epoch": 1.2432, "feature_vector_variance/max_squared_error": 153544.84375, "feature_vector_variance/metric": 27532.22265625, "generated_tokens/total": 40877052.0, "grad_norm": 0.1919463723897934, "grouped_std_rewards": 0.13415014743804932, "learning_rate": 4.108578473795033e-08, "loss": 0.0686, "mean_logprobs": -0.0272216796875, "mean_logprobs/var": 0.00029754638671875, "num_completions/total": 74496, "per_sentence_gradient_norm": 1.4361460208892822, "per_sentence_gradient_norm/max": 286.48187255859375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 28.63850975036621, "per_sentence_gradient_norm/var": 167.48965454101562, "per_token_feature_norm": 196.34426879882812, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 75.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 430.79412841796875, "per_token_full_gradient_variance/max_squared_error": 255.70272827148438, "per_token_full_gradient_variance/variance": 0.053180135786533356, "per_token_gradient_norm": 2.2794079780578613, "per_token_gradient_norm/max": 6819.44189453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5417.72314453125, "per_token_policy_error_norm": 0.014911157079041004, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.013910171575844288, "policy_entropy": 0.028424404561519623, "policy_entropy/max": 3.6875, "policy_entropy/median": 6.373738870024681e-09, "policy_entropy/min": 5.353248226647178e-19, "policy_entropy/p25": 8.139977580867708e-11, "policy_entropy/p75": 1.601874828338623e-06, "policy_entropy/var": 0.01646890491247177, "policy_error_vector_variance/max_squared_error": 2.0023255348205566, "policy_error_vector_variance/metric": 0.014901043847203255, "policy_loss": 0.06858018040657043, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.7426509857177734, "policy_sharpness": 9.280298233032227, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.444155216217041, "reward": 0.80859375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15497168898582458, "rewards/accuracy_reward": 0.80859375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15497168898582458, "sentence_full_gradient_variance/max_squared_error": 752168.0, "sentence_full_gradient_variance/metric": 3618.0517578125, "sentence_full_gradient_variance/p75": 83.34222412109375, "sentence_full_gradient_variance/p90": 187.63308715820312, "sentence_full_gradient_variance/p95": 187.63308715820312, "sentence_full_gradient_variance/p99": 46808.3359375, "state_level_variance/metric": 19.305339813232422, "state_level_variance_full_gradient/metric": 410.1829528808594, "step": 97 }, { "accuracy_reward": 0.79296875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16438336670398712, "action_level_variance/metric": 79.26306915283203, "action_level_variance_full_gradient/metric": 3382.568115234375, "adam_stats/lr_effective_max": 7.171917815185225e-08, "adam_stats/lr_effective_mean": 3.314490338007098e-13, "adam_stats/lr_effective_min": -7.184538475257796e-08, "adam_stats/m_t_max": 0.0002470222534611821, "adam_stats/m_t_mean": 4.8050743071959e-12, "adam_stats/m_t_min": -0.00022831377282273024, "adam_stats/v_t_max": 2.19365356315393e-05, "adam_stats/v_t_mean": 1.957960052770269e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.0602160207927227, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.83127498626709, "all_logprobs": -0.029182234779000282, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.875, "all_logprobs/p1": -0.828125, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.05313730239868164, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04324497655034065, "clip_ratio": 0.0, "completion_length": 509.93231201171875, "completion_length/correct": 456.6141357421875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 409.0, "completion_length/correct/min": 110.0, "completion_length/correct/p25": 299.0, "completion_length/correct/p75": 586.0, "completion_length/correct/var": 38421.46875, "completion_length/incorrect": 714.1509399414062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 730.0, "completion_length/incorrect/min": 116.0, "completion_length/incorrect/p25": 464.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 74778.8125, "completion_length/max": 1024.0, "completion_length/median": 451.0, "completion_length/min": 110.0, "completion_length/p25": 317.75, "completion_length/p75": 682.0, "completion_length/var": 56763.65625, "epoch": 1.256, "feature_vector_variance/max_squared_error": 154424.515625, "feature_vector_variance/metric": 27986.291015625, "generated_tokens/total": 41268680.0, "grad_norm": 0.0455927774310112, "grouped_std_rewards": 0.12651899456977844, "learning_rate": 1.8269623051318517e-08, "loss": 0.0602, "mean_logprobs": -0.031005859375, "mean_logprobs/var": 0.0004730224609375, "num_completions/total": 75264, "per_sentence_gradient_norm": 1.2216618061065674, "per_sentence_gradient_norm/max": 148.19847106933594, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 32.419673919677734, "per_sentence_gradient_norm/var": 77.87200164794922, "per_token_feature_norm": 195.8297576904297, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 74.5, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 424.8902587890625, "per_token_full_gradient_variance/max_squared_error": 214.61558532714844, "per_token_full_gradient_variance/variance": 0.027624240145087242, "per_token_gradient_norm": 1.6074069738388062, "per_token_gradient_norm/max": 5728.79541015625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2711.512451171875, "per_token_policy_error_norm": 0.01686340570449829, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01590263843536377, "policy_entropy": 0.03139704465866089, "policy_entropy/max": 3.390625, "policy_entropy/median": 1.1350493878126144e-08, "policy_entropy/min": 7.199780051661553e-20, "policy_entropy/p25": 1.2005330063402653e-10, "policy_entropy/p75": 3.7550926208496094e-06, "policy_entropy/var": 0.017732640728354454, "policy_error_vector_variance/max_squared_error": 2.002332925796509, "policy_error_vector_variance/metric": 0.01685638166964054, "policy_loss": 0.060216024518013, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.831275224685669, "policy_sharpness": 9.205327987670898, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.857627868652344, "reward": 0.79296875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16438336670398712, "rewards/accuracy_reward": 0.79296875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16438336670398712, "sentence_full_gradient_variance/max_squared_error": 710504.25, "sentence_full_gradient_variance/metric": 3832.139404296875, "sentence_full_gradient_variance/p75": 74.57160949707031, "sentence_full_gradient_variance/p90": 90.85758209228516, "sentence_full_gradient_variance/p95": 90.85758209228516, "sentence_full_gradient_variance/p99": 71643.921875, "state_level_variance/metric": 8.504009246826172, "state_level_variance_full_gradient/metric": 449.5709228515625, "step": 98 }, { "accuracy_reward": 0.7760416865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17402757704257965, "action_level_variance/metric": 257.63201904296875, "action_level_variance_full_gradient/metric": 1763.6556396484375, "adam_stats/lr_effective_max": 1.8432759318898206e-08, "adam_stats/lr_effective_mean": 8.646140986327822e-14, "adam_stats/lr_effective_min": -1.847510588959267e-08, "adam_stats/m_t_max": 0.00022396315762307495, "adam_stats/m_t_mean": 4.765787157273715e-12, "adam_stats/m_t_min": -0.0001944724644999951, "adam_stats/v_t_max": 2.191460953326896e-05, "adam_stats/v_t_mean": 1.9562014768464975e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.011481260880827904, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.221123695373535, "all_logprobs": -0.028029922395944595, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -14.75, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.0015106201171875, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.048583984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0402788370847702, "clip_ratio": 0.0, "completion_length": 515.4388427734375, "completion_length/correct": 439.6358947753906, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 392.0, "completion_length/correct/min": 129.0, "completion_length/correct/p25": 321.75, "completion_length/correct/p75": 509.25, "completion_length/correct/var": 32606.046875, "completion_length/incorrect": 778.1046752929688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 842.0, "completion_length/incorrect/min": 251.0, "completion_length/incorrect/p25": 570.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 62867.38671875, "completion_length/max": 1024.0, "completion_length/median": 432.0, "completion_length/min": 129.0, "completion_length/p25": 342.75, "completion_length/p75": 629.25, "completion_length/var": 59246.98828125, "epoch": 1.2688, "feature_vector_variance/max_squared_error": 156290.390625, "feature_vector_variance/metric": 28028.927734375, "generated_tokens/total": 41664536.0, "grad_norm": 0.0730169266462326, "grouped_std_rewards": 0.08298343420028687, "learning_rate": 4.568797356781784e-09, "loss": -0.0115, "mean_logprobs": -0.02880859375, "mean_logprobs/var": 0.0003414154052734375, "num_completions/total": 76032, "per_sentence_gradient_norm": 1.297257900238037, "per_sentence_gradient_norm/max": 401.2535400390625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 27.473979949951172, "per_sentence_gradient_norm/var": 256.2828369140625, "per_token_feature_norm": 195.5924835205078, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 73.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 425.8218688964844, "per_token_full_gradient_variance/max_squared_error": 985.2333984375, "per_token_full_gradient_variance/variance": 0.04809734225273132, "per_token_gradient_norm": 2.1284847259521484, "per_token_gradient_norm/max": 8346.6259765625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5630.87646484375, "per_token_policy_error_norm": 0.016118964180350304, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014794688671827316, "policy_entropy": 0.031188612803816795, "policy_entropy/max": 3.796875, "policy_entropy/median": 1.257285475730896e-08, "policy_entropy/min": 1.2061749168901237e-18, "policy_entropy/p25": 1.4279066817834973e-10, "policy_entropy/p75": 3.7401914596557617e-06, "policy_entropy/var": 0.018452171236276627, "policy_error_vector_variance/max_squared_error": 2.002683162689209, "policy_error_vector_variance/metric": 0.016108687967061996, "policy_loss": -0.011481260880827904, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.221123695373535, "policy_sharpness": 9.219609260559082, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.765659332275391, "reward": 0.7760416865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17402757704257965, "rewards/accuracy_reward": 0.7760416865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17402757704257965, "sentence_full_gradient_variance/max_squared_error": 793071.375, "sentence_full_gradient_variance/metric": 2001.16748046875, "sentence_full_gradient_variance/p75": 23.604286193847656, "sentence_full_gradient_variance/p90": 34.98870849609375, "sentence_full_gradient_variance/p95": 34.98870849609375, "sentence_full_gradient_variance/p99": 19335.27734375, "state_level_variance/metric": 30.842395782470703, "state_level_variance_full_gradient/metric": 237.5116424560547, "step": 99 }, { "accuracy_reward": 0.8190104365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1484256237745285, "action_level_variance/metric": 58.66514205932617, "action_level_variance_full_gradient/metric": 3066.75732421875, "adam_stats/lr_effective_max": -0.0, "adam_stats/lr_effective_mean": 0.0, "adam_stats/lr_effective_min": -0.0, "adam_stats/m_t_max": 0.00040696311043575406, "adam_stats/m_t_mean": 5.9885408264237494e-12, "adam_stats/m_t_min": -0.00020782486535608768, "adam_stats/v_t_max": 2.189270526287146e-05, "adam_stats/v_t_mean": 1.9548269253322204e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.050441108644008636, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.6711056232452393, "all_logprobs": -0.028130115941166878, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.25, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00162506103515625, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.049560546875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.039417706429958344, "clip_ratio": 0.0, "completion_length": 508.60546875, "completion_length/correct": 461.4451599121094, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 423.0, "completion_length/correct/min": 101.0, "completion_length/correct/p25": 320.0, "completion_length/correct/p75": 565.0, "completion_length/correct/var": 38580.13671875, "completion_length/incorrect": 722.014404296875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 754.0, "completion_length/incorrect/min": 146.0, "completion_length/incorrect/p25": 497.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 76937.4765625, "completion_length/max": 1024.0, "completion_length/median": 465.0, "completion_length/min": 101.0, "completion_length/p25": 326.0, "completion_length/p75": 632.25, "completion_length/var": 55508.71484375, "epoch": 1.2816, "feature_vector_variance/max_squared_error": 137803.015625, "feature_vector_variance/metric": 27902.37109375, "generated_tokens/total": 42055144.0, "grad_norm": 0.12006210535764694, "grouped_std_rewards": 0.13712528347969055, "learning_rate": 0.0, "loss": -0.0504, "mean_logprobs": -0.029296875, "mean_logprobs/var": 0.00028228759765625, "num_completions/total": 76800, "per_sentence_gradient_norm": 1.1538474559783936, "per_sentence_gradient_norm/max": 105.17897033691406, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 35.603363037109375, "per_sentence_gradient_norm/var": 57.40851974487305, "per_token_feature_norm": 195.49586486816406, "per_token_feature_norm/max": 296.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 77.5, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 409.1693420410156, "per_token_full_gradient_variance/max_squared_error": 102.61132049560547, "per_token_full_gradient_variance/variance": 0.02449967712163925, "per_token_gradient_norm": 1.4395694732666016, "per_token_gradient_norm/max": 5004.6357421875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2094.911865234375, "per_token_policy_error_norm": 0.01629091240465641, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014985471032559872, "policy_entropy": 0.031198062002658844, "policy_entropy/max": 3.59375, "policy_entropy/median": 1.1641532182693481e-08, "policy_entropy/min": 1.7618285302889447e-18, "policy_entropy/p25": 1.3642420526593924e-10, "policy_entropy/p75": 3.2782554626464844e-06, "policy_entropy/var": 0.017642108723521233, "policy_error_vector_variance/max_squared_error": 2.0027055740356445, "policy_error_vector_variance/metric": 0.016283942386507988, "policy_loss": -0.050441108644008636, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.6711056232452393, "policy_sharpness": 9.212113380432129, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.826419830322266, "reward": 0.8190104365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1484256237745285, "rewards/accuracy_reward": 0.8190104365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1484256237745285, "sentence_full_gradient_variance/max_squared_error": 717126.0625, "sentence_full_gradient_variance/metric": 3436.891357421875, "sentence_full_gradient_variance/p75": 139.37844848632812, "sentence_full_gradient_variance/p90": 194.73728942871094, "sentence_full_gradient_variance/p95": 194.73728942871094, "sentence_full_gradient_variance/p99": 88709.7421875, "state_level_variance/metric": 6.06495475769043, "state_level_variance_full_gradient/metric": 370.13427734375, "step": 100 }, { "adam_stats/lr_effective_max": -0.0, "adam_stats/lr_effective_mean": 0.0, "adam_stats/lr_effective_min": -0.0, "adam_stats/m_t_max": 0.00040696311043575406, "adam_stats/m_t_mean": 5.9885408264237494e-12, "adam_stats/m_t_min": -0.00020782486535608768, "adam_stats/v_t_max": 2.189270526287146e-05, "adam_stats/v_t_mean": 1.9548269253322204e-12, "adam_stats/v_t_min": 0.0, "epoch": 1.2816, "step": 100, "total_flos": 0.0, "train_loss": -0.0018190278578549624, "train_runtime": 72604.0469, "train_samples_per_second": 1.058, "train_steps_per_second": 0.001 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 24, "trial_name": null, "trial_params": null }