{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08, "eval_steps": 10, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894736886024475, "action_level_variance/metric": 1138.5277099609375, "action_level_variance_full_gradient/metric": 4343.05322265625, "adam_stats/lr_effective_max": 4.743430508824531e-06, "adam_stats/lr_effective_mean": -6.267696084361063e-11, "adam_stats/lr_effective_min": -4.743435965792742e-06, "adam_stats/m_t_max": 0.008556028828024864, "adam_stats/m_t_mean": 1.3839424051198534e-10, "adam_stats/m_t_min": -0.011783753521740437, "adam_stats/v_t_max": 1.3885498447052669e-05, "adam_stats/v_t_mean": 4.901722474096037e-13, "adam_stats/v_t_min": 0.0, "advantages": -1.2417634698280722e-09, "advantages/max": 0.9680583477020264, "advantages/median": 0.36585545539855957, "advantages/min": -2.560988187789917, "advantages/p25": 0.032377004623413086, "advantages/p75": 0.5588920712471008, "advantages/var": 0.9469107985496521, "all_logprobs": -0.18518340587615967, "all_logprobs/max": 0.0, "all_logprobs/median": -9.5367431640625e-05, "all_logprobs/min": -10.0, "all_logprobs/p1": -2.828125, "all_logprobs/p10": -0.54296875, "all_logprobs/p25": -0.040283203125, "all_logprobs/p5": -1.1441402435302734, "all_logprobs/p75": -9.5367431640625e-07, "all_logprobs/var": 0.32445457577705383, "clip_ratio": 0.0, "completion_length": 498.04168701171875, "completion_length/correct": 472.84722900390625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 396.0, "completion_length/correct/min": 175.0, "completion_length/correct/p25": 327.0, "completion_length/correct/p75": 559.75, "completion_length/correct/var": 58331.45703125, "completion_length/incorrect": 573.625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 486.0, "completion_length/incorrect/min": 17.0, "completion_length/incorrect/p25": 119.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 167742.84375, "completion_length/max": 1024.0, "completion_length/median": 415.0, "completion_length/min": 17.0, "completion_length/p25": 316.5, "completion_length/p75": 661.25, "completion_length/var": 86130.84375, "epoch": 0.0008, "feature_vector_variance/max_squared_error": 96514.34375, "feature_vector_variance/metric": 25623.115234375, "generated_tokens/total": 47812.0, "grad_norm": 3.8121871948242188, "grouped_std_rewards": 0.42352813482284546, "learning_rate": 1.5e-06, "loss": -0.0, "mean_logprobs": -0.21484375, "mean_logprobs/var": 0.0186767578125, "num_completions/total": 96, "per_sentence_gradient_norm": 25.107540130615234, "per_sentence_gradient_norm/max": 215.31503295898438, "per_sentence_gradient_norm/median": 13.355048179626465, "per_sentence_gradient_norm/min": 4.699602127075195, "per_sentence_gradient_norm/p25": 10.303136825561523, "per_sentence_gradient_norm/p75": 22.499595642089844, "per_sentence_gradient_norm/p85": 40.8219108581543, "per_sentence_gradient_norm/p90": 50.60018539428711, "per_sentence_gradient_norm/p95": 78.6885986328125, "per_sentence_gradient_norm/p99": 180.45497131347656, "per_sentence_gradient_norm/var": 1098.900390625, "per_token_feature_norm": 169.11961364746094, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 161.0, "per_token_feature_norm/min": 62.5, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 2541.601318359375, "per_token_full_gradient_variance/max_squared_error": 16.105764389038086, "per_token_full_gradient_variance/variance": 0.03580053150653839, "per_token_gradient_norm": 21.46100616455078, "per_token_gradient_norm/max": 967.0531616210938, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4200.36865234375, "per_token_policy_error_norm": 0.0943555235862732, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.07768786698579788, "policy_entropy": 0.20558790862560272, "policy_entropy/max": 3.78125, "policy_entropy/median": 0.0010223388671875, "policy_entropy/min": 4.689582056016661e-13, "policy_entropy/p25": 1.5616416931152344e-05, "policy_entropy/p75": 0.1826171875, "policy_entropy/var": 0.17379343509674072, "policy_error_vector_variance/max_squared_error": 2.01072359085083, "policy_error_vector_variance/metric": 0.09376528859138489, "policy_loss": -6.208817460162663e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": -0.36585545539855957, "policy_loss/min": -0.9680584669113159, "policy_loss/p25": -0.5588920712471008, "policy_loss/p75": -0.032377004623413086, "policy_loss/var": 0.9469107985496521, "policy_sharpness": 6.696777820587158, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.46246337890625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 15.086150169372559, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894736886024475, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894736886024475, "sentence_full_gradient_variance/max_squared_error": 82019.8359375, "sentence_full_gradient_variance/metric": 4381.30859375, "sentence_full_gradient_variance/p75": 1586.7591552734375, "sentence_full_gradient_variance/p90": 9744.365234375, "sentence_full_gradient_variance/p95": 18740.9296875, "sentence_full_gradient_variance/p99": 75460.6484375, "state_level_variance/metric": 24.100452423095703, "state_level_variance_full_gradient/metric": 38.25520324707031, "step": 1 }, { "accuracy_reward": 0.5833333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24561403691768646, "action_level_variance/metric": 582.4110107421875, "action_level_variance_full_gradient/metric": 3211.7275390625, "adam_stats/lr_effective_max": 1.2765935025527142e-05, "adam_stats/lr_effective_mean": -1.4229276090738097e-10, "adam_stats/lr_effective_min": -1.2765907740686089e-05, "adam_stats/m_t_max": 0.00647706026211381, "adam_stats/m_t_mean": 1.8809936247476955e-11, "adam_stats/m_t_min": -0.0056439596228301525, "adam_stats/v_t_max": 2.236978616565466e-05, "adam_stats/v_t_mean": 8.678278907124637e-13, "adam_stats/v_t_min": 0.0, "advantages": -7.450580596923828e-09, "advantages/max": 1.249750018119812, "advantages/median": 0.24990005791187286, "advantages/min": -3.7485008239746094, "advantages/p25": -0.18746250867843628, "advantages/p75": 0.5588920712471008, "advantages/var": 0.7890772819519043, "all_logprobs": -0.14142143726348877, "all_logprobs/max": 0.0, "all_logprobs/median": -8.940696716308594e-06, "all_logprobs/min": -10.125, "all_logprobs/p1": -2.46875, "all_logprobs/p10": -0.373046875, "all_logprobs/p25": -0.010009765625, "all_logprobs/p5": -0.8984375, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.23498055338859558, "clip_ratio": 0.0, "completion_length": 738.0521240234375, "completion_length/correct": 680.6607666015625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 771.0, "completion_length/correct/min": 127.0, "completion_length/correct/p25": 399.5, "completion_length/correct/p75": 958.0, "completion_length/correct/var": 90084.734375, "completion_length/incorrect": 818.4000244140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 43.0, "completion_length/incorrect/p25": 669.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 81983.6328125, "completion_length/max": 1024.0, "completion_length/median": 790.0, "completion_length/min": 43.0, "completion_length/p25": 558.5, "completion_length/p75": 1024.0, "completion_length/var": 90470.5859375, "epoch": 0.0016, "feature_vector_variance/max_squared_error": 91466.5, "feature_vector_variance/metric": 24552.298828125, "generated_tokens/total": 118665.0, "grad_norm": 2.076301097869873, "grouped_std_rewards": 0.35385680198669434, "learning_rate": 3e-06, "loss": -0.0, "mean_logprobs": -0.154296875, "mean_logprobs/var": 0.0087890625, "num_completions/total": 192, "per_sentence_gradient_norm": 13.940669059753418, "per_sentence_gradient_norm/max": 210.4659423828125, "per_sentence_gradient_norm/median": 8.88543701171875, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 3.499828338623047, "per_sentence_gradient_norm/p75": 15.022483825683594, "per_sentence_gradient_norm/p85": 20.53899574279785, "per_sentence_gradient_norm/p90": 25.224660873413086, "per_sentence_gradient_norm/p95": 46.480369567871094, "per_sentence_gradient_norm/p99": 92.15412902832031, "per_sentence_gradient_norm/var": 618.0702514648438, "per_token_feature_norm": 158.70587158203125, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 148.0, "per_token_feature_norm/min": 62.25, "per_token_feature_norm/p25": 122.5, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 2217.42529296875, "per_token_full_gradient_variance/max_squared_error": 2219.236328125, "per_token_full_gradient_variance/variance": 0.045789044350385666, "per_token_gradient_norm": 10.376666069030762, "per_token_gradient_norm/max": 1112.5433349609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1795.4515380859375, "per_token_policy_error_norm": 0.07382401078939438, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06187732517719269, "policy_entropy": 0.15884296596050262, "policy_entropy/max": 3.671875, "policy_entropy/median": 0.000118255615234375, "policy_entropy/min": 1.9895196601282805e-13, "policy_entropy/p25": 4.6193599700927734e-06, "policy_entropy/p75": 0.061279296875, "policy_entropy/var": 0.13023671507835388, "policy_error_vector_variance/max_squared_error": 2.014791965484619, "policy_error_vector_variance/metric": 0.0734901875257492, "policy_loss": -9.313225746154785e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": -0.24990005791187286, "policy_loss/min": -1.2497501373291016, "policy_loss/p25": -0.5588920712471008, "policy_loss/p75": 0.18746249377727509, "policy_loss/var": 0.7890772223472595, "policy_sharpness": 7.394958019256592, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.62109375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.574806213378906, "reward": 0.5833333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24561403691768646, "rewards/accuracy_reward": 0.5833333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24561403691768646, "sentence_full_gradient_variance/max_squared_error": 151481.5, "sentence_full_gradient_variance/metric": 3258.94189453125, "sentence_full_gradient_variance/p75": 1104.8408203125, "sentence_full_gradient_variance/p90": 3429.1416015625, "sentence_full_gradient_variance/p95": 7031.39404296875, "sentence_full_gradient_variance/p99": 42275.11328125, "state_level_variance/metric": 78.74612426757812, "state_level_variance_full_gradient/metric": 47.21476364135742, "step": 2 }, { "accuracy_reward": 0.4895833432674408, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.25252193212509155, "action_level_variance/metric": 451.6849365234375, "action_level_variance_full_gradient/metric": 2788.561279296875, "adam_stats/lr_effective_max": 2.23465176532045e-05, "adam_stats/lr_effective_mean": 2.378083624118843e-13, "adam_stats/lr_effective_min": -2.23561619350221e-05, "adam_stats/m_t_max": 0.007832624018192291, "adam_stats/m_t_mean": 1.280972411477066e-10, "adam_stats/m_t_min": -0.011331818997859955, "adam_stats/v_t_max": 3.2515446946490556e-05, "adam_stats/v_t_mean": 1.3475308108940043e-12, "adam_stats/v_t_min": 0.0, "advantages": 8.692344621863413e-09, "advantages/max": 0.7498500347137451, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.5588920712471008, "advantages/var": 0.6312869191169739, "all_logprobs": -0.16496773064136505, "all_logprobs/max": 0.0, "all_logprobs/median": -3.838539123535156e-05, "all_logprobs/min": -11.4375, "all_logprobs/p1": -2.90625, "all_logprobs/p10": -0.43124961853027344, "all_logprobs/p25": -0.015869140625, "all_logprobs/p5": -1.0546875, "all_logprobs/p75": -8.344650268554688e-07, "all_logprobs/var": 0.306011825799942, "clip_ratio": 0.0, "completion_length": 703.15625, "completion_length/correct": 587.3829345703125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 599.0, "completion_length/correct/min": 75.0, "completion_length/correct/p25": 434.0, "completion_length/correct/p75": 753.0, "completion_length/correct/var": 78344.9375, "completion_length/incorrect": 814.2040405273438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 130.0, "completion_length/incorrect/p25": 578.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 99146.9609375, "completion_length/max": 1024.0, "completion_length/median": 735.0, "completion_length/min": 75.0, "completion_length/p25": 505.5, "completion_length/p75": 1024.0, "completion_length/var": 101022.4609375, "epoch": 0.0024, "feature_vector_variance/max_squared_error": 90383.8125, "feature_vector_variance/metric": 24701.337890625, "generated_tokens/total": 186168.0, "grad_norm": 1.7529001235961914, "grouped_std_rewards": 0.2945820093154907, "learning_rate": 4.5e-06, "loss": -0.0, "mean_logprobs": -0.1962890625, "mean_logprobs/var": 0.0201416015625, "num_completions/total": 288, "per_sentence_gradient_norm": 15.979316711425781, "per_sentence_gradient_norm/max": 156.61935424804688, "per_sentence_gradient_norm/median": 7.687700271606445, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 16.931028366088867, "per_sentence_gradient_norm/p85": 33.955257415771484, "per_sentence_gradient_norm/p90": 37.722198486328125, "per_sentence_gradient_norm/p95": 55.351715087890625, "per_sentence_gradient_norm/p99": 143.74124145507812, "per_sentence_gradient_norm/var": 689.0007934570312, "per_token_feature_norm": 162.56959533691406, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 61.5, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2574.69580078125, "per_token_full_gradient_variance/max_squared_error": 10.352337837219238, "per_token_full_gradient_variance/variance": 0.015681849792599678, "per_token_gradient_norm": 10.191628456115723, "per_token_gradient_norm/max": 898.3866577148438, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1937.7078857421875, "per_token_policy_error_norm": 0.08221165090799332, "per_token_policy_error_norm/max": 1.9921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06978306919336319, "policy_entropy": 0.18185774981975555, "policy_entropy/max": 3.5625, "policy_entropy/median": 0.0004596710205078125, "policy_entropy/min": 4.2810199829546036e-13, "policy_entropy/p25": 1.4662742614746094e-05, "policy_entropy/p75": 0.08935546875, "policy_entropy/var": 0.16851508617401123, "policy_error_vector_variance/max_squared_error": 2.00417160987854, "policy_error_vector_variance/metric": 0.08165788650512695, "policy_loss": -9.934107758624577e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -0.7498500347137451, "policy_loss/p25": -0.5588920712471008, "policy_loss/p75": 0.0, "policy_loss/var": 0.6312869191169739, "policy_sharpness": 7.0428876876831055, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.8238282203674316, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.684300422668457, "reward": 0.4895833432674408, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.25252193212509155, "rewards/accuracy_reward": 0.4895833432674408, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.25252193212509155, "sentence_full_gradient_variance/max_squared_error": 61512.7890625, "sentence_full_gradient_variance/metric": 2802.751953125, "sentence_full_gradient_variance/p75": 2004.522216796875, "sentence_full_gradient_variance/p90": 3803.3154296875, "sentence_full_gradient_variance/p95": 14829.052734375, "sentence_full_gradient_variance/p99": 32972.42578125, "state_level_variance/metric": 310.0429992675781, "state_level_variance_full_gradient/metric": 14.190422058105469, "step": 3 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894737035036087, "action_level_variance/metric": 813.1802978515625, "action_level_variance_full_gradient/metric": 3275.77392578125, "adam_stats/lr_effective_max": 3.283174373791553e-05, "adam_stats/lr_effective_mean": 7.600232249105332e-11, "adam_stats/lr_effective_min": -3.281781391706318e-05, "adam_stats/m_t_max": 0.01064806804060936, "adam_stats/m_t_mean": 1.4451026486561602e-10, "adam_stats/m_t_min": -0.016833819448947906, "adam_stats/v_t_max": 3.688543802127242e-05, "adam_stats/v_t_mean": 1.852784635783533e-12, "adam_stats/v_t_min": 0.0, "advantages": 8.692344621863413e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.36585545539855957, "advantages/min": -3.7485008239746094, "advantages/p25": 0.12495002895593643, "advantages/p75": 0.36585545539855957, "advantages/var": 0.9467464089393616, "all_logprobs": -0.15560346841812134, "all_logprobs/max": 0.0, "all_logprobs/median": -3.218650817871094e-05, "all_logprobs/min": -9.75, "all_logprobs/p1": -2.59375, "all_logprobs/p10": -0.423828125, "all_logprobs/p25": -0.018798828125, "all_logprobs/p5": -0.9921875, "all_logprobs/p75": -9.5367431640625e-07, "all_logprobs/var": 0.2615965008735657, "clip_ratio": 0.0, "completion_length": 576.78125, "completion_length/correct": 485.625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 449.0, "completion_length/correct/min": 212.0, "completion_length/correct/p25": 354.25, "completion_length/correct/p75": 554.75, "completion_length/correct/var": 36771.0, "completion_length/incorrect": 850.25, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 70.0, "completion_length/incorrect/p25": 894.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 100028.71875, "completion_length/max": 1024.0, "completion_length/median": 473.0, "completion_length/min": 70.0, "completion_length/p25": 371.75, "completion_length/p75": 883.0, "completion_length/var": 76889.75, "epoch": 0.0032, "feature_vector_variance/max_squared_error": 89323.4765625, "feature_vector_variance/metric": 24763.669921875, "generated_tokens/total": 241539.0, "grad_norm": 3.164357900619507, "grouped_std_rewards": 0.3110433518886566, "learning_rate": 6e-06, "loss": 0.0, "mean_logprobs": -0.1591796875, "mean_logprobs/var": 0.00701904296875, "num_completions/total": 384, "per_sentence_gradient_norm": 14.955917358398438, "per_sentence_gradient_norm/max": 148.05709838867188, "per_sentence_gradient_norm/median": 6.213871955871582, "per_sentence_gradient_norm/min": 2.494331121444702, "per_sentence_gradient_norm/p25": 4.910482883453369, "per_sentence_gradient_norm/p75": 8.49228286743164, "per_sentence_gradient_norm/p85": 10.001150131225586, "per_sentence_gradient_norm/p90": 22.19702911376953, "per_sentence_gradient_norm/p95": 74.22400665283203, "per_sentence_gradient_norm/p99": 142.68955993652344, "per_sentence_gradient_norm/var": 802.6531982421875, "per_token_feature_norm": 163.85060119628906, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 124.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 2503.84521484375, "per_token_full_gradient_variance/max_squared_error": 29.363210678100586, "per_token_full_gradient_variance/variance": 0.03301377221941948, "per_token_gradient_norm": 13.637566566467285, "per_token_gradient_norm/max": 1343.603271484375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4125.677734375, "per_token_policy_error_norm": 0.08082542568445206, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0681987777352333, "policy_entropy": 0.17332801222801208, "policy_entropy/max": 3.765625, "policy_entropy/median": 0.000385284423828125, "policy_entropy/min": 4.907185768843192e-14, "policy_entropy/p25": 1.4871358871459961e-05, "policy_entropy/p75": 0.10009765625, "policy_entropy/var": 0.14074963331222534, "policy_error_vector_variance/max_squared_error": 2.01674747467041, "policy_error_vector_variance/metric": 0.08044783771038055, "policy_loss": -5.587935447692871e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": -0.36585545539855957, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": -0.12495001405477524, "policy_loss/var": 0.9467465281486511, "policy_sharpness": 7.129134178161621, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.996063232421875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.170385360717773, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894737035036087, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894737035036087, "sentence_full_gradient_variance/max_squared_error": 89071.6796875, "sentence_full_gradient_variance/metric": 3373.395263671875, "sentence_full_gradient_variance/p75": 601.68310546875, "sentence_full_gradient_variance/p90": 870.6654052734375, "sentence_full_gradient_variance/p95": 10043.6220703125, "sentence_full_gradient_variance/p99": 69433.8125, "state_level_variance/metric": 38.32288360595703, "state_level_variance_full_gradient/metric": 97.62139892578125, "step": 4 }, { "accuracy_reward": 0.65625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2279605269432068, "action_level_variance/metric": 448.19732666015625, "action_level_variance_full_gradient/metric": 1774.7210693359375, "adam_stats/lr_effective_max": 4.377448931336403e-05, "adam_stats/lr_effective_mean": 2.8796939832709256e-10, "adam_stats/lr_effective_min": -4.372098555904813e-05, "adam_stats/m_t_max": 0.007511101197451353, "adam_stats/m_t_mean": 1.6933402379945561e-12, "adam_stats/m_t_min": -0.0086620282381773, "adam_stats/v_t_max": 4.849510878557339e-05, "adam_stats/v_t_mean": 2.3606019482896867e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 0.7498500347137451, "advantages/median": 0.24990005791187286, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.46501490473747253, "advantages/var": 0.7890477180480957, "all_logprobs": -0.15248696506023407, "all_logprobs/max": 0.0, "all_logprobs/median": -2.1696090698242188e-05, "all_logprobs/min": -9.9375, "all_logprobs/p1": -2.625, "all_logprobs/p10": -0.39453125, "all_logprobs/p25": -0.015716552734375, "all_logprobs/p5": -0.97265625, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.26378458738327026, "clip_ratio": 0.0, "completion_length": 603.6458740234375, "completion_length/correct": 516.8095703125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 462.0, "completion_length/correct/min": 174.0, "completion_length/correct/p25": 400.0, "completion_length/correct/p75": 601.0, "completion_length/correct/var": 35985.25390625, "completion_length/incorrect": 769.4242553710938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 942.0, "completion_length/incorrect/min": 88.0, "completion_length/incorrect/p25": 591.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 105170.5, "completion_length/max": 1024.0, "completion_length/median": 538.0, "completion_length/min": 88.0, "completion_length/p25": 403.0, "completion_length/p75": 837.25, "completion_length/var": 73458.0859375, "epoch": 0.004, "feature_vector_variance/max_squared_error": 96478.4296875, "feature_vector_variance/metric": 24437.4765625, "generated_tokens/total": 299489.0, "grad_norm": 2.461315155029297, "grouped_std_rewards": 0.3288986086845398, "learning_rate": 7.5e-06, "loss": -0.0, "mean_logprobs": -0.16796875, "mean_logprobs/var": 0.0174560546875, "num_completions/total": 480, "per_sentence_gradient_norm": 15.082133293151855, "per_sentence_gradient_norm/max": 164.12115478515625, "per_sentence_gradient_norm/median": 7.157534122467041, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 3.028630256652832, "per_sentence_gradient_norm/p75": 16.378902435302734, "per_sentence_gradient_norm/p85": 29.927453994750977, "per_sentence_gradient_norm/p90": 42.5842399597168, "per_sentence_gradient_norm/p95": 55.178890228271484, "per_sentence_gradient_norm/p99": 92.38841247558594, "per_sentence_gradient_norm/var": 557.0254516601562, "per_token_feature_norm": 160.97073364257812, "per_token_feature_norm/max": 340.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 191.0, "per_token_feature_norm/var": 2297.514404296875, "per_token_full_gradient_variance/max_squared_error": 9.491678237915039, "per_token_full_gradient_variance/variance": 0.02223486267030239, "per_token_gradient_norm": 13.00401782989502, "per_token_gradient_norm/max": 1116.8189697265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2908.282958984375, "per_token_policy_error_norm": 0.07875539362430573, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06625501811504364, "policy_entropy": 0.16755154728889465, "policy_entropy/max": 3.75, "policy_entropy/median": 0.0002689361572265625, "policy_entropy/min": 5.684341886080802e-14, "policy_entropy/p25": 6.139278411865234e-06, "policy_entropy/p75": 0.0869140625, "policy_entropy/var": 0.13807779550552368, "policy_error_vector_variance/max_squared_error": 2.0162465572357178, "policy_error_vector_variance/metric": 0.07838809490203857, "policy_loss": -1.30385160446167e-08, "policy_loss/max": 3.7485008239746094, "policy_loss/median": -0.3658554255962372, "policy_loss/min": -0.7498500943183899, "policy_loss/p25": -0.46501490473747253, "policy_loss/p75": 0.0, "policy_loss/var": 0.7890477180480957, "policy_sharpness": 7.192956447601318, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.1836423873901367, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.873746871948242, "reward": 0.65625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2279605269432068, "rewards/accuracy_reward": 0.65625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2279605269432068, "sentence_full_gradient_variance/max_squared_error": 30824.51171875, "sentence_full_gradient_variance/metric": 1857.6854248046875, "sentence_full_gradient_variance/p75": 621.0525512695312, "sentence_full_gradient_variance/p90": 2393.96630859375, "sentence_full_gradient_variance/p95": 14780.52734375, "sentence_full_gradient_variance/p99": 25683.5859375, "state_level_variance/metric": 157.2456817626953, "state_level_variance_full_gradient/metric": 82.9642333984375, "step": 5 }, { "accuracy_reward": 0.6979166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21304824948310852, "action_level_variance/metric": 232.21041870117188, "action_level_variance_full_gradient/metric": 2798.943115234375, "adam_stats/lr_effective_max": 5.498558311956003e-05, "adam_stats/lr_effective_mean": 3.3163108370537486e-10, "adam_stats/lr_effective_min": -5.491543561220169e-05, "adam_stats/m_t_max": 0.006299750413745642, "adam_stats/m_t_mean": 8.894118623016534e-13, "adam_stats/m_t_min": -0.00795451644808054, "adam_stats/v_t_max": 4.925557368551381e-05, "adam_stats/v_t_mean": 2.5543057252663814e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.1796752907855534e-08, "advantages/max": 0.6526548862457275, "advantages/median": 0.24990005791187286, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.46501490473747253, "advantages/var": 0.6312379240989685, "all_logprobs": -0.13446396589279175, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-05, "all_logprobs/min": -9.75, "all_logprobs/p1": -2.3756256103515625, "all_logprobs/p10": -0.333984375, "all_logprobs/p25": -0.0086669921875, "all_logprobs/p5": -0.86328125, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.2159535139799118, "clip_ratio": 0.0, "completion_length": 563.5104370117188, "completion_length/correct": 439.0149230957031, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 365.0, "completion_length/correct/min": 123.0, "completion_length/correct/p25": 290.0, "completion_length/correct/p75": 496.5, "completion_length/correct/var": 48772.83203125, "completion_length/incorrect": 851.137939453125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 131.0, "completion_length/incorrect/p25": 846.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 92979.5546875, "completion_length/max": 1024.0, "completion_length/median": 427.0, "completion_length/min": 123.0, "completion_length/p25": 308.75, "completion_length/p75": 898.5, "completion_length/var": 97474.0390625, "epoch": 0.0048, "feature_vector_variance/max_squared_error": 87195.453125, "feature_vector_variance/metric": 23319.310546875, "generated_tokens/total": 353586.0, "grad_norm": 0.7121232748031616, "grouped_std_rewards": 0.26317334175109863, "learning_rate": 9e-06, "loss": -0.0, "mean_logprobs": -0.154296875, "mean_logprobs/var": 0.0062255859375, "num_completions/total": 576, "per_sentence_gradient_norm": 10.618456840515137, "per_sentence_gradient_norm/max": 103.50605010986328, "per_sentence_gradient_norm/median": 6.674565315246582, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 11.506678581237793, "per_sentence_gradient_norm/p85": 17.511587142944336, "per_sentence_gradient_norm/p90": 25.357872009277344, "per_sentence_gradient_norm/p95": 40.96968078613281, "per_sentence_gradient_norm/p99": 83.12347412109375, "per_sentence_gradient_norm/var": 295.8896179199219, "per_token_feature_norm": 156.0871124267578, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 145.0, "per_token_feature_norm/min": 63.25, "per_token_feature_norm/p25": 120.0, "per_token_feature_norm/p75": 185.0, "per_token_feature_norm/var": 2157.205078125, "per_token_full_gradient_variance/max_squared_error": 7.806212902069092, "per_token_full_gradient_variance/variance": 0.01447908952832222, "per_token_gradient_norm": 8.919943809509277, "per_token_gradient_norm/max": 1430.5216064453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1813.464599609375, "per_token_policy_error_norm": 0.07118138670921326, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06016341224312782, "policy_entropy": 0.15006570518016815, "policy_entropy/max": 3.65625, "policy_entropy/median": 0.0001583099365234375, "policy_entropy/min": 1.3528733688872308e-11, "policy_entropy/p25": 6.973743438720703e-06, "policy_entropy/p75": 0.053955078125, "policy_entropy/var": 0.11853045970201492, "policy_error_vector_variance/max_squared_error": 2.013052225112915, "policy_error_vector_variance/metric": 0.07096028327941895, "policy_loss": -1.6763806343078613e-08, "policy_loss/max": 3.7485008239746094, "policy_loss/median": -0.24990005791187286, "policy_loss/min": -0.6526549458503723, "policy_loss/p25": -0.46501490473747253, "policy_loss/p75": 0.0, "policy_loss/var": 0.6312379240989685, "policy_sharpness": 7.4353156089782715, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.873046875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.056812286376953, "reward": 0.6979166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21304824948310852, "rewards/accuracy_reward": 0.6979166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21304824948310852, "sentence_full_gradient_variance/max_squared_error": 101223.0703125, "sentence_full_gradient_variance/metric": 2848.77294921875, "sentence_full_gradient_variance/p75": 1311.20068359375, "sentence_full_gradient_variance/p90": 1926.9703369140625, "sentence_full_gradient_variance/p95": 12812.236328125, "sentence_full_gradient_variance/p99": 47717.9453125, "state_level_variance/metric": 90.13217163085938, "state_level_variance_full_gradient/metric": 49.82998275756836, "step": 6 }, { "accuracy_reward": 0.7395833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19462722539901733, "action_level_variance/metric": 81.97062683105469, "action_level_variance_full_gradient/metric": 190.1073455810547, "adam_stats/lr_effective_max": 6.615719757974148e-05, "adam_stats/lr_effective_mean": 5.610070186179428e-10, "adam_stats/lr_effective_min": -6.611825665459037e-05, "adam_stats/m_t_max": 0.006811133120208979, "adam_stats/m_t_mean": -1.4835681294567138e-11, "adam_stats/m_t_min": -0.007629035506397486, "adam_stats/v_t_max": 4.984480983694084e-05, "adam_stats/v_t_mean": 2.6060965451629725e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 2.560988187789917, "advantages/median": 0.36585545539855957, "advantages/min": -2.560988187789917, "advantages/p25": -0.36585545539855957, "advantages/p75": 0.46501490473747253, "advantages/var": 0.7890477180480957, "all_logprobs": -0.12592962384223938, "all_logprobs/max": 0.0, "all_logprobs/median": -5.125999450683594e-06, "all_logprobs/min": -9.375, "all_logprobs/p1": -2.234375, "all_logprobs/p10": -0.3125, "all_logprobs/p25": -0.005645751953125, "all_logprobs/p5": -0.82421875, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.1961560696363449, "clip_ratio": 0.0, "completion_length": 597.3958740234375, "completion_length/correct": 457.6760559082031, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 333.0, "completion_length/correct/min": 150.0, "completion_length/correct/p25": 230.5, "completion_length/correct/p75": 718.0, "completion_length/correct/var": 76123.875, "completion_length/incorrect": 994.199951171875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 631.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 10673.0, "completion_length/max": 1024.0, "completion_length/median": 558.0, "completion_length/min": 150.0, "completion_length/p25": 248.0, "completion_length/p75": 1024.0, "completion_length/var": 114812.5859375, "epoch": 0.0056, "feature_vector_variance/max_squared_error": 94241.0546875, "feature_vector_variance/metric": 24188.65234375, "generated_tokens/total": 410936.0, "grad_norm": 0.38530030846595764, "grouped_std_rewards": 0.3125036060810089, "learning_rate": 1.05e-05, "loss": -0.0, "mean_logprobs": -0.12890625, "mean_logprobs/var": 0.0029754638671875, "num_completions/total": 672, "per_sentence_gradient_norm": 9.510358810424805, "per_sentence_gradient_norm/max": 45.22455596923828, "per_sentence_gradient_norm/median": 5.181307315826416, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 3.147808074951172, "per_sentence_gradient_norm/p75": 13.6947660446167, "per_sentence_gradient_norm/p85": 19.491003036499023, "per_sentence_gradient_norm/p90": 25.15069580078125, "per_sentence_gradient_norm/p95": 36.31909942626953, "per_sentence_gradient_norm/p99": 43.4914436340332, "per_sentence_gradient_norm/var": 118.70967102050781, "per_token_feature_norm": 156.92787170410156, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 148.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 121.5, "per_token_feature_norm/p75": 186.0, "per_token_feature_norm/var": 2039.895751953125, "per_token_full_gradient_variance/max_squared_error": 22.805234909057617, "per_token_full_gradient_variance/variance": 0.02144252508878708, "per_token_gradient_norm": 12.506327629089355, "per_token_gradient_norm/max": 864.6536254882812, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2592.107666015625, "per_token_policy_error_norm": 0.06776315718889236, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.057132747024297714, "policy_entropy": 0.14043959975242615, "policy_entropy/max": 3.71875, "policy_entropy/median": 7.104873657226562e-05, "policy_entropy/min": 1.0613732115416497e-13, "policy_entropy/p25": 3.516674041748047e-06, "policy_entropy/p75": 0.03662109375, "policy_entropy/var": 0.10660138726234436, "policy_error_vector_variance/max_squared_error": 2.01414155960083, "policy_error_vector_variance/metric": 0.0676216334104538, "policy_loss": -1.2417634920325327e-08, "policy_loss/max": 2.560988187789917, "policy_loss/median": -0.36585545539855957, "policy_loss/min": -2.560988187789917, "policy_loss/p25": -0.46501487493515015, "policy_loss/p75": 0.36585545539855957, "policy_loss/var": 0.7890476584434509, "policy_sharpness": 7.621069431304932, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.185546398162842, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.646533966064453, "reward": 0.7395833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19462722539901733, "rewards/accuracy_reward": 0.7395833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19462722539901733, "sentence_full_gradient_variance/max_squared_error": 7234.51416015625, "sentence_full_gradient_variance/metric": 346.94622802734375, "sentence_full_gradient_variance/p75": 431.01483154296875, "sentence_full_gradient_variance/p90": 506.30255126953125, "sentence_full_gradient_variance/p95": 572.4138793945312, "sentence_full_gradient_variance/p99": 1578.2120361328125, "state_level_variance/metric": 48.75079345703125, "state_level_variance_full_gradient/metric": 156.83883666992188, "step": 7 }, { "accuracy_reward": 0.9583333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.04035087302327156, "action_level_variance/metric": 123.9972152709961, "action_level_variance_full_gradient/metric": 1484.6634521484375, "adam_stats/lr_effective_max": 7.68834215705283e-05, "adam_stats/lr_effective_mean": 8.668984419202275e-10, "adam_stats/lr_effective_min": -7.63759744586423e-05, "adam_stats/m_t_max": 0.008958677761256695, "adam_stats/m_t_mean": 1.1839443314620723e-10, "adam_stats/m_t_min": -0.015627510845661163, "adam_stats/v_t_max": 6.653110904153436e-05, "adam_stats/v_t_mean": 3.114547682259161e-12, "adam_stats/v_t_min": 0.0, "advantages": 2.4835269396561444e-09, "advantages/max": 0.36585545539855957, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.47333940863609314, "all_logprobs": -0.10834338515996933, "all_logprobs/max": 0.0, "all_logprobs/median": -2.6226043701171875e-06, "all_logprobs/min": -8.4375, "all_logprobs/p1": -2.140625, "all_logprobs/p10": -0.2265625, "all_logprobs/p25": -0.00150299072265625, "all_logprobs/p5": -0.69140625, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.16716250777244568, "clip_ratio": 0.0, "completion_length": 385.5520935058594, "completion_length/correct": 371.09783935546875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 320.0, "completion_length/correct/min": 159.0, "completion_length/correct/p25": 272.75, "completion_length/correct/p75": 424.75, "completion_length/correct/var": 24091.654296875, "completion_length/incorrect": 718.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 533.0, "completion_length/incorrect/min": 291.0, "completion_length/incorrect/p25": 472.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 134608.671875, "completion_length/max": 1024.0, "completion_length/median": 325.0, "completion_length/min": 159.0, "completion_length/p25": 273.75, "completion_length/p75": 433.5, "completion_length/var": 32183.93359375, "epoch": 0.0064, "feature_vector_variance/max_squared_error": 93041.125, "feature_vector_variance/metric": 24968.53515625, "generated_tokens/total": 447949.0, "grad_norm": 1.6229597330093384, "grouped_std_rewards": 0.14026084542274475, "learning_rate": 1.2e-05, "loss": -0.0, "mean_logprobs": -0.11572265625, "mean_logprobs/var": 0.002685546875, "num_completions/total": 768, "per_sentence_gradient_norm": 4.3854193687438965, "per_sentence_gradient_norm/max": 79.10680389404297, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 4.361687183380127, "per_sentence_gradient_norm/p85": 5.0273051261901855, "per_sentence_gradient_norm/p90": 5.906221389770508, "per_sentence_gradient_norm/p95": 8.507341384887695, "per_sentence_gradient_norm/p99": 68.74177551269531, "per_sentence_gradient_norm/var": 137.3899688720703, "per_token_feature_norm": 158.6668243408203, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 124.0, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 1845.1998291015625, "per_token_full_gradient_variance/max_squared_error": 14.871668815612793, "per_token_full_gradient_variance/variance": 0.016173847019672394, "per_token_gradient_norm": 5.938019752502441, "per_token_gradient_norm/max": 1344.3060302734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2138.898681640625, "per_token_policy_error_norm": 0.05837217718362808, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04986048862338066, "policy_entropy": 0.12036816030740738, "policy_entropy/max": 3.75, "policy_entropy/median": 3.790855407714844e-05, "policy_entropy/min": 2.7569058147491887e-12, "policy_entropy/p25": 3.293156623840332e-06, "policy_entropy/p75": 0.01141357421875, "policy_entropy/var": 0.08984915912151337, "policy_error_vector_variance/max_squared_error": 2.00469708442688, "policy_error_vector_variance/metric": 0.05825141444802284, "policy_loss": -4.967053879312289e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.36585548520088196, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.47333940863609314, "policy_sharpness": 7.9376912117004395, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.74609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.533782005310059, "reward": 0.9583333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.04035087302327156, "rewards/accuracy_reward": 0.9583333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.04035087302327156, "sentence_full_gradient_variance/max_squared_error": 111525.546875, "sentence_full_gradient_variance/metric": 1559.7830810546875, "sentence_full_gradient_variance/p75": 286.57598876953125, "sentence_full_gradient_variance/p90": 483.04144287109375, "sentence_full_gradient_variance/p95": 636.99609375, "sentence_full_gradient_variance/p99": 28447.3359375, "state_level_variance/metric": 23.653715133666992, "state_level_variance_full_gradient/metric": 75.11981201171875, "step": 8 }, { "accuracy_reward": 0.8229166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14725878834724426, "action_level_variance/metric": 118.19198608398438, "action_level_variance_full_gradient/metric": 81.57909393310547, "adam_stats/lr_effective_max": 8.87876667547971e-05, "adam_stats/lr_effective_mean": 9.32827259880753e-10, "adam_stats/lr_effective_min": -8.719105971977115e-05, "adam_stats/m_t_max": 0.007928532548248768, "adam_stats/m_t_mean": 1.0350947876602845e-10, "adam_stats/m_t_min": -0.014418763108551502, "adam_stats/v_t_max": 6.647017289651558e-05, "adam_stats/v_t_mean": 3.1197919681674735e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.4733055531978607, "all_logprobs": -0.13315045833587646, "all_logprobs/max": 0.0, "all_logprobs/median": -5.7220458984375e-06, "all_logprobs/min": -7.90625, "all_logprobs/p1": -2.359375, "all_logprobs/p10": -0.34765625, "all_logprobs/p25": -0.00732421875, "all_logprobs/p5": -0.8548822402954102, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.20760898292064667, "clip_ratio": 0.0, "completion_length": 466.0833435058594, "completion_length/correct": 349.4936828613281, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 338.0, "completion_length/correct/min": 148.0, "completion_length/correct/p25": 253.5, "completion_length/correct/p75": 402.5, "completion_length/correct/var": 25840.45703125, "completion_length/incorrect": 1007.8823852539062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 888.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 1477.110595703125, "completion_length/max": 1024.0, "completion_length/median": 373.0, "completion_length/min": 148.0, "completion_length/p25": 288.5, "completion_length/p75": 483.0, "completion_length/var": 85298.25, "epoch": 0.0072, "feature_vector_variance/max_squared_error": 91624.4140625, "feature_vector_variance/metric": 24611.798828125, "generated_tokens/total": 492693.0, "grad_norm": 0.19959266483783722, "grouped_std_rewards": 0.125, "learning_rate": 1.3500000000000001e-05, "loss": 0.0, "mean_logprobs": -0.1533203125, "mean_logprobs/var": 0.006591796875, "num_completions/total": 864, "per_sentence_gradient_norm": 3.726144790649414, "per_sentence_gradient_norm/max": 84.93630981445312, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 4.022140026092529, "per_sentence_gradient_norm/p85": 4.602584362030029, "per_sentence_gradient_norm/p90": 5.058947563171387, "per_sentence_gradient_norm/p95": 6.926912784576416, "per_sentence_gradient_norm/p99": 68.96110534667969, "per_sentence_gradient_norm/var": 129.04541015625, "per_token_feature_norm": 158.24337768554688, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 62.5, "per_token_feature_norm/p25": 122.5, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 2032.0418701171875, "per_token_full_gradient_variance/max_squared_error": 58.254783630371094, "per_token_full_gradient_variance/variance": 0.01862727291882038, "per_token_gradient_norm": 5.810554504394531, "per_token_gradient_norm/max": 1286.7900390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2209.2763671875, "per_token_policy_error_norm": 0.07136156409978867, "per_token_policy_error_norm/max": 1.9921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06031043455004692, "policy_entropy": 0.14659570157527924, "policy_entropy/max": 3.640625, "policy_entropy/median": 7.82012939453125e-05, "policy_entropy/min": 1.2434497875801753e-13, "policy_entropy/p25": 3.2782554626464844e-06, "policy_entropy/p75": 0.044921875, "policy_entropy/var": 0.1112082377076149, "policy_error_vector_variance/max_squared_error": 2.005659818649292, "policy_error_vector_variance/metric": 0.0712573230266571, "policy_loss": 1.2417634698280722e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": -0.24990004301071167, "policy_loss/p75": 0.0, "policy_loss/var": 0.4733055531978607, "policy_sharpness": 7.55373477935791, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.011474609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.806401252746582, "reward": 0.8229166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14725878834724426, "rewards/accuracy_reward": 0.8229166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14725878834724426, "sentence_full_gradient_variance/max_squared_error": 5893.88427734375, "sentence_full_gradient_variance/metric": 151.76882934570312, "sentence_full_gradient_variance/p75": 127.48352813720703, "sentence_full_gradient_variance/p90": 213.06600952148438, "sentence_full_gradient_variance/p95": 311.10540771484375, "sentence_full_gradient_variance/p99": 614.582763671875, "state_level_variance/metric": 20.275421142578125, "state_level_variance_full_gradient/metric": 70.18977355957031, "step": 9 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14035087823867798, "action_level_variance/metric": 0.0, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 8.88319918885827e-05, "adam_stats/lr_effective_mean": 9.33206067976755e-10, "adam_stats/lr_effective_min": -8.723428618395701e-05, "adam_stats/m_t_max": 0.007135679014027119, "adam_stats/m_t_mean": 9.315855031832854e-11, "adam_stats/m_t_min": -0.012976886704564095, "adam_stats/v_t_max": 6.640370702371001e-05, "adam_stats/v_t_mean": 3.1166720679959292e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.1302037239074707, "all_logprobs/max": 0.0, "all_logprobs/median": -4.0531158447265625e-06, "all_logprobs/min": -14.25, "all_logprobs/p1": -2.234375, "all_logprobs/p10": -0.341796875, "all_logprobs/p25": -0.00860595703125, "all_logprobs/p5": -0.83203125, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.19880987703800201, "clip_ratio": 0.0, "completion_length": 557.46875, "completion_length/correct": 469.9125061035156, "completion_length/correct/max": 789.0, "completion_length/correct/median": 494.0, "completion_length/correct/min": 154.0, "completion_length/correct/p25": 379.75, "completion_length/correct/p75": 567.75, "completion_length/correct/var": 25588.7890625, "completion_length/incorrect": 995.25, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 752.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 6406.86669921875, "completion_length/max": 1024.0, "completion_length/median": 520.0, "completion_length/min": 154.0, "completion_length/p25": 400.0, "completion_length/p75": 675.25, "completion_length/var": 61024.66796875, "epoch": 0.008, "feature_vector_variance/max_squared_error": 97244.984375, "feature_vector_variance/metric": 23121.994140625, "generated_tokens/total": 546210.0, "grad_norm": 0.0, "grouped_std_rewards": 0.0, "learning_rate": 1.5e-05, "loss": 0.0, "mean_logprobs": -0.126953125, "mean_logprobs/var": 0.002105712890625, "num_completions/total": 960, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 154.02784729003906, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 145.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 120.5, "per_token_feature_norm/p75": 181.0, "per_token_feature_norm/var": 1917.5379638671875, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.07091032713651657, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05951835215091705, "policy_entropy": 0.14375855028629303, "policy_entropy/max": 3.03125, "policy_entropy/median": 5.793571472167969e-05, "policy_entropy/min": 1.6342482922482304e-12, "policy_entropy/p25": 2.60770320892334e-06, "policy_entropy/p75": 0.051513671875, "policy_entropy/var": 0.10140535980463028, "policy_error_vector_variance/max_squared_error": 2.0024609565734863, "policy_error_vector_variance/metric": 0.07084111124277115, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 7.547513961791992, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.986144781112671, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.7349214553833, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14035087823867798, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14035087823867798, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 10 }, { "accuracy_reward": 0.6458333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2311403751373291, "action_level_variance/metric": 150.46885681152344, "action_level_variance_full_gradient/metric": 340.1212463378906, "adam_stats/lr_effective_max": 9.069589577848092e-05, "adam_stats/lr_effective_mean": 7.04220515235221e-10, "adam_stats/lr_effective_min": -9.050891821971163e-05, "adam_stats/m_t_max": 0.004276332911103964, "adam_stats/m_t_mean": 4.2558904317369084e-11, "adam_stats/m_t_min": -0.01006786897778511, "adam_stats/v_t_max": 6.747239967808127e-05, "adam_stats/v_t_mean": 3.16784857894159e-12, "adam_stats/v_t_min": 0.0, "advantages": -7.450580596923828e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": -0.24990005791187286, "advantages/p75": 0.24990005791187286, "advantages/var": 0.7889722585678101, "all_logprobs": -0.1366558074951172, "all_logprobs/max": 0.0, "all_logprobs/median": -6.198883056640625e-06, "all_logprobs/min": -10.25, "all_logprobs/p1": -2.359375, "all_logprobs/p10": -0.361328125, "all_logprobs/p25": -0.010009765625, "all_logprobs/p5": -0.890625, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.21384307742118835, "clip_ratio": 0.0, "completion_length": 714.09375, "completion_length/correct": 558.0967407226562, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 572.0, "completion_length/correct/min": 227.0, "completion_length/correct/p25": 393.0, "completion_length/correct/p75": 696.75, "completion_length/correct/var": 45475.7578125, "completion_length/incorrect": 998.558837890625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 639.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 5328.80029296875, "completion_length/max": 1024.0, "completion_length/median": 701.0, "completion_length/min": 227.0, "completion_length/p25": 504.0, "completion_length/p75": 1024.0, "completion_length/var": 75894.09375, "epoch": 0.0088, "feature_vector_variance/max_squared_error": 98008.8359375, "feature_vector_variance/metric": 24670.12890625, "generated_tokens/total": 614763.0, "grad_norm": 0.4115792512893677, "grouped_std_rewards": 0.29645755887031555, "learning_rate": 1.4995431202643219e-05, "loss": 0.0, "mean_logprobs": -0.1396484375, "mean_logprobs/var": 0.003570556640625, "num_completions/total": 1056, "per_sentence_gradient_norm": 11.180437088012695, "per_sentence_gradient_norm/max": 83.10139465332031, "per_sentence_gradient_norm/median": 5.028938293457031, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 2.267458438873291, "per_sentence_gradient_norm/p75": 18.66493034362793, "per_sentence_gradient_norm/p85": 20.669527053833008, "per_sentence_gradient_norm/p90": 25.82408905029297, "per_sentence_gradient_norm/p95": 34.88328552246094, "per_sentence_gradient_norm/p99": 72.1296157836914, "per_sentence_gradient_norm/var": 209.87107849121094, "per_token_feature_norm": 158.688720703125, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 63.0, "per_token_feature_norm/p25": 124.0, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 2018.8990478515625, "per_token_full_gradient_variance/max_squared_error": 20.891422271728516, "per_token_full_gradient_variance/variance": 0.02133459970355034, "per_token_gradient_norm": 12.748570442199707, "per_token_gradient_norm/max": 1424.7232666015625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2750.39697265625, "per_token_policy_error_norm": 0.07318998128175735, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06133966147899628, "policy_entropy": 0.1514711081981659, "policy_entropy/max": 3.109375, "policy_entropy/median": 8.392333984375e-05, "policy_entropy/min": 4.746425474877469e-12, "policy_entropy/p25": 3.084540367126465e-06, "policy_entropy/p75": 0.06103515625, "policy_entropy/var": 0.11336775124073029, "policy_error_vector_variance/max_squared_error": 2.010373115539551, "policy_error_vector_variance/metric": 0.07314302027225494, "policy_loss": 6.208817460162663e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.24990007281303406, "policy_loss/var": 0.7889722585678101, "policy_sharpness": 7.456811904907227, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.810546636581421, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.038201332092285, "reward": 0.6458333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2311403751373291, "rewards/accuracy_reward": 0.6458333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2311403751373291, "sentence_full_gradient_variance/max_squared_error": 5725.65771484375, "sentence_full_gradient_variance/metric": 401.05511474609375, "sentence_full_gradient_variance/p75": 77.73699951171875, "sentence_full_gradient_variance/p90": 1582.7830810546875, "sentence_full_gradient_variance/p95": 2387.22119140625, "sentence_full_gradient_variance/p99": 5230.43603515625, "state_level_variance/metric": 79.94446563720703, "state_level_variance_full_gradient/metric": 60.933868408203125, "step": 11 }, { "accuracy_reward": 0.6770833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.22094298899173737, "action_level_variance/metric": 182.52850341796875, "action_level_variance_full_gradient/metric": 1518.4134521484375, "adam_stats/lr_effective_max": 9.27029104786925e-05, "adam_stats/lr_effective_mean": 6.22136397954165e-10, "adam_stats/lr_effective_min": -9.288093860959634e-05, "adam_stats/m_t_max": 0.005773433018475771, "adam_stats/m_t_mean": -2.555731841435982e-11, "adam_stats/m_t_min": -0.00696147233247757, "adam_stats/v_t_max": 6.807463796576485e-05, "adam_stats/v_t_mean": 3.520848109589414e-12, "adam_stats/v_t_min": 0.0, "advantages": -1.2417634698280722e-09, "advantages/max": 2.0150647163391113, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": -0.46501490473747253, "advantages/p75": 0.36585545539855957, "advantages/var": 0.6312763690948486, "all_logprobs": -0.13699489831924438, "all_logprobs/max": 0.0, "all_logprobs/median": -6.9141387939453125e-06, "all_logprobs/min": -12.0, "all_logprobs/p1": -2.359375, "all_logprobs/p10": -0.357421875, "all_logprobs/p25": -0.0094451904296875, "all_logprobs/p5": -0.8984375, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.2172524780035019, "clip_ratio": 0.0, "completion_length": 545.3541870117188, "completion_length/correct": 439.984619140625, "completion_length/correct/max": 974.0, "completion_length/correct/median": 390.0, "completion_length/correct/min": 270.0, "completion_length/correct/p25": 346.0, "completion_length/correct/p75": 463.0, "completion_length/correct/var": 24607.205078125, "completion_length/incorrect": 766.290283203125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 771.0, "completion_length/incorrect/min": 217.0, "completion_length/incorrect/p25": 533.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 71882.2734375, "completion_length/max": 1024.0, "completion_length/median": 441.0, "completion_length/min": 217.0, "completion_length/p25": 365.75, "completion_length/p75": 682.0, "completion_length/var": 62802.1484375, "epoch": 0.0096, "feature_vector_variance/max_squared_error": 95671.3125, "feature_vector_variance/metric": 25067.58203125, "generated_tokens/total": 667117.0, "grad_norm": 0.8804919719696045, "grouped_std_rewards": 0.2836841940879822, "learning_rate": 1.4981730376948682e-05, "loss": 0.0, "mean_logprobs": -0.1328125, "mean_logprobs/var": 0.0038909912109375, "num_completions/total": 1152, "per_sentence_gradient_norm": 11.51404857635498, "per_sentence_gradient_norm/max": 91.35311126708984, "per_sentence_gradient_norm/median": 6.820333480834961, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 13.901996612548828, "per_sentence_gradient_norm/p85": 25.169025421142578, "per_sentence_gradient_norm/p90": 28.795175552368164, "per_sentence_gradient_norm/p95": 42.10042953491211, "per_sentence_gradient_norm/p99": 77.49221801757812, "per_sentence_gradient_norm/var": 264.8675842285156, "per_token_feature_norm": 160.19947814941406, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 125.5, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 2042.706298828125, "per_token_full_gradient_variance/max_squared_error": 5.2558088302612305, "per_token_full_gradient_variance/variance": 0.016616741195321083, "per_token_gradient_norm": 12.84264850616455, "per_token_gradient_norm/max": 969.8142700195312, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2310.343505859375, "per_token_policy_error_norm": 0.0725119560956955, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06014318764209747, "policy_entropy": 0.15452194213867188, "policy_entropy/max": 3.078125, "policy_entropy/median": 9.202957153320312e-05, "policy_entropy/min": 1.1823875212257917e-14, "policy_entropy/p25": 4.023313522338867e-06, "policy_entropy/p75": 0.056640625, "policy_entropy/var": 0.12042012065649033, "policy_error_vector_variance/max_squared_error": 2.0131940841674805, "policy_error_vector_variance/metric": 0.07241977006196976, "policy_loss": 0.0, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -2.0150644779205322, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.46501490473747253, "policy_loss/var": 0.6312763094902039, "policy_sharpness": 7.469605922698975, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.8125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.27285099029541, "reward": 0.6770833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.22094298899173737, "rewards/accuracy_reward": 0.6770833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.22094298899173737, "sentence_full_gradient_variance/max_squared_error": 31843.224609375, "sentence_full_gradient_variance/metric": 1574.3089599609375, "sentence_full_gradient_variance/p75": 1429.8643798828125, "sentence_full_gradient_variance/p90": 3083.259765625, "sentence_full_gradient_variance/p95": 7784.7314453125, "sentence_full_gradient_variance/p99": 21716.908203125, "state_level_variance/metric": 109.18570709228516, "state_level_variance_full_gradient/metric": 55.8956298828125, "step": 12 }, { "accuracy_reward": 0.78125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17269736528396606, "action_level_variance/metric": 77.98149108886719, "action_level_variance_full_gradient/metric": 627.46630859375, "adam_stats/lr_effective_max": 9.130014223046601e-05, "adam_stats/lr_effective_mean": 7.091995879449087e-10, "adam_stats/lr_effective_min": -9.15484270080924e-05, "adam_stats/m_t_max": 0.005294942297041416, "adam_stats/m_t_mean": -2.1700951799430435e-11, "adam_stats/m_t_min": -0.0055906581692397594, "adam_stats/v_t_max": 6.813131767557934e-05, "adam_stats/v_t_mean": 3.533553658008337e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 1.4358407258987427, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.4734647274017334, "all_logprobs": -0.11848224699497223, "all_logprobs/max": 0.0, "all_logprobs/median": -2.1457672119140625e-06, "all_logprobs/min": -9.0625, "all_logprobs/p1": -2.140625, "all_logprobs/p10": -0.28125, "all_logprobs/p25": -0.003631591796875, "all_logprobs/p5": -0.7578125, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.18138113617897034, "clip_ratio": 0.0, "completion_length": 504.04168701171875, "completion_length/correct": 455.413330078125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 417.0, "completion_length/correct/min": 192.0, "completion_length/correct/p25": 284.5, "completion_length/correct/p75": 619.5, "completion_length/correct/var": 47278.89453125, "completion_length/incorrect": 677.7142944335938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 625.0, "completion_length/incorrect/min": 256.0, "completion_length/incorrect/p25": 378.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 102474.2109375, "completion_length/max": 1024.0, "completion_length/median": 419.0, "completion_length/min": 192.0, "completion_length/p25": 317.5, "completion_length/p75": 652.0, "completion_length/var": 66935.6015625, "epoch": 0.0104, "feature_vector_variance/max_squared_error": 92955.78125, "feature_vector_variance/metric": 24465.013671875, "generated_tokens/total": 715505.0, "grad_norm": 0.28197237849235535, "grouped_std_rewards": 0.22277939319610596, "learning_rate": 1.495891421526205e-05, "loss": -0.0, "mean_logprobs": -0.12451171875, "mean_logprobs/var": 0.0020294189453125, "num_completions/total": 1248, "per_sentence_gradient_norm": 7.504874229431152, "per_sentence_gradient_norm/max": 79.55028533935547, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 11.278911590576172, "per_sentence_gradient_norm/p85": 15.509666442871094, "per_sentence_gradient_norm/p90": 19.16253089904785, "per_sentence_gradient_norm/p95": 24.860355377197266, "per_sentence_gradient_norm/p99": 51.23508071899414, "per_sentence_gradient_norm/var": 137.74607849121094, "per_token_feature_norm": 158.1454315185547, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 126.0, "per_token_feature_norm/p75": 185.0, "per_token_feature_norm/var": 1717.5924072265625, "per_token_full_gradient_variance/max_squared_error": 7.284719944000244, "per_token_full_gradient_variance/variance": 0.013112150132656097, "per_token_gradient_norm": 8.970497131347656, "per_token_gradient_norm/max": 942.3635864257812, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1820.5157470703125, "per_token_policy_error_norm": 0.06422401964664459, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.054343391209840775, "policy_entropy": 0.1301928609609604, "policy_entropy/max": 3.796875, "policy_entropy/median": 3.218650817871094e-05, "policy_entropy/min": 2.0747847884194925e-12, "policy_entropy/p25": 1.9669532775878906e-06, "policy_entropy/p75": 0.025634765625, "policy_entropy/var": 0.09540846198797226, "policy_error_vector_variance/max_squared_error": 2.011937379837036, "policy_error_vector_variance/metric": 0.06416834890842438, "policy_loss": -4.967053879312289e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -1.4358408451080322, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.47346481680870056, "policy_sharpness": 7.775909423828125, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.463531494140625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.008116722106934, "reward": 0.78125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17269736528396606, "rewards/accuracy_reward": 0.78125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17269736528396606, "sentence_full_gradient_variance/max_squared_error": 11227.896484375, "sentence_full_gradient_variance/metric": 716.57421875, "sentence_full_gradient_variance/p75": 590.3955078125, "sentence_full_gradient_variance/p90": 2487.78076171875, "sentence_full_gradient_variance/p95": 3586.857421875, "sentence_full_gradient_variance/p99": 5483.79345703125, "state_level_variance/metric": 75.84428405761719, "state_level_variance_full_gradient/metric": 89.10800170898438, "step": 13 }, { "accuracy_reward": 0.6458333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2311403751373291, "action_level_variance/metric": 232.11373901367188, "action_level_variance_full_gradient/metric": 1731.568603515625, "adam_stats/lr_effective_max": 9.343584679299966e-05, "adam_stats/lr_effective_mean": 5.553257298451797e-10, "adam_stats/lr_effective_min": -9.233685705112293e-05, "adam_stats/m_t_max": 0.004463849123567343, "adam_stats/m_t_mean": -2.9704908238031846e-12, "adam_stats/m_t_min": -0.004754621535539627, "adam_stats/v_t_max": 6.846886390121654e-05, "adam_stats/v_t_mean": 3.5476782102306093e-12, "adam_stats/v_t_min": 0.0, "advantages": 2.4835269396561444e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.631074070930481, "all_logprobs": -0.17739708721637726, "all_logprobs/max": 0.0, "all_logprobs/median": -2.3365020751953125e-05, "all_logprobs/min": -11.5, "all_logprobs/p1": -2.671875, "all_logprobs/p10": -0.53515625, "all_logprobs/p25": -0.0380859375, "all_logprobs/p5": -1.140625, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.2819225490093231, "clip_ratio": 0.0, "completion_length": 615.2604370117188, "completion_length/correct": 459.6451416015625, "completion_length/correct/max": 946.0, "completion_length/correct/median": 358.0, "completion_length/correct/min": 216.0, "completion_length/correct/p25": 310.0, "completion_length/correct/p75": 535.5, "completion_length/correct/var": 43928.52734375, "completion_length/incorrect": 899.0294189453125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 991.0, "completion_length/incorrect/min": 310.0, "completion_length/incorrect/p25": 827.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 30744.998046875, "completion_length/max": 1024.0, "completion_length/median": 539.0, "completion_length/min": 216.0, "completion_length/p25": 327.0, "completion_length/p75": 888.25, "completion_length/var": 83510.1875, "epoch": 0.0112, "feature_vector_variance/max_squared_error": 89429.6875, "feature_vector_variance/metric": 27149.55078125, "generated_tokens/total": 774570.0, "grad_norm": 0.35539528727531433, "grouped_std_rewards": 0.1666666716337204, "learning_rate": 1.4927010515561777e-05, "loss": -0.0, "mean_logprobs": -0.1611328125, "mean_logprobs/var": 0.007080078125, "num_completions/total": 1344, "per_sentence_gradient_norm": 5.545619964599609, "per_sentence_gradient_norm/max": 100.04156494140625, "per_sentence_gradient_norm/median": 2.552288770675659, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.6201906204223633, "per_sentence_gradient_norm/p85": 6.349043846130371, "per_sentence_gradient_norm/p90": 7.223651885986328, "per_sentence_gradient_norm/p95": 9.577326774597168, "per_sentence_gradient_norm/p99": 88.8755111694336, "per_sentence_gradient_norm/var": 240.80511474609375, "per_token_feature_norm": 168.79403686523438, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 161.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 130.0, "per_token_feature_norm/p75": 201.0, "per_token_feature_norm/var": 2431.436279296875, "per_token_full_gradient_variance/max_squared_error": 26.55648422241211, "per_token_full_gradient_variance/variance": 0.015371494926512241, "per_token_gradient_norm": 5.889245986938477, "per_token_gradient_norm/max": 1324.8607177734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2040.6566162109375, "per_token_policy_error_norm": 0.0928434282541275, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.07571487128734589, "policy_entropy": 0.19668073952198029, "policy_entropy/max": 3.765625, "policy_entropy/median": 0.0002841949462890625, "policy_entropy/min": 5.030642569181509e-12, "policy_entropy/p25": 5.066394805908203e-06, "policy_entropy/p75": 0.1728515625, "policy_entropy/var": 0.15663114190101624, "policy_error_vector_variance/max_squared_error": 2.017179012298584, "policy_error_vector_variance/metric": 0.09275607764720917, "policy_loss": -3.725290298461914e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.631074070930481, "policy_sharpness": 6.95391845703125, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.7125000953674316, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.69171142578125, "reward": 0.6458333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2311403751373291, "rewards/accuracy_reward": 0.6458333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2311403751373291, "sentence_full_gradient_variance/max_squared_error": 108453.015625, "sentence_full_gradient_variance/metric": 1781.6903076171875, "sentence_full_gradient_variance/p75": 256.3547668457031, "sentence_full_gradient_variance/p90": 290.497802734375, "sentence_full_gradient_variance/p95": 332.28564453125, "sentence_full_gradient_variance/p99": 52143.5859375, "state_level_variance/metric": 24.828144073486328, "state_level_variance_full_gradient/metric": 50.12164306640625, "step": 14 }, { "accuracy_reward": 0.7708333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17850877344608307, "action_level_variance/metric": 92.89384460449219, "action_level_variance_full_gradient/metric": 596.2012939453125, "adam_stats/lr_effective_max": 9.261594823328778e-05, "adam_stats/lr_effective_mean": 4.443168888368376e-10, "adam_stats/lr_effective_min": -9.205420064972714e-05, "adam_stats/m_t_max": 0.004134193528443575, "adam_stats/m_t_mean": -1.193958126810557e-11, "adam_stats/m_t_min": -0.004086898639798164, "adam_stats/v_t_max": 6.842162838438526e-05, "adam_stats/v_t_mean": 3.555953491732322e-12, "adam_stats/v_t_min": 0.0, "advantages": 4.967053879312289e-09, "advantages/max": 0.5588920712471008, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.31562647223472595, "all_logprobs": -0.1455487757921219, "all_logprobs/max": 0.0, "all_logprobs/median": -5.245208740234375e-06, "all_logprobs/min": -11.625, "all_logprobs/p1": -2.40625, "all_logprobs/p10": -0.39453125, "all_logprobs/p25": -0.01239013671875, "all_logprobs/p5": -0.97265625, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.2315896451473236, "clip_ratio": 0.0, "completion_length": 598.9166870117188, "completion_length/correct": 497.52703857421875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 453.0, "completion_length/correct/min": 228.0, "completion_length/correct/p25": 327.25, "completion_length/correct/p75": 607.0, "completion_length/correct/var": 44114.9921875, "completion_length/incorrect": 939.95458984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 316.0, "completion_length/incorrect/p25": 974.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 34258.33203125, "completion_length/max": 1024.0, "completion_length/median": 509.0, "completion_length/min": 228.0, "completion_length/p25": 340.5, "completion_length/p75": 837.0, "completion_length/var": 76413.46875, "epoch": 0.012, "feature_vector_variance/max_squared_error": 93499.0546875, "feature_vector_variance/metric": 26170.6484375, "generated_tokens/total": 832066.0, "grad_norm": 0.2610909938812256, "grouped_std_rewards": 0.13146311044692993, "learning_rate": 1.488605814759156e-05, "loss": -0.0, "mean_logprobs": -0.1474609375, "mean_logprobs/var": 0.0029144287109375, "num_completions/total": 1440, "per_sentence_gradient_norm": 5.001887321472168, "per_sentence_gradient_norm/max": 86.26012420654297, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 7.196537017822266, "per_sentence_gradient_norm/p85": 9.940986633300781, "per_sentence_gradient_norm/p90": 11.685072898864746, "per_sentence_gradient_norm/p95": 16.94385528564453, "per_sentence_gradient_norm/p99": 58.88629150390625, "per_sentence_gradient_norm/var": 141.2722625732422, "per_token_feature_norm": 163.22091674804688, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 127.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2139.121337890625, "per_token_full_gradient_variance/max_squared_error": 4.394201278686523, "per_token_full_gradient_variance/variance": 0.008992353454232216, "per_token_gradient_norm": 5.65797758102417, "per_token_gradient_norm/max": 1043.402587890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1263.849853515625, "per_token_policy_error_norm": 0.07766668498516083, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06470795720815659, "policy_entropy": 0.159958153963089, "policy_entropy/max": 2.75, "policy_entropy/median": 7.343292236328125e-05, "policy_entropy/min": 2.753353101070388e-13, "policy_entropy/p25": 2.8908252716064453e-06, "policy_entropy/p75": 0.0703125, "policy_entropy/var": 0.12293452024459839, "policy_error_vector_variance/max_squared_error": 2.0105600357055664, "policy_error_vector_variance/metric": 0.077626071870327, "policy_loss": 0.0, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -0.5588921308517456, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.31562647223472595, "policy_sharpness": 7.41864538192749, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.623046636581421, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.367473602294922, "reward": 0.7708333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17850877344608307, "rewards/accuracy_reward": 0.7708333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17850877344608307, "sentence_full_gradient_variance/max_squared_error": 48145.73828125, "sentence_full_gradient_variance/metric": 615.4126586914062, "sentence_full_gradient_variance/p75": 9.503803253173828, "sentence_full_gradient_variance/p90": 666.909423828125, "sentence_full_gradient_variance/p95": 814.3694458007812, "sentence_full_gradient_variance/p99": 3298.15869140625, "state_level_variance/metric": 63.2552490234375, "state_level_variance_full_gradient/metric": 19.211362838745117, "step": 15 }, { "accuracy_reward": 0.625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2368421107530594, "action_level_variance/metric": 73.94310760498047, "action_level_variance_full_gradient/metric": 1018.22021484375, "adam_stats/lr_effective_max": 9.31609611143358e-05, "adam_stats/lr_effective_mean": 7.973766091851076e-10, "adam_stats/lr_effective_min": -9.335901995655149e-05, "adam_stats/m_t_max": 0.007865087129175663, "adam_stats/m_t_mean": 5.645008419041808e-11, "adam_stats/m_t_min": -0.006910694297403097, "adam_stats/v_t_max": 7.097101479303092e-05, "adam_stats/v_t_mean": 4.043970019962373e-12, "adam_stats/v_t_min": 0.0, "advantages": 9.934107758624577e-09, "advantages/max": 0.9680583477020264, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.5588920712471008, "advantages/var": 0.6312593817710876, "all_logprobs": -0.16759102046489716, "all_logprobs/max": 0.0, "all_logprobs/median": -1.4901161193847656e-05, "all_logprobs/min": -10.1875, "all_logprobs/p1": -2.609375, "all_logprobs/p10": -0.478515625, "all_logprobs/p25": -0.0245361328125, "all_logprobs/p5": -1.09375, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.277558296918869, "clip_ratio": 0.0, "completion_length": 538.8333740234375, "completion_length/correct": 417.85003662109375, "completion_length/correct/max": 978.0, "completion_length/correct/median": 445.0, "completion_length/correct/min": 98.0, "completion_length/correct/p25": 150.0, "completion_length/correct/p75": 585.75, "completion_length/correct/var": 55707.453125, "completion_length/incorrect": 740.4722290039062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 779.0, "completion_length/incorrect/min": 256.0, "completion_length/incorrect/p25": 514.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 74814.375, "completion_length/max": 1024.0, "completion_length/median": 521.0, "completion_length/min": 98.0, "completion_length/p25": 328.75, "completion_length/p75": 725.5, "completion_length/var": 86812.1796875, "epoch": 0.0128, "feature_vector_variance/max_squared_error": 89351.6015625, "feature_vector_variance/metric": 27658.26953125, "generated_tokens/total": 883794.0, "grad_norm": 1.3742337226867676, "grouped_std_rewards": 0.28765982389450073, "learning_rate": 1.4836107005503543e-05, "loss": 0.0, "mean_logprobs": -0.169921875, "mean_logprobs/var": 0.0038299560546875, "num_completions/total": 1536, "per_sentence_gradient_norm": 12.34817886352539, "per_sentence_gradient_norm/max": 55.6866340637207, "per_sentence_gradient_norm/median": 8.85838794708252, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 19.99883270263672, "per_sentence_gradient_norm/p85": 26.175779342651367, "per_sentence_gradient_norm/p90": 31.86789321899414, "per_sentence_gradient_norm/p95": 37.465972900390625, "per_sentence_gradient_norm/p99": 52.75445556640625, "per_sentence_gradient_norm/var": 190.0936279296875, "per_token_feature_norm": 172.17449951171875, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 164.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 132.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 2467.1494140625, "per_token_full_gradient_variance/max_squared_error": 36.67562484741211, "per_token_full_gradient_variance/variance": 0.01787647418677807, "per_token_gradient_norm": 12.663516998291016, "per_token_gradient_norm/max": 1171.9921875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2268.252685546875, "per_token_policy_error_norm": 0.08693592250347137, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.07143358141183853, "policy_entropy": 0.18461710214614868, "policy_entropy/max": 3.375, "policy_entropy/median": 0.00019073486328125, "policy_entropy/min": 2.4202861936828413e-14, "policy_entropy/p25": 5.21540641784668e-06, "policy_entropy/p75": 0.1220703125, "policy_entropy/var": 0.14679577946662903, "policy_error_vector_variance/max_squared_error": 2.01833438873291, "policy_error_vector_variance/metric": 0.086866095662117, "policy_loss": -3.1044087300813317e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.9680584669113159, "policy_loss/p25": -0.5588920712471008, "policy_loss/p75": 0.0, "policy_loss/var": 0.6312593817710876, "policy_sharpness": 7.125705718994141, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.88134765625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.336709022521973, "reward": 0.625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2368421107530594, "rewards/accuracy_reward": 0.625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2368421107530594, "sentence_full_gradient_variance/max_squared_error": 13979.73046875, "sentence_full_gradient_variance/metric": 1052.626220703125, "sentence_full_gradient_variance/p75": 970.397705078125, "sentence_full_gradient_variance/p90": 2988.8291015625, "sentence_full_gradient_variance/p95": 3932.614013671875, "sentence_full_gradient_variance/p99": 11296.05078125, "state_level_variance/metric": 142.55020141601562, "state_level_variance_full_gradient/metric": 34.40602111816406, "step": 16 }, { "accuracy_reward": 0.8020833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16041667759418488, "action_level_variance/metric": 28.940458297729492, "action_level_variance_full_gradient/metric": 249.53646850585938, "adam_stats/lr_effective_max": 8.989783236756921e-05, "adam_stats/lr_effective_mean": 8.267230788838731e-10, "adam_stats/lr_effective_min": -9.23468978726305e-05, "adam_stats/m_t_max": 0.007241847459226847, "adam_stats/m_t_mean": 4.877974066341828e-11, "adam_stats/m_t_min": -0.006027363706380129, "adam_stats/v_t_max": 7.090374128893018e-05, "adam_stats/v_t_mean": 4.042551016159024e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 1.4358407258987427, "advantages/median": 0.0, "advantages/min": -0.9680583477020264, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.31566235423088074, "all_logprobs": -0.13285228610038757, "all_logprobs/max": 0.0, "all_logprobs/median": -2.1457672119140625e-06, "all_logprobs/min": -7.78125, "all_logprobs/p1": -2.328125, "all_logprobs/p10": -0.349609375, "all_logprobs/p25": -0.006103515625, "all_logprobs/p5": -0.85546875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.20595625042915344, "clip_ratio": 0.0, "completion_length": 529.8021240234375, "completion_length/correct": 468.2077941894531, "completion_length/correct/max": 982.0, "completion_length/correct/median": 459.0, "completion_length/correct/min": 186.0, "completion_length/correct/p25": 241.0, "completion_length/correct/p75": 655.0, "completion_length/correct/var": 59188.56640625, "completion_length/incorrect": 779.4210815429688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 749.0, "completion_length/incorrect/min": 587.0, "completion_length/incorrect/p25": 677.0, "completion_length/incorrect/p75": 816.5, "completion_length/incorrect/var": 21307.14453125, "completion_length/max": 1024.0, "completion_length/median": 538.0, "completion_length/min": 186.0, "completion_length/p25": 260.0, "completion_length/p75": 754.5, "completion_length/var": 66924.9453125, "epoch": 0.0136, "feature_vector_variance/max_squared_error": 104967.5, "feature_vector_variance/metric": 26457.353515625, "generated_tokens/total": 934655.0, "grad_norm": 0.18304288387298584, "grouped_std_rewards": 0.165851891040802, "learning_rate": 1.4777217947069972e-05, "loss": -0.0, "mean_logprobs": -0.12451171875, "mean_logprobs/var": 0.003875732421875, "num_completions/total": 1632, "per_sentence_gradient_norm": 6.704625129699707, "per_sentence_gradient_norm/max": 54.8637809753418, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 11.442097663879395, "per_sentence_gradient_norm/p85": 18.287513732910156, "per_sentence_gradient_norm/p90": 20.071203231811523, "per_sentence_gradient_norm/p95": 31.59203338623047, "per_sentence_gradient_norm/p99": 43.0875358581543, "per_sentence_gradient_norm/var": 133.64572143554688, "per_token_feature_norm": 163.3302459716797, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 157.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2044.7894287109375, "per_token_full_gradient_variance/max_squared_error": 2.510366678237915, "per_token_full_gradient_variance/variance": 0.011169173754751682, "per_token_gradient_norm": 9.950902938842773, "per_token_gradient_norm/max": 543.2637939453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1508.7535400390625, "per_token_policy_error_norm": 0.0706758201122284, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05842520669102669, "policy_entropy": 0.14787565171718597, "policy_entropy/max": 2.796875, "policy_entropy/median": 3.0517578125e-05, "policy_entropy/min": 2.7000623958883807e-12, "policy_entropy/p25": 1.3336539268493652e-06, "policy_entropy/p75": 0.040283203125, "policy_entropy/var": 0.11585160344839096, "policy_error_vector_variance/max_squared_error": 2.0065646171569824, "policy_error_vector_variance/metric": 0.07059681415557861, "policy_loss": 0.0, "policy_loss/max": 0.9680584669113159, "policy_loss/median": 0.0, "policy_loss/min": -1.4358408451080322, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.3156624138355255, "policy_sharpness": 7.649021625518799, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.2342529296875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.65176010131836, "reward": 0.8020833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16041667759418488, "rewards/accuracy_reward": 0.8020833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16041667759418488, "sentence_full_gradient_variance/max_squared_error": 3195.684814453125, "sentence_full_gradient_variance/metric": 257.35498046875, "sentence_full_gradient_variance/p75": 380.33984375, "sentence_full_gradient_variance/p90": 883.5117797851562, "sentence_full_gradient_variance/p95": 1233.0286865234375, "sentence_full_gradient_variance/p99": 3030.9306640625, "state_level_variance/metric": 126.14628601074219, "state_level_variance_full_gradient/metric": 7.8185319900512695, "step": 17 }, { "accuracy_reward": 0.5416666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.25087717175483704, "action_level_variance/metric": 113.4173812866211, "action_level_variance_full_gradient/metric": 111.29156494140625, "adam_stats/lr_effective_max": 8.784553210716695e-05, "adam_stats/lr_effective_mean": 7.424389436572199e-10, "adam_stats/lr_effective_min": -9.054647671291605e-05, "adam_stats/m_t_max": 0.006447853520512581, "adam_stats/m_t_mean": 4.9422081010996877e-11, "adam_stats/m_t_min": -0.004998364020138979, "adam_stats/v_t_max": 7.086201367201284e-05, "adam_stats/v_t_mean": 4.040438122965284e-12, "adam_stats/v_t_min": 0.0, "advantages": -1.2417634698280722e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": -0.24990005791187286, "advantages/p75": 0.0, "advantages/var": 0.47336119413375854, "all_logprobs": -0.15721489489078522, "all_logprobs/max": 0.0, "all_logprobs/median": -7.3909759521484375e-06, "all_logprobs/min": -10.0, "all_logprobs/p1": -2.640625, "all_logprobs/p10": -0.435546875, "all_logprobs/p25": -0.0152740478515625, "all_logprobs/p5": -1.0078125, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.26223018765449524, "clip_ratio": 0.0, "completion_length": 740.875, "completion_length/correct": 590.7115478515625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 571.0, "completion_length/correct/min": 343.0, "completion_length/correct/p25": 510.25, "completion_length/correct/p75": 634.25, "completion_length/correct/var": 19159.623046875, "completion_length/incorrect": 918.3409423828125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1003.0, "completion_length/incorrect/min": 555.0, "completion_length/incorrect/p25": 847.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 18021.765625, "completion_length/max": 1024.0, "completion_length/median": 701.0, "completion_length/min": 343.0, "completion_length/p25": 560.25, "completion_length/p75": 997.0, "completion_length/var": 45372.328125, "epoch": 0.0144, "feature_vector_variance/max_squared_error": 94053.609375, "feature_vector_variance/metric": 26814.74609375, "generated_tokens/total": 1005779.0, "grad_norm": 0.19097808003425598, "grouped_std_rewards": 0.15786893665790558, "learning_rate": 1.4709462719537392e-05, "loss": 0.0, "mean_logprobs": -0.1494140625, "mean_logprobs/var": 0.004425048828125, "num_completions/total": 1728, "per_sentence_gradient_norm": 5.937616348266602, "per_sentence_gradient_norm/max": 66.1469497680664, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.5249834060668945, "per_sentence_gradient_norm/p85": 10.360934257507324, "per_sentence_gradient_norm/p90": 13.014142036437988, "per_sentence_gradient_norm/p95": 37.1132698059082, "per_sentence_gradient_norm/p99": 65.11956787109375, "per_sentence_gradient_norm/var": 161.80287170410156, "per_token_feature_norm": 166.1398162841797, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 157.0, "per_token_feature_norm/min": 63.75, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 2332.126953125, "per_token_full_gradient_variance/max_squared_error": 42.62063980102539, "per_token_full_gradient_variance/variance": 0.012987656518816948, "per_token_gradient_norm": 7.0863728523254395, "per_token_gradient_norm/max": 1300.08544921875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1742.27099609375, "per_token_policy_error_norm": 0.08147549629211426, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06802653521299362, "policy_entropy": 0.17331020534038544, "policy_entropy/max": 3.703125, "policy_entropy/median": 0.0001010894775390625, "policy_entropy/min": 8.126832540256146e-14, "policy_entropy/p25": 3.0957162380218506e-06, "policy_entropy/p75": 0.083984375, "policy_entropy/var": 0.1437690556049347, "policy_error_vector_variance/max_squared_error": 2.01859974861145, "policy_error_vector_variance/metric": 0.08138490468263626, "policy_loss": 9.934107758624577e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": 0.0, "policy_loss/p75": 0.24990007281303406, "policy_loss/var": 0.47336119413375854, "policy_sharpness": 7.310771465301514, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.300732374191284, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.851638793945312, "reward": 0.5416666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.25087717175483704, "rewards/accuracy_reward": 0.5416666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.25087717175483704, "sentence_full_gradient_variance/max_squared_error": 4709.494140625, "sentence_full_gradient_variance/metric": 130.88583374023438, "sentence_full_gradient_variance/p75": 83.43861389160156, "sentence_full_gradient_variance/p90": 149.8746337890625, "sentence_full_gradient_variance/p95": 451.63287353515625, "sentence_full_gradient_variance/p99": 1616.4910888671875, "state_level_variance/metric": 64.54635620117188, "state_level_variance_full_gradient/metric": 19.594266891479492, "step": 18 }, { "accuracy_reward": 0.5104166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.25252193212509155, "action_level_variance/metric": 328.3791198730469, "action_level_variance_full_gradient/metric": 999.1796264648438, "adam_stats/lr_effective_max": 8.763791265664622e-05, "adam_stats/lr_effective_mean": 6.634409688288656e-10, "adam_stats/lr_effective_min": -8.960759441833943e-05, "adam_stats/m_t_max": 0.005707700736820698, "adam_stats/m_t_mean": 5.190083779416099e-11, "adam_stats/m_t_min": -0.005020378157496452, "adam_stats/v_t_max": 7.080837531248108e-05, "adam_stats/v_t_mean": 4.0410040764993216e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 2.560988187789917, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.3155708611011505, "all_logprobs": -0.15251493453979492, "all_logprobs/max": 0.0, "all_logprobs/median": -9.179115295410156e-06, "all_logprobs/min": -13.25, "all_logprobs/p1": -2.4375, "all_logprobs/p10": -0.443359375, "all_logprobs/p25": -0.0181884765625, "all_logprobs/p5": -0.98046875, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.23740527033805847, "clip_ratio": 0.0, "completion_length": 647.9791870117188, "completion_length/correct": 419.3061218261719, "completion_length/correct/max": 942.0, "completion_length/correct/median": 373.0, "completion_length/correct/min": 248.0, "completion_length/correct/p25": 342.0, "completion_length/correct/p75": 474.0, "completion_length/correct/var": 17794.884765625, "completion_length/incorrect": 886.3829345703125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 299.0, "completion_length/incorrect/p25": 793.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 53276.11328125, "completion_length/max": 1024.0, "completion_length/median": 505.0, "completion_length/min": 248.0, "completion_length/p25": 368.5, "completion_length/p75": 1024.0, "completion_length/var": 89878.3359375, "epoch": 0.0152, "feature_vector_variance/max_squared_error": 95089.25, "feature_vector_variance/metric": 26147.533203125, "generated_tokens/total": 1067985.0, "grad_norm": 0.32319390773773193, "grouped_std_rewards": 0.09859417378902435, "learning_rate": 1.4632923872213653e-05, "loss": 0.0, "mean_logprobs": -0.1572265625, "mean_logprobs/var": 0.004150390625, "num_completions/total": 1824, "per_sentence_gradient_norm": 5.054895401000977, "per_sentence_gradient_norm/max": 169.73876953125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.953456878662109, "per_sentence_gradient_norm/p85": 6.873531818389893, "per_sentence_gradient_norm/p90": 7.460110664367676, "per_sentence_gradient_norm/p95": 8.90131664276123, "per_sentence_gradient_norm/p99": 64.62716674804688, "per_sentence_gradient_norm/var": 363.7979736328125, "per_token_feature_norm": 163.54627990722656, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 155.0, "per_token_feature_norm/min": 63.25, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2133.11669921875, "per_token_full_gradient_variance/max_squared_error": 8.082551956176758, "per_token_full_gradient_variance/variance": 0.008793987333774567, "per_token_gradient_norm": 4.793278217315674, "per_token_gradient_norm/max": 1317.773681640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1539.321044921875, "per_token_policy_error_norm": 0.0812867134809494, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06667532026767731, "policy_entropy": 0.16936711966991425, "policy_entropy/max": 2.890625, "policy_entropy/median": 0.00012302398681640625, "policy_entropy/min": 3.4638958368304884e-13, "policy_entropy/p25": 4.500150680541992e-06, "policy_entropy/p75": 0.09326171875, "policy_entropy/var": 0.12943223118782043, "policy_error_vector_variance/max_squared_error": 2.01814341545105, "policy_error_vector_variance/metric": 0.0812193751335144, "policy_loss": 0.0, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -2.560988187789917, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.3155708611011505, "policy_sharpness": 7.29220724105835, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.244117498397827, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.73425006866455, "reward": 0.5104166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.25252193212509155, "rewards/accuracy_reward": 0.5104166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.25252193212509155, "sentence_full_gradient_variance/max_squared_error": 85992.2890625, "sentence_full_gradient_variance/metric": 1003.2376708984375, "sentence_full_gradient_variance/p75": 1.0826438665390015, "sentence_full_gradient_variance/p90": 399.3274841308594, "sentence_full_gradient_variance/p95": 449.65435791015625, "sentence_full_gradient_variance/p99": 8152.4111328125, "state_level_variance/metric": 62.583595275878906, "state_level_variance_full_gradient/metric": 4.05826473236084, "step": 19 }, { "accuracy_reward": 0.65625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2279605269432068, "action_level_variance/metric": 144.99746704101562, "action_level_variance_full_gradient/metric": 1399.25830078125, "adam_stats/lr_effective_max": 8.991910726763308e-05, "adam_stats/lr_effective_mean": 5.919716938862507e-10, "adam_stats/lr_effective_min": -8.765458187554032e-05, "adam_stats/m_t_max": 0.005226194392889738, "adam_stats/m_t_mean": 5.457094845451316e-11, "adam_stats/m_t_min": -0.005654416047036648, "adam_stats/v_t_max": 7.091298175510019e-05, "adam_stats/v_t_mean": 4.044359031701861e-12, "adam_stats/v_t_min": 0.0, "advantages": -1.4901161193847656e-08, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -2.0150647163391113, "advantages/p25": -0.24990005791187286, "advantages/p75": 0.46501490473747253, "advantages/var": 0.7890509366989136, "all_logprobs": -0.13579030334949493, "all_logprobs/max": 0.0, "all_logprobs/median": -2.86102294921875e-06, "all_logprobs/min": -12.0, "all_logprobs/p1": -2.28125, "all_logprobs/p10": -0.3720703125, "all_logprobs/p25": -0.0098876953125, "all_logprobs/p5": -0.8984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.20472629368305206, "clip_ratio": 0.0, "completion_length": 637.25, "completion_length/correct": 541.3175048828125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 494.0, "completion_length/correct/min": 206.0, "completion_length/correct/p25": 390.5, "completion_length/correct/p75": 688.0, "completion_length/correct/var": 45069.96484375, "completion_length/incorrect": 820.3939819335938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 440.0, "completion_length/incorrect/p25": 572.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 54560.4296875, "completion_length/max": 1024.0, "completion_length/median": 560.0, "completion_length/min": 206.0, "completion_length/p25": 448.5, "completion_length/p75": 847.25, "completion_length/var": 65546.7421875, "epoch": 0.016, "feature_vector_variance/max_squared_error": 102314.0390625, "feature_vector_variance/metric": 26392.455078125, "generated_tokens/total": 1129161.0, "grad_norm": 0.3008836805820465, "grouped_std_rewards": 0.32861435413360596, "learning_rate": 1.4547694655894313e-05, "loss": 0.0, "mean_logprobs": -0.138671875, "mean_logprobs/var": 0.001983642578125, "num_completions/total": 1920, "per_sentence_gradient_norm": 12.118898391723633, "per_sentence_gradient_norm/max": 66.06289672851562, "per_sentence_gradient_norm/median": 7.765167236328125, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 3.813175916671753, "per_sentence_gradient_norm/p75": 14.173909187316895, "per_sentence_gradient_norm/p85": 24.71735382080078, "per_sentence_gradient_norm/p90": 31.151479721069336, "per_sentence_gradient_norm/p95": 46.095577239990234, "per_sentence_gradient_norm/p99": 54.50918960571289, "per_sentence_gradient_norm/var": 192.1359405517578, "per_token_feature_norm": 163.4048614501953, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 157.0, "per_token_feature_norm/min": 63.75, "per_token_feature_norm/p25": 129.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 1947.876708984375, "per_token_full_gradient_variance/max_squared_error": 10.134103775024414, "per_token_full_gradient_variance/variance": 0.019581792876124382, "per_token_gradient_norm": 12.497159957885742, "per_token_gradient_norm/max": 1122.3245849609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2437.451904296875, "per_token_policy_error_norm": 0.07326676696538925, "per_token_policy_error_norm/max": 1.9921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.060187894850969315, "policy_entropy": 0.1517128199338913, "policy_entropy/max": 3.703125, "policy_entropy/median": 4.00543212890625e-05, "policy_entropy/min": 6.794564910705958e-14, "policy_entropy/p25": 1.5869736671447754e-06, "policy_entropy/p75": 0.06060791015625, "policy_entropy/var": 0.11308775097131729, "policy_error_vector_variance/max_squared_error": 2.0091891288757324, "policy_error_vector_variance/metric": 0.07320227473974228, "policy_loss": 1.7384689243726825e-08, "policy_loss/max": 2.0150647163391113, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": -0.46501490473747253, "policy_loss/p75": 0.24990007281303406, "policy_loss/var": 0.7890509366989136, "policy_sharpness": 7.531118869781494, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.9314956665039062, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.877495765686035, "reward": 0.65625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2279605269432068, "rewards/accuracy_reward": 0.65625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2279605269432068, "sentence_full_gradient_variance/max_squared_error": 21096.068359375, "sentence_full_gradient_variance/metric": 1455.3905029296875, "sentence_full_gradient_variance/p75": 1031.0126953125, "sentence_full_gradient_variance/p90": 3981.602783203125, "sentence_full_gradient_variance/p95": 7033.8623046875, "sentence_full_gradient_variance/p99": 17445.794921875, "state_level_variance/metric": 65.03929138183594, "state_level_variance_full_gradient/metric": 56.132198333740234, "step": 20 }, { "accuracy_reward": 0.8541666865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.12587718665599823, "action_level_variance/metric": 78.19511413574219, "action_level_variance_full_gradient/metric": 1199.25537109375, "adam_stats/lr_effective_max": 8.978484402177855e-05, "adam_stats/lr_effective_mean": 5.964650440226649e-10, "adam_stats/lr_effective_min": -8.694823191035539e-05, "adam_stats/m_t_max": 0.004319053143262863, "adam_stats/m_t_mean": 5.499283320387072e-11, "adam_stats/m_t_min": -0.005626083817332983, "adam_stats/v_t_max": 7.08709194441326e-05, "adam_stats/v_t_mean": 4.051814005839871e-12, "adam_stats/v_t_min": 0.0, "advantages": 9.934107758624577e-09, "advantages/max": 0.8537459373474121, "advantages/median": 0.24990005791187286, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.6312281489372253, "all_logprobs": -0.13358263671398163, "all_logprobs/max": 0.0, "all_logprobs/median": -2.6226043701171875e-06, "all_logprobs/min": -11.5625, "all_logprobs/p1": -2.28125, "all_logprobs/p10": -0.3515625, "all_logprobs/p25": -0.00994873046875, "all_logprobs/p5": -0.86328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.20275896787643433, "clip_ratio": 0.0, "completion_length": 547.4479370117188, "completion_length/correct": 539.46337890625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 509.0, "completion_length/correct/min": 266.0, "completion_length/correct/p25": 387.0, "completion_length/correct/p75": 633.0, "completion_length/correct/var": 35727.5625, "completion_length/incorrect": 594.2142944335938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 424.0, "completion_length/incorrect/min": 294.0, "completion_length/incorrect/p25": 384.0, "completion_length/incorrect/p75": 838.25, "completion_length/incorrect/var": 73149.4140625, "completion_length/max": 1024.0, "completion_length/median": 509.0, "completion_length/min": 266.0, "completion_length/p25": 385.0, "completion_length/p75": 683.75, "completion_length/var": 40849.69921875, "epoch": 0.0168, "feature_vector_variance/max_squared_error": 95044.234375, "feature_vector_variance/metric": 26566.55078125, "generated_tokens/total": 1181716.0, "grad_norm": 0.3067169487476349, "grouped_std_rewards": 0.25852102041244507, "learning_rate": 1.4453878909250906e-05, "loss": -0.0, "mean_logprobs": -0.134765625, "mean_logprobs/var": 0.0023193359375, "num_completions/total": 2016, "per_sentence_gradient_norm": 10.095888137817383, "per_sentence_gradient_norm/max": 60.935829162597656, "per_sentence_gradient_norm/median": 4.712649822235107, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 13.127487182617188, "per_sentence_gradient_norm/p85": 24.97233772277832, "per_sentence_gradient_norm/p90": 30.22577667236328, "per_sentence_gradient_norm/p95": 36.30855941772461, "per_sentence_gradient_norm/p99": 44.22948455810547, "per_sentence_gradient_norm/var": 166.74472045898438, "per_token_feature_norm": 164.17459106445312, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 159.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 130.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 1860.9276123046875, "per_token_full_gradient_variance/max_squared_error": 10.63788890838623, "per_token_full_gradient_variance/variance": 0.014761022292077541, "per_token_gradient_norm": 10.628225326538086, "per_token_gradient_norm/max": 1055.7301025390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1902.1395263671875, "per_token_policy_error_norm": 0.07249440252780914, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06060387194156647, "policy_entropy": 0.14770366251468658, "policy_entropy/max": 3.59375, "policy_entropy/median": 3.838539123535156e-05, "policy_entropy/min": 1.2323475573339238e-14, "policy_entropy/p25": 9.760260581970215e-07, "policy_entropy/p75": 0.06103515625, "policy_entropy/var": 0.10774436593055725, "policy_error_vector_variance/max_squared_error": 2.002166509628296, "policy_error_vector_variance/metric": 0.07245922088623047, "policy_loss": -9.934107758624577e-09, "policy_loss/max": 3.7485010623931885, "policy_loss/median": -0.24990005791187286, "policy_loss/min": -0.8537459373474121, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.6312282085418701, "policy_sharpness": 7.537813186645508, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.95068359375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.767333030700684, "reward": 0.8541666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.12587718665599823, "rewards/accuracy_reward": 0.8541666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.12587718665599823, "sentence_full_gradient_variance/max_squared_error": 52257.078125, "sentence_full_gradient_variance/metric": 1203.3590087890625, "sentence_full_gradient_variance/p75": 521.453369140625, "sentence_full_gradient_variance/p90": 2536.5234375, "sentence_full_gradient_variance/p95": 3273.95751953125, "sentence_full_gradient_variance/p99": 12650.755859375, "state_level_variance/metric": 110.03984069824219, "state_level_variance_full_gradient/metric": 4.103818416595459, "step": 21 }, { "accuracy_reward": 0.8020833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16041666269302368, "action_level_variance/metric": 31.514266967773438, "action_level_variance_full_gradient/metric": 194.3203125, "adam_stats/lr_effective_max": 8.754045848036185e-05, "adam_stats/lr_effective_mean": 4.981344781107566e-10, "adam_stats/lr_effective_min": -8.705310028744861e-05, "adam_stats/m_t_max": 0.003679728601127863, "adam_stats/m_t_mean": 4.839286263380593e-11, "adam_stats/m_t_min": -0.004648435860872269, "adam_stats/v_t_max": 7.081727380864322e-05, "adam_stats/v_t_mean": 4.051396804843899e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634920325327e-08, "advantages/max": 0.9680583477020264, "advantages/median": 0.0, "advantages/min": -1.6766761541366577, "advantages/p25": 0.0, "advantages/p75": 0.5588920712471008, "advantages/var": 0.4734908938407898, "all_logprobs": -0.12917713820934296, "all_logprobs/max": 0.0, "all_logprobs/median": -1.0728836059570312e-06, "all_logprobs/min": -8.3125, "all_logprobs/p1": -2.296875, "all_logprobs/p10": -0.345703125, "all_logprobs/p25": -0.003753662109375, "all_logprobs/p5": -0.859375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.19762282073497772, "clip_ratio": 0.0, "completion_length": 673.0208740234375, "completion_length/correct": 623.259765625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 655.0, "completion_length/correct/min": 306.0, "completion_length/correct/p25": 553.0, "completion_length/correct/p75": 714.0, "completion_length/correct/var": 27944.69140625, "completion_length/incorrect": 874.6842041015625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 536.0, "completion_length/incorrect/p25": 721.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 34385.890625, "completion_length/max": 1024.0, "completion_length/median": 672.0, "completion_length/min": 306.0, "completion_length/p25": 578.75, "completion_length/p75": 750.5, "completion_length/var": 39011.60546875, "epoch": 0.0176, "feature_vector_variance/max_squared_error": 110053.421875, "feature_vector_variance/metric": 25990.04296875, "generated_tokens/total": 1246326.0, "grad_norm": 0.20756778120994568, "grouped_std_rewards": 0.24599313735961914, "learning_rate": 1.4351590932319506e-05, "loss": -0.0, "mean_logprobs": -0.125, "mean_logprobs/var": 0.001953125, "num_completions/total": 2112, "per_sentence_gradient_norm": 9.111806869506836, "per_sentence_gradient_norm/max": 47.199867248535156, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 15.679437637329102, "per_sentence_gradient_norm/p85": 20.65740394592285, "per_sentence_gradient_norm/p90": 23.103839874267578, "per_sentence_gradient_norm/p95": 28.53179359436035, "per_sentence_gradient_norm/p99": 42.70986557006836, "per_sentence_gradient_norm/var": 120.46896362304688, "per_token_feature_norm": 161.81216430664062, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 155.0, "per_token_feature_norm/min": 62.0, "per_token_feature_norm/p25": 127.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 2009.3602294921875, "per_token_full_gradient_variance/max_squared_error": 3.3133766651153564, "per_token_full_gradient_variance/variance": 0.013006567023694515, "per_token_gradient_norm": 11.303940773010254, "per_token_gradient_norm/max": 688.485107421875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1852.250732421875, "per_token_policy_error_norm": 0.07006752490997314, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0583597868680954, "policy_entropy": 0.1417120099067688, "policy_entropy/max": 3.703125, "policy_entropy/median": 1.704692840576172e-05, "policy_entropy/min": 2.1871393585115584e-14, "policy_entropy/p25": 6.854534149169922e-07, "policy_entropy/p75": 0.0264892578125, "policy_entropy/var": 0.11193172633647919, "policy_error_vector_variance/max_squared_error": 2.009669065475464, "policy_error_vector_variance/metric": 0.07001856714487076, "policy_loss": -9.313225746154785e-09, "policy_loss/max": 1.6766760349273682, "policy_loss/median": 0.0, "policy_loss/min": -0.9680584669113159, "policy_loss/p25": -0.5588920712471008, "policy_loss/p75": 0.0, "policy_loss/var": 0.4734908640384674, "policy_sharpness": 7.76309871673584, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.48822021484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.270991325378418, "reward": 0.8020833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16041666269302368, "rewards/accuracy_reward": 0.8020833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16041666269302368, "sentence_full_gradient_variance/max_squared_error": 2480.097412109375, "sentence_full_gradient_variance/metric": 241.6322021484375, "sentence_full_gradient_variance/p75": 247.40029907226562, "sentence_full_gradient_variance/p90": 761.5435791015625, "sentence_full_gradient_variance/p95": 1080.9603271484375, "sentence_full_gradient_variance/p99": 2391.041259765625, "state_level_variance/metric": 107.60334777832031, "state_level_variance_full_gradient/metric": 47.31189727783203, "step": 22 }, { "accuracy_reward": 0.6354166865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23410087823867798, "action_level_variance/metric": 41.32252502441406, "action_level_variance_full_gradient/metric": 2131.8115234375, "adam_stats/lr_effective_max": 8.510369661962613e-05, "adam_stats/lr_effective_mean": 5.08656727848944e-10, "adam_stats/lr_effective_min": -8.670624811202288e-05, "adam_stats/m_t_max": 0.0054483069106936455, "adam_stats/m_t_mean": 5.0012393532083976e-11, "adam_stats/m_t_min": -0.006991209927946329, "adam_stats/v_t_max": 7.153471960918978e-05, "adam_stats/v_t_mean": 4.0849138308041155e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 2.0150647163391113, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": -0.46501490473747253, "advantages/p75": 0.24990005791187286, "advantages/var": 0.6312422156333923, "all_logprobs": -0.1408182829618454, "all_logprobs/max": 0.0, "all_logprobs/median": -1.430511474609375e-06, "all_logprobs/min": -9.375, "all_logprobs/p1": -2.421875, "all_logprobs/p10": -0.38671875, "all_logprobs/p25": -0.006622314453125, "all_logprobs/p5": -0.9296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.22171783447265625, "clip_ratio": 0.0, "completion_length": 610.03125, "completion_length/correct": 471.7704772949219, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 401.0, "completion_length/correct/min": 287.0, "completion_length/correct/p25": 360.0, "completion_length/correct/p75": 523.0, "completion_length/correct/var": 31253.9140625, "completion_length/incorrect": 851.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 278.0, "completion_length/incorrect/p25": 743.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 83672.3515625, "completion_length/max": 1024.0, "completion_length/median": 457.0, "completion_length/min": 278.0, "completion_length/p25": 375.25, "completion_length/p75": 1024.0, "completion_length/var": 83352.4375, "epoch": 0.0184, "feature_vector_variance/max_squared_error": 116408.9921875, "feature_vector_variance/metric": 28282.53515625, "generated_tokens/total": 1304889.0, "grad_norm": 0.3763035237789154, "grouped_std_rewards": 0.2687790095806122, "learning_rate": 1.4240955347243754e-05, "loss": 0.0, "mean_logprobs": -0.1455078125, "mean_logprobs/var": 0.004302978515625, "num_completions/total": 2208, "per_sentence_gradient_norm": 10.122821807861328, "per_sentence_gradient_norm/max": 44.4835090637207, "per_sentence_gradient_norm/median": 4.2257256507873535, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 18.654958724975586, "per_sentence_gradient_norm/p85": 26.412616729736328, "per_sentence_gradient_norm/p90": 29.37175750732422, "per_sentence_gradient_norm/p95": 35.31779098510742, "per_sentence_gradient_norm/p99": 41.23414993286133, "per_sentence_gradient_norm/var": 152.82366943359375, "per_token_feature_norm": 167.9691619873047, "per_token_feature_norm/max": 338.0, "per_token_feature_norm/median": 163.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 132.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 2119.859619140625, "per_token_full_gradient_variance/max_squared_error": 7.320008277893066, "per_token_full_gradient_variance/variance": 0.011911559849977493, "per_token_gradient_norm": 10.674378395080566, "per_token_gradient_norm/max": 719.189208984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1591.848876953125, "per_token_policy_error_norm": 0.0747484639286995, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06166977807879448, "policy_entropy": 0.15499523282051086, "policy_entropy/max": 3.28125, "policy_entropy/median": 2.181529998779297e-05, "policy_entropy/min": 5.790923296444817e-13, "policy_entropy/p25": 8.791685104370117e-07, "policy_entropy/p75": 0.04052734375, "policy_entropy/var": 0.1251021772623062, "policy_error_vector_variance/max_squared_error": 1.9983916282653809, "policy_error_vector_variance/metric": 0.07470566034317017, "policy_loss": 7.450580596923828e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -2.0150644779205322, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.46501490473747253, "policy_loss/var": 0.6312422156333923, "policy_sharpness": 7.663271427154541, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.1923828125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.83141040802002, "reward": 0.6354166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23410087823867798, "rewards/accuracy_reward": 0.6354166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23410087823867798, "sentence_full_gradient_variance/max_squared_error": 111577.5859375, "sentence_full_gradient_variance/metric": 2157.70263671875, "sentence_full_gradient_variance/p75": 290.36859130859375, "sentence_full_gradient_variance/p90": 5126.6474609375, "sentence_full_gradient_variance/p95": 6776.423828125, "sentence_full_gradient_variance/p99": 15616.642578125, "state_level_variance/metric": 134.99026489257812, "state_level_variance_full_gradient/metric": 25.891265869140625, "step": 23 }, { "accuracy_reward": 0.7083333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2087719589471817, "action_level_variance/metric": 21.49742889404297, "action_level_variance_full_gradient/metric": 211.1060028076172, "adam_stats/lr_effective_max": 8.550218626623973e-05, "adam_stats/lr_effective_mean": 5.919237322515869e-10, "adam_stats/lr_effective_min": -8.56921324157156e-05, "adam_stats/m_t_max": 0.005170505028218031, "adam_stats/m_t_mean": 5.625336654824231e-11, "adam_stats/m_t_min": -0.0061963400803506374, "adam_stats/v_t_max": 7.146409916458651e-05, "adam_stats/v_t_mean": 4.0922391343622966e-12, "adam_stats/v_t_min": 0.0, "advantages": -4.967053879312289e-09, "advantages/max": 1.6766761541366577, "advantages/median": 0.0, "advantages/min": -0.5588920712471008, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.15782417356967926, "all_logprobs": -0.13009078800678253, "all_logprobs/max": 0.0, "all_logprobs/median": -1.9073486328125e-06, "all_logprobs/min": -8.1875, "all_logprobs/p1": -2.21875, "all_logprobs/p10": -0.34765625, "all_logprobs/p25": -0.006744384765625, "all_logprobs/p5": -0.84375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.19581890106201172, "clip_ratio": 0.0, "completion_length": 488.0625, "completion_length/correct": 470.9264831542969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 373.0, "completion_length/correct/min": 199.0, "completion_length/correct/p25": 263.25, "completion_length/correct/p75": 606.25, "completion_length/correct/var": 54241.29296875, "completion_length/incorrect": 529.6785888671875, "completion_length/incorrect/max": 738.0, "completion_length/incorrect/median": 494.0, "completion_length/incorrect/min": 331.0, "completion_length/incorrect/p25": 407.25, "completion_length/incorrect/p75": 660.5, "completion_length/incorrect/var": 19018.818359375, "completion_length/max": 1024.0, "completion_length/median": 456.0, "completion_length/min": 199.0, "completion_length/p25": 314.5, "completion_length/p75": 641.0, "completion_length/var": 44380.375, "epoch": 0.0192, "feature_vector_variance/max_squared_error": 107151.453125, "feature_vector_variance/metric": 25706.796875, "generated_tokens/total": 1351743.0, "grad_norm": 0.22930490970611572, "grouped_std_rewards": 0.07453560084104538, "learning_rate": 1.4122106946441953e-05, "loss": 0.0, "mean_logprobs": -0.130859375, "mean_logprobs/var": 0.0009002685546875, "num_completions/total": 2304, "per_sentence_gradient_norm": 2.314661979675293, "per_sentence_gradient_norm/max": 35.766944885253906, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 5.776341438293457, "per_sentence_gradient_norm/p90": 7.608890533447266, "per_sentence_gradient_norm/p95": 9.863965034484863, "per_sentence_gradient_norm/p99": 33.86786651611328, "per_sentence_gradient_norm/var": 47.43626022338867, "per_token_feature_norm": 161.44845581054688, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 156.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 1857.1055908203125, "per_token_full_gradient_variance/max_squared_error": 4.095851898193359, "per_token_full_gradient_variance/variance": 0.005133291240781546, "per_token_gradient_norm": 3.566225051879883, "per_token_gradient_norm/max": 612.091552734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 729.0700073242188, "per_token_policy_error_norm": 0.07054656744003296, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.058294136077165604, "policy_entropy": 0.14401781558990479, "policy_entropy/max": 3.625, "policy_entropy/median": 2.777576446533203e-05, "policy_entropy/min": 1.8540724511240114e-14, "policy_entropy/p25": 1.214444637298584e-06, "policy_entropy/p75": 0.043212890625, "policy_entropy/var": 0.10562777519226074, "policy_error_vector_variance/max_squared_error": 2.0047993659973145, "policy_error_vector_variance/metric": 0.0704445093870163, "policy_loss": 0.0, "policy_loss/max": 0.5588920712471008, "policy_loss/median": 0.0, "policy_loss/min": -1.6766761541366577, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.15782415866851807, "policy_sharpness": 7.661238670349121, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.17962646484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.48950481414795, "reward": 0.7083333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2087719589471817, "rewards/accuracy_reward": 0.7083333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2087719589471817, "sentence_full_gradient_variance/max_squared_error": 10844.1796875, "sentence_full_gradient_variance/metric": 217.37359619140625, "sentence_full_gradient_variance/p75": 1.2535171508789062, "sentence_full_gradient_variance/p90": 467.11663818359375, "sentence_full_gradient_variance/p95": 701.587890625, "sentence_full_gradient_variance/p99": 2397.697265625, "state_level_variance/metric": 32.14596176147461, "state_level_variance_full_gradient/metric": 6.267586708068848, "step": 24 }, { "accuracy_reward": 0.8229166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14725877344608307, "action_level_variance/metric": 60.281654357910156, "action_level_variance_full_gradient/metric": 466.3555603027344, "adam_stats/lr_effective_max": 8.532890205970034e-05, "adam_stats/lr_effective_mean": 5.967539240536723e-10, "adam_stats/lr_effective_min": -8.177696872735396e-05, "adam_stats/m_t_max": 0.004722118843346834, "adam_stats/m_t_mean": 5.4828648565763416e-11, "adam_stats/m_t_min": -0.005916977301239967, "adam_stats/v_t_max": 7.140421075746417e-05, "adam_stats/v_t_mean": 4.090504844567189e-12, "adam_stats/v_t_min": 0.0, "advantages": -3.725290298461914e-09, "advantages/max": 1.0976732969284058, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.46501490473747253, "advantages/var": 0.6312683820724487, "all_logprobs": -0.1083558052778244, "all_logprobs/max": 0.0, "all_logprobs/median": -7.152557373046875e-07, "all_logprobs/min": -8.375, "all_logprobs/p1": -2.0625, "all_logprobs/p10": -0.251953125, "all_logprobs/p25": -0.001190185546875, "all_logprobs/p5": -0.697265625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15876047313213348, "clip_ratio": 0.0, "completion_length": 716.9896240234375, "completion_length/correct": 684.2152099609375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 708.0, "completion_length/correct/min": 243.0, "completion_length/correct/p25": 508.0, "completion_length/correct/p75": 890.0, "completion_length/correct/var": 58435.48046875, "completion_length/incorrect": 869.2941284179688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1002.0, "completion_length/incorrect/min": 488.0, "completion_length/incorrect/p25": 806.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 41970.47265625, "completion_length/max": 1024.0, "completion_length/median": 738.0, "completion_length/min": 243.0, "completion_length/p25": 523.75, "completion_length/p75": 942.75, "completion_length/var": 60091.546875, "epoch": 0.02, "feature_vector_variance/max_squared_error": 105971.6484375, "feature_vector_variance/metric": 25482.080078125, "generated_tokens/total": 1420574.0, "grad_norm": 0.20178154110908508, "grouped_std_rewards": 0.27668970823287964, "learning_rate": 1.3995190528383292e-05, "loss": -0.0, "mean_logprobs": -0.1142578125, "mean_logprobs/var": 0.00139617919921875, "num_completions/total": 2400, "per_sentence_gradient_norm": 7.460555553436279, "per_sentence_gradient_norm/max": 46.18832015991211, "per_sentence_gradient_norm/median": 4.6427741050720215, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 10.424604415893555, "per_sentence_gradient_norm/p85": 13.476651191711426, "per_sentence_gradient_norm/p90": 15.831134796142578, "per_sentence_gradient_norm/p95": 28.866865158081055, "per_sentence_gradient_norm/p99": 44.49406051635742, "per_sentence_gradient_norm/var": 94.16178131103516, "per_token_feature_norm": 161.3905029296875, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 156.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 129.0, "per_token_feature_norm/p75": 191.0, "per_token_feature_norm/var": 1785.9212646484375, "per_token_full_gradient_variance/max_squared_error": 4.483343124389648, "per_token_full_gradient_variance/variance": 0.011300601996481419, "per_token_gradient_norm": 8.723588943481445, "per_token_gradient_norm/max": 800.3887939453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1454.8114013671875, "per_token_policy_error_norm": 0.0596536360681057, "per_token_policy_error_norm/max": 1.9921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04991930350661278, "policy_entropy": 0.11979874223470688, "policy_entropy/max": 2.546875, "policy_entropy/median": 1.1324882507324219e-05, "policy_entropy/min": 6.323830348264892e-13, "policy_entropy/p25": 5.848705768585205e-07, "policy_entropy/p75": 0.0096435546875, "policy_entropy/var": 0.08762868493795395, "policy_error_vector_variance/max_squared_error": 2.0002832412719727, "policy_error_vector_variance/metric": 0.059611618518829346, "policy_loss": 1.2417634698280722e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -1.0976732969284058, "policy_loss/p25": -0.46501490473747253, "policy_loss/p75": 0.0, "policy_loss/var": 0.6312683820724487, "policy_sharpness": 7.9879679679870605, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.24609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.330304145812988, "reward": 0.8229166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14725877344608307, "rewards/accuracy_reward": 0.8229166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14725877344608307, "sentence_full_gradient_variance/max_squared_error": 15954.814453125, "sentence_full_gradient_variance/metric": 480.04217529296875, "sentence_full_gradient_variance/p75": 132.13140869140625, "sentence_full_gradient_variance/p90": 510.787841796875, "sentence_full_gradient_variance/p95": 1687.6802978515625, "sentence_full_gradient_variance/p99": 13876.0439453125, "state_level_variance/metric": 44.000244140625, "state_level_variance_full_gradient/metric": 13.68653392791748, "step": 25 }, { "accuracy_reward": 0.6145833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23936404287815094, "action_level_variance/metric": 208.10662841796875, "action_level_variance_full_gradient/metric": 1778.1307373046875, "adam_stats/lr_effective_max": 8.193542453227565e-05, "adam_stats/lr_effective_mean": 7.139989155469095e-10, "adam_stats/lr_effective_min": -8.162394806277007e-05, "adam_stats/m_t_max": 0.005372953601181507, "adam_stats/m_t_mean": 7.511026522966091e-11, "adam_stats/m_t_min": -0.007327232975512743, "adam_stats/v_t_max": 7.173357880674303e-05, "adam_stats/v_t_mean": 4.109489658288279e-12, "adam_stats/v_t_min": 0.0, "advantages": 2.4835269396561444e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.6311296820640564, "all_logprobs": -0.1526039093732834, "all_logprobs/max": 0.0, "all_logprobs/median": -4.649162292480469e-06, "all_logprobs/min": -9.1875, "all_logprobs/p1": -2.564687728881836, "all_logprobs/p10": -0.4296875, "all_logprobs/p25": -0.01263427734375, "all_logprobs/p5": -0.98046875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.24929948151111603, "clip_ratio": 0.0, "completion_length": 651.9479370117188, "completion_length/correct": 524.4576416015625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 522.0, "completion_length/correct/min": 203.0, "completion_length/correct/p25": 432.5, "completion_length/correct/p75": 599.5, "completion_length/correct/var": 22100.044921875, "completion_length/incorrect": 855.2432861328125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 331.0, "completion_length/incorrect/p25": 686.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 58387.91796875, "completion_length/max": 1024.0, "completion_length/median": 588.0, "completion_length/min": 203.0, "completion_length/p25": 472.5, "completion_length/p75": 987.75, "completion_length/var": 61809.6015625, "epoch": 0.0208, "feature_vector_variance/max_squared_error": 95025.3046875, "feature_vector_variance/metric": 27124.16015625, "generated_tokens/total": 1483161.0, "grad_norm": 0.34277400374412537, "grouped_std_rewards": 0.19953560829162598, "learning_rate": 1.3860360721173195e-05, "loss": -0.0, "mean_logprobs": -0.14453125, "mean_logprobs/var": 0.004608154296875, "num_completions/total": 2496, "per_sentence_gradient_norm": 7.703402519226074, "per_sentence_gradient_norm/max": 108.4537353515625, "per_sentence_gradient_norm/median": 3.1335251331329346, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 8.258855819702148, "per_sentence_gradient_norm/p85": 10.651653289794922, "per_sentence_gradient_norm/p90": 12.302282333374023, "per_sentence_gradient_norm/p95": 28.780113220214844, "per_sentence_gradient_norm/p99": 74.48090362548828, "per_sentence_gradient_norm/var": 250.1966552734375, "per_token_feature_norm": 166.57264709472656, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 159.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 130.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 2167.397705078125, "per_token_full_gradient_variance/max_squared_error": 13.127923011779785, "per_token_full_gradient_variance/variance": 0.012456374242901802, "per_token_gradient_norm": 7.0107927322387695, "per_token_gradient_norm/max": 1171.523681640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1677.6787109375, "per_token_policy_error_norm": 0.07990137487649918, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06604956090450287, "policy_entropy": 0.16889692842960358, "policy_entropy/max": 3.65625, "policy_entropy/median": 6.532669067382812e-05, "policy_entropy/min": 1.1057821325266559e-13, "policy_entropy/p25": 1.5795230865478516e-06, "policy_entropy/p75": 0.07373046875, "policy_entropy/var": 0.1406596601009369, "policy_error_vector_variance/max_squared_error": 2.0191195011138916, "policy_error_vector_variance/metric": 0.07967043668031693, "policy_loss": -8.692344621863413e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -3.7485010623931885, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.6311296820640564, "policy_sharpness": 7.385436058044434, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.565673828125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.591129302978516, "reward": 0.6145833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23936404287815094, "rewards/accuracy_reward": 0.6145833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23936404287815094, "sentence_full_gradient_variance/max_squared_error": 46647.7265625, "sentence_full_gradient_variance/metric": 1786.580078125, "sentence_full_gradient_variance/p75": 206.43194580078125, "sentence_full_gradient_variance/p90": 1909.7569580078125, "sentence_full_gradient_variance/p95": 9671.037109375, "sentence_full_gradient_variance/p99": 35563.8515625, "state_level_variance/metric": 62.988563537597656, "state_level_variance_full_gradient/metric": 8.449564933776855, "step": 26 }, { "accuracy_reward": 0.7291666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19956141710281372, "action_level_variance/metric": 199.65667724609375, "action_level_variance_full_gradient/metric": 1605.9638671875, "adam_stats/lr_effective_max": 8.303529466502368e-05, "adam_stats/lr_effective_mean": 4.897333094611156e-10, "adam_stats/lr_effective_min": -8.275121945189312e-05, "adam_stats/m_t_max": 0.004524379037320614, "adam_stats/m_t_mean": 4.2173230185849064e-11, "adam_stats/m_t_min": -0.005544704385101795, "adam_stats/v_t_max": 7.177205407060683e-05, "adam_stats/v_t_mean": 4.1517076235231265e-12, "adam_stats/v_t_min": 0.0, "advantages": -7.450580596923828e-09, "advantages/max": 1.249750018119812, "advantages/median": 0.24990005791187286, "advantages/min": -3.7485008239746094, "advantages/p25": -0.7498500347137451, "advantages/p75": 0.46501490473747253, "advantages/var": 0.7890505194664001, "all_logprobs": -0.15000031888484955, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-06, "all_logprobs/min": -11.3125, "all_logprobs/p1": -2.46875, "all_logprobs/p10": -0.416015625, "all_logprobs/p25": -0.01531982421875, "all_logprobs/p5": -0.984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.23772463202476501, "clip_ratio": 0.0, "completion_length": 570.1979370117188, "completion_length/correct": 529.9000244140625, "completion_length/correct/max": 977.0, "completion_length/correct/median": 490.0, "completion_length/correct/min": 292.0, "completion_length/correct/p25": 429.75, "completion_length/correct/p75": 612.0, "completion_length/correct/var": 23020.29296875, "completion_length/incorrect": 678.6923217773438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 650.0, "completion_length/incorrect/min": 336.0, "completion_length/incorrect/p25": 569.75, "completion_length/incorrect/p75": 751.5, "completion_length/incorrect/var": 41653.26171875, "completion_length/max": 1024.0, "completion_length/median": 530.0, "completion_length/min": 292.0, "completion_length/p25": 441.5, "completion_length/p75": 666.0, "completion_length/var": 32099.5078125, "epoch": 0.0216, "feature_vector_variance/max_squared_error": 96910.6640625, "feature_vector_variance/metric": 28109.673828125, "generated_tokens/total": 1537900.0, "grad_norm": 0.46730807423591614, "grouped_std_rewards": 0.3324463367462158, "learning_rate": 1.3717781794162813e-05, "loss": 0.0, "mean_logprobs": -0.1455078125, "mean_logprobs/var": 0.002044677734375, "num_completions/total": 2592, "per_sentence_gradient_norm": 12.619086265563965, "per_sentence_gradient_norm/max": 81.3091812133789, "per_sentence_gradient_norm/median": 10.130807876586914, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 4.071462631225586, "per_sentence_gradient_norm/p75": 14.656478881835938, "per_sentence_gradient_norm/p85": 16.953842163085938, "per_sentence_gradient_norm/p90": 23.895843505859375, "per_sentence_gradient_norm/p95": 38.083961486816406, "per_sentence_gradient_norm/p99": 80.82382202148438, "per_sentence_gradient_norm/var": 238.41014099121094, "per_token_feature_norm": 168.15846252441406, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 163.0, "per_token_feature_norm/min": 63.5, "per_token_feature_norm/p25": 131.0, "per_token_feature_norm/p75": 200.0, "per_token_feature_norm/var": 2182.40234375, "per_token_full_gradient_variance/max_squared_error": 8.853758811950684, "per_token_full_gradient_variance/variance": 0.021385453641414642, "per_token_gradient_norm": 14.256152153015137, "per_token_gradient_norm/max": 1312.56103515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2818.362060546875, "per_token_policy_error_norm": 0.0799131765961647, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06654956191778183, "policy_entropy": 0.1636987328529358, "policy_entropy/max": 3.6875, "policy_entropy/median": 6.4849853515625e-05, "policy_entropy/min": 1.438849039914203e-13, "policy_entropy/p25": 1.2367963790893555e-06, "policy_entropy/p75": 0.08203125, "policy_entropy/var": 0.12785513699054718, "policy_error_vector_variance/max_squared_error": 2.0114572048187256, "policy_error_vector_variance/metric": 0.07986379414796829, "policy_loss": 9.934107758624577e-09, "policy_loss/max": 3.7485010623931885, "policy_loss/median": -0.24990005791187286, "policy_loss/min": -1.2497501373291016, "policy_loss/p25": -0.46501484513282776, "policy_loss/p75": 0.7498500347137451, "policy_loss/var": 0.7890505194664001, "policy_sharpness": 7.385828971862793, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.556610107421875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.399796485900879, "reward": 0.7291666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19956141710281372, "rewards/accuracy_reward": 0.7291666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19956141710281372, "sentence_full_gradient_variance/max_squared_error": 43843.8515625, "sentence_full_gradient_variance/metric": 1647.2431640625, "sentence_full_gradient_variance/p75": 1106.3785400390625, "sentence_full_gradient_variance/p90": 3706.75, "sentence_full_gradient_variance/p95": 7561.7509765625, "sentence_full_gradient_variance/p99": 17366.908203125, "state_level_variance/metric": 58.498260498046875, "state_level_variance_full_gradient/metric": 41.27940368652344, "step": 27 }, { "accuracy_reward": 0.8125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1539473831653595, "action_level_variance/metric": 26.233802795410156, "action_level_variance_full_gradient/metric": 448.97113037109375, "adam_stats/lr_effective_max": 8.338541374541819e-05, "adam_stats/lr_effective_mean": 4.523960095426105e-10, "adam_stats/lr_effective_min": -8.026746945688501e-05, "adam_stats/m_t_max": 0.004060735460370779, "adam_stats/m_t_mean": 3.1742910383547596e-11, "adam_stats/m_t_min": -0.004933776333928108, "adam_stats/v_t_max": 7.170060416683555e-05, "adam_stats/v_t_mean": 4.148447210750028e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.36585545539855957, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.15780235826969147, "all_logprobs": -0.1517612338066101, "all_logprobs/max": 0.0, "all_logprobs/median": -7.987022399902344e-06, "all_logprobs/min": -9.5, "all_logprobs/p1": -2.484375, "all_logprobs/p10": -0.4296875, "all_logprobs/p25": -0.0181884765625, "all_logprobs/p5": -0.9765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.23752987384796143, "clip_ratio": 0.0, "completion_length": 532.4375, "completion_length/correct": 426.974365234375, "completion_length/correct/max": 953.0, "completion_length/correct/median": 323.0, "completion_length/correct/min": 206.0, "completion_length/correct/p25": 270.25, "completion_length/correct/p75": 529.25, "completion_length/correct/var": 42608.1796875, "completion_length/incorrect": 989.4444580078125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 402.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 21493.556640625, "completion_length/max": 1024.0, "completion_length/median": 402.0, "completion_length/min": 206.0, "completion_length/p25": 279.75, "completion_length/p75": 780.0, "completion_length/var": 87086.0078125, "epoch": 0.0224, "feature_vector_variance/max_squared_error": 92150.84375, "feature_vector_variance/metric": 27259.12890625, "generated_tokens/total": 1589014.0, "grad_norm": 0.1652439534664154, "grouped_std_rewards": 0.056927502155303955, "learning_rate": 1.3567627457812107e-05, "loss": 0.0, "mean_logprobs": -0.134765625, "mean_logprobs/var": 0.002716064453125, "num_completions/total": 2688, "per_sentence_gradient_norm": 1.896836519241333, "per_sentence_gradient_norm/max": 46.83121871948242, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 5.30265998840332, "per_sentence_gradient_norm/p90": 6.641767501831055, "per_sentence_gradient_norm/p95": 7.862387657165527, "per_sentence_gradient_norm/p99": 40.00543212890625, "per_sentence_gradient_norm/var": 43.032386779785156, "per_token_feature_norm": 166.17807006835938, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 161.0, "per_token_feature_norm/min": 61.75, "per_token_feature_norm/p25": 131.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 2043.559814453125, "per_token_full_gradient_variance/max_squared_error": 5.157991886138916, "per_token_full_gradient_variance/variance": 0.0032230825163424015, "per_token_gradient_norm": 2.0713133811950684, "per_token_gradient_norm/max": 856.8505859375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 480.3925476074219, "per_token_policy_error_norm": 0.08045492321252823, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06665843725204468, "policy_entropy": 0.16741250455379486, "policy_entropy/max": 3.421875, "policy_entropy/median": 0.0001087188720703125, "policy_entropy/min": 1.0080825063596421e-13, "policy_entropy/p25": 1.30385160446167e-06, "policy_entropy/p75": 0.09423828125, "policy_entropy/var": 0.12818798422813416, "policy_error_vector_variance/max_squared_error": 2.0207762718200684, "policy_error_vector_variance/metric": 0.08037261664867401, "policy_loss": 0.0, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -0.36585548520088196, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.15780235826969147, "policy_sharpness": 7.305532455444336, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.30792236328125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.498984336853027, "reward": 0.8125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1539473831653595, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1539473831653595, "sentence_full_gradient_variance/max_squared_error": 36865.3203125, "sentence_full_gradient_variance/metric": 459.7430114746094, "sentence_full_gradient_variance/p75": 2.154376268386841, "sentence_full_gradient_variance/p90": 479.6233215332031, "sentence_full_gradient_variance/p95": 648.5706787109375, "sentence_full_gradient_variance/p99": 2490.61083984375, "state_level_variance/metric": 21.587932586669922, "state_level_variance_full_gradient/metric": 10.771881103515625, "step": 28 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894737035036087, "action_level_variance/metric": 62.87006759643555, "action_level_variance_full_gradient/metric": 346.640625, "adam_stats/lr_effective_max": 7.972597086336464e-05, "adam_stats/lr_effective_mean": 4.4290934808621785e-10, "adam_stats/lr_effective_min": -8.013436308829114e-05, "adam_stats/m_t_max": 0.00379885733127594, "adam_stats/m_t_mean": 2.741546226148195e-11, "adam_stats/m_t_min": -0.0048737479373812675, "adam_stats/v_t_max": 7.164768612710759e-05, "adam_stats/v_t_mean": 4.148889998917271e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -1.6766761541366577, "advantages/p25": -0.062475014477968216, "advantages/p75": 0.1397230178117752, "advantages/var": 0.47342151403427124, "all_logprobs": -0.14113196730613708, "all_logprobs/max": 0.0, "all_logprobs/median": -6.9141387939453125e-06, "all_logprobs/min": -10.6875, "all_logprobs/p1": -2.34375, "all_logprobs/p10": -0.392578125, "all_logprobs/p25": -0.0142822265625, "all_logprobs/p5": -0.8984375, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.21134817600250244, "clip_ratio": 0.0, "completion_length": 509.54168701171875, "completion_length/correct": 387.1111145019531, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 323.0, "completion_length/correct/min": 160.0, "completion_length/correct/p25": 268.75, "completion_length/correct/p75": 373.5, "completion_length/correct/var": 42872.21484375, "completion_length/incorrect": 876.8333740234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 249.0, "completion_length/incorrect/p25": 914.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 73156.9296875, "completion_length/max": 1024.0, "completion_length/median": 341.0, "completion_length/min": 160.0, "completion_length/p25": 286.5, "completion_length/p75": 802.5, "completion_length/var": 95194.0703125, "epoch": 0.0232, "feature_vector_variance/max_squared_error": 92127.4609375, "feature_vector_variance/metric": 25083.552734375, "generated_tokens/total": 1637930.0, "grad_norm": 0.2396669089794159, "grouped_std_rewards": 0.19598786532878876, "learning_rate": 1.3410080652050414e-05, "loss": 0.0, "mean_logprobs": -0.140625, "mean_logprobs/var": 0.00191497802734375, "num_completions/total": 2784, "per_sentence_gradient_norm": 6.462464809417725, "per_sentence_gradient_norm/max": 70.21163177490234, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 9.848140716552734, "per_sentence_gradient_norm/p85": 12.825509071350098, "per_sentence_gradient_norm/p90": 19.764202117919922, "per_sentence_gradient_norm/p95": 24.59998321533203, "per_sentence_gradient_norm/p99": 35.35793685913086, "per_sentence_gradient_norm/var": 106.49417877197266, "per_token_feature_norm": 160.451171875, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 71.5, "per_token_feature_norm/p25": 125.0, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 2070.059814453125, "per_token_full_gradient_variance/max_squared_error": 12.603743553161621, "per_token_full_gradient_variance/variance": 0.014354047365486622, "per_token_gradient_norm": 8.721025466918945, "per_token_gradient_norm/max": 1172.4315185546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1794.64111328125, "per_token_policy_error_norm": 0.07628726959228516, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0628119632601738, "policy_entropy": 0.15783651173114777, "policy_entropy/max": 2.84375, "policy_entropy/median": 9.393692016601562e-05, "policy_entropy/min": 1.6910917111090384e-12, "policy_entropy/p25": 2.3096799850463867e-06, "policy_entropy/p75": 0.080078125, "policy_entropy/var": 0.11571070551872253, "policy_error_vector_variance/max_squared_error": 2.0082967281341553, "policy_error_vector_variance/metric": 0.07620260864496231, "policy_loss": -4.967053879312289e-09, "policy_loss/max": 1.6766760349273682, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": -0.139723002910614, "policy_loss/p75": 0.062475014477968216, "policy_loss/var": 0.47342148423194885, "policy_sharpness": 7.366631031036377, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.558593511581421, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.249780654907227, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894737035036087, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894737035036087, "sentence_full_gradient_variance/max_squared_error": 8913.1201171875, "sentence_full_gradient_variance/metric": 381.0989685058594, "sentence_full_gradient_variance/p75": 114.47064208984375, "sentence_full_gradient_variance/p90": 925.4273681640625, "sentence_full_gradient_variance/p95": 1600.2421875, "sentence_full_gradient_variance/p99": 6142.93603515625, "state_level_variance/metric": 55.7330322265625, "state_level_variance_full_gradient/metric": 34.458351135253906, "step": 29 }, { "accuracy_reward": 0.8229166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14725877344608307, "action_level_variance/metric": 101.2880859375, "action_level_variance_full_gradient/metric": 1199.399169921875, "adam_stats/lr_effective_max": 8.214217814384028e-05, "adam_stats/lr_effective_mean": 5.371572631140964e-10, "adam_stats/lr_effective_min": -8.163668098859489e-05, "adam_stats/m_t_max": 0.0026627208571881056, "adam_stats/m_t_mean": 3.29645651053756e-11, "adam_stats/m_t_min": -0.002726216334849596, "adam_stats/v_t_max": 7.185164577094838e-05, "adam_stats/v_t_mean": 4.162139816826782e-12, "adam_stats/v_t_min": 0.0, "advantages": -9.934107758624577e-09, "advantages/max": 1.0976732969284058, "advantages/median": 0.24990005791187286, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.7889966368675232, "all_logprobs": -0.12932848930358887, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-06, "all_logprobs/min": -9.1875, "all_logprobs/p1": -2.3125, "all_logprobs/p10": -0.337890625, "all_logprobs/p25": -0.00372314453125, "all_logprobs/p5": -0.84765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.20210224390029907, "clip_ratio": 0.0, "completion_length": 692.4271240234375, "completion_length/correct": 681.4937133789062, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 659.0, "completion_length/correct/min": 307.0, "completion_length/correct/p25": 500.5, "completion_length/correct/p75": 889.0, "completion_length/correct/var": 48822.71875, "completion_length/incorrect": 743.2352905273438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 696.0, "completion_length/incorrect/min": 359.0, "completion_length/incorrect/p25": 590.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 51516.6953125, "completion_length/max": 1024.0, "completion_length/median": 666.0, "completion_length/min": 307.0, "completion_length/p25": 500.75, "completion_length/p75": 894.25, "completion_length/var": 49323.86328125, "epoch": 0.024, "feature_vector_variance/max_squared_error": 102696.5, "feature_vector_variance/metric": 27882.25, "generated_tokens/total": 1704403.0, "grad_norm": 0.45823365449905396, "grouped_std_rewards": 0.3001876771450043, "learning_rate": 1.3245333323392335e-05, "loss": -0.0, "mean_logprobs": -0.1318359375, "mean_logprobs/var": 0.00579833984375, "num_completions/total": 2880, "per_sentence_gradient_norm": 11.259275436401367, "per_sentence_gradient_norm/max": 63.87250518798828, "per_sentence_gradient_norm/median": 4.34694766998291, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 2.902448892593384, "per_sentence_gradient_norm/p75": 9.309327125549316, "per_sentence_gradient_norm/p85": 30.306917190551758, "per_sentence_gradient_norm/p90": 35.89203643798828, "per_sentence_gradient_norm/p95": 43.561641693115234, "per_sentence_gradient_norm/p99": 57.76420593261719, "per_sentence_gradient_norm/var": 233.0945587158203, "per_token_feature_norm": 168.1956329345703, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 163.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 132.0, "per_token_feature_norm/p75": 200.0, "per_token_feature_norm/var": 2001.20361328125, "per_token_full_gradient_variance/max_squared_error": 7.651361465454102, "per_token_full_gradient_variance/variance": 0.015214016661047935, "per_token_gradient_norm": 11.346871376037598, "per_token_gradient_norm/max": 1367.5584716796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2036.81689453125, "per_token_policy_error_norm": 0.06945203989744186, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05791051313281059, "policy_entropy": 0.14212100207805634, "policy_entropy/max": 2.953125, "policy_entropy/median": 1.8477439880371094e-05, "policy_entropy/min": 4.3021142204224816e-15, "policy_entropy/p25": 9.08970832824707e-07, "policy_entropy/p75": 0.0264892578125, "policy_entropy/var": 0.11169590055942535, "policy_error_vector_variance/max_squared_error": 2.0137434005737305, "policy_error_vector_variance/metric": 0.06940463185310364, "policy_loss": -4.967053879312289e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": -0.24990005791187286, "policy_loss/min": -1.0976734161376953, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.7889966368675232, "policy_sharpness": 7.76448392868042, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.48822021484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.353423118591309, "reward": 0.8229166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14725877344608307, "rewards/accuracy_reward": 0.8229166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14725877344608307, "sentence_full_gradient_variance/max_squared_error": 67198.5234375, "sentence_full_gradient_variance/metric": 1233.3837890625, "sentence_full_gradient_variance/p75": 313.06597900390625, "sentence_full_gradient_variance/p90": 1277.726806640625, "sentence_full_gradient_variance/p95": 2377.984375, "sentence_full_gradient_variance/p99": 16131.1875, "state_level_variance/metric": 162.85067749023438, "state_level_variance_full_gradient/metric": 33.984779357910156, "step": 30 }, { "accuracy_reward": 0.7708333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17850877344608307, "action_level_variance/metric": 190.3406982421875, "action_level_variance_full_gradient/metric": 1315.802001953125, "adam_stats/lr_effective_max": 7.677285611862317e-05, "adam_stats/lr_effective_mean": 4.749031168316264e-10, "adam_stats/lr_effective_min": -7.652540080016479e-05, "adam_stats/m_t_max": 0.0024215567391365767, "adam_stats/m_t_mean": 4.5410505217224895e-11, "adam_stats/m_t_min": -0.0026473812758922577, "adam_stats/v_t_max": 7.178355008363724e-05, "adam_stats/v_t_mean": 4.1677143507168335e-12, "adam_stats/v_t_min": 0.0, "advantages": -1.2417634698280722e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.47339966893196106, "all_logprobs": -0.15305866301059723, "all_logprobs/max": 0.0, "all_logprobs/median": -4.172325134277344e-06, "all_logprobs/min": -10.0, "all_logprobs/p1": -2.421875, "all_logprobs/p10": -0.453125, "all_logprobs/p25": -0.0198974609375, "all_logprobs/p5": -0.984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.23305422067642212, "clip_ratio": 0.0, "completion_length": 562.7916870117188, "completion_length/correct": 540.2162475585938, "completion_length/correct/max": 961.0, "completion_length/correct/median": 521.0, "completion_length/correct/min": 323.0, "completion_length/correct/p25": 453.0, "completion_length/correct/p75": 587.75, "completion_length/correct/var": 15119.76171875, "completion_length/incorrect": 638.727294921875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 589.0, "completion_length/incorrect/min": 360.0, "completion_length/incorrect/p25": 397.0, "completion_length/incorrect/p75": 818.0, "completion_length/incorrect/var": 56051.2578125, "completion_length/max": 1024.0, "completion_length/median": 528.0, "completion_length/min": 323.0, "completion_length/p25": 448.0, "completion_length/p75": 643.75, "completion_length/var": 25740.9453125, "epoch": 0.0248, "feature_vector_variance/max_squared_error": 96536.8125, "feature_vector_variance/metric": 26986.755859375, "generated_tokens/total": 1758431.0, "grad_norm": 0.33889520168304443, "grouped_std_rewards": 0.17837977409362793, "learning_rate": 1.3073586191080456e-05, "loss": 0.0, "mean_logprobs": -0.1513671875, "mean_logprobs/var": 0.002471923828125, "num_completions/total": 2976, "per_sentence_gradient_norm": 7.9315948486328125, "per_sentence_gradient_norm/max": 99.26710510253906, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 9.75379753112793, "per_sentence_gradient_norm/p85": 12.813129425048828, "per_sentence_gradient_norm/p90": 15.063289642333984, "per_sentence_gradient_norm/p95": 29.21175765991211, "per_sentence_gradient_norm/p99": 82.75471496582031, "per_sentence_gradient_norm/var": 246.5806884765625, "per_token_feature_norm": 165.80795288085938, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 158.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 129.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 2153.197998046875, "per_token_full_gradient_variance/max_squared_error": 10.402689933776855, "per_token_full_gradient_variance/variance": 0.014168127439916134, "per_token_gradient_norm": 7.6569671630859375, "per_token_gradient_norm/max": 1469.6466064453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2001.0780029296875, "per_token_policy_error_norm": 0.0823395773768425, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06733652949333191, "policy_entropy": 0.16931982338428497, "policy_entropy/max": 3.75, "policy_entropy/median": 5.817413330078125e-05, "policy_entropy/min": 1.1032841307212493e-15, "policy_entropy/p25": 1.0728836059570312e-06, "policy_entropy/p75": 0.1015625, "policy_entropy/var": 0.1265062540769577, "policy_error_vector_variance/max_squared_error": 2.0057454109191895, "policy_error_vector_variance/metric": 0.08227182924747467, "policy_loss": 0.0, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.47339966893196106, "policy_sharpness": 7.328464984893799, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.37005615234375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.603972434997559, "reward": 0.7708333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17850877344608307, "rewards/accuracy_reward": 0.7708333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17850877344608307, "sentence_full_gradient_variance/max_squared_error": 28863.8359375, "sentence_full_gradient_variance/metric": 1321.3616943359375, "sentence_full_gradient_variance/p75": 471.7676696777344, "sentence_full_gradient_variance/p90": 2605.96630859375, "sentence_full_gradient_variance/p95": 10726.0927734375, "sentence_full_gradient_variance/p99": 14786.8984375, "state_level_variance/metric": 78.68124389648438, "state_level_variance_full_gradient/metric": 5.559590816497803, "step": 31 }, { "accuracy_reward": 0.8541666865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.12587720155715942, "action_level_variance/metric": 411.52520751953125, "action_level_variance_full_gradient/metric": 1186.1033935546875, "adam_stats/lr_effective_max": 7.608313171658665e-05, "adam_stats/lr_effective_mean": 2.93481183799571e-10, "adam_stats/lr_effective_min": -7.640783587703481e-05, "adam_stats/m_t_max": 0.0028965643141418695, "adam_stats/m_t_mean": 4.0811212048685874e-11, "adam_stats/m_t_min": -0.0035667254123836756, "adam_stats/v_t_max": 7.18519659130834e-05, "adam_stats/v_t_mean": 4.173270236329518e-12, "adam_stats/v_t_min": 0.0, "advantages": 4.967053879312289e-09, "advantages/max": 0.9680583477020264, "advantages/median": 0.24990005791187286, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.6312209367752075, "all_logprobs": -0.22958879172801971, "all_logprobs/max": 0.0, "all_logprobs/median": -6.246566772460938e-05, "all_logprobs/min": -10.3125, "all_logprobs/p1": -3.028437614440918, "all_logprobs/p10": -0.7578125, "all_logprobs/p25": -0.080078125, "all_logprobs/p5": -1.46875, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.3867851197719574, "clip_ratio": 0.0, "completion_length": 562.6979370117188, "completion_length/correct": 514.5365600585938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 460.0, "completion_length/correct/min": 194.0, "completion_length/correct/p25": 350.75, "completion_length/correct/p75": 633.5, "completion_length/correct/var": 45098.44921875, "completion_length/incorrect": 844.7857666015625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 314.0, "completion_length/incorrect/p25": 690.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 72933.875, "completion_length/max": 1024.0, "completion_length/median": 514.0, "completion_length/min": 194.0, "completion_length/p25": 360.75, "completion_length/p75": 686.75, "completion_length/var": 62161.515625, "epoch": 0.0256, "feature_vector_variance/max_squared_error": 105369.0078125, "feature_vector_variance/metric": 28696.833984375, "generated_tokens/total": 1812450.0, "grad_norm": 0.4089006781578064, "grouped_std_rewards": 0.25184595584869385, "learning_rate": 1.2895048502539883e-05, "loss": 0.0, "mean_logprobs": -0.2080078125, "mean_logprobs/var": 0.0145263671875, "num_completions/total": 3072, "per_sentence_gradient_norm": 16.593595504760742, "per_sentence_gradient_norm/max": 144.07081604003906, "per_sentence_gradient_norm/median": 5.668575763702393, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 24.746423721313477, "per_sentence_gradient_norm/p85": 31.596757888793945, "per_sentence_gradient_norm/p90": 39.74261474609375, "per_sentence_gradient_norm/p95": 57.39695739746094, "per_sentence_gradient_norm/p99": 114.96392822265625, "per_sentence_gradient_norm/var": 656.8717651367188, "per_token_feature_norm": 175.69354248046875, "per_token_feature_norm/max": 336.0, "per_token_feature_norm/median": 167.0, "per_token_feature_norm/min": 54.75, "per_token_feature_norm/p25": 134.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 2837.341064453125, "per_token_full_gradient_variance/max_squared_error": 13.787522315979004, "per_token_full_gradient_variance/variance": 0.03598305955529213, "per_token_gradient_norm": 21.36408805847168, "per_token_gradient_norm/max": 1336.281982421875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4572.27001953125, "per_token_policy_error_norm": 0.11472029983997345, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.01171875, "per_token_policy_error_norm/var": 0.09027650952339172, "policy_entropy": 0.25432124733924866, "policy_entropy/max": 3.75, "policy_entropy/median": 0.00070953369140625, "policy_entropy/min": 1.8047785488306545e-12, "policy_entropy/p25": 6.16908073425293e-06, "policy_entropy/p75": 0.2958984375, "policy_entropy/var": 0.23486045002937317, "policy_error_vector_variance/max_squared_error": 2.0189855098724365, "policy_error_vector_variance/metric": 0.11452531069517136, "policy_loss": 0.0, "policy_loss/max": 3.7485008239746094, "policy_loss/median": -0.24990004301071167, "policy_loss/min": -0.9680584669113159, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.6312209367752075, "policy_sharpness": 6.605530261993408, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.186328172683716, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 15.750519752502441, "reward": 0.8541666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.12587720155715942, "rewards/accuracy_reward": 0.8541666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.12587720155715942, "sentence_full_gradient_variance/max_squared_error": 36596.11328125, "sentence_full_gradient_variance/metric": 1208.075439453125, "sentence_full_gradient_variance/p75": 388.218505859375, "sentence_full_gradient_variance/p90": 779.8992919921875, "sentence_full_gradient_variance/p95": 992.3577880859375, "sentence_full_gradient_variance/p99": 30160.966796875, "state_level_variance/metric": 317.06927490234375, "state_level_variance_full_gradient/metric": 21.97218894958496, "step": 32 }, { "accuracy_reward": 0.5833333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24561403691768646, "action_level_variance/metric": 141.27464294433594, "action_level_variance_full_gradient/metric": 192.1442108154297, "adam_stats/lr_effective_max": 7.553628529421985e-05, "adam_stats/lr_effective_mean": 1.2432070650714167e-10, "adam_stats/lr_effective_min": -7.438778266077861e-05, "adam_stats/m_t_max": 0.0022620591334998608, "adam_stats/m_t_mean": 3.232131229324864e-11, "adam_stats/m_t_min": -0.002544769551604986, "adam_stats/v_t_max": 7.182437548181042e-05, "adam_stats/v_t_mean": 4.1721331250910154e-12, "adam_stats/v_t_min": 0.0, "advantages": 9.934107758624577e-09, "advantages/max": 2.560988187789917, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": -0.09146386384963989, "advantages/p75": 0.5588920712471008, "advantages/var": 0.6312237977981567, "all_logprobs": -0.20318852365016937, "all_logprobs/max": 0.0, "all_logprobs/median": -1.33514404296875e-05, "all_logprobs/min": -12.9375, "all_logprobs/p1": -2.90625, "all_logprobs/p10": -0.65625, "all_logprobs/p25": -0.045166015625, "all_logprobs/p5": -1.296875, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.3467327058315277, "clip_ratio": 0.0, "completion_length": 737.96875, "completion_length/correct": 705.4107666015625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 683.0, "completion_length/correct/min": 409.0, "completion_length/correct/p25": 571.0, "completion_length/correct/p75": 804.0, "completion_length/correct/var": 27791.845703125, "completion_length/incorrect": 783.5499877929688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 989.0, "completion_length/incorrect/min": 258.0, "completion_length/incorrect/p25": 500.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 74923.3828125, "completion_length/max": 1024.0, "completion_length/median": 702.0, "completion_length/min": 258.0, "completion_length/p25": 555.0, "completion_length/p75": 995.5, "completion_length/var": 48347.69921875, "epoch": 0.0264, "feature_vector_variance/max_squared_error": 92307.0078125, "feature_vector_variance/metric": 27688.6171875, "generated_tokens/total": 1883295.0, "grad_norm": 0.24880123138427734, "grouped_std_rewards": 0.2529153823852539, "learning_rate": 1.270993777844248e-05, "loss": -0.0, "mean_logprobs": -0.19921875, "mean_logprobs/var": 0.007293701171875, "num_completions/total": 3168, "per_sentence_gradient_norm": 13.036540031433105, "per_sentence_gradient_norm/max": 76.62461853027344, "per_sentence_gradient_norm/median": 7.66611385345459, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 19.813058853149414, "per_sentence_gradient_norm/p85": 25.80048942565918, "per_sentence_gradient_norm/p90": 32.151187896728516, "per_sentence_gradient_norm/p95": 47.64801788330078, "per_sentence_gradient_norm/p99": 64.1220703125, "per_sentence_gradient_norm/var": 266.4200744628906, "per_token_feature_norm": 168.9635772705078, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 157.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 129.0, "per_token_feature_norm/p75": 200.0, "per_token_feature_norm/var": 2737.6181640625, "per_token_full_gradient_variance/max_squared_error": 41.274169921875, "per_token_full_gradient_variance/variance": 0.021741658449172974, "per_token_gradient_norm": 15.185221672058105, "per_token_gradient_norm/max": 1219.72705078125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3079.333984375, "per_token_policy_error_norm": 0.10286794602870941, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.08160035312175751, "policy_entropy": 0.22315923869609833, "policy_entropy/max": 3.71875, "policy_entropy/median": 0.00017261505126953125, "policy_entropy/min": 3.069544618483633e-12, "policy_entropy/p25": 3.56137752532959e-06, "policy_entropy/p75": 0.1982421875, "policy_entropy/var": 0.19951972365379333, "policy_error_vector_variance/max_squared_error": 2.0175933837890625, "policy_error_vector_variance/metric": 0.10276467353105545, "policy_loss": -4.967053879312289e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -2.560988187789917, "policy_loss/p25": -0.558892011642456, "policy_loss/p75": 0.09146386384963989, "policy_loss/var": 0.6312237977981567, "policy_sharpness": 6.969016075134277, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.5875000953674316, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 15.168740272521973, "reward": 0.5833333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24561403691768646, "rewards/accuracy_reward": 0.5833333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24561403691768646, "sentence_full_gradient_variance/max_squared_error": 3590.31982421875, "sentence_full_gradient_variance/metric": 222.83285522460938, "sentence_full_gradient_variance/p75": 39.18004608154297, "sentence_full_gradient_variance/p90": 802.3668212890625, "sentence_full_gradient_variance/p95": 1144.5577392578125, "sentence_full_gradient_variance/p99": 2341.1015625, "state_level_variance/metric": 157.4398651123047, "state_level_variance_full_gradient/metric": 30.688634872436523, "step": 33 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14035087823867798, "action_level_variance/metric": 94.73582458496094, "action_level_variance_full_gradient/metric": 11.954765319824219, "adam_stats/lr_effective_max": 7.450216799043119e-05, "adam_stats/lr_effective_mean": 1.4128537229041171e-10, "adam_stats/lr_effective_min": -7.259018457261845e-05, "adam_stats/m_t_max": 0.0026889294385910034, "adam_stats/m_t_mean": 3.502674192357169e-11, "adam_stats/m_t_min": -0.003761240281164646, "adam_stats/v_t_max": 7.196891965577379e-05, "adam_stats/v_t_mean": 4.172554662895678e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 1.4358407258987427, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.47342148423194885, "all_logprobs": -0.1446329951286316, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-06, "all_logprobs/min": -9.5, "all_logprobs/p1": -2.390625, "all_logprobs/p10": -0.392578125, "all_logprobs/p25": -0.00750732421875, "all_logprobs/p5": -0.9609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.23356088995933533, "clip_ratio": 0.0, "completion_length": 642.6771240234375, "completion_length/correct": 567.1500244140625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 488.0, "completion_length/correct/min": 260.0, "completion_length/correct/p25": 375.75, "completion_length/correct/p75": 740.75, "completion_length/correct/var": 52194.02734375, "completion_length/incorrect": 1020.3125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 974.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 157.5625, "completion_length/max": 1024.0, "completion_length/median": 552.0, "completion_length/min": 260.0, "completion_length/p25": 411.0, "completion_length/p75": 922.5, "completion_length/var": 72250.265625, "epoch": 0.0272, "feature_vector_variance/max_squared_error": 111101.59375, "feature_vector_variance/metric": 27193.15625, "generated_tokens/total": 1944992.0, "grad_norm": 0.23558977246284485, "grouped_std_rewards": 0.19598786532878876, "learning_rate": 1.2518479547691437e-05, "loss": 0.0, "mean_logprobs": -0.146484375, "mean_logprobs/var": 0.00165557861328125, "num_completions/total": 3264, "per_sentence_gradient_norm": 7.215389251708984, "per_sentence_gradient_norm/max": 86.33208465576172, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 9.49163818359375, "per_sentence_gradient_norm/p85": 14.881322860717773, "per_sentence_gradient_norm/p90": 22.510774612426758, "per_sentence_gradient_norm/p95": 27.22629737854004, "per_sentence_gradient_norm/p99": 41.34311294555664, "per_sentence_gradient_norm/var": 148.81536865234375, "per_token_feature_norm": 165.207275390625, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 158.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 2230.77685546875, "per_token_full_gradient_variance/max_squared_error": 14.034013748168945, "per_token_full_gradient_variance/variance": 0.01584519073367119, "per_token_gradient_norm": 10.320134162902832, "per_token_gradient_norm/max": 1235.7166748046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2246.843505859375, "per_token_policy_error_norm": 0.07676704972982407, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06364238262176514, "policy_entropy": 0.15654096007347107, "policy_entropy/max": 3.796875, "policy_entropy/median": 1.823902130126953e-05, "policy_entropy/min": 1.709743457922741e-14, "policy_entropy/p25": 5.476176738739014e-07, "policy_entropy/p75": 0.044921875, "policy_entropy/var": 0.12591570615768433, "policy_error_vector_variance/max_squared_error": 2.0119869709014893, "policy_error_vector_variance/metric": 0.07669985294342041, "policy_loss": -5.587935447692871e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -1.4358407258987427, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.4734214246273041, "policy_sharpness": 7.630279541015625, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.1405029296875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.773698806762695, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14035087823867798, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14035087823867798, "sentence_full_gradient_variance/max_squared_error": 324.97357177734375, "sentence_full_gradient_variance/metric": 63.69841766357422, "sentence_full_gradient_variance/p75": 21.64945411682129, "sentence_full_gradient_variance/p90": 244.2179718017578, "sentence_full_gradient_variance/p95": 273.3352355957031, "sentence_full_gradient_variance/p99": 308.06500244140625, "state_level_variance/metric": 70.14044189453125, "state_level_variance_full_gradient/metric": 51.74365234375, "step": 34 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894736886024475, "action_level_variance/metric": 173.8055877685547, "action_level_variance_full_gradient/metric": 241.27127075195312, "adam_stats/lr_effective_max": 6.93113834131509e-05, "adam_stats/lr_effective_mean": 9.765227515501351e-11, "adam_stats/lr_effective_min": -7.102880772436038e-05, "adam_stats/m_t_max": 0.002853386104106903, "adam_stats/m_t_mean": 4.122334418266149e-11, "adam_stats/m_t_min": -0.0037360682617872953, "adam_stats/v_t_max": 7.190927135525271e-05, "adam_stats/v_t_mean": 4.173014798297681e-12, "adam_stats/v_t_min": 0.0, "advantages": 4.967053879312289e-09, "advantages/max": 0.5588920712471008, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.4734289050102234, "all_logprobs": -0.16659829020500183, "all_logprobs/max": 0.0, "all_logprobs/median": -4.887580871582031e-06, "all_logprobs/min": -8.625, "all_logprobs/p1": -2.6726560592651367, "all_logprobs/p10": -0.5, "all_logprobs/p25": -0.0205078125, "all_logprobs/p5": -1.0625, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.26949867606163025, "clip_ratio": 0.0, "completion_length": 669.75, "completion_length/correct": 568.5416870117188, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 549.0, "completion_length/correct/min": 321.0, "completion_length/correct/p25": 424.75, "completion_length/correct/p75": 651.25, "completion_length/correct/var": 29725.6875, "completion_length/incorrect": 973.375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 452.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 20381.548828125, "completion_length/max": 1024.0, "completion_length/median": 624.0, "completion_length/min": 321.0, "completion_length/p25": 452.75, "completion_length/p75": 994.0, "completion_length/var": 58203.375, "epoch": 0.028, "feature_vector_variance/max_squared_error": 97907.4375, "feature_vector_variance/metric": 27574.91015625, "generated_tokens/total": 2009288.0, "grad_norm": 0.27618882060050964, "grouped_std_rewards": 0.1883906126022339, "learning_rate": 1.2320907072649045e-05, "loss": 0.0, "mean_logprobs": -0.1640625, "mean_logprobs/var": 0.0029296875, "num_completions/total": 3360, "per_sentence_gradient_norm": 9.110847473144531, "per_sentence_gradient_norm/max": 88.7201156616211, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 9.32433795928955, "per_sentence_gradient_norm/p85": 15.924432754516602, "per_sentence_gradient_norm/p90": 19.374046325683594, "per_sentence_gradient_norm/p95": 51.66749572753906, "per_sentence_gradient_norm/p99": 66.11959075927734, "per_sentence_gradient_norm/var": 267.4328308105469, "per_token_feature_norm": 167.32281494140625, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 158.0, "per_token_feature_norm/min": 61.25, "per_token_feature_norm/p25": 130.0, "per_token_feature_norm/p75": 198.0, "per_token_feature_norm/var": 2320.9794921875, "per_token_full_gradient_variance/max_squared_error": 9.72361946105957, "per_token_full_gradient_variance/variance": 0.017418280243873596, "per_token_gradient_norm": 10.39966869354248, "per_token_gradient_norm/max": 936.7214965820312, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2296.8720703125, "per_token_policy_error_norm": 0.08687013387680054, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0712939202785492, "policy_entropy": 0.18286900222301483, "policy_entropy/max": 2.84375, "policy_entropy/median": 6.818771362304688e-05, "policy_entropy/min": 3.019806626980426e-14, "policy_entropy/p25": 1.7061829566955566e-06, "policy_entropy/p75": 0.10546875, "policy_entropy/var": 0.15052150189876556, "policy_error_vector_variance/max_squared_error": 2.015157461166382, "policy_error_vector_variance/metric": 0.0868019163608551, "policy_loss": -2.4835269396561444e-09, "policy_loss/max": 2.560988426208496, "policy_loss/median": 0.0, "policy_loss/min": -0.5588921308517456, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.4734289050102234, "policy_sharpness": 7.259827613830566, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.136914014816284, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.112260818481445, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894736886024475, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894736886024475, "sentence_full_gradient_variance/max_squared_error": 14118.6875, "sentence_full_gradient_variance/metric": 263.15966796875, "sentence_full_gradient_variance/p75": 92.87052917480469, "sentence_full_gradient_variance/p90": 358.66461181640625, "sentence_full_gradient_variance/p95": 675.723876953125, "sentence_full_gradient_variance/p99": 2276.172607421875, "state_level_variance/metric": 122.04520416259766, "state_level_variance_full_gradient/metric": 21.888402938842773, "step": 35 }, { "accuracy_reward": 0.7291666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19956141710281372, "action_level_variance/metric": 83.75871276855469, "action_level_variance_full_gradient/metric": 396.5758972167969, "adam_stats/lr_effective_max": 6.919843144714832e-05, "adam_stats/lr_effective_mean": 6.443519329213387e-11, "adam_stats/lr_effective_min": -6.934908014954999e-05, "adam_stats/m_t_max": 0.002002145629376173, "adam_stats/m_t_mean": 4.295961503197887e-11, "adam_stats/m_t_min": -0.0025751078501343727, "adam_stats/v_t_max": 7.189935422502458e-05, "adam_stats/v_t_mean": 4.173599833789954e-12, "adam_stats/v_t_min": 0.0, "advantages": -4.967053879312289e-09, "advantages/max": 1.4358407258987427, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": -0.6526548862457275, "advantages/p75": 0.3906453251838684, "advantages/var": 0.6312791705131531, "all_logprobs": -0.13112948834896088, "all_logprobs/max": 0.0, "all_logprobs/median": -2.6226043701171875e-06, "all_logprobs/min": -7.34375, "all_logprobs/p1": -2.34375, "all_logprobs/p10": -0.32421875, "all_logprobs/p25": -0.0059814453125, "all_logprobs/p5": -0.8515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.20638206601142883, "clip_ratio": 0.0, "completion_length": 616.1146240234375, "completion_length/correct": 536.0, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 494.0, "completion_length/correct/min": 117.0, "completion_length/correct/p25": 238.25, "completion_length/correct/p75": 857.5, "completion_length/correct/var": 91237.8828125, "completion_length/incorrect": 831.8077392578125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 904.0, "completion_length/incorrect/min": 303.0, "completion_length/incorrect/p25": 637.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 54712.640625, "completion_length/max": 1024.0, "completion_length/median": 587.0, "completion_length/min": 117.0, "completion_length/p25": 283.0, "completion_length/p75": 931.75, "completion_length/var": 98127.640625, "epoch": 0.0288, "feature_vector_variance/max_squared_error": 100314.0546875, "feature_vector_variance/metric": 26970.619140625, "generated_tokens/total": 2068435.0, "grad_norm": 0.2897487282752991, "grouped_std_rewards": 0.2872319221496582, "learning_rate": 1.2117461064942437e-05, "loss": 0.0, "mean_logprobs": -0.142578125, "mean_logprobs/var": 0.0028839111328125, "num_completions/total": 3456, "per_sentence_gradient_norm": 9.719913482666016, "per_sentence_gradient_norm/max": 63.70923614501953, "per_sentence_gradient_norm/median": 8.33792781829834, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 14.239152908325195, "per_sentence_gradient_norm/p85": 19.367979049682617, "per_sentence_gradient_norm/p90": 23.56642723083496, "per_sentence_gradient_norm/p95": 34.81418228149414, "per_sentence_gradient_norm/p99": 48.312828063964844, "per_sentence_gradient_norm/var": 142.80044555664062, "per_token_feature_norm": 165.5814208984375, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 160.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 133.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 1784.4249267578125, "per_token_full_gradient_variance/max_squared_error": 20.07535743713379, "per_token_full_gradient_variance/variance": 0.018362674862146378, "per_token_gradient_norm": 12.739327430725098, "per_token_gradient_norm/max": 887.7425537109375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2396.56982421875, "per_token_policy_error_norm": 0.07018671929836273, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05978285148739815, "policy_entropy": 0.1427827924489975, "policy_entropy/max": 3.15625, "policy_entropy/median": 3.814697265625e-05, "policy_entropy/min": 4.6351811278100286e-15, "policy_entropy/p25": 1.4230608940124512e-06, "policy_entropy/p75": 0.0390625, "policy_entropy/var": 0.10907807946205139, "policy_error_vector_variance/max_squared_error": 2.0107831954956055, "policy_error_vector_variance/metric": 0.07014932483434677, "policy_loss": 1.2417634698280722e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -1.4358407258987427, "policy_loss/p25": -0.3906453251838684, "policy_loss/p75": 0.6526548266410828, "policy_loss/var": 0.6312791109085083, "policy_sharpness": 7.640685558319092, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.2342529296875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.45596694946289, "reward": 0.7291666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19956141710281372, "rewards/accuracy_reward": 0.7291666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19956141710281372, "sentence_full_gradient_variance/max_squared_error": 7405.29248046875, "sentence_full_gradient_variance/metric": 438.73785400390625, "sentence_full_gradient_variance/p75": 514.2972412109375, "sentence_full_gradient_variance/p90": 1362.62744140625, "sentence_full_gradient_variance/p95": 2032.6160888671875, "sentence_full_gradient_variance/p99": 5485.326171875, "state_level_variance/metric": 75.34699249267578, "state_level_variance_full_gradient/metric": 42.161888122558594, "step": 36 }, { "accuracy_reward": 0.90625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.08585526049137115, "action_level_variance/metric": 240.00161743164062, "action_level_variance_full_gradient/metric": 773.6923217773438, "adam_stats/lr_effective_max": 6.857331754872575e-05, "adam_stats/lr_effective_mean": -2.2264254675996575e-11, "adam_stats/lr_effective_min": -6.882708112243563e-05, "adam_stats/m_t_max": 0.0017694910056889057, "adam_stats/m_t_mean": 4.514542212286088e-11, "adam_stats/m_t_min": -0.002051393734291196, "adam_stats/v_t_max": 7.195220678113401e-05, "adam_stats/v_t_mean": 4.180494058564355e-12, "adam_stats/v_t_min": 0.0, "advantages": 6.208817460162663e-09, "advantages/max": 0.8537459373474121, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.47337016463279724, "all_logprobs": -0.12406311184167862, "all_logprobs/max": 0.0, "all_logprobs/median": -5.960464477539062e-07, "all_logprobs/min": -10.0, "all_logprobs/p1": -2.234375, "all_logprobs/p10": -0.3125, "all_logprobs/p25": -0.00372314453125, "all_logprobs/p5": -0.82421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1873098760843277, "clip_ratio": 0.0, "completion_length": 637.8958740234375, "completion_length/correct": 613.7586059570312, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 567.0, "completion_length/correct/min": 240.0, "completion_length/correct/p25": 437.5, "completion_length/correct/p75": 798.5, "completion_length/correct/var": 43922.83984375, "completion_length/incorrect": 871.2222290039062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 974.0, "completion_length/incorrect/min": 317.0, "completion_length/incorrect/p25": 810.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 53829.4453125, "completion_length/max": 1024.0, "completion_length/median": 603.0, "completion_length/min": 240.0, "completion_length/p25": 450.5, "completion_length/p75": 844.75, "completion_length/var": 49985.859375, "epoch": 0.0296, "feature_vector_variance/max_squared_error": 116214.4296875, "feature_vector_variance/metric": 28118.90625, "generated_tokens/total": 2129673.0, "grad_norm": 0.3308764696121216, "grouped_std_rewards": 0.16872458159923553, "learning_rate": 1.1908389392193549e-05, "loss": -0.0, "mean_logprobs": -0.130859375, "mean_logprobs/var": 0.002410888671875, "num_completions/total": 3552, "per_sentence_gradient_norm": 5.797083854675293, "per_sentence_gradient_norm/max": 120.53291320800781, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 6.584291934967041, "per_sentence_gradient_norm/p85": 8.079483032226562, "per_sentence_gradient_norm/p90": 8.704219818115234, "per_sentence_gradient_norm/p95": 10.506547927856445, "per_sentence_gradient_norm/p99": 103.91668701171875, "per_sentence_gradient_norm/var": 262.1396484375, "per_token_feature_norm": 169.93948364257812, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 167.0, "per_token_feature_norm/min": 62.25, "per_token_feature_norm/p25": 136.0, "per_token_feature_norm/p75": 201.0, "per_token_feature_norm/var": 1898.6640625, "per_token_full_gradient_variance/max_squared_error": 12.68773365020752, "per_token_full_gradient_variance/variance": 0.009762571193277836, "per_token_gradient_norm": 6.049296855926514, "per_token_gradient_norm/max": 1434.9730224609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1570.9063720703125, "per_token_policy_error_norm": 0.06743814796209335, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.056570250540971756, "policy_entropy": 0.13594114780426025, "policy_entropy/max": 2.5625, "policy_entropy/median": 9.59634780883789e-06, "policy_entropy/min": 5.245803791353865e-15, "policy_entropy/p25": 3.073364496231079e-07, "policy_entropy/p75": 0.0264892578125, "policy_entropy/var": 0.10129594802856445, "policy_error_vector_variance/max_squared_error": 2.011650800704956, "policy_error_vector_variance/metric": 0.067405566573143, "policy_loss": -6.829699028543246e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.8537459969520569, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.47337016463279724, "policy_sharpness": 7.799320220947266, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.132417678833008, "reward": 0.90625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.08585526049137115, "rewards/accuracy_reward": 0.90625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.08585526049137115, "sentence_full_gradient_variance/max_squared_error": 61906.1640625, "sentence_full_gradient_variance/metric": 800.5853271484375, "sentence_full_gradient_variance/p75": 55.7514762878418, "sentence_full_gradient_variance/p90": 259.21435546875, "sentence_full_gradient_variance/p95": 355.33984375, "sentence_full_gradient_variance/p99": 11387.4677734375, "state_level_variance/metric": 41.289024353027344, "state_level_variance_full_gradient/metric": 26.89293670654297, "step": 37 }, { "accuracy_reward": 0.6979166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21304824948310852, "action_level_variance/metric": 35.812713623046875, "action_level_variance_full_gradient/metric": 302.69598388671875, "adam_stats/lr_effective_max": 6.410318746929988e-05, "adam_stats/lr_effective_mean": -5.689881552028986e-11, "adam_stats/lr_effective_min": -6.654034223174676e-05, "adam_stats/m_t_max": 0.001565075945109129, "adam_stats/m_t_mean": 3.0637832548752186e-11, "adam_stats/m_t_min": -0.0015700701624155045, "adam_stats/v_t_max": 7.194615318439901e-05, "adam_stats/v_t_mean": 4.178368154944545e-12, "adam_stats/v_t_min": 0.0, "advantages": 3.725290298461914e-09, "advantages/max": 2.0150647163391113, "advantages/median": 0.0, "advantages/min": -0.46501490473747253, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.15781643986701965, "all_logprobs": -0.1460394412279129, "all_logprobs/max": 0.0, "all_logprobs/median": -1.7881393432617188e-06, "all_logprobs/min": -12.625, "all_logprobs/p1": -2.375, "all_logprobs/p10": -0.4113283157348633, "all_logprobs/p25": -0.0142822265625, "all_logprobs/p5": -0.96484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.22640320658683777, "clip_ratio": 0.0, "completion_length": 591.5104370117188, "completion_length/correct": 473.19403076171875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 475.0, "completion_length/correct/min": 213.0, "completion_length/correct/p25": 378.0, "completion_length/correct/p75": 535.5, "completion_length/correct/var": 29307.4921875, "completion_length/incorrect": 864.862060546875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 914.0, "completion_length/incorrect/min": 293.0, "completion_length/incorrect/p25": 804.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 32503.98046875, "completion_length/max": 1024.0, "completion_length/median": 522.0, "completion_length/min": 213.0, "completion_length/p25": 438.75, "completion_length/p75": 812.75, "completion_length/var": 62623.53515625, "epoch": 0.0304, "feature_vector_variance/max_squared_error": 102247.28125, "feature_vector_variance/metric": 29560.76171875, "generated_tokens/total": 2186458.0, "grad_norm": 0.16538475453853607, "grouped_std_rewards": 0.06718548387289047, "learning_rate": 1.1693946776030601e-05, "loss": -0.0, "mean_logprobs": -0.140625, "mean_logprobs/var": 0.003265380859375, "num_completions/total": 3648, "per_sentence_gradient_norm": 2.1772208213806152, "per_sentence_gradient_norm/max": 52.43284225463867, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 3.812253475189209, "per_sentence_gradient_norm/p90": 5.88901424407959, "per_sentence_gradient_norm/p95": 8.38940143585205, "per_sentence_gradient_norm/p99": 40.09732437133789, "per_sentence_gradient_norm/var": 57.87876892089844, "per_token_feature_norm": 172.46469116210938, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 169.0, "per_token_feature_norm/min": 69.0, "per_token_feature_norm/p25": 138.0, "per_token_feature_norm/p75": 202.0, "per_token_feature_norm/var": 1981.9658203125, "per_token_full_gradient_variance/max_squared_error": 3.44612979888916, "per_token_full_gradient_variance/variance": 0.003344866679981351, "per_token_gradient_norm": 2.580906629562378, "per_token_gradient_norm/max": 779.2633056640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 494.5385437011719, "per_token_policy_error_norm": 0.07770328223705292, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06357956677675247, "policy_entropy": 0.16367517411708832, "policy_entropy/max": 3.84375, "policy_entropy/median": 2.5987625122070312e-05, "policy_entropy/min": 3.924811864397526e-17, "policy_entropy/p25": 5.550682544708252e-07, "policy_entropy/p75": 0.08056640625, "policy_entropy/var": 0.12720176577568054, "policy_error_vector_variance/max_squared_error": 2.0087482929229736, "policy_error_vector_variance/metric": 0.07751215994358063, "policy_loss": -2.4835269396561444e-09, "policy_loss/max": 0.4650149345397949, "policy_loss/median": 0.0, "policy_loss/min": -2.0150647163391113, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.15781643986701965, "policy_sharpness": 7.442629814147949, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.6285400390625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.329361915588379, "reward": 0.6979166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21304824948310852, "rewards/accuracy_reward": 0.6979166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21304824948310852, "sentence_full_gradient_variance/max_squared_error": 15507.1337890625, "sentence_full_gradient_variance/metric": 320.86041259765625, "sentence_full_gradient_variance/p75": 3.6328909397125244, "sentence_full_gradient_variance/p90": 3.6363484859466553, "sentence_full_gradient_variance/p95": 430.5517272949219, "sentence_full_gradient_variance/p99": 8135.5625, "state_level_variance/metric": 28.44173812866211, "state_level_variance_full_gradient/metric": 18.164459228515625, "step": 38 }, { "accuracy_reward": 0.6458333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2311403602361679, "action_level_variance/metric": 180.96737670898438, "action_level_variance_full_gradient/metric": 200.7168426513672, "adam_stats/lr_effective_max": 6.433238013414666e-05, "adam_stats/lr_effective_mean": -2.5984429885550497e-11, "adam_stats/lr_effective_min": -6.546773511217907e-05, "adam_stats/m_t_max": 0.0016334233805537224, "adam_stats/m_t_mean": 4.5354141449405194e-12, "adam_stats/m_t_min": -0.001120822038501501, "adam_stats/v_t_max": 7.219620601972565e-05, "adam_stats/v_t_mean": 4.1842501685707134e-12, "adam_stats/v_t_min": 0.0, "advantages": -3.725290298461914e-09, "advantages/max": 2.0150647163391113, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.4734352231025696, "all_logprobs": -0.17007356882095337, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-06, "all_logprobs/min": -13.3125, "all_logprobs/p1": -2.703125, "all_logprobs/p10": -0.486328125, "all_logprobs/p25": -0.0206298828125, "all_logprobs/p5": -1.09375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.2893839478492737, "clip_ratio": 0.0, "completion_length": 786.0625, "completion_length/correct": 684.54833984375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 636.0, "completion_length/correct/min": 280.0, "completion_length/correct/p25": 517.0, "completion_length/correct/p75": 895.25, "completion_length/correct/var": 51374.7734375, "completion_length/incorrect": 971.1764526367188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 540.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 16461.54296875, "completion_length/max": 1024.0, "completion_length/median": 809.0, "completion_length/min": 280.0, "completion_length/p25": 592.5, "completion_length/p75": 1024.0, "completion_length/var": 57695.72265625, "epoch": 0.0312, "feature_vector_variance/max_squared_error": 107756.875, "feature_vector_variance/metric": 30827.7421875, "generated_tokens/total": 2261920.0, "grad_norm": 0.28236308693885803, "grouped_std_rewards": 0.1912984699010849, "learning_rate": 1.1474394481749037e-05, "loss": 0.0, "mean_logprobs": -0.1572265625, "mean_logprobs/var": 0.00762939453125, "num_completions/total": 3744, "per_sentence_gradient_norm": 8.550750732421875, "per_sentence_gradient_norm/max": 109.39848327636719, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 7.028140068054199, "per_sentence_gradient_norm/p85": 16.548824310302734, "per_sentence_gradient_norm/p90": 20.487594604492188, "per_sentence_gradient_norm/p95": 41.109107971191406, "per_sentence_gradient_norm/p99": 81.33544158935547, "per_sentence_gradient_norm/var": 290.6351013183594, "per_token_feature_norm": 175.4633331298828, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 172.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 138.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 2272.663330078125, "per_token_full_gradient_variance/max_squared_error": 13.557751655578613, "per_token_full_gradient_variance/variance": 0.01477675512433052, "per_token_gradient_norm": 9.81340217590332, "per_token_gradient_norm/max": 989.9420166015625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2083.7197265625, "per_token_policy_error_norm": 0.0876932442188263, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.07234068959951401, "policy_entropy": 0.1853138506412506, "policy_entropy/max": 3.859375, "policy_entropy/median": 5.0067901611328125e-05, "policy_entropy/min": 1.942890293094024e-15, "policy_entropy/p25": 4.470348358154297e-07, "policy_entropy/p75": 0.1103515625, "policy_entropy/var": 0.16008605062961578, "policy_error_vector_variance/max_squared_error": 2.0139319896698, "policy_error_vector_variance/metric": 0.08760097622871399, "policy_loss": 1.2417634698280722e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -2.0150644779205322, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.4734351634979248, "policy_sharpness": 7.286251544952393, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.241455078125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.000069618225098, "reward": 0.6458333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2311403602361679, "rewards/accuracy_reward": 0.6458333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2311403602361679, "sentence_full_gradient_variance/max_squared_error": 7249.955078125, "sentence_full_gradient_variance/metric": 201.813232421875, "sentence_full_gradient_variance/p75": 0.46209436655044556, "sentence_full_gradient_variance/p90": 91.64437866210938, "sentence_full_gradient_variance/p95": 214.66390991210938, "sentence_full_gradient_variance/p99": 6849.94482421875, "state_level_variance/metric": 141.54092407226562, "state_level_variance_full_gradient/metric": 1.096413016319275, "step": 39 }, { "accuracy_reward": 0.8020833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16041667759418488, "action_level_variance/metric": 127.43363952636719, "action_level_variance_full_gradient/metric": 269.739990234375, "adam_stats/lr_effective_max": 6.219044735189527e-05, "adam_stats/lr_effective_mean": -9.193250327665936e-11, "adam_stats/lr_effective_min": -6.246589327929541e-05, "adam_stats/m_t_max": 0.0010389921953901649, "adam_stats/m_t_mean": 4.616666424150928e-12, "adam_stats/m_t_min": -0.001094951992854476, "adam_stats/v_t_max": 7.217810343718156e-05, "adam_stats/v_t_mean": 4.184854719702091e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 2.560988187789917, "advantages/median": 0.0, "advantages/min": -1.4358407258987427, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.3156311511993408, "all_logprobs": -0.1572684645652771, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-06, "all_logprobs/min": -8.6875, "all_logprobs/p1": -2.40625, "all_logprobs/p10": -0.474609375, "all_logprobs/p25": -0.02978515625, "all_logprobs/p5": -1.015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.23125538229942322, "clip_ratio": 0.0, "completion_length": 568.3646240234375, "completion_length/correct": 584.3506469726562, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 522.0, "completion_length/correct/min": 350.0, "completion_length/correct/p25": 439.0, "completion_length/correct/p75": 750.0, "completion_length/correct/var": 31846.3359375, "completion_length/incorrect": 503.5789489746094, "completion_length/incorrect/max": 767.0, "completion_length/incorrect/median": 484.0, "completion_length/incorrect/min": 400.0, "completion_length/incorrect/p25": 423.0, "completion_length/incorrect/p75": 526.0, "completion_length/incorrect/var": 10774.9248046875, "completion_length/max": 1024.0, "completion_length/median": 507.0, "completion_length/min": 350.0, "completion_length/p25": 434.75, "completion_length/p75": 706.25, "completion_length/var": 28565.205078125, "epoch": 0.032, "feature_vector_variance/max_squared_error": 95442.375, "feature_vector_variance/metric": 30155.966796875, "generated_tokens/total": 2316483.0, "grad_norm": 0.3115637004375458, "grouped_std_rewards": 0.13671310245990753, "learning_rate": 1.125e-05, "loss": -0.0, "mean_logprobs": -0.1650390625, "mean_logprobs/var": 0.003936767578125, "num_completions/total": 3840, "per_sentence_gradient_norm": 7.4508795738220215, "per_sentence_gradient_norm/max": 85.16881561279297, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 12.130087852478027, "per_sentence_gradient_norm/p85": 14.349814414978027, "per_sentence_gradient_norm/p90": 17.252422332763672, "per_sentence_gradient_norm/p95": 37.972103118896484, "per_sentence_gradient_norm/p99": 80.04135131835938, "per_sentence_gradient_norm/var": 233.26052856445312, "per_token_feature_norm": 174.92657470703125, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 175.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 139.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 1976.527099609375, "per_token_full_gradient_variance/max_squared_error": 5.047487735748291, "per_token_full_gradient_variance/variance": 0.011620833538472652, "per_token_gradient_norm": 7.949068546295166, "per_token_gradient_norm/max": 897.1061401367188, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1442.430908203125, "per_token_policy_error_norm": 0.0857531875371933, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06995312869548798, "policy_entropy": 0.17280679941177368, "policy_entropy/max": 3.15625, "policy_entropy/median": 5.078315734863281e-05, "policy_entropy/min": 4.163336342344337e-17, "policy_entropy/p25": 3.650784492492676e-07, "policy_entropy/p75": 0.14111328125, "policy_entropy/var": 0.1207398995757103, "policy_error_vector_variance/max_squared_error": 2.0147523880004883, "policy_error_vector_variance/metric": 0.08561504632234573, "policy_loss": -7.450580596923828e-09, "policy_loss/max": 1.4358407258987427, "policy_loss/median": 0.0, "policy_loss/min": -2.560988426208496, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.3156311511993408, "policy_sharpness": 7.176560878753662, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.998779296875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.85442066192627, "reward": 0.8020833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16041667759418488, "rewards/accuracy_reward": 0.8020833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16041667759418488, "sentence_full_gradient_variance/max_squared_error": 7228.18701171875, "sentence_full_gradient_variance/metric": 285.067138671875, "sentence_full_gradient_variance/p75": 26.153535842895508, "sentence_full_gradient_variance/p90": 410.2095031738281, "sentence_full_gradient_variance/p95": 1300.2647705078125, "sentence_full_gradient_variance/p99": 5640.69384765625, "state_level_variance/metric": 133.63406372070312, "state_level_variance_full_gradient/metric": 15.32713508605957, "step": 40 }, { "accuracy_reward": 0.8541666865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.12587721645832062, "action_level_variance/metric": 18.397245407104492, "action_level_variance_full_gradient/metric": 179.818115234375, "adam_stats/lr_effective_max": 6.043129542376846e-05, "adam_stats/lr_effective_mean": -1.5261150676515456e-10, "adam_stats/lr_effective_min": -6.025818584021181e-05, "adam_stats/m_t_max": 0.001440260442905128, "adam_stats/m_t_mean": 1.6096774520940604e-12, "adam_stats/m_t_min": -0.00132663338445127, "adam_stats/v_t_max": 7.215391815407202e-05, "adam_stats/v_t_mean": 4.186536100431182e-12, "adam_stats/v_t_min": 0.0, "advantages": 2.4835269396561444e-09, "advantages/max": 0.9680583477020264, "advantages/median": 0.0, "advantages/min": -1.249750018119812, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.31566518545150757, "all_logprobs": -0.14332780241966248, "all_logprobs/max": 0.0, "all_logprobs/median": -8.344650268554688e-07, "all_logprobs/min": -8.625, "all_logprobs/p1": -2.392031669616699, "all_logprobs/p10": -0.38671875, "all_logprobs/p25": -0.007598876953125, "all_logprobs/p5": -0.9453125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.22883467376232147, "clip_ratio": 0.0, "completion_length": 584.2916870117188, "completion_length/correct": 515.207275390625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 441.0, "completion_length/correct/min": 216.0, "completion_length/correct/p25": 336.25, "completion_length/correct/p75": 625.75, "completion_length/correct/var": 54137.41796875, "completion_length/incorrect": 988.9285888671875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 533.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 17220.0703125, "completion_length/max": 1024.0, "completion_length/median": 536.0, "completion_length/min": 216.0, "completion_length/p25": 353.0, "completion_length/p75": 749.5, "completion_length/var": 76764.0390625, "epoch": 0.0328, "feature_vector_variance/max_squared_error": 113737.5390625, "feature_vector_variance/metric": 30332.4453125, "generated_tokens/total": 2372575.0, "grad_norm": 0.22388166189193726, "grouped_std_rewards": 0.16939961910247803, "learning_rate": 1.1021036720894182e-05, "loss": -0.0, "mean_logprobs": -0.1328125, "mean_logprobs/var": 0.0040283203125, "num_completions/total": 3936, "per_sentence_gradient_norm": 7.805316925048828, "per_sentence_gradient_norm/max": 49.26043701171875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 16.43855857849121, "per_sentence_gradient_norm/p85": 20.59054946899414, "per_sentence_gradient_norm/p90": 28.182893753051758, "per_sentence_gradient_norm/p95": 33.36107635498047, "per_sentence_gradient_norm/p99": 43.48922348022461, "per_sentence_gradient_norm/var": 156.1372833251953, "per_token_feature_norm": 175.3141326904297, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 173.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 142.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 1925.2890625, "per_token_full_gradient_variance/max_squared_error": 1.9012762308120728, "per_token_full_gradient_variance/variance": 0.012997891753911972, "per_token_gradient_norm": 11.849893569946289, "per_token_gradient_norm/max": 447.957275390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1771.251708984375, "per_token_policy_error_norm": 0.07574658840894699, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06322943419218063, "policy_entropy": 0.1566137671470642, "policy_entropy/max": 3.28125, "policy_entropy/median": 1.2636184692382812e-05, "policy_entropy/min": 1.0480505352461478e-13, "policy_entropy/p25": 2.4959444999694824e-07, "policy_entropy/p75": 0.048095703125, "policy_entropy/var": 0.12646439671516418, "policy_error_vector_variance/max_squared_error": 2.0148956775665283, "policy_error_vector_variance/metric": 0.0756547674536705, "policy_loss": -2.4835269396561444e-09, "policy_loss/max": 1.249750018119812, "policy_loss/median": 0.0, "policy_loss/min": -0.9680584669113159, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.31566524505615234, "policy_sharpness": 7.622044086456299, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.111328125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.869515419006348, "reward": 0.8541666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.12587721645832062, "rewards/accuracy_reward": 0.8541666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.12587721645832062, "sentence_full_gradient_variance/max_squared_error": 4146.45703125, "sentence_full_gradient_variance/metric": 207.4609832763672, "sentence_full_gradient_variance/p75": 11.849114418029785, "sentence_full_gradient_variance/p90": 13.948049545288086, "sentence_full_gradient_variance/p95": 1446.097412109375, "sentence_full_gradient_variance/p99": 3692.53271484375, "state_level_variance/metric": 164.7161102294922, "state_level_variance_full_gradient/metric": 27.642873764038086, "step": 41 }, { "accuracy_reward": 0.5416666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2508772313594818, "action_level_variance/metric": 58.02216720581055, "action_level_variance_full_gradient/metric": 1052.5831298828125, "adam_stats/lr_effective_max": 6.054328696336597e-05, "adam_stats/lr_effective_mean": -4.0058584921398577e-11, "adam_stats/lr_effective_min": -5.940127812209539e-05, "adam_stats/m_t_max": 0.0009270908194594085, "adam_stats/m_t_mean": 3.2922197953921817e-12, "adam_stats/m_t_min": -0.001209787093102932, "adam_stats/v_t_max": 7.225396257126704e-05, "adam_stats/v_t_mean": 4.192964118271414e-12, "adam_stats/v_t_min": 0.0, "advantages": -2.4835269396561444e-09, "advantages/max": 1.249750018119812, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.31563395261764526, "all_logprobs": -0.16887293756008148, "all_logprobs/max": 0.0, "all_logprobs/median": -2.1457672119140625e-06, "all_logprobs/min": -8.9375, "all_logprobs/p1": -2.703125, "all_logprobs/p10": -0.478515625, "all_logprobs/p25": -0.01800537109375, "all_logprobs/p5": -1.140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.2869384288787842, "clip_ratio": 0.0, "completion_length": 632.7396240234375, "completion_length/correct": 502.0000305175781, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 477.0, "completion_length/correct/min": 287.0, "completion_length/correct/p25": 372.75, "completion_length/correct/p75": 580.0, "completion_length/correct/var": 22402.90234375, "completion_length/incorrect": 787.25, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 799.0, "completion_length/incorrect/min": 333.0, "completion_length/incorrect/p25": 611.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 52154.09765625, "completion_length/max": 1024.0, "completion_length/median": 577.0, "completion_length/min": 287.0, "completion_length/p25": 426.0, "completion_length/p75": 805.75, "completion_length/var": 56046.68359375, "epoch": 0.0336, "feature_vector_variance/max_squared_error": 104347.1796875, "feature_vector_variance/metric": 32551.337890625, "generated_tokens/total": 2433318.0, "grad_norm": 0.27160805463790894, "grouped_std_rewards": 0.14026084542274475, "learning_rate": 1.078778360091808e-05, "loss": 0.0, "mean_logprobs": -0.1513671875, "mean_logprobs/var": 0.0101318359375, "num_completions/total": 4032, "per_sentence_gradient_norm": 4.62807559967041, "per_sentence_gradient_norm/max": 58.24956512451172, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.803122520446777, "per_sentence_gradient_norm/p85": 11.648799896240234, "per_sentence_gradient_norm/p90": 13.18095874786377, "per_sentence_gradient_norm/p95": 20.1944637298584, "per_sentence_gradient_norm/p99": 57.95506286621094, "per_sentence_gradient_norm/var": 99.07512664794922, "per_token_feature_norm": 180.21446228027344, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 179.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 143.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 2221.4208984375, "per_token_full_gradient_variance/max_squared_error": 5027.30908203125, "per_token_full_gradient_variance/variance": 0.0888509675860405, "per_token_gradient_norm": 4.012761116027832, "per_token_gradient_norm/max": 734.6034545898438, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 782.61181640625, "per_token_policy_error_norm": 0.08578003942966461, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.07021744549274445, "policy_entropy": 0.18725062906742096, "policy_entropy/max": 3.203125, "policy_entropy/median": 3.170967102050781e-05, "policy_entropy/min": 1.071365218763276e-14, "policy_entropy/p25": 1.7881393432617188e-07, "policy_entropy/p75": 0.09228515625, "policy_entropy/var": 0.16984930634498596, "policy_error_vector_variance/max_squared_error": 2.019453525543213, "policy_error_vector_variance/metric": 0.08563632518053055, "policy_loss": 3.725290298461914e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -1.249750018119812, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.3156339228153229, "policy_sharpness": 7.326025009155273, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.305389404296875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.111671447753906, "reward": 0.5416666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2508772313594818, "rewards/accuracy_reward": 0.5416666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2508772313594818, "sentence_full_gradient_variance/max_squared_error": 34156.86328125, "sentence_full_gradient_variance/metric": 1078.287353515625, "sentence_full_gradient_variance/p75": 810.75537109375, "sentence_full_gradient_variance/p90": 2217.894287109375, "sentence_full_gradient_variance/p95": 3652.94091796875, "sentence_full_gradient_variance/p99": 16418.576171875, "state_level_variance/metric": 52.37677764892578, "state_level_variance_full_gradient/metric": 25.70416259765625, "step": 42 }, { "accuracy_reward": 0.5729166865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24725878238677979, "action_level_variance/metric": 39.150489807128906, "action_level_variance_full_gradient/metric": 343.75775146484375, "adam_stats/lr_effective_max": 6.0076697991462424e-05, "adam_stats/lr_effective_mean": -4.5332116532792455e-11, "adam_stats/lr_effective_min": -5.9572721511358395e-05, "adam_stats/m_t_max": 0.000985870254226029, "adam_stats/m_t_mean": -8.974597974498177e-12, "adam_stats/m_t_min": -0.0008959303959272802, "adam_stats/v_t_max": 7.218535029096529e-05, "adam_stats/v_t_mean": 4.1923205358618265e-12, "adam_stats/v_t_min": 0.0, "advantages": 8.692344621863413e-09, "advantages/max": 2.0150647163391113, "advantages/median": 0.0, "advantages/min": -1.4358407258987427, "advantages/p25": -0.46501490473747253, "advantages/p75": 0.0, "advantages/var": 0.47347837686538696, "all_logprobs": -0.1497071236371994, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-07, "all_logprobs/min": -8.1875, "all_logprobs/p1": -2.578125, "all_logprobs/p10": -0.396484375, "all_logprobs/p25": -0.00787353515625, "all_logprobs/p5": -0.97265625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.2524147629737854, "clip_ratio": 0.0, "completion_length": 743.59375, "completion_length/correct": 702.0908813476562, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 666.0, "completion_length/correct/min": 401.0, "completion_length/correct/p25": 584.0, "completion_length/correct/p75": 790.0, "completion_length/correct/var": 33155.86328125, "completion_length/incorrect": 799.2682495117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 854.0, "completion_length/incorrect/min": 438.0, "completion_length/incorrect/p25": 547.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 55997.1015625, "completion_length/max": 1024.0, "completion_length/median": 682.0, "completion_length/min": 401.0, "completion_length/p25": 573.5, "completion_length/p75": 1024.0, "completion_length/var": 44759.19140625, "epoch": 0.0344, "feature_vector_variance/max_squared_error": 104624.3359375, "feature_vector_variance/metric": 31640.798828125, "generated_tokens/total": 2504703.0, "grad_norm": 0.22005128860473633, "grouped_std_rewards": 0.23236232995986938, "learning_rate": 1.0550524823068504e-05, "loss": 0.0, "mean_logprobs": -0.1630859375, "mean_logprobs/var": 0.01116943359375, "num_completions/total": 4128, "per_sentence_gradient_norm": 7.4884161949157715, "per_sentence_gradient_norm/max": 53.27171325683594, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 9.739102363586426, "per_sentence_gradient_norm/p85": 16.837034225463867, "per_sentence_gradient_norm/p90": 22.23833465576172, "per_sentence_gradient_norm/p95": 33.67189025878906, "per_sentence_gradient_norm/p99": 46.94045639038086, "per_sentence_gradient_norm/var": 129.13143920898438, "per_token_feature_norm": 179.11917114257812, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 178.0, "per_token_feature_norm/min": 71.5, "per_token_feature_norm/p25": 146.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 1883.3408203125, "per_token_full_gradient_variance/max_squared_error": 3.999040126800537, "per_token_full_gradient_variance/variance": 0.009515632875263691, "per_token_gradient_norm": 7.826634407043457, "per_token_gradient_norm/max": 679.8639526367188, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1253.0345458984375, "per_token_policy_error_norm": 0.0774960070848465, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06467445939779282, "policy_entropy": 0.16409961879253387, "policy_entropy/max": 3.53125, "policy_entropy/median": 8.225440979003906e-06, "policy_entropy/min": 2.4147350785597155e-15, "policy_entropy/p25": 1.4062970876693726e-07, "policy_entropy/p75": 0.0498046875, "policy_entropy/var": 0.14202913641929626, "policy_error_vector_variance/max_squared_error": 1.9881459474563599, "policy_error_vector_variance/metric": 0.07727087289094925, "policy_loss": -6.208817349140361e-10, "policy_loss/max": 1.4358407258987427, "policy_loss/median": 0.0, "policy_loss/min": -2.0150644779205322, "policy_loss/p25": 0.0, "policy_loss/p75": 0.46501487493515015, "policy_loss/var": 0.4734783470630646, "policy_sharpness": 7.597949504852295, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.011474609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.215595245361328, "reward": 0.5729166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24725878238677979, "rewards/accuracy_reward": 0.5729166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24725878238677979, "sentence_full_gradient_variance/max_squared_error": 6563.46240234375, "sentence_full_gradient_variance/metric": 351.0240478515625, "sentence_full_gradient_variance/p75": 3.508763313293457, "sentence_full_gradient_variance/p90": 880.2188110351562, "sentence_full_gradient_variance/p95": 1558.833740234375, "sentence_full_gradient_variance/p99": 6486.6435546875, "state_level_variance/metric": 109.29927825927734, "state_level_variance_full_gradient/metric": 7.266334533691406, "step": 43 }, { "accuracy_reward": 0.6145833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23936404287815094, "action_level_variance/metric": 23.229711532592773, "action_level_variance_full_gradient/metric": 744.7938842773438, "adam_stats/lr_effective_max": 5.673836130881682e-05, "adam_stats/lr_effective_mean": 4.7062444219481137e-11, "adam_stats/lr_effective_min": -5.865076673217118e-05, "adam_stats/m_t_max": 0.0015995195135474205, "adam_stats/m_t_mean": -5.661463485517881e-12, "adam_stats/m_t_min": -0.0013281479477882385, "adam_stats/v_t_max": 7.211740739876404e-05, "adam_stats/v_t_mean": 4.215476925861772e-12, "adam_stats/v_t_min": 0.0, "advantages": -9.934107758624577e-09, "advantages/max": 1.6766761541366577, "advantages/median": 0.0, "advantages/min": -0.8537459373474121, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.31565728783607483, "all_logprobs": -0.13366195559501648, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -11.4375, "all_logprobs/p1": -2.375, "all_logprobs/p10": -0.337890625, "all_logprobs/p25": -0.0031890869140625, "all_logprobs/p5": -0.87890625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.21977587044239044, "clip_ratio": 0.0, "completion_length": 701.7083740234375, "completion_length/correct": 575.0338745117188, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 590.0, "completion_length/correct/min": 289.0, "completion_length/correct/p25": 335.5, "completion_length/correct/p75": 711.5, "completion_length/correct/var": 51760.828125, "completion_length/incorrect": 903.7026977539062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 322.0, "completion_length/incorrect/p25": 899.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 50114.21875, "completion_length/max": 1024.0, "completion_length/median": 676.0, "completion_length/min": 289.0, "completion_length/p25": 462.25, "completion_length/p75": 1024.0, "completion_length/var": 76448.859375, "epoch": 0.0352, "feature_vector_variance/max_squared_error": 113296.9140625, "feature_vector_variance/metric": 30565.466796875, "generated_tokens/total": 2572067.0, "grad_norm": 0.3571113049983978, "grouped_std_rewards": 0.1599268615245819, "learning_rate": 1.0309549450619342e-05, "loss": 0.0, "mean_logprobs": -0.1396484375, "mean_logprobs/var": 0.0038604736328125, "num_completions/total": 4224, "per_sentence_gradient_norm": 7.673593044281006, "per_sentence_gradient_norm/max": 50.91804504394531, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 11.784261703491211, "per_sentence_gradient_norm/p85": 24.229293823242188, "per_sentence_gradient_norm/p90": 28.087379455566406, "per_sentence_gradient_norm/p95": 34.05250549316406, "per_sentence_gradient_norm/p99": 43.72726058959961, "per_sentence_gradient_norm/var": 156.3097381591797, "per_token_feature_norm": 177.40963745117188, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 176.0, "per_token_feature_norm/min": 69.5, "per_token_feature_norm/p25": 144.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 1916.9119873046875, "per_token_full_gradient_variance/max_squared_error": 4.4182939529418945, "per_token_full_gradient_variance/variance": 0.007810044568032026, "per_token_gradient_norm": 7.162797927856445, "per_token_gradient_norm/max": 619.9509887695312, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1107.94921875, "per_token_policy_error_norm": 0.07056022435426712, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.059141021221876144, "policy_entropy": 0.1453176736831665, "policy_entropy/max": 3.421875, "policy_entropy/median": 4.738569259643555e-06, "policy_entropy/min": 1.2656542480726785e-14, "policy_entropy/p25": 1.0803341865539551e-07, "policy_entropy/p75": 0.0228271484375, "policy_entropy/var": 0.12096654623746872, "policy_error_vector_variance/max_squared_error": 2.0013926029205322, "policy_error_vector_variance/metric": 0.07036435604095459, "policy_loss": 1.1796752907855534e-08, "policy_loss/max": 0.8537459373474121, "policy_loss/median": 0.0, "policy_loss/min": -1.6766761541366577, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.31565728783607483, "policy_sharpness": 7.801821231842041, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.5625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.378746032714844, "reward": 0.6145833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23936404287815094, "rewards/accuracy_reward": 0.6145833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23936404287815094, "sentence_full_gradient_variance/max_squared_error": 7420.7529296875, "sentence_full_gradient_variance/metric": 766.615234375, "sentence_full_gradient_variance/p75": 6.605424880981445, "sentence_full_gradient_variance/p90": 3256.36279296875, "sentence_full_gradient_variance/p95": 6441.0595703125, "sentence_full_gradient_variance/p99": 7359.03662109375, "state_level_variance/metric": 159.48440551757812, "state_level_variance_full_gradient/metric": 21.821456909179688, "step": 44 }, { "accuracy_reward": 0.875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.11052631586790085, "action_level_variance/metric": 51.8543701171875, "action_level_variance_full_gradient/metric": 251.55010986328125, "adam_stats/lr_effective_max": 5.6858352763811126e-05, "adam_stats/lr_effective_mean": 1.866186198629105e-11, "adam_stats/lr_effective_min": -5.644164411933161e-05, "adam_stats/m_t_max": 0.003136345185339451, "adam_stats/m_t_mean": -4.078762067677433e-12, "adam_stats/m_t_min": -0.002519796369597316, "adam_stats/v_t_max": 7.209036266431212e-05, "adam_stats/v_t_mean": 4.236277127700472e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.1175870895385742e-08, "advantages/max": 0.9680583477020264, "advantages/median": 0.24990005791187286, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.6311729550361633, "all_logprobs": -0.0926886647939682, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -9.375, "all_logprobs/p1": -1.9140625, "all_logprobs/p10": -0.166015625, "all_logprobs/p25": -0.0002334117889404297, "all_logprobs/p5": -0.578125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.13923391699790955, "clip_ratio": 0.0, "completion_length": 700.9375, "completion_length/correct": 683.547607421875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 656.0, "completion_length/correct/min": 231.0, "completion_length/correct/p25": 546.25, "completion_length/correct/p75": 898.25, "completion_length/correct/var": 55787.48046875, "completion_length/incorrect": 822.6666870117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 827.0, "completion_length/incorrect/min": 569.0, "completion_length/incorrect/p25": 630.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 38042.2421875, "completion_length/max": 1024.0, "completion_length/median": 670.0, "completion_length/min": 231.0, "completion_length/p25": 557.0, "completion_length/p75": 903.5, "completion_length/var": 55284.66796875, "epoch": 0.036, "feature_vector_variance/max_squared_error": 121158.9765625, "feature_vector_variance/metric": 30912.18359375, "generated_tokens/total": 2639357.0, "grad_norm": 0.3002019226551056, "grouped_std_rewards": 0.22632715106010437, "learning_rate": 1.0065151074942516e-05, "loss": -0.0, "mean_logprobs": -0.099609375, "mean_logprobs/var": 0.0010986328125, "num_completions/total": 4320, "per_sentence_gradient_norm": 5.381115913391113, "per_sentence_gradient_norm/max": 52.14032745361328, "per_sentence_gradient_norm/median": 2.643852949142456, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 4.410536289215088, "per_sentence_gradient_norm/p85": 13.237117767333984, "per_sentence_gradient_norm/p90": 16.099449157714844, "per_sentence_gradient_norm/p95": 17.761043548583984, "per_sentence_gradient_norm/p99": 48.277015686035156, "per_sentence_gradient_norm/var": 74.92626953125, "per_token_feature_norm": 182.7447052001953, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 184.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 154.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 1602.63232421875, "per_token_full_gradient_variance/max_squared_error": 14.819523811340332, "per_token_full_gradient_variance/variance": 0.010585268959403038, "per_token_gradient_norm": 6.230982780456543, "per_token_gradient_norm/max": 1255.747802734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1323.814697265625, "per_token_policy_error_norm": 0.050408706068992615, "per_token_policy_error_norm/max": 1.9921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.042369991540908813, "policy_entropy": 0.10451171547174454, "policy_entropy/max": 3.53125, "policy_entropy/median": 1.55717134475708e-06, "policy_entropy/min": 8.222589276130066e-16, "policy_entropy/p25": 3.236345946788788e-08, "policy_entropy/p75": 0.0023040771484375, "policy_entropy/var": 0.07964339107275009, "policy_error_vector_variance/max_squared_error": 2.0035479068756104, "policy_error_vector_variance/metric": 0.05017566680908203, "policy_loss": -1.3659398057086491e-08, "policy_loss/max": 3.7485008239746094, "policy_loss/median": -0.24990005791187286, "policy_loss/min": -0.9680584669113159, "policy_loss/p25": -0.24990007281303406, "policy_loss/p75": 0.0, "policy_loss/var": 0.6311729550361633, "policy_sharpness": 8.244872093200684, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.289227485656738, "reward": 0.875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.11052631586790085, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.11052631586790085, "sentence_full_gradient_variance/max_squared_error": 4324.24755859375, "sentence_full_gradient_variance/metric": 258.21112060546875, "sentence_full_gradient_variance/p75": 71.24079132080078, "sentence_full_gradient_variance/p90": 868.3411865234375, "sentence_full_gradient_variance/p95": 1959.8056640625, "sentence_full_gradient_variance/p99": 2772.9345703125, "state_level_variance/metric": 30.6387939453125, "state_level_variance_full_gradient/metric": 6.660999298095703, "step": 45 }, { "accuracy_reward": 0.6875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21710526943206787, "action_level_variance/metric": 14.349052429199219, "action_level_variance_full_gradient/metric": 249.4747314453125, "adam_stats/lr_effective_max": 5.7156856200890616e-05, "adam_stats/lr_effective_mean": 1.0001151296012978e-10, "adam_stats/lr_effective_min": -5.759269333793782e-05, "adam_stats/m_t_max": 0.0051054260693490505, "adam_stats/m_t_mean": 5.9722331247069604e-12, "adam_stats/m_t_min": -0.0035983833950012922, "adam_stats/v_t_max": 7.225110311992466e-05, "adam_stats/v_t_mean": 4.286912838602497e-12, "adam_stats/v_t_min": 0.0, "advantages": -3.725290298461914e-09, "advantages/max": 1.0976732969284058, "advantages/median": 0.0, "advantages/min": -1.4358407258987427, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.3156619071960449, "all_logprobs": -0.12368281185626984, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -9.25, "all_logprobs/p1": -2.296875, "all_logprobs/p10": -0.28125, "all_logprobs/p25": -0.00180816650390625, "all_logprobs/p5": -0.8125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.2040897011756897, "clip_ratio": 0.0, "completion_length": 688.5625, "completion_length/correct": 578.4091186523438, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 648.0, "completion_length/correct/min": 106.0, "completion_length/correct/p25": 335.75, "completion_length/correct/p75": 805.75, "completion_length/correct/var": 79051.84375, "completion_length/incorrect": 930.9000244140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 606.0, "completion_length/incorrect/p25": 866.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 20606.98828125, "completion_length/max": 1024.0, "completion_length/median": 747.0, "completion_length/min": 106.0, "completion_length/p25": 370.75, "completion_length/p75": 968.25, "completion_length/var": 87353.9609375, "epoch": 0.0368, "feature_vector_variance/max_squared_error": 123121.1875, "feature_vector_variance/metric": 29066.20703125, "generated_tokens/total": 2705459.0, "grad_norm": 0.3666095435619354, "grouped_std_rewards": 0.1651768535375595, "learning_rate": 9.817627457812105e-06, "loss": -0.0, "mean_logprobs": -0.125, "mean_logprobs/var": 0.0024871826171875, "num_completions/total": 4416, "per_sentence_gradient_norm": 4.7360992431640625, "per_sentence_gradient_norm/max": 30.145751953125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 7.039157390594482, "per_sentence_gradient_norm/p85": 13.849784851074219, "per_sentence_gradient_norm/p90": 18.863149642944336, "per_sentence_gradient_norm/p95": 22.039318084716797, "per_sentence_gradient_norm/p99": 28.65241241455078, "per_sentence_gradient_norm/var": 62.05736541748047, "per_token_feature_norm": 174.3594512939453, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 174.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 140.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 1876.692626953125, "per_token_full_gradient_variance/max_squared_error": 2.0462286472320557, "per_token_full_gradient_variance/variance": 0.00616877619177103, "per_token_gradient_norm": 5.796092510223389, "per_token_gradient_norm/max": 458.3472595214844, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 854.8107299804688, "per_token_policy_error_norm": 0.06539110094308853, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05534958839416504, "policy_entropy": 0.13347174227237701, "policy_entropy/max": 3.203125, "policy_entropy/median": 4.0531158447265625e-06, "policy_entropy/min": 8.93729534823251e-15, "policy_entropy/p25": 7.264316082000732e-08, "policy_entropy/p75": 0.0140380859375, "policy_entropy/var": 0.10979003459215164, "policy_error_vector_variance/max_squared_error": 2.011521577835083, "policy_error_vector_variance/metric": 0.06519201397895813, "policy_loss": -7.450580596923828e-09, "policy_loss/max": 1.4358408451080322, "policy_loss/median": 0.0, "policy_loss/min": -1.0976734161376953, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.3156619369983673, "policy_sharpness": 7.91967248916626, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.62109375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.762212753295898, "reward": 0.6875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21710526943206787, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21710526943206787, "sentence_full_gradient_variance/max_squared_error": 4838.7236328125, "sentence_full_gradient_variance/metric": 293.5234069824219, "sentence_full_gradient_variance/p75": 0.07825279235839844, "sentence_full_gradient_variance/p90": 729.6937255859375, "sentence_full_gradient_variance/p95": 1655.2044677734375, "sentence_full_gradient_variance/p99": 4751.55615234375, "state_level_variance/metric": 57.550445556640625, "state_level_variance_full_gradient/metric": 44.0487174987793, "step": 46 }, { "accuracy_reward": 1.0, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 1.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": 0.0, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 5.015444548917003e-05, "adam_stats/lr_effective_mean": 8.77602365778607e-11, "adam_stats/lr_effective_min": -5.053678250988014e-05, "adam_stats/m_t_max": 0.004594883415848017, "adam_stats/m_t_mean": 5.375020307313294e-12, "adam_stats/m_t_min": -0.0032385450322180986, "adam_stats/v_t_max": 7.217885286081582e-05, "adam_stats/v_t_mean": 4.28262546953162e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.11514172703027725, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -8.6875, "all_logprobs/p1": -2.142812728881836, "all_logprobs/p10": -0.263671875, "all_logprobs/p25": -0.001560211181640625, "all_logprobs/p5": -0.75, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.17584869265556335, "clip_ratio": 0.0, "completion_length": 573.8229370117188, "completion_length/correct": 573.8229370117188, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 480.0, "completion_length/correct/min": 144.0, "completion_length/correct/p25": 362.75, "completion_length/correct/p75": 780.75, "completion_length/correct/var": 59813.63671875, "completion_length/max": 1024.0, "completion_length/median": 480.0, "completion_length/min": 144.0, "completion_length/p25": 362.75, "completion_length/p75": 780.75, "completion_length/var": 59813.63671875, "epoch": 0.0376, "feature_vector_variance/max_squared_error": 114661.9296875, "feature_vector_variance/metric": 30555.822265625, "generated_tokens/total": 2760546.0, "grad_norm": 0.0, "grouped_std_rewards": 0.0, "learning_rate": 9.567280168627493e-06, "loss": 0.0, "mean_logprobs": -0.11328125, "mean_logprobs/var": 0.00144195556640625, "num_completions/total": 4512, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 180.1948699951172, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 183.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 151.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 1615.95166015625, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.062498971819877625, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05254361033439636, "policy_entropy": 0.12623079121112823, "policy_entropy/max": 3.59375, "policy_entropy/median": 2.086162567138672e-06, "policy_entropy/min": 1.609823385706477e-15, "policy_entropy/p25": 2.7939677238464355e-08, "policy_entropy/p75": 0.01220703125, "policy_entropy/var": 0.09795446693897247, "policy_error_vector_variance/max_squared_error": 2.010298252105713, "policy_error_vector_variance/metric": 0.06232938915491104, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 7.957470417022705, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.99609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.58972454071045, "reward": 1.0, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 1.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.0, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 1.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 47 }, { "accuracy_reward": 0.7708333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17850875854492188, "action_level_variance/metric": 57.31904602050781, "action_level_variance_full_gradient/metric": 818.9473876953125, "adam_stats/lr_effective_max": 5.005556158721447e-05, "adam_stats/lr_effective_mean": 2.1983788400792292e-10, "adam_stats/lr_effective_min": -5.309766493155621e-05, "adam_stats/m_t_max": 0.0037588360719382763, "adam_stats/m_t_mean": 4.540424980437052e-11, "adam_stats/m_t_min": -0.004021385218948126, "adam_stats/v_t_max": 7.22556869732216e-05, "adam_stats/v_t_mean": 4.518440743089824e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.47339966893196106, "all_logprobs": -0.12013661861419678, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -10.0625, "all_logprobs/p1": -2.1298437118530273, "all_logprobs/p10": -0.28515625, "all_logprobs/p25": -0.00323486328125, "all_logprobs/p5": -0.77734375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.18498198688030243, "clip_ratio": 0.0, "completion_length": 568.4375, "completion_length/correct": 569.4324340820312, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 601.0, "completion_length/correct/min": 198.0, "completion_length/correct/p25": 463.75, "completion_length/correct/p75": 688.75, "completion_length/correct/var": 37893.7265625, "completion_length/incorrect": 565.0909423828125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 456.0, "completion_length/incorrect/min": 353.0, "completion_length/incorrect/p25": 427.25, "completion_length/incorrect/p75": 587.5, "completion_length/incorrect/var": 49015.8046875, "completion_length/max": 1024.0, "completion_length/median": 573.0, "completion_length/min": 198.0, "completion_length/p25": 440.75, "completion_length/p75": 684.25, "completion_length/var": 39956.77734375, "epoch": 0.0384, "feature_vector_variance/max_squared_error": 113017.2734375, "feature_vector_variance/metric": 32345.845703125, "generated_tokens/total": 2815116.0, "grad_norm": 0.7496240139007568, "grouped_std_rewards": 0.17837977409362793, "learning_rate": 9.314414216997507e-06, "loss": 0.0, "mean_logprobs": -0.12060546875, "mean_logprobs/var": 0.00153350830078125, "num_completions/total": 4608, "per_sentence_gradient_norm": 5.157897472381592, "per_sentence_gradient_norm/max": 53.08258819580078, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.019156455993652, "per_sentence_gradient_norm/p85": 9.838272094726562, "per_sentence_gradient_norm/p90": 13.393547058105469, "per_sentence_gradient_norm/p95": 28.256656646728516, "per_sentence_gradient_norm/p99": 39.636756896972656, "per_sentence_gradient_norm/var": 89.53791809082031, "per_token_feature_norm": 184.89788818359375, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 159.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 1436.1600341796875, "per_token_full_gradient_variance/max_squared_error": 12.993915557861328, "per_token_full_gradient_variance/variance": 0.011443193070590496, "per_token_gradient_norm": 6.287647724151611, "per_token_gradient_norm/max": 1194.8345947265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1359.9017333984375, "per_token_policy_error_norm": 0.06584099680185318, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05570366978645325, "policy_entropy": 0.12924028933048248, "policy_entropy/max": 3.765625, "policy_entropy/median": 3.2782554626464844e-06, "policy_entropy/min": 1.8821749714348357e-16, "policy_entropy/p25": 2.10711732506752e-08, "policy_entropy/p75": 0.023529052734375, "policy_entropy/var": 0.09423103928565979, "policy_error_vector_variance/max_squared_error": 2.011928081512451, "policy_error_vector_variance/metric": 0.0656459629535675, "policy_loss": 2.4835269396561444e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.47339966893196106, "policy_sharpness": 7.817211627960205, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.61712646484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.931750297546387, "reward": 0.7708333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17850875854492188, "rewards/accuracy_reward": 0.7708333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17850875854492188, "sentence_full_gradient_variance/max_squared_error": 43177.234375, "sentence_full_gradient_variance/metric": 844.1380615234375, "sentence_full_gradient_variance/p75": 334.4682312011719, "sentence_full_gradient_variance/p90": 864.7247314453125, "sentence_full_gradient_variance/p95": 1596.1572265625, "sentence_full_gradient_variance/p99": 10851.1513671875, "state_level_variance/metric": 41.84235763549805, "state_level_variance_full_gradient/metric": 25.19074821472168, "step": 48 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1666666716337204, "action_level_variance/metric": 28.864042282104492, "action_level_variance_full_gradient/metric": 755.3546752929688, "adam_stats/lr_effective_max": 5.135198080097325e-05, "adam_stats/lr_effective_mean": 9.563209946161777e-11, "adam_stats/lr_effective_min": -4.986518615623936e-05, "adam_stats/m_t_max": 0.0028991014696657658, "adam_stats/m_t_mean": 1.8247202360210935e-11, "adam_stats/m_t_min": -0.0027810093015432358, "adam_stats/v_t_max": 7.221776468213648e-05, "adam_stats/v_t_mean": 4.5592618222456416e-12, "adam_stats/v_t_min": 0.0, "advantages": 4.967053879312289e-09, "advantages/max": 0.5588920712471008, "advantages/median": 0.0, "advantages/min": -1.6766761541366577, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.15782415866851807, "all_logprobs": -0.1060996800661087, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -8.375, "all_logprobs/p1": -2.046875, "all_logprobs/p10": -0.23701143264770508, "all_logprobs/p25": -0.00193023681640625, "all_logprobs/p5": -0.69140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15797358751296997, "clip_ratio": 0.0, "completion_length": 669.2083740234375, "completion_length/correct": 607.3289794921875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 600.0, "completion_length/correct/min": 227.0, "completion_length/correct/p25": 470.75, "completion_length/correct/p75": 706.5, "completion_length/correct/var": 37873.71875, "completion_length/incorrect": 904.3500366210938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 299.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 61709.40234375, "completion_length/max": 1024.0, "completion_length/median": 623.0, "completion_length/min": 227.0, "completion_length/p25": 492.0, "completion_length/p75": 862.75, "completion_length/var": 56945.76953125, "epoch": 0.0392, "feature_vector_variance/max_squared_error": 121927.4375, "feature_vector_variance/metric": 32143.943359375, "generated_tokens/total": 2879360.0, "grad_norm": 0.3741137981414795, "grouped_std_rewards": 0.07453560084104538, "learning_rate": 9.059337681133194e-06, "loss": -0.0, "mean_logprobs": -0.10693359375, "mean_logprobs/var": 0.00193023681640625, "num_completions/total": 4704, "per_sentence_gradient_norm": 2.7780511379241943, "per_sentence_gradient_norm/max": 45.754180908203125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 6.910342216491699, "per_sentence_gradient_norm/p90": 9.661724090576172, "per_sentence_gradient_norm/p95": 12.1659517288208, "per_sentence_gradient_norm/p99": 44.29554748535156, "per_sentence_gradient_norm/var": 66.33890533447266, "per_token_feature_norm": 186.3511962890625, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 190.0, "per_token_feature_norm/min": 70.5, "per_token_feature_norm/p25": 161.0, "per_token_feature_norm/p75": 213.0, "per_token_feature_norm/var": 1458.680908203125, "per_token_full_gradient_variance/max_squared_error": 2.845564603805542, "per_token_full_gradient_variance/variance": 0.0024432865902781487, "per_token_gradient_norm": 1.6881380081176758, "per_token_gradient_norm/max": 528.02197265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 318.3748779296875, "per_token_policy_error_norm": 0.05903536453843117, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05010974034667015, "policy_entropy": 0.11502691358327866, "policy_entropy/max": 3.828125, "policy_entropy/median": 2.9653310775756836e-06, "policy_entropy/min": 1.196959198423997e-16, "policy_entropy/p25": 1.234002411365509e-08, "policy_entropy/p75": 0.0140380859375, "policy_entropy/var": 0.07967985421419144, "policy_error_vector_variance/max_squared_error": 2.010242223739624, "policy_error_vector_variance/metric": 0.0589846707880497, "policy_loss": -4.967053879312289e-09, "policy_loss/max": 1.6766761541366577, "policy_loss/median": 0.0, "policy_loss/min": -0.5588921308517456, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.15782415866851807, "policy_sharpness": 7.9387640953063965, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.74609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.256448745727539, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1666666716337204, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1666666716337204, "sentence_full_gradient_variance/max_squared_error": 17305.419921875, "sentence_full_gradient_variance/metric": 755.7879638671875, "sentence_full_gradient_variance/p75": 0.08662590384483337, "sentence_full_gradient_variance/p90": 1608.736572265625, "sentence_full_gradient_variance/p95": 2286.5400390625, "sentence_full_gradient_variance/p99": 14705.822265625, "state_level_variance/metric": 46.3054084777832, "state_level_variance_full_gradient/metric": 0.43312931060791016, "step": 49 }, { "accuracy_reward": 0.8229166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14725877344608307, "action_level_variance/metric": 6.748546123504639, "action_level_variance_full_gradient/metric": 0.8256150484085083, "adam_stats/lr_effective_max": 4.774187982548028e-05, "adam_stats/lr_effective_mean": -2.2072497996017404e-10, "adam_stats/lr_effective_min": -4.739432188216597e-05, "adam_stats/m_t_max": 0.005700775887817144, "adam_stats/m_t_mean": -2.6905743258365256e-11, "adam_stats/m_t_min": -0.008069315925240517, "adam_stats/v_t_max": 7.218393147923052e-05, "adam_stats/v_t_mean": 4.9090826889641725e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 0.24990005791187286, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.15776851773262024, "all_logprobs": -0.07811035960912704, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.9375, "all_logprobs/p1": -1.7890625, "all_logprobs/p10": -0.1064453125, "all_logprobs/p25": -4.5299530029296875e-05, "all_logprobs/p5": -0.447265625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12075011432170868, "clip_ratio": 0.0, "completion_length": 623.9166870117188, "completion_length/correct": 593.594970703125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 517.0, "completion_length/correct/min": 275.0, "completion_length/correct/p25": 416.5, "completion_length/correct/p75": 789.5, "completion_length/correct/var": 62360.08203125, "completion_length/incorrect": 764.8235473632812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 778.0, "completion_length/incorrect/min": 612.0, "completion_length/incorrect/p25": 692.0, "completion_length/incorrect/p75": 808.0, "completion_length/incorrect/var": 9470.154296875, "completion_length/max": 1024.0, "completion_length/median": 563.0, "completion_length/min": 275.0, "completion_length/p25": 443.0, "completion_length/p75": 806.5, "completion_length/var": 57113.3984375, "epoch": 0.04, "feature_vector_variance/max_squared_error": 131965.125, "feature_vector_variance/metric": 31289.56640625, "generated_tokens/total": 2939256.0, "grad_norm": 0.8301342129707336, "grouped_std_rewards": 0.0416666679084301, "learning_rate": 8.80236133250198e-06, "loss": -0.0, "mean_logprobs": -0.08056640625, "mean_logprobs/var": 0.00080108642578125, "num_completions/total": 4800, "per_sentence_gradient_norm": 0.593246340751648, "per_sentence_gradient_norm/max": 27.38275909423828, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 1.5256025791168213, "per_sentence_gradient_norm/p90": 1.8493125438690186, "per_sentence_gradient_norm/p95": 2.097219705581665, "per_sentence_gradient_norm/p99": 4.152571678161621, "per_sentence_gradient_norm/var": 8.171589851379395, "per_token_feature_norm": 193.9700469970703, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 176.0, "per_token_feature_norm/p75": 216.0, "per_token_feature_norm/var": 1106.3577880859375, "per_token_full_gradient_variance/max_squared_error": 10.60145378112793, "per_token_full_gradient_variance/variance": 0.002854208694770932, "per_token_gradient_norm": 0.9538783431053162, "per_token_gradient_norm/max": 998.0676879882812, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 259.3094482421875, "per_token_policy_error_norm": 0.04289153218269348, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.037779729813337326, "policy_entropy": 0.08518762141466141, "policy_entropy/max": 3.796875, "policy_entropy/median": 1.7136335372924805e-07, "policy_entropy/min": 2.862293735361732e-17, "policy_entropy/p25": 2.1973391994833946e-09, "policy_entropy/p75": 0.000499725341796875, "policy_entropy/var": 0.05919582396745682, "policy_error_vector_variance/max_squared_error": 1.998643398284912, "policy_error_vector_variance/metric": 0.04279601573944092, "policy_loss": -3.725290298461914e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.24990007281303406, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.15776850283145905, "policy_sharpness": 8.448473930358887, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.229737281799316, "reward": 0.8229166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14725877344608307, "rewards/accuracy_reward": 0.8229166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14725877344608307, "sentence_full_gradient_variance/max_squared_error": 50.231719970703125, "sentence_full_gradient_variance/metric": 0.9641736745834351, "sentence_full_gradient_variance/p75": 0.027711722999811172, "sentence_full_gradient_variance/p90": 0.027711722999811172, "sentence_full_gradient_variance/p95": 0.027789877727627754, "sentence_full_gradient_variance/p99": 33.378631591796875, "state_level_variance/metric": 2.111647129058838, "state_level_variance_full_gradient/metric": 0.13855858147144318, "step": 50 }, { "accuracy_reward": 0.8125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1539473682641983, "action_level_variance/metric": 292.8590087890625, "action_level_variance_full_gradient/metric": 1171.745361328125, "adam_stats/lr_effective_max": 4.603386332746595e-05, "adam_stats/lr_effective_mean": -3.6803619180014735e-11, "adam_stats/lr_effective_min": -4.901022111880593e-05, "adam_stats/m_t_max": 0.007218100596219301, "adam_stats/m_t_mean": -9.236993982197905e-12, "adam_stats/m_t_min": -0.009508478455245495, "adam_stats/v_t_max": 7.211863703560084e-05, "adam_stats/v_t_mean": 4.98946457067051e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634920325327e-08, "advantages/max": 2.560988187789917, "advantages/median": 0.24990005791187286, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.7888764142990112, "all_logprobs": -0.1422162503004074, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -8.8125, "all_logprobs/p1": -2.43359375, "all_logprobs/p10": -0.3828125, "all_logprobs/p25": -0.0086669921875, "all_logprobs/p5": -0.9453125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.2298155128955841, "clip_ratio": 0.0, "completion_length": 719.0208740234375, "completion_length/correct": 679.7692260742188, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 626.0, "completion_length/correct/min": 374.0, "completion_length/correct/p25": 504.5, "completion_length/correct/p75": 839.75, "completion_length/correct/var": 42067.8984375, "completion_length/incorrect": 889.1111450195312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 482.0, "completion_length/incorrect/p25": 755.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 37658.57421875, "completion_length/max": 1024.0, "completion_length/median": 703.0, "completion_length/min": 374.0, "completion_length/p25": 540.5, "completion_length/p75": 921.25, "completion_length/var": 47582.6328125, "epoch": 0.0408, "feature_vector_variance/max_squared_error": 117739.1015625, "feature_vector_variance/metric": 36022.40234375, "generated_tokens/total": 3008282.0, "grad_norm": 0.606854259967804, "grouped_std_rewards": 0.22359417378902435, "learning_rate": 8.543798257200491e-06, "loss": -0.0, "mean_logprobs": -0.1279296875, "mean_logprobs/var": 0.008056640625, "num_completions/total": 4896, "per_sentence_gradient_norm": 8.157821655273438, "per_sentence_gradient_norm/max": 142.54129028320312, "per_sentence_gradient_norm/median": 3.436054229736328, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 2.219231367111206, "per_sentence_gradient_norm/p75": 8.253040313720703, "per_sentence_gradient_norm/p85": 10.413839340209961, "per_sentence_gradient_norm/p90": 11.461929321289062, "per_sentence_gradient_norm/p95": 33.17072296142578, "per_sentence_gradient_norm/p99": 87.80381774902344, "per_sentence_gradient_norm/var": 312.3520202636719, "per_token_feature_norm": 193.30918884277344, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 75.0, "per_token_feature_norm/p25": 171.0, "per_token_feature_norm/p75": 216.0, "per_token_feature_norm/var": 1502.671142578125, "per_token_full_gradient_variance/max_squared_error": 24.071643829345703, "per_token_full_gradient_variance/variance": 0.02132178284227848, "per_token_gradient_norm": 9.242265701293945, "per_token_gradient_norm/max": 1423.2589111328125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2725.522216796875, "per_token_policy_error_norm": 0.0743880420923233, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06175732612609863, "policy_entropy": 0.15963134169578552, "policy_entropy/max": 3.09375, "policy_entropy/median": 6.198883056640625e-06, "policy_entropy/min": 4.787836793695988e-16, "policy_entropy/p25": 1.8044374883174896e-08, "policy_entropy/p75": 0.052490234375, "policy_entropy/var": 0.13504934310913086, "policy_error_vector_variance/max_squared_error": 2.017012119293213, "policy_error_vector_variance/metric": 0.07422316819429398, "policy_loss": -1.2417634920325327e-08, "policy_loss/max": 3.7485010623931885, "policy_loss/median": -0.24990005791187286, "policy_loss/min": -2.560988187789917, "policy_loss/p25": -0.24990007281303406, "policy_loss/p75": 0.0, "policy_loss/var": 0.788876473903656, "policy_sharpness": 7.55954647064209, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.998046636581421, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.245282173156738, "reward": 0.8125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1539473682641983, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1539473682641983, "sentence_full_gradient_variance/max_squared_error": 39584.9765625, "sentence_full_gradient_variance/metric": 1196.319091796875, "sentence_full_gradient_variance/p75": 115.9582290649414, "sentence_full_gradient_variance/p90": 192.81494140625, "sentence_full_gradient_variance/p95": 396.36370849609375, "sentence_full_gradient_variance/p99": 35745.55859375, "state_level_variance/metric": 41.45161437988281, "state_level_variance_full_gradient/metric": 24.573768615722656, "step": 51 }, { "accuracy_reward": 0.5625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24868421256542206, "action_level_variance/metric": 91.13957977294922, "action_level_variance_full_gradient/metric": 297.35784912109375, "adam_stats/lr_effective_max": 4.678680852521211e-05, "adam_stats/lr_effective_mean": -2.332289222861661e-11, "adam_stats/lr_effective_min": -4.7328390792245045e-05, "adam_stats/m_t_max": 0.00732026482000947, "adam_stats/m_t_mean": -1.856725190263475e-11, "adam_stats/m_t_min": -0.008932996541261673, "adam_stats/v_t_max": 7.248223846545443e-05, "adam_stats/v_t_mean": 4.99911874049519e-12, "adam_stats/v_t_min": 0.0, "advantages": -1.2417634698280722e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -0.6526548862457275, "advantages/p25": -0.24990005791187286, "advantages/p75": 0.0, "advantages/var": 0.3155972957611084, "all_logprobs": -0.10408671945333481, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.25, "all_logprobs/p1": -2.0, "all_logprobs/p10": -0.212890625, "all_logprobs/p25": -0.0009746551513671875, "all_logprobs/p5": -0.69140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1551501601934433, "clip_ratio": 0.0, "completion_length": 671.40625, "completion_length/correct": 695.0, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 595.0, "completion_length/correct/min": 439.0, "completion_length/correct/p25": 534.75, "completion_length/correct/p75": 926.75, "completion_length/correct/var": 47142.265625, "completion_length/incorrect": 641.0714111328125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 587.0, "completion_length/incorrect/min": 187.0, "completion_length/incorrect/p25": 432.5, "completion_length/incorrect/p75": 978.5, "completion_length/incorrect/var": 69877.53125, "completion_length/max": 1024.0, "completion_length/median": 594.0, "completion_length/min": 187.0, "completion_length/p25": 492.0, "completion_length/p75": 955.0, "completion_length/var": 57181.3359375, "epoch": 0.0416, "feature_vector_variance/max_squared_error": 120078.703125, "feature_vector_variance/metric": 33191.8359375, "generated_tokens/total": 3072737.0, "grad_norm": 0.2895106375217438, "grouped_std_rewards": 0.12145226448774338, "learning_rate": 8.283963474507402e-06, "loss": 0.0, "mean_logprobs": -0.1142578125, "mean_logprobs/var": 0.0037841796875, "num_completions/total": 4992, "per_sentence_gradient_norm": 3.715784788131714, "per_sentence_gradient_norm/max": 98.3093490600586, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 6.475322723388672, "per_sentence_gradient_norm/p85": 7.5674519538879395, "per_sentence_gradient_norm/p90": 8.547676086425781, "per_sentence_gradient_norm/p95": 10.66307544708252, "per_sentence_gradient_norm/p99": 27.165048599243164, "per_sentence_gradient_norm/var": 115.73650360107422, "per_token_feature_norm": 191.89295959472656, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 70.5, "per_token_feature_norm/p25": 175.0, "per_token_feature_norm/p75": 213.0, "per_token_feature_norm/var": 1136.2191162109375, "per_token_full_gradient_variance/max_squared_error": 12.898237228393555, "per_token_full_gradient_variance/variance": 0.005790152586996555, "per_token_gradient_norm": 3.9086854457855225, "per_token_gradient_norm/max": 1317.773681640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 841.3970336914062, "per_token_policy_error_norm": 0.05741100013256073, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04892773553729057, "policy_entropy": 0.11404970288276672, "policy_entropy/max": 3.828125, "policy_entropy/median": 4.5262277126312256e-07, "policy_entropy/min": 2.96637714392034e-16, "policy_entropy/p25": 2.735760062932968e-09, "policy_entropy/p75": 0.00811767578125, "policy_entropy/var": 0.08293288946151733, "policy_error_vector_variance/max_squared_error": 2.0066397190093994, "policy_error_vector_variance/metric": 0.057375941425561905, "policy_loss": 0.0, "policy_loss/max": 0.6526548862457275, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": 0.0, "policy_loss/p75": 0.24990004301071167, "policy_loss/var": 0.3155972957611084, "policy_sharpness": 8.049287796020508, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.75, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.033404350280762, "reward": 0.5625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24868421256542206, "rewards/accuracy_reward": 0.5625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24868421256542206, "sentence_full_gradient_variance/max_squared_error": 24475.732421875, "sentence_full_gradient_variance/metric": 298.3404541015625, "sentence_full_gradient_variance/p75": 0.09491195529699326, "sentence_full_gradient_variance/p90": 123.44088745117188, "sentence_full_gradient_variance/p95": 262.5024719238281, "sentence_full_gradient_variance/p99": 2680.280029296875, "state_level_variance/metric": 34.905067443847656, "state_level_variance_full_gradient/metric": 0.9826111197471619, "step": 52 }, { "accuracy_reward": 0.71875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20427633821964264, "action_level_variance/metric": 53.57261276245117, "action_level_variance_full_gradient/metric": 804.89306640625, "adam_stats/lr_effective_max": 4.6622320951428264e-05, "adam_stats/lr_effective_mean": 1.8911547675082296e-11, "adam_stats/lr_effective_min": -4.6900269808247685e-05, "adam_stats/m_t_max": 0.009957379661500454, "adam_stats/m_t_mean": -2.7753022102672276e-11, "adam_stats/m_t_min": -0.01049331109970808, "adam_stats/v_t_max": 7.245320739457384e-05, "adam_stats/v_t_mean": 5.2157432019195316e-12, "adam_stats/v_t_min": 0.0, "advantages": 2.4835269396561444e-09, "advantages/max": 2.560988187789917, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": -0.36585545539855957, "advantages/p75": 0.36585545539855957, "advantages/var": 0.63124018907547, "all_logprobs": -0.06652133166790009, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.8125, "all_logprobs/p1": -1.6015625, "all_logprobs/p10": -0.06201171875, "all_logprobs/p25": -1.6689300537109375e-05, "all_logprobs/p5": -0.357421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.09867437928915024, "clip_ratio": 0.0, "completion_length": 776.9583740234375, "completion_length/correct": 777.6376953125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 835.0, "completion_length/correct/min": 320.0, "completion_length/correct/p25": 569.0, "completion_length/correct/p75": 999.0, "completion_length/correct/var": 51291.55859375, "completion_length/incorrect": 775.2222290039062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 811.0, "completion_length/incorrect/min": 331.0, "completion_length/incorrect/p25": 544.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63555.79296875, "completion_length/max": 1024.0, "completion_length/median": 833.0, "completion_length/min": 320.0, "completion_length/p25": 549.75, "completion_length/p75": 1018.75, "completion_length/var": 54109.36328125, "epoch": 0.0424, "feature_vector_variance/max_squared_error": 122961.8828125, "feature_vector_variance/metric": 30516.990234375, "generated_tokens/total": 3147325.0, "grad_norm": 0.7335529327392578, "grouped_std_rewards": 0.2561737895011902, "learning_rate": 8.02317355308094e-06, "loss": 0.0, "mean_logprobs": -0.06591796875, "mean_logprobs/var": 0.000659942626953125, "num_completions/total": 5088, "per_sentence_gradient_norm": 5.053387641906738, "per_sentence_gradient_norm/max": 57.8731575012207, "per_sentence_gradient_norm/median": 3.1322920322418213, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.980415344238281, "per_sentence_gradient_norm/p85": 8.375177383422852, "per_sentence_gradient_norm/p90": 11.32485294342041, "per_sentence_gradient_norm/p95": 15.990966796875, "per_sentence_gradient_norm/p99": 35.26897430419922, "per_sentence_gradient_norm/var": 66.36739349365234, "per_token_feature_norm": 191.38931274414062, "per_token_feature_norm/max": 294.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 71.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 844.6759643554688, "per_token_full_gradient_variance/max_squared_error": 3.231403112411499, "per_token_full_gradient_variance/variance": 0.009075225330889225, "per_token_gradient_norm": 5.573662757873535, "per_token_gradient_norm/max": 802.8698120117188, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1063.11767578125, "per_token_policy_error_norm": 0.037055015563964844, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03233712166547775, "policy_entropy": 0.07364527136087418, "policy_entropy/max": 3.65625, "policy_entropy/median": 1.019798219203949e-07, "policy_entropy/min": 1.1102230246251565e-16, "policy_entropy/p25": 1.4042598195374012e-09, "policy_entropy/p75": 0.00020313262939453125, "policy_entropy/var": 0.05220269784331322, "policy_error_vector_variance/max_squared_error": 2.008629322052002, "policy_error_vector_variance/metric": 0.03701801970601082, "policy_loss": -2.4835269396561444e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -2.560988187789917, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.36585545539855957, "policy_loss/var": 0.63124018907547, "policy_sharpness": 8.60250473022461, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.377405166625977, "reward": 0.71875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20427633821964264, "rewards/accuracy_reward": 0.71875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20427633821964264, "sentence_full_gradient_variance/max_squared_error": 41751.15625, "sentence_full_gradient_variance/metric": 811.5050048828125, "sentence_full_gradient_variance/p75": 95.3930892944336, "sentence_full_gradient_variance/p90": 472.56243896484375, "sentence_full_gradient_variance/p95": 788.45166015625, "sentence_full_gradient_variance/p99": 27192.408203125, "state_level_variance/metric": 18.542095184326172, "state_level_variance_full_gradient/metric": 6.611824035644531, "step": 53 }, { "accuracy_reward": 0.5729166865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2472587525844574, "action_level_variance/metric": 47.865535736083984, "action_level_variance_full_gradient/metric": 759.2047729492188, "adam_stats/lr_effective_max": 4.5614495320478454e-05, "adam_stats/lr_effective_mean": -1.9978492818428784e-11, "adam_stats/lr_effective_min": -4.512908708420582e-05, "adam_stats/m_t_max": 0.008015596307814121, "adam_stats/m_t_mean": -1.92272222448997e-11, "adam_stats/m_t_min": -0.008259897120296955, "adam_stats/v_t_max": 7.27250226191245e-05, "adam_stats/v_t_mean": 5.263643253899941e-12, "adam_stats/v_t_min": 0.0, "advantages": -6.208817460162663e-09, "advantages/max": 1.6766761541366577, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": -0.5588920712471008, "advantages/p75": 0.0, "advantages/var": 0.4734553396701813, "all_logprobs": -0.07038488239049911, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.4375, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.0791015625, "all_logprobs/p25": -4.649162292480469e-05, "all_logprobs/p5": -0.38671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.10381388664245605, "clip_ratio": 0.0, "completion_length": 694.1666870117188, "completion_length/correct": 496.6908874511719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 433.0, "completion_length/correct/min": 179.0, "completion_length/correct/p25": 376.0, "completion_length/correct/p75": 485.5, "completion_length/correct/var": 53460.328125, "completion_length/incorrect": 959.0731201171875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 317.0, "completion_length/incorrect/p25": 972.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 24693.37109375, "completion_length/max": 1024.0, "completion_length/median": 604.0, "completion_length/min": 179.0, "completion_length/p25": 414.5, "completion_length/p75": 1024.0, "completion_length/var": 93648.453125, "epoch": 0.0432, "feature_vector_variance/max_squared_error": 124304.53125, "feature_vector_variance/metric": 30783.57421875, "generated_tokens/total": 3213965.0, "grad_norm": 0.4885748624801636, "grouped_std_rewards": 0.21124869585037231, "learning_rate": 7.76174622526876e-06, "loss": 0.0, "mean_logprobs": -0.07080078125, "mean_logprobs/var": 0.001007080078125, "num_completions/total": 5184, "per_sentence_gradient_norm": 4.750304698944092, "per_sentence_gradient_norm/max": 43.85743713378906, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.134367942810059, "per_sentence_gradient_norm/p85": 10.008591651916504, "per_sentence_gradient_norm/p90": 11.193477630615234, "per_sentence_gradient_norm/p95": 19.16182518005371, "per_sentence_gradient_norm/p99": 43.49713134765625, "per_sentence_gradient_norm/var": 74.407958984375, "per_token_feature_norm": 194.73007202148438, "per_token_feature_norm/max": 292.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 78.5, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 764.6438598632812, "per_token_full_gradient_variance/max_squared_error": 3.922051191329956, "per_token_full_gradient_variance/variance": 0.008051048032939434, "per_token_gradient_norm": 5.315364360809326, "per_token_gradient_norm/max": 792.9860229492188, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 912.7573852539062, "per_token_policy_error_norm": 0.039397116750478745, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.035032469779253006, "policy_entropy": 0.07726877182722092, "policy_entropy/max": 2.515625, "policy_entropy/median": 1.2293457984924316e-07, "policy_entropy/min": 3.0184188481996443e-16, "policy_entropy/p25": 1.1859810911118984e-09, "policy_entropy/p75": 0.000530242919921875, "policy_entropy/var": 0.051641181111335754, "policy_error_vector_variance/max_squared_error": 2.0062313079833984, "policy_error_vector_variance/metric": 0.039358578622341156, "policy_loss": 7.450580596923828e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -1.6766760349273682, "policy_loss/p25": 0.0, "policy_loss/p75": 0.5588920712471008, "policy_loss/var": 0.4734553396701813, "policy_sharpness": 8.493313789367676, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.825139999389648, "reward": 0.5729166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2472587525844574, "rewards/accuracy_reward": 0.5729166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2472587525844574, "sentence_full_gradient_variance/max_squared_error": 38964.4609375, "sentence_full_gradient_variance/metric": 759.4437866210938, "sentence_full_gradient_variance/p75": 4.84583854675293, "sentence_full_gradient_variance/p90": 872.9195556640625, "sentence_full_gradient_variance/p95": 913.0472412109375, "sentence_full_gradient_variance/p99": 21961.638671875, "state_level_variance/metric": 34.5107307434082, "state_level_variance_full_gradient/metric": 0.23899583518505096, "step": 54 }, { "accuracy_reward": 0.8541666865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.12587720155715942, "action_level_variance/metric": 34.06804275512695, "action_level_variance_full_gradient/metric": 1564.852783203125, "adam_stats/lr_effective_max": 4.216997331241146e-05, "adam_stats/lr_effective_mean": -2.1919410730930622e-10, "adam_stats/lr_effective_min": -4.3852451199200004e-05, "adam_stats/m_t_max": 0.009923998266458511, "adam_stats/m_t_mean": -3.6834361949455996e-11, "adam_stats/m_t_min": -0.009313790127635002, "adam_stats/v_t_max": 7.321311568375677e-05, "adam_stats/v_t_mean": 5.6327997821625786e-12, "adam_stats/v_t_min": 0.0, "advantages": 4.967053879312289e-09, "advantages/max": 1.4358407258987427, "advantages/median": 0.24990005791187286, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.6311343312263489, "all_logprobs": -0.06897444278001785, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.9375, "all_logprobs/p1": -1.5859375, "all_logprobs/p10": -0.07958984375, "all_logprobs/p25": -5.817413330078125e-05, "all_logprobs/p5": -0.38671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0987255647778511, "clip_ratio": 0.0, "completion_length": 534.0, "completion_length/correct": 539.0975341796875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 512.0, "completion_length/correct/min": 304.0, "completion_length/correct/p25": 420.75, "completion_length/correct/p75": 669.5, "completion_length/correct/var": 24509.59765625, "completion_length/incorrect": 504.14288330078125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 406.0, "completion_length/incorrect/min": 306.0, "completion_length/incorrect/p25": 329.75, "completion_length/incorrect/p75": 522.5, "completion_length/incorrect/var": 60088.75, "completion_length/max": 1024.0, "completion_length/median": 506.0, "completion_length/min": 304.0, "completion_length/p25": 403.0, "completion_length/p75": 662.5, "completion_length/var": 29274.126953125, "epoch": 0.044, "feature_vector_variance/max_squared_error": 122617.2890625, "feature_vector_variance/metric": 31306.30078125, "generated_tokens/total": 3265229.0, "grad_norm": 0.9445539116859436, "grouped_std_rewards": 0.20478558540344238, "learning_rate": 7.5e-06, "loss": 0.0, "mean_logprobs": -0.0712890625, "mean_logprobs/var": 0.00089263916015625, "num_completions/total": 5280, "per_sentence_gradient_norm": 4.378464698791504, "per_sentence_gradient_norm/max": 39.70716857910156, "per_sentence_gradient_norm/median": 2.029109001159668, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.1371521949768066, "per_sentence_gradient_norm/p85": 10.00564193725586, "per_sentence_gradient_norm/p90": 13.26226806640625, "per_sentence_gradient_norm/p95": 20.26576805114746, "per_sentence_gradient_norm/p99": 29.705636978149414, "per_sentence_gradient_norm/var": 52.62914276123047, "per_token_feature_norm": 195.60301208496094, "per_token_feature_norm/max": 290.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 69.5, "per_token_feature_norm/p25": 184.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 695.9092407226562, "per_token_full_gradient_variance/max_squared_error": 6.7843804359436035, "per_token_full_gradient_variance/variance": 0.007764102891087532, "per_token_gradient_norm": 4.049077987670898, "per_token_gradient_norm/max": 1093.7716064453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 875.8720092773438, "per_token_policy_error_norm": 0.03905872628092766, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.034303296357393265, "policy_entropy": 0.0761248841881752, "policy_entropy/max": 2.609375, "policy_entropy/median": 1.2386590242385864e-07, "policy_entropy/min": 1.0885389811754465e-16, "policy_entropy/p25": 1.2369127944111824e-09, "policy_entropy/p75": 0.0006256103515625, "policy_entropy/var": 0.04844282940030098, "policy_error_vector_variance/max_squared_error": 2.0093560218811035, "policy_error_vector_variance/metric": 0.03903117775917053, "policy_loss": 0.0, "policy_loss/max": 3.7485010623931885, "policy_loss/median": -0.24990004301071167, "policy_loss/min": -1.4358408451080322, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.6311343312263489, "policy_sharpness": 8.479247093200684, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.841012001037598, "reward": 0.8541666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.12587720155715942, "rewards/accuracy_reward": 0.8541666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.12587720155715942, "sentence_full_gradient_variance/max_squared_error": 58848.87890625, "sentence_full_gradient_variance/metric": 1567.32080078125, "sentence_full_gradient_variance/p75": 221.73316955566406, "sentence_full_gradient_variance/p90": 2856.3837890625, "sentence_full_gradient_variance/p95": 6268.1875, "sentence_full_gradient_variance/p99": 21557.037109375, "state_level_variance/metric": 24.170557022094727, "state_level_variance_full_gradient/metric": 2.4679532051086426, "step": 55 }, { "accuracy_reward": 0.8229166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14725877344608307, "action_level_variance/metric": 16.61374282836914, "action_level_variance_full_gradient/metric": 481.4031982421875, "adam_stats/lr_effective_max": 4.1285788029199466e-05, "adam_stats/lr_effective_mean": -2.2483494233060952e-10, "adam_stats/lr_effective_min": -4.1850795241771266e-05, "adam_stats/m_t_max": 0.010213336907327175, "adam_stats/m_t_mean": -4.466399472491389e-11, "adam_stats/m_t_min": -0.010091395117342472, "adam_stats/v_t_max": 7.350253144977614e-05, "adam_stats/v_t_mean": 5.6894979179322736e-12, "adam_stats/v_t_min": 0.0, "advantages": 4.967053879312289e-09, "advantages/max": 0.8537459373474121, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.46501490473747253, "advantages/var": 0.6312496662139893, "all_logprobs": -0.052286285907030106, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.0, "all_logprobs/p1": -1.421875, "all_logprobs/p10": -0.023193359375, "all_logprobs/p25": -2.2649765014648438e-06, "all_logprobs/p5": -0.21484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.07875475287437439, "clip_ratio": 0.0, "completion_length": 718.2396240234375, "completion_length/correct": 682.7215576171875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 608.0, "completion_length/correct/min": 279.0, "completion_length/correct/p25": 478.5, "completion_length/correct/p75": 945.5, "completion_length/correct/var": 71425.9453125, "completion_length/incorrect": 883.2941284179688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 944.0, "completion_length/incorrect/min": 464.0, "completion_length/incorrect/p25": 796.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 32427.470703125, "completion_length/max": 1024.0, "completion_length/median": 787.0, "completion_length/min": 279.0, "completion_length/p25": 493.75, "completion_length/p75": 998.5, "completion_length/var": 70030.0546875, "epoch": 0.0448, "feature_vector_variance/max_squared_error": 122555.828125, "feature_vector_variance/metric": 29267.873046875, "generated_tokens/total": 3334180.0, "grad_norm": 0.4611346125602722, "grouped_std_rewards": 0.277576744556427, "learning_rate": 7.238253774731245e-06, "loss": -0.0, "mean_logprobs": -0.05517578125, "mean_logprobs/var": 0.00049591064453125, "num_completions/total": 5376, "per_sentence_gradient_norm": 3.9324111938476562, "per_sentence_gradient_norm/max": 27.790969848632812, "per_sentence_gradient_norm/median": 2.7235336303710938, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.063642978668213, "per_sentence_gradient_norm/p85": 6.658792495727539, "per_sentence_gradient_norm/p90": 10.223159790039062, "per_sentence_gradient_norm/p95": 12.440010070800781, "per_sentence_gradient_norm/p99": 25.56500816345215, "per_sentence_gradient_norm/var": 26.813209533691406, "per_token_feature_norm": 189.8064727783203, "per_token_feature_norm/max": 290.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 71.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 747.3753662109375, "per_token_full_gradient_variance/max_squared_error": 7.221906661987305, "per_token_full_gradient_variance/variance": 0.006266288459300995, "per_token_gradient_norm": 4.555848598480225, "per_token_gradient_norm/max": 1019.5336303710938, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 732.0816040039062, "per_token_policy_error_norm": 0.029443293809890747, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.02663196064531803, "policy_entropy": 0.056728340685367584, "policy_entropy/max": 2.5, "policy_entropy/median": 4.330649971961975e-08, "policy_entropy/min": 7.155734338404329e-17, "policy_entropy/p25": 4.420144250616431e-10, "policy_entropy/p75": 3.24249267578125e-05, "policy_entropy/var": 0.038245219737291336, "policy_error_vector_variance/max_squared_error": 1.978337049484253, "policy_error_vector_variance/metric": 0.02939571626484394, "policy_loss": 2.4835269396561444e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.8537459373474121, "policy_loss/p25": -0.46501490473747253, "policy_loss/p75": 0.0, "policy_loss/var": 0.6312496662139893, "policy_sharpness": 8.860587120056152, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.033814430236816, "reward": 0.8229166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14725877344608307, "rewards/accuracy_reward": 0.8229166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14725877344608307, "sentence_full_gradient_variance/max_squared_error": 16918.26953125, "sentence_full_gradient_variance/metric": 492.28033447265625, "sentence_full_gradient_variance/p75": 150.35653686523438, "sentence_full_gradient_variance/p90": 648.1765747070312, "sentence_full_gradient_variance/p95": 965.765380859375, "sentence_full_gradient_variance/p99": 12780.4365234375, "state_level_variance/metric": 13.150227546691895, "state_level_variance_full_gradient/metric": 10.877055168151855, "step": 56 }, { "accuracy_reward": 0.9375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.059210531413555145, "action_level_variance/metric": 17.179370880126953, "action_level_variance_full_gradient/metric": 84.41325378417969, "adam_stats/lr_effective_max": 3.711895988089964e-05, "adam_stats/lr_effective_mean": -1.8446121019533024e-10, "adam_stats/lr_effective_min": -3.852324152830988e-05, "adam_stats/m_t_max": 0.008682359009981155, "adam_stats/m_t_mean": -3.716435145739716e-11, "adam_stats/m_t_min": -0.008624491281807423, "adam_stats/v_t_max": 7.346159691223875e-05, "adam_stats/v_t_mean": 5.687340789289896e-12, "adam_stats/v_t_min": 0.0, "advantages": 2.4835269396561444e-09, "advantages/max": 0.7498500347137451, "advantages/median": 0.0, "advantages/min": -1.249750018119812, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.157831609249115, "all_logprobs": -0.10454612970352173, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.6875, "all_logprobs/p1": -2.125, "all_logprobs/p10": -0.201171875, "all_logprobs/p25": -0.000560760498046875, "all_logprobs/p5": -0.6796875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1695134937763214, "clip_ratio": 0.0, "completion_length": 607.03125, "completion_length/correct": 585.4888916015625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 570.0, "completion_length/correct/min": 243.0, "completion_length/correct/p25": 470.0, "completion_length/correct/p75": 668.75, "completion_length/correct/var": 30305.2890625, "completion_length/incorrect": 930.1666870117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 989.0, "completion_length/incorrect/min": 651.0, "completion_length/incorrect/p25": 899.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 22310.16796875, "completion_length/max": 1024.0, "completion_length/median": 587.0, "completion_length/min": 243.0, "completion_length/p25": 475.25, "completion_length/p75": 694.25, "completion_length/var": 36599.859375, "epoch": 0.0456, "feature_vector_variance/max_squared_error": 125618.4140625, "feature_vector_variance/metric": 33631.046875, "generated_tokens/total": 3392455.0, "grad_norm": 0.17381852865219116, "grouped_std_rewards": 0.0833333358168602, "learning_rate": 6.976826446919061e-06, "loss": -0.0, "mean_logprobs": -0.0947265625, "mean_logprobs/var": 0.00738525390625, "num_completions/total": 5472, "per_sentence_gradient_norm": 5.484777450561523, "per_sentence_gradient_norm/max": 50.26990509033203, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 20.300718307495117, "per_sentence_gradient_norm/p90": 26.943897247314453, "per_sentence_gradient_norm/p95": 39.3509521484375, "per_sentence_gradient_norm/p99": 45.19664001464844, "per_sentence_gradient_norm/var": 168.2724151611328, "per_token_feature_norm": 193.0108642578125, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 970.30712890625, "per_token_full_gradient_variance/max_squared_error": 3.0055902004241943, "per_token_full_gradient_variance/variance": 0.008710304275155067, "per_token_gradient_norm": 7.717226505279541, "per_token_gradient_norm/max": 492.67486572265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1275.17724609375, "per_token_policy_error_norm": 0.055452458560466766, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.047567613422870636, "policy_entropy": 0.1167403906583786, "policy_entropy/max": 3.3125, "policy_entropy/median": 2.477318048477173e-07, "policy_entropy/min": 1.0245710529988017e-17, "policy_entropy/p25": 7.203198038041592e-10, "policy_entropy/p75": 0.005035400390625, "policy_entropy/var": 0.09644830971956253, "policy_error_vector_variance/max_squared_error": 2.007772922515869, "policy_error_vector_variance/metric": 0.05535416677594185, "policy_loss": -2.4835269396561444e-09, "policy_loss/max": 1.249750018119812, "policy_loss/median": 0.0, "policy_loss/min": -0.7498500347137451, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.1578315943479538, "policy_sharpness": 8.082418441772461, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.237744331359863, "reward": 0.9375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.059210531413555145, "rewards/accuracy_reward": 0.9375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.059210531413555145, "sentence_full_gradient_variance/max_squared_error": 3054.782958984375, "sentence_full_gradient_variance/metric": 87.13645935058594, "sentence_full_gradient_variance/p75": 0.5446417927742004, "sentence_full_gradient_variance/p90": 1.5456252098083496, "sentence_full_gradient_variance/p95": 540.8387451171875, "sentence_full_gradient_variance/p99": 1716.86083984375, "state_level_variance/metric": 180.4967041015625, "state_level_variance_full_gradient/metric": 2.72320818901062, "step": 57 }, { "accuracy_reward": 0.5625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24868421256542206, "action_level_variance/metric": 44.911529541015625, "action_level_variance_full_gradient/metric": 264.7281494140625, "adam_stats/lr_effective_max": 3.754138379008509e-05, "adam_stats/lr_effective_mean": -2.285751588004814e-10, "adam_stats/lr_effective_min": -3.943185220123269e-05, "adam_stats/m_t_max": 0.006898595951497555, "adam_stats/m_t_mean": -3.943709595000122e-11, "adam_stats/m_t_min": -0.0067000300623476505, "adam_stats/v_t_max": 7.343826291617006e-05, "adam_stats/v_t_mean": 5.705550181617225e-12, "adam_stats/v_t_min": 0.0, "advantages": 8.692344621863413e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": -0.46501490473747253, "advantages/p75": 0.24990005791187286, "advantages/var": 0.7889845967292786, "all_logprobs": -0.0652114674448967, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.25, "all_logprobs/p1": -1.59375, "all_logprobs/p10": -0.049072265625, "all_logprobs/p25": -6.198883056640625e-06, "all_logprobs/p5": -0.33984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.09751925617456436, "clip_ratio": 0.0, "completion_length": 829.8229370117188, "completion_length/correct": 724.7777709960938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 711.0, "completion_length/correct/min": 279.0, "completion_length/correct/p25": 477.25, "completion_length/correct/p75": 936.25, "completion_length/correct/var": 51623.8359375, "completion_length/incorrect": 964.8809814453125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 498.0, "completion_length/incorrect/p25": 966.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 14123.8642578125, "completion_length/max": 1024.0, "completion_length/median": 925.0, "completion_length/min": 279.0, "completion_length/p25": 672.0, "completion_length/p75": 1024.0, "completion_length/var": 49232.76171875, "epoch": 0.0464, "feature_vector_variance/max_squared_error": 123618.46875, "feature_vector_variance/metric": 29811.97265625, "generated_tokens/total": 3472118.0, "grad_norm": 0.4024631083011627, "grouped_std_rewards": 0.2872319221496582, "learning_rate": 6.7160365254926005e-06, "loss": 0.0, "mean_logprobs": -0.06298828125, "mean_logprobs/var": 0.00115966796875, "num_completions/total": 5568, "per_sentence_gradient_norm": 5.63416051864624, "per_sentence_gradient_norm/max": 38.84323501586914, "per_sentence_gradient_norm/median": 2.620377779006958, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.4056215286254883, "per_sentence_gradient_norm/p75": 5.264181613922119, "per_sentence_gradient_norm/p85": 10.689614295959473, "per_sentence_gradient_norm/p90": 17.5048828125, "per_sentence_gradient_norm/p95": 23.491779327392578, "per_sentence_gradient_norm/p99": 32.16628646850586, "per_sentence_gradient_norm/var": 60.861167907714844, "per_token_feature_norm": 190.38978576660156, "per_token_feature_norm/max": 296.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 71.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 728.505126953125, "per_token_full_gradient_variance/max_squared_error": 10.663273811340332, "per_token_full_gradient_variance/variance": 0.010968091897666454, "per_token_gradient_norm": 6.150127410888672, "per_token_gradient_norm/max": 1052.3331298828125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1178.39501953125, "per_token_policy_error_norm": 0.03594852611422539, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03112918697297573, "policy_entropy": 0.07257163524627686, "policy_entropy/max": 3.25, "policy_entropy/median": 4.21423465013504e-08, "policy_entropy/min": 5.399326818977812e-17, "policy_entropy/p25": 4.838511813431978e-10, "policy_entropy/p75": 8.106231689453125e-05, "policy_entropy/var": 0.05428687483072281, "policy_error_vector_variance/max_squared_error": 1.9955995082855225, "policy_error_vector_variance/metric": 0.03589720278978348, "policy_loss": 1.2417634920325327e-08, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": -0.24990007281303406, "policy_loss/p75": 0.46501490473747253, "policy_loss/var": 0.7889845967292786, "policy_sharpness": 8.667529106140137, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.134254455566406, "reward": 0.5625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24868421256542206, "rewards/accuracy_reward": 0.5625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24868421256542206, "sentence_full_gradient_variance/max_squared_error": 14863.970703125, "sentence_full_gradient_variance/metric": 275.19049072265625, "sentence_full_gradient_variance/p75": 47.313262939453125, "sentence_full_gradient_variance/p90": 190.70242309570312, "sentence_full_gradient_variance/p95": 1062.3193359375, "sentence_full_gradient_variance/p99": 3475.707275390625, "state_level_variance/metric": 21.747159957885742, "state_level_variance_full_gradient/metric": 10.462287902832031, "step": 58 }, { "accuracy_reward": 0.7708333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17850877344608307, "action_level_variance/metric": 18.999176025390625, "action_level_variance_full_gradient/metric": 411.853271484375, "adam_stats/lr_effective_max": 3.64180414180737e-05, "adam_stats/lr_effective_mean": -1.695088375219811e-10, "adam_stats/lr_effective_min": -3.621395808295347e-05, "adam_stats/m_t_max": 0.0060294452123343945, "adam_stats/m_t_mean": -3.333659043258663e-11, "adam_stats/m_t_min": -0.006324521731585264, "adam_stats/v_t_max": 7.350502710323781e-05, "adam_stats/v_t_mean": 5.761507590462678e-12, "adam_stats/v_t_min": 0.0, "advantages": 2.4835269396561444e-09, "advantages/max": 0.6526548862457275, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.3155972957611084, "all_logprobs": -0.06765663623809814, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.375, "all_logprobs/p1": -1.6796875, "all_logprobs/p10": -0.06201171875, "all_logprobs/p25": -3.123283386230469e-05, "all_logprobs/p5": -0.34765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.10875194519758224, "clip_ratio": 0.0, "completion_length": 634.9375, "completion_length/correct": 577.5540771484375, "completion_length/correct/max": 973.0, "completion_length/correct/median": 452.0, "completion_length/correct/min": 275.0, "completion_length/correct/p25": 383.5, "completion_length/correct/p75": 841.25, "completion_length/correct/var": 57117.953125, "completion_length/incorrect": 827.95458984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 808.0, "completion_length/incorrect/min": 468.0, "completion_length/incorrect/p25": 773.0, "completion_length/incorrect/p75": 944.25, "completion_length/incorrect/var": 24562.521484375, "completion_length/max": 1024.0, "completion_length/median": 536.0, "completion_length/min": 275.0, "completion_length/p25": 405.0, "completion_length/p75": 884.5, "completion_length/var": 60512.8203125, "epoch": 0.0472, "feature_vector_variance/max_squared_error": 127944.671875, "feature_vector_variance/metric": 31396.6953125, "generated_tokens/total": 3533072.0, "grad_norm": 0.4446890354156494, "grouped_std_rewards": 0.12145226448774338, "learning_rate": 6.456201742799511e-06, "loss": -0.0, "mean_logprobs": -0.0703125, "mean_logprobs/var": 0.00141143798828125, "num_completions/total": 5664, "per_sentence_gradient_norm": 2.3145923614501953, "per_sentence_gradient_norm/max": 42.622615814208984, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.3640315532684326, "per_sentence_gradient_norm/p85": 5.469529151916504, "per_sentence_gradient_norm/p90": 7.668006896972656, "per_sentence_gradient_norm/p95": 10.928169250488281, "per_sentence_gradient_norm/p99": 17.123458862304688, "per_sentence_gradient_norm/var": 30.290504455566406, "per_token_feature_norm": 194.28082275390625, "per_token_feature_norm/max": 302.0, "per_token_feature_norm/median": 194.0, "per_token_feature_norm/min": 68.5, "per_token_feature_norm/p25": 184.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 499.9222717285156, "per_token_full_gradient_variance/max_squared_error": 4.310826301574707, "per_token_full_gradient_variance/variance": 0.004436192102730274, "per_token_gradient_norm": 2.5409483909606934, "per_token_gradient_norm/max": 957.5369873046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 453.6148986816406, "per_token_policy_error_norm": 0.03709476441144943, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03325973451137543, "policy_entropy": 0.07366973906755447, "policy_entropy/max": 3.40625, "policy_entropy/median": 7.12461769580841e-08, "policy_entropy/min": 2.4936649967166602e-17, "policy_entropy/p25": 5.602487362921238e-10, "policy_entropy/p75": 0.000362396240234375, "policy_entropy/var": 0.051849499344825745, "policy_error_vector_variance/max_squared_error": 2.0064995288848877, "policy_error_vector_variance/metric": 0.036973632872104645, "policy_loss": -3.725290298461914e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.6526548862457275, "policy_loss/p25": -0.24990004301071167, "policy_loss/p75": 0.0, "policy_loss/var": 0.3155972957611084, "policy_sharpness": 8.53200626373291, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.677478790283203, "reward": 0.7708333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17850877344608307, "rewards/accuracy_reward": 0.7708333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17850877344608307, "sentence_full_gradient_variance/max_squared_error": 30587.6640625, "sentence_full_gradient_variance/metric": 416.1847839355469, "sentence_full_gradient_variance/p75": 137.32025146484375, "sentence_full_gradient_variance/p90": 303.29351806640625, "sentence_full_gradient_variance/p95": 440.7355041503906, "sentence_full_gradient_variance/p99": 3542.85888671875, "state_level_variance/metric": 14.595904350280762, "state_level_variance_full_gradient/metric": 4.331445693969727, "step": 59 }, { "accuracy_reward": 0.9479166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.049890343099832535, "action_level_variance/metric": 27.11752700805664, "action_level_variance_full_gradient/metric": 212.47274780273438, "adam_stats/lr_effective_max": 3.450466101639904e-05, "adam_stats/lr_effective_mean": -2.470586513148021e-10, "adam_stats/lr_effective_min": -3.566349914763123e-05, "adam_stats/m_t_max": 0.004527126904577017, "adam_stats/m_t_mean": -2.4951443688969732e-11, "adam_stats/m_t_min": -0.005002371966838837, "adam_stats/v_t_max": 7.34451605239883e-05, "adam_stats/v_t_mean": 5.76731024048982e-12, "adam_stats/v_t_min": 0.0, "advantages": -3.725290298461914e-09, "advantages/max": 0.46501490473747253, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.3156188130378723, "all_logprobs": -0.07189148664474487, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.8125, "all_logprobs/p1": -1.7123439311981201, "all_logprobs/p10": -0.07607412338256836, "all_logprobs/p25": -4.649162292480469e-05, "all_logprobs/p5": -0.38671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11304639279842377, "clip_ratio": 0.0, "completion_length": 483.15625, "completion_length/correct": 470.6153869628906, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 418.0, "completion_length/correct/min": 196.0, "completion_length/correct/p25": 326.0, "completion_length/correct/p75": 605.5, "completion_length/correct/var": 33784.55078125, "completion_length/incorrect": 711.4000244140625, "completion_length/incorrect/max": 890.0, "completion_length/incorrect/median": 639.0, "completion_length/incorrect/min": 565.0, "completion_length/incorrect/p25": 609.0, "completion_length/incorrect/p75": 854.0, "completion_length/incorrect/var": 22348.30078125, "completion_length/max": 1024.0, "completion_length/median": 425.0, "completion_length/min": 196.0, "completion_length/p25": 329.0, "completion_length/p75": 625.5, "completion_length/var": 35839.8984375, "epoch": 0.048, "feature_vector_variance/max_squared_error": 114508.21875, "feature_vector_variance/metric": 30388.408203125, "generated_tokens/total": 3579455.0, "grad_norm": 0.4416272044181824, "grouped_std_rewards": 0.12411298602819443, "learning_rate": 6.197638667498023e-06, "loss": -0.0, "mean_logprobs": -0.07421875, "mean_logprobs/var": 0.002685546875, "num_completions/total": 5760, "per_sentence_gradient_norm": 2.9347169399261475, "per_sentence_gradient_norm/max": 44.644927978515625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.778456926345825, "per_sentence_gradient_norm/p85": 6.0082807540893555, "per_sentence_gradient_norm/p90": 8.30929946899414, "per_sentence_gradient_norm/p95": 12.941710472106934, "per_sentence_gradient_norm/p99": 40.30508041381836, "per_sentence_gradient_norm/var": 46.75197982788086, "per_token_feature_norm": 189.2638702392578, "per_token_feature_norm/max": 286.0, "per_token_feature_norm/median": 190.0, "per_token_feature_norm/min": 73.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 673.6734008789062, "per_token_full_gradient_variance/max_squared_error": 2.790148973464966, "per_token_full_gradient_variance/variance": 0.0063034179620444775, "per_token_gradient_norm": 3.63046932220459, "per_token_gradient_norm/max": 662.2355346679688, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 665.19287109375, "per_token_policy_error_norm": 0.03923843055963516, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03512398153543472, "policy_entropy": 0.08002487570047379, "policy_entropy/max": 3.765625, "policy_entropy/median": 1.0849907994270325e-07, "policy_entropy/min": 7.15573433840433e-18, "policy_entropy/p25": 4.147295840084553e-10, "policy_entropy/p75": 0.0005245208740234375, "policy_entropy/var": 0.05820406228303909, "policy_error_vector_variance/max_squared_error": 2.0053272247314453, "policy_error_vector_variance/metric": 0.039152100682258606, "policy_loss": -1.2417634698280722e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -0.46501490473747253, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.3156187832355499, "policy_sharpness": 8.461736679077148, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.098854064941406, "reward": 0.9479166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.049890343099832535, "rewards/accuracy_reward": 0.9479166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.049890343099832535, "sentence_full_gradient_variance/max_squared_error": 7844.2734375, "sentence_full_gradient_variance/metric": 225.39195251464844, "sentence_full_gradient_variance/p75": 144.15855407714844, "sentence_full_gradient_variance/p90": 441.005615234375, "sentence_full_gradient_variance/p95": 846.1708374023438, "sentence_full_gradient_variance/p99": 3002.54638671875, "state_level_variance/metric": 25.010757446289062, "state_level_variance_full_gradient/metric": 12.919229507446289, "step": 60 }, { "accuracy_reward": 0.6145833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23936405777931213, "action_level_variance/metric": 11.237842559814453, "action_level_variance_full_gradient/metric": 125.22926330566406, "adam_stats/lr_effective_max": 3.432554149185307e-05, "adam_stats/lr_effective_mean": -2.4571952805807484e-10, "adam_stats/lr_effective_min": -3.208564885426313e-05, "adam_stats/m_t_max": 0.004169128835201263, "adam_stats/m_t_mean": -3.2200523497616373e-11, "adam_stats/m_t_min": -0.004581098910421133, "adam_stats/v_t_max": 7.34104760340415e-05, "adam_stats/v_t_mean": 5.765161351783954e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 2.0150647163391113, "advantages/median": 0.0, "advantages/min": -0.9680583477020264, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.3156500458717346, "all_logprobs": -0.10525939613580704, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.625, "all_logprobs/p1": -2.140625, "all_logprobs/p10": -0.2021484375, "all_logprobs/p25": -0.000926971435546875, "all_logprobs/p5": -0.671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.16940073668956757, "clip_ratio": 0.0, "completion_length": 545.71875, "completion_length/correct": 428.5762634277344, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 394.0, "completion_length/correct/min": 159.0, "completion_length/correct/p25": 221.5, "completion_length/correct/p75": 487.5, "completion_length/correct/var": 58955.17578125, "completion_length/incorrect": 732.5135498046875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 898.0, "completion_length/incorrect/min": 244.0, "completion_length/incorrect/p25": 426.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 95177.265625, "completion_length/max": 1024.0, "completion_length/median": 454.0, "completion_length/min": 159.0, "completion_length/p25": 287.25, "completion_length/p75": 901.25, "completion_length/var": 94172.796875, "epoch": 0.0488, "feature_vector_variance/max_squared_error": 124618.265625, "feature_vector_variance/metric": 32517.12109375, "generated_tokens/total": 3631844.0, "grad_norm": 0.2390209287405014, "grouped_std_rewards": 0.1532517820596695, "learning_rate": 5.9406623188668065e-06, "loss": 0.0, "mean_logprobs": -0.10693359375, "mean_logprobs/var": 0.0052490234375, "num_completions/total": 5856, "per_sentence_gradient_norm": 3.5091891288757324, "per_sentence_gradient_norm/max": 25.02390480041504, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 6.794560432434082, "per_sentence_gradient_norm/p85": 9.552176475524902, "per_sentence_gradient_norm/p90": 11.410591125488281, "per_sentence_gradient_norm/p95": 16.335227966308594, "per_sentence_gradient_norm/p99": 23.830020904541016, "per_sentence_gradient_norm/var": 36.34288787841797, "per_token_feature_norm": 190.48770141601562, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 74.5, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 992.30810546875, "per_token_full_gradient_variance/max_squared_error": 2.69368052482605, "per_token_full_gradient_variance/variance": 0.007137275766581297, "per_token_gradient_norm": 5.8022379875183105, "per_token_gradient_norm/max": 598.2380981445312, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 875.02978515625, "per_token_policy_error_norm": 0.05631616711616516, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04871154576539993, "policy_entropy": 0.11555583029985428, "policy_entropy/max": 3.078125, "policy_entropy/median": 7.040798664093018e-07, "policy_entropy/min": 2.7755575615628914e-17, "policy_entropy/p25": 3.6088749766349792e-09, "policy_entropy/p75": 0.0079345703125, "policy_entropy/var": 0.08873689919710159, "policy_error_vector_variance/max_squared_error": 2.0110044479370117, "policy_error_vector_variance/metric": 0.05625335872173309, "policy_loss": 6.208817460162663e-09, "policy_loss/max": 0.9680584669113159, "policy_loss/median": 0.0, "policy_loss/min": -2.0150644779205322, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.3156500458717346, "policy_sharpness": 8.032607078552246, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.75, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.17674732208252, "reward": 0.6145833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23936405777931213, "rewards/accuracy_reward": 0.6145833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23936405777931213, "sentence_full_gradient_variance/max_squared_error": 2938.162353515625, "sentence_full_gradient_variance/metric": 126.72935485839844, "sentence_full_gradient_variance/p75": 0.6415562033653259, "sentence_full_gradient_variance/p90": 152.055419921875, "sentence_full_gradient_variance/p95": 718.7150268554688, "sentence_full_gradient_variance/p99": 2415.384521484375, "state_level_variance/metric": 30.514596939086914, "state_level_variance_full_gradient/metric": 1.5000900030136108, "step": 61 }, { "accuracy_reward": 0.9375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.05921052768826485, "action_level_variance/metric": 58.32433319091797, "action_level_variance_full_gradient/metric": 778.8983154296875, "adam_stats/lr_effective_max": 3.368770921952091e-05, "adam_stats/lr_effective_mean": -2.1466024791028104e-10, "adam_stats/lr_effective_min": -3.267939609941095e-05, "adam_stats/m_t_max": 0.0038452944718301296, "adam_stats/m_t_mean": -3.5842408901975986e-11, "adam_stats/m_t_min": -0.0043488191440701485, "adam_stats/v_t_max": 7.35888970666565e-05, "adam_stats/v_t_mean": 5.780053085463477e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 0.46501490473747253, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.47338736057281494, "all_logprobs": -0.06174667179584503, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.28125, "all_logprobs/p1": -1.5, "all_logprobs/p10": -0.054931640625, "all_logprobs/p25": -2.753734588623047e-05, "all_logprobs/p5": -0.3125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.09065210074186325, "clip_ratio": 0.0, "completion_length": 572.7708740234375, "completion_length/correct": 566.3778076171875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 528.0, "completion_length/correct/min": 193.0, "completion_length/correct/p25": 394.5, "completion_length/correct/p75": 749.0, "completion_length/correct/var": 44639.8984375, "completion_length/incorrect": 668.6666870117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 628.0, "completion_length/incorrect/min": 413.0, "completion_length/incorrect/p25": 514.75, "completion_length/incorrect/p75": 753.5, "completion_length/incorrect/var": 48311.06640625, "completion_length/max": 1024.0, "completion_length/median": 532.0, "completion_length/min": 193.0, "completion_length/p25": 409.5, "completion_length/p75": 749.5, "completion_length/var": 44982.74609375, "epoch": 0.0496, "feature_vector_variance/max_squared_error": 115640.90625, "feature_vector_variance/metric": 30694.29296875, "generated_tokens/total": 3686830.0, "grad_norm": 0.46948301792144775, "grouped_std_rewards": 0.16577965021133423, "learning_rate": 5.685585783002493e-06, "loss": 0.0, "mean_logprobs": -0.064453125, "mean_logprobs/var": 0.000659942626953125, "num_completions/total": 5952, "per_sentence_gradient_norm": 3.5531632900238037, "per_sentence_gradient_norm/max": 53.64925003051758, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.172009229660034, "per_sentence_gradient_norm/p85": 4.8288655281066895, "per_sentence_gradient_norm/p90": 5.857480049133301, "per_sentence_gradient_norm/p95": 21.383543014526367, "per_sentence_gradient_norm/p99": 38.4500846862793, "per_sentence_gradient_norm/var": 69.62008666992188, "per_token_feature_norm": 192.6961212158203, "per_token_feature_norm/max": 290.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 73.0, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 595.517578125, "per_token_full_gradient_variance/max_squared_error": 5.249999046325684, "per_token_full_gradient_variance/variance": 0.008779563941061497, "per_token_gradient_norm": 4.022676944732666, "per_token_gradient_norm/max": 1064.6328125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 953.4523315429688, "per_token_policy_error_norm": 0.034539151936769485, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.030637426301836967, "policy_entropy": 0.06934469938278198, "policy_entropy/max": 3.15625, "policy_entropy/median": 6.007030606269836e-08, "policy_entropy/min": 6.8575787409708155e-18, "policy_entropy/p25": 3.9108272176235914e-10, "policy_entropy/p75": 0.0003185272216796875, "policy_entropy/var": 0.04585527628660202, "policy_error_vector_variance/max_squared_error": 2.008378028869629, "policy_error_vector_variance/metric": 0.034461427479982376, "policy_loss": -4.967053879312289e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.4650149345397949, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.47338736057281494, "policy_sharpness": 8.565991401672363, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.450480461120605, "reward": 0.9375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.05921052768826485, "rewards/accuracy_reward": 0.9375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.05921052768826485, "sentence_full_gradient_variance/max_squared_error": 24846.53515625, "sentence_full_gradient_variance/metric": 783.005615234375, "sentence_full_gradient_variance/p75": 155.12974548339844, "sentence_full_gradient_variance/p90": 551.0516357421875, "sentence_full_gradient_variance/p95": 1499.57666015625, "sentence_full_gradient_variance/p99": 20453.21484375, "state_level_variance/metric": 17.05897331237793, "state_level_variance_full_gradient/metric": 4.107251167297363, "step": 62 }, { "accuracy_reward": 0.59375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24375000596046448, "action_level_variance/metric": 64.75170135498047, "action_level_variance_full_gradient/metric": 1202.0279541015625, "adam_stats/lr_effective_max": 3.133782956865616e-05, "adam_stats/lr_effective_mean": -9.15791678601785e-11, "adam_stats/lr_effective_min": -3.0992763640824705e-05, "adam_stats/m_t_max": 0.0029525819700211287, "adam_stats/m_t_mean": -4.399683048439407e-11, "adam_stats/m_t_min": -0.002895709592849016, "adam_stats/v_t_max": 7.351782551268116e-05, "adam_stats/v_t_mean": 5.7933150464373195e-12, "adam_stats/v_t_min": 0.0, "advantages": 4.967053879312289e-09, "advantages/max": 1.4358407258987427, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.46501490473747253, "advantages/var": 0.631247341632843, "all_logprobs": -0.0831979513168335, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.125, "all_logprobs/p1": -1.8671875, "all_logprobs/p10": -0.12246084213256836, "all_logprobs/p25": -0.000141143798828125, "all_logprobs/p5": -0.486328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12650349736213684, "clip_ratio": 0.0, "completion_length": 668.15625, "completion_length/correct": 612.2807006835938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 659.0, "completion_length/correct/min": 124.0, "completion_length/correct/p25": 261.0, "completion_length/correct/p75": 925.0, "completion_length/correct/var": 102262.703125, "completion_length/incorrect": 749.8204956054688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 759.0, "completion_length/incorrect/min": 280.0, "completion_length/incorrect/p25": 697.0, "completion_length/incorrect/p75": 856.5, "completion_length/incorrect/var": 32331.94140625, "completion_length/max": 1024.0, "completion_length/median": 748.0, "completion_length/min": 124.0, "completion_length/p25": 385.5, "completion_length/p75": 877.5, "completion_length/var": 77825.0234375, "epoch": 0.0504, "feature_vector_variance/max_squared_error": 120846.3046875, "feature_vector_variance/metric": 32566.556640625, "generated_tokens/total": 3750973.0, "grad_norm": 0.433167964220047, "grouped_std_rewards": 0.2747040390968323, "learning_rate": 5.432719831372507e-06, "loss": 0.0, "mean_logprobs": -0.087890625, "mean_logprobs/var": 0.0037384033203125, "num_completions/total": 6048, "per_sentence_gradient_norm": 8.32830810546875, "per_sentence_gradient_norm/max": 58.08980178833008, "per_sentence_gradient_norm/median": 3.376638412475586, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 11.314488410949707, "per_sentence_gradient_norm/p85": 17.715312957763672, "per_sentence_gradient_norm/p90": 21.62726593017578, "per_sentence_gradient_norm/p95": 34.384307861328125, "per_sentence_gradient_norm/p99": 53.35951232910156, "per_sentence_gradient_norm/var": 151.3975067138672, "per_token_feature_norm": 194.54727172851562, "per_token_feature_norm/max": 296.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 77.5, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 781.4314575195312, "per_token_full_gradient_variance/max_squared_error": 4.878806114196777, "per_token_full_gradient_variance/variance": 0.011311388574540615, "per_token_gradient_norm": 8.525108337402344, "per_token_gradient_norm/max": 1191.3204345703125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1481.922119140625, "per_token_policy_error_norm": 0.04590371996164322, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04031484201550484, "policy_entropy": 0.0916915163397789, "policy_entropy/max": 3.171875, "policy_entropy/median": 3.0174851417541504e-07, "policy_entropy/min": 2.949029909160572e-17, "policy_entropy/p25": 2.051820047199726e-09, "policy_entropy/p75": 0.00145721435546875, "policy_entropy/var": 0.06752533465623856, "policy_error_vector_variance/max_squared_error": 2.0053038597106934, "policy_error_vector_variance/metric": 0.04585673660039902, "policy_loss": 0.0, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -1.4358407258987427, "policy_loss/p25": -0.46501487493515015, "policy_loss/p75": 0.0, "policy_loss/var": 0.6312474012374878, "policy_sharpness": 8.329429626464844, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 9.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.801657676696777, "reward": 0.59375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24375000596046448, "rewards/accuracy_reward": 0.59375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24375000596046448, "sentence_full_gradient_variance/max_squared_error": 23708.603515625, "sentence_full_gradient_variance/metric": 1206.040283203125, "sentence_full_gradient_variance/p75": 800.8763427734375, "sentence_full_gradient_variance/p90": 2001.853271484375, "sentence_full_gradient_variance/p95": 5229.4169921875, "sentence_full_gradient_variance/p99": 18972.18359375, "state_level_variance/metric": 106.93888854980469, "state_level_variance_full_gradient/metric": 4.0123138427734375, "step": 63 }, { "accuracy_reward": 0.8854166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.10252193361520767, "action_level_variance/metric": 47.17920684814453, "action_level_variance_full_gradient/metric": 347.48760986328125, "adam_stats/lr_effective_max": 3.110555553575978e-05, "adam_stats/lr_effective_mean": -1.739280386381381e-10, "adam_stats/lr_effective_min": -3.117818778264336e-05, "adam_stats/m_t_max": 0.0030199100729078054, "adam_stats/m_t_mean": -5.086365356676836e-11, "adam_stats/m_t_min": -0.002879232633858919, "adam_stats/v_t_max": 7.351524982368574e-05, "adam_stats/v_t_mean": 5.7959405504182104e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 0.9680583477020264, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.4734044671058655, "all_logprobs": -0.07031944394111633, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.0625, "all_logprobs/p1": -1.690234661102295, "all_logprobs/p10": -0.072265625, "all_logprobs/p25": -4.5299530029296875e-05, "all_logprobs/p5": -0.38671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.10656917840242386, "clip_ratio": 0.0, "completion_length": 631.9375, "completion_length/correct": 625.3294067382812, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 603.0, "completion_length/correct/min": 265.0, "completion_length/correct/p25": 488.0, "completion_length/correct/p75": 750.0, "completion_length/correct/var": 39595.84765625, "completion_length/incorrect": 683.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 679.0, "completion_length/incorrect/min": 484.0, "completion_length/incorrect/p25": 557.0, "completion_length/incorrect/p75": 686.5, "completion_length/incorrect/var": 33862.0, "completion_length/max": 1024.0, "completion_length/median": 618.0, "completion_length/min": 265.0, "completion_length/p25": 489.75, "completion_length/p75": 747.75, "completion_length/var": 38916.45703125, "epoch": 0.0512, "feature_vector_variance/max_squared_error": 129472.828125, "feature_vector_variance/metric": 31775.05859375, "generated_tokens/total": 3811639.0, "grad_norm": 0.32747775316238403, "grouped_std_rewards": 0.18466047942638397, "learning_rate": 5.182372542187895e-06, "loss": -0.0, "mean_logprobs": -0.07421875, "mean_logprobs/var": 0.001983642578125, "num_completions/total": 6144, "per_sentence_gradient_norm": 4.006507873535156, "per_sentence_gradient_norm/max": 55.78498077392578, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.763479709625244, "per_sentence_gradient_norm/p85": 9.745882987976074, "per_sentence_gradient_norm/p90": 10.992703437805176, "per_sentence_gradient_norm/p95": 11.764453887939453, "per_sentence_gradient_norm/p99": 48.33412170410156, "per_sentence_gradient_norm/var": 68.87530517578125, "per_token_feature_norm": 197.60226440429688, "per_token_feature_norm/max": 306.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 77.0, "per_token_feature_norm/p25": 184.0, "per_token_feature_norm/p75": 215.0, "per_token_feature_norm/var": 778.1582641601562, "per_token_full_gradient_variance/max_squared_error": 6.418206691741943, "per_token_full_gradient_variance/variance": 0.008128078654408455, "per_token_gradient_norm": 4.12618350982666, "per_token_gradient_norm/max": 949.1907348632812, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 901.64453125, "per_token_policy_error_norm": 0.03907161206007004, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.034552231431007385, "policy_entropy": 0.0767839178442955, "policy_entropy/max": 2.8125, "policy_entropy/median": 1.0663643479347229e-07, "policy_entropy/min": 2.927345865710862e-18, "policy_entropy/p25": 9.1313268058002e-10, "policy_entropy/p75": 0.000499725341796875, "policy_entropy/var": 0.052363455295562744, "policy_error_vector_variance/max_squared_error": 1.9786067008972168, "policy_error_vector_variance/metric": 0.03902676701545715, "policy_loss": -4.967053879312289e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.9680584669113159, "policy_loss/p25": -0.24990007281303406, "policy_loss/p75": 0.0, "policy_loss/var": 0.47340449690818787, "policy_sharpness": 8.491350173950195, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.880366325378418, "reward": 0.8854166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.10252193361520767, "rewards/accuracy_reward": 0.8854166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.10252193361520767, "sentence_full_gradient_variance/max_squared_error": 20449.912109375, "sentence_full_gradient_variance/metric": 366.0724792480469, "sentence_full_gradient_variance/p75": 211.63787841796875, "sentence_full_gradient_variance/p90": 501.858642578125, "sentence_full_gradient_variance/p95": 669.2673950195312, "sentence_full_gradient_variance/p99": 2777.70166015625, "state_level_variance/metric": 28.71282958984375, "state_level_variance_full_gradient/metric": 18.58490562438965, "step": 64 }, { "accuracy_reward": 0.5416666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2508772015571594, "action_level_variance/metric": 26.420257568359375, "action_level_variance_full_gradient/metric": 397.06982421875, "adam_stats/lr_effective_max": 2.8396108973538503e-05, "adam_stats/lr_effective_mean": -1.421658207823029e-10, "adam_stats/lr_effective_min": -2.841548848664388e-05, "adam_stats/m_t_max": 0.0029109427705407143, "adam_stats/m_t_mean": -4.408564485691713e-11, "adam_stats/m_t_min": -0.002710327971726656, "adam_stats/v_t_max": 7.344201731029898e-05, "adam_stats/v_t_mean": 5.794628665789503e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634920325327e-08, "advantages/max": 2.0150647163391113, "advantages/median": 0.0, "advantages/min": -1.249750018119812, "advantages/p25": -0.46501490473747253, "advantages/p75": 0.0, "advantages/var": 0.6312887072563171, "all_logprobs": -0.05730311572551727, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.46875, "all_logprobs/p1": -1.530390739440918, "all_logprobs/p10": -0.0233154296875, "all_logprobs/p25": -1.6689300537109375e-06, "all_logprobs/p5": -0.251953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.09136483073234558, "clip_ratio": 0.0, "completion_length": 749.0833740234375, "completion_length/correct": 622.34619140625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 543.0, "completion_length/correct/min": 191.0, "completion_length/correct/p25": 316.75, "completion_length/correct/p75": 949.75, "completion_length/correct/var": 90094.1171875, "completion_length/incorrect": 898.8636474609375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 979.0, "completion_length/incorrect/min": 432.0, "completion_length/incorrect/p25": 863.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 32929.609375, "completion_length/max": 1024.0, "completion_length/median": 872.0, "completion_length/min": 191.0, "completion_length/p25": 468.5, "completion_length/p75": 1024.0, "completion_length/var": 82453.8515625, "epoch": 0.052, "feature_vector_variance/max_squared_error": 138287.3125, "feature_vector_variance/metric": 29891.51953125, "generated_tokens/total": 3883551.0, "grad_norm": 0.2669786512851715, "grouped_std_rewards": 0.2922399044036865, "learning_rate": 4.934848925057485e-06, "loss": 0.0, "mean_logprobs": -0.057861328125, "mean_logprobs/var": 0.000919342041015625, "num_completions/total": 6240, "per_sentence_gradient_norm": 4.5051164627075195, "per_sentence_gradient_norm/max": 37.70623779296875, "per_sentence_gradient_norm/median": 3.12105655670166, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.363354682922363, "per_sentence_gradient_norm/p85": 7.170907497406006, "per_sentence_gradient_norm/p90": 9.243523597717285, "per_sentence_gradient_norm/p95": 16.160837173461914, "per_sentence_gradient_norm/p99": 30.945768356323242, "per_sentence_gradient_norm/var": 40.59858322143555, "per_token_feature_norm": 195.43553161621094, "per_token_feature_norm/max": 302.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 71.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 217.0, "per_token_feature_norm/var": 1095.647216796875, "per_token_full_gradient_variance/max_squared_error": 3.382556200027466, "per_token_full_gradient_variance/variance": 0.007803024258464575, "per_token_gradient_norm": 5.323261737823486, "per_token_gradient_norm/max": 630.5420532226562, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 919.48974609375, "per_token_policy_error_norm": 0.031191442161798477, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.027918614447116852, "policy_entropy": 0.06285416334867477, "policy_entropy/max": 2.75, "policy_entropy/median": 5.029141902923584e-08, "policy_entropy/min": 1.2836953722228372e-16, "policy_entropy/p25": 1.1641532182693481e-09, "policy_entropy/p75": 2.491474151611328e-05, "policy_entropy/var": 0.04859425127506256, "policy_error_vector_variance/max_squared_error": 2.0060973167419434, "policy_error_vector_variance/metric": 0.031161773949861526, "policy_loss": 2.4835269396561444e-09, "policy_loss/max": 1.249750018119812, "policy_loss/median": 0.0, "policy_loss/min": -2.0150647163391113, "policy_loss/p25": 0.0, "policy_loss/p75": 0.46501490473747253, "policy_loss/var": 0.6312886476516724, "policy_sharpness": 8.825387001037598, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.335035800933838, "reward": 0.5416666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2508772015571594, "rewards/accuracy_reward": 0.5416666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2508772015571594, "sentence_full_gradient_variance/max_squared_error": 14891.0302734375, "sentence_full_gradient_variance/metric": 415.43505859375, "sentence_full_gradient_variance/p75": 58.76430892944336, "sentence_full_gradient_variance/p90": 560.9999389648438, "sentence_full_gradient_variance/p95": 978.0236206054688, "sentence_full_gradient_variance/p99": 12557.232421875, "state_level_variance/metric": 18.48802375793457, "state_level_variance_full_gradient/metric": 18.365232467651367, "step": 65 }, { "accuracy_reward": 0.5104166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.25252193212509155, "action_level_variance/metric": 34.485809326171875, "action_level_variance_full_gradient/metric": 625.6564331054688, "adam_stats/lr_effective_max": 2.7488275009091012e-05, "adam_stats/lr_effective_mean": -7.953181169195744e-11, "adam_stats/lr_effective_min": -2.6368141334387474e-05, "adam_stats/m_t_max": 0.0027648068498820066, "adam_stats/m_t_mean": -2.765132220416966e-11, "adam_stats/m_t_min": -0.0027097498532384634, "adam_stats/v_t_max": 7.395275315502658e-05, "adam_stats/v_t_mean": 5.823628905499145e-12, "adam_stats/v_t_min": 0.0, "advantages": -2.4835269396561444e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -2.0150647163391113, "advantages/p25": -0.24990005791187286, "advantages/p75": 0.46501490473747253, "advantages/var": 0.6312422752380371, "all_logprobs": -0.07258925586938858, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.59375, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.09047842025756836, "all_logprobs/p25": -0.00012302398681640625, "all_logprobs/p5": -0.404296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1046781986951828, "clip_ratio": 0.0, "completion_length": 641.75, "completion_length/correct": 607.4898071289062, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 549.0, "completion_length/correct/min": 329.0, "completion_length/correct/p25": 470.0, "completion_length/correct/p75": 699.0, "completion_length/correct/var": 40923.92578125, "completion_length/incorrect": 677.4680786132812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 627.0, "completion_length/incorrect/min": 333.0, "completion_length/incorrect/p25": 501.5, "completion_length/incorrect/p75": 967.5, "completion_length/incorrect/var": 57680.3046875, "completion_length/max": 1024.0, "completion_length/median": 568.0, "completion_length/min": 329.0, "completion_length/p25": 493.75, "completion_length/p75": 777.25, "completion_length/var": 49843.34765625, "epoch": 0.0528, "feature_vector_variance/max_squared_error": 127579.7265625, "feature_vector_variance/metric": 32496.951171875, "generated_tokens/total": 3945159.0, "grad_norm": 0.49080008268356323, "grouped_std_rewards": 0.2687790095806122, "learning_rate": 4.6904505493806595e-06, "loss": 0.0, "mean_logprobs": -0.07080078125, "mean_logprobs/var": 0.00052642822265625, "num_completions/total": 6336, "per_sentence_gradient_norm": 5.333070278167725, "per_sentence_gradient_norm/max": 51.5319938659668, "per_sentence_gradient_norm/median": 3.591240882873535, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 7.351043224334717, "per_sentence_gradient_norm/p85": 10.21470832824707, "per_sentence_gradient_norm/p90": 12.363890647888184, "per_sentence_gradient_norm/p95": 15.379327774047852, "per_sentence_gradient_norm/p99": 23.565876007080078, "per_sentence_gradient_norm/var": 48.743858337402344, "per_token_feature_norm": 196.1174774169922, "per_token_feature_norm/max": 300.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 214.0, "per_token_feature_norm/var": 888.5029296875, "per_token_full_gradient_variance/max_squared_error": 12.341949462890625, "per_token_full_gradient_variance/variance": 0.009292233735322952, "per_token_gradient_norm": 5.395288467407227, "per_token_gradient_norm/max": 1063.40283203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1009.2740478515625, "per_token_policy_error_norm": 0.04082959145307541, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03609107807278633, "policy_entropy": 0.08079732954502106, "policy_entropy/max": 2.40625, "policy_entropy/median": 6.705522537231445e-07, "policy_entropy/min": 3.4867941867133823e-16, "policy_entropy/p25": 5.820766091346741e-09, "policy_entropy/p75": 0.00124359130859375, "policy_entropy/var": 0.05387681722640991, "policy_error_vector_variance/max_squared_error": 2.0068576335906982, "policy_error_vector_variance/metric": 0.04077881947159767, "policy_loss": 6.208817460162663e-09, "policy_loss/max": 2.0150644779205322, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": -0.46501490473747253, "policy_loss/p75": 0.24990007281303406, "policy_loss/var": 0.6312422156333923, "policy_sharpness": 8.405496597290039, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.231813430786133, "reward": 0.5104166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.25252193212509155, "rewards/accuracy_reward": 0.5104166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.25252193212509155, "sentence_full_gradient_variance/max_squared_error": 21334.396484375, "sentence_full_gradient_variance/metric": 645.5206298828125, "sentence_full_gradient_variance/p75": 309.0884704589844, "sentence_full_gradient_variance/p90": 1890.548583984375, "sentence_full_gradient_variance/p95": 2320.495849609375, "sentence_full_gradient_variance/p99": 7791.95263671875, "state_level_variance/metric": 19.08680534362793, "state_level_variance_full_gradient/metric": 19.864158630371094, "step": 66 }, { "accuracy_reward": 0.9166666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.0771929919719696, "action_level_variance/metric": 2.300222873687744, "action_level_variance_full_gradient/metric": 112.72853088378906, "adam_stats/lr_effective_max": 2.5537075998727232e-05, "adam_stats/lr_effective_mean": -4.695963756740085e-11, "adam_stats/lr_effective_min": -2.5282419301220216e-05, "adam_stats/m_t_max": 0.002475832821801305, "adam_stats/m_t_mean": -1.9968001210846076e-11, "adam_stats/m_t_min": -0.002592527074739337, "adam_stats/v_t_max": 7.38869421184063e-05, "adam_stats/v_t_mean": 5.819598275502713e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.9680583477020264, "advantages/median": 0.0, "advantages/min": -0.9680583477020264, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.15783360600471497, "all_logprobs": -0.07275021076202393, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.5, "all_logprobs/p1": -1.6095314025878906, "all_logprobs/p10": -0.10009765625, "all_logprobs/p25": -5.8650970458984375e-05, "all_logprobs/p5": -0.400390625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.10435042530298233, "clip_ratio": 0.0, "completion_length": 452.07293701171875, "completion_length/correct": 412.4659118652344, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 375.0, "completion_length/correct/min": 189.0, "completion_length/correct/p25": 316.75, "completion_length/correct/p75": 469.0, "completion_length/correct/var": 30158.943359375, "completion_length/incorrect": 887.75, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 935.0, "completion_length/incorrect/min": 522.0, "completion_length/incorrect/p25": 817.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 35839.64453125, "completion_length/max": 1024.0, "completion_length/median": 395.0, "completion_length/min": 189.0, "completion_length/p25": 332.0, "completion_length/p75": 490.0, "completion_length/var": 47697.5625, "epoch": 0.0536, "feature_vector_variance/max_squared_error": 137203.25, "feature_vector_variance/metric": 30915.3203125, "generated_tokens/total": 3988558.0, "grad_norm": 0.14274103939533234, "grouped_std_rewards": 0.08606629818677902, "learning_rate": 4.4494751769315e-06, "loss": -0.0, "mean_logprobs": -0.0732421875, "mean_logprobs/var": 0.0006866455078125, "num_completions/total": 6432, "per_sentence_gradient_norm": 1.8094558715820312, "per_sentence_gradient_norm/max": 20.756065368652344, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 6.70276403427124, "per_sentence_gradient_norm/p90": 9.017816543579102, "per_sentence_gradient_norm/p95": 12.371822357177734, "per_sentence_gradient_norm/p99": 15.505943298339844, "per_sentence_gradient_norm/var": 18.722131729125977, "per_token_feature_norm": 193.41653442382812, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 71.5, "per_token_feature_norm/p25": 176.0, "per_token_feature_norm/p75": 215.0, "per_token_feature_norm/var": 1130.6729736328125, "per_token_full_gradient_variance/max_squared_error": 1.147918701171875, "per_token_full_gradient_variance/variance": 0.0036523137241601944, "per_token_gradient_norm": 3.4056646823883057, "per_token_gradient_norm/max": 354.7328796386719, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 491.8701171875, "per_token_policy_error_norm": 0.04066549614071846, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.034964341670274734, "policy_entropy": 0.08164190500974655, "policy_entropy/max": 2.390625, "policy_entropy/median": 1.5832483768463135e-07, "policy_entropy/min": 8.370040771588094e-17, "policy_entropy/p25": 2.0954757928848267e-09, "policy_entropy/p75": 0.000640869140625, "policy_entropy/var": 0.05281477048993111, "policy_error_vector_variance/max_squared_error": 2.0128729343414307, "policy_error_vector_variance/metric": 0.04062298312783241, "policy_loss": -1.2417634698280722e-09, "policy_loss/max": 0.9680584669113159, "policy_loss/median": 0.0, "policy_loss/min": -0.9680584669113159, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.15783363580703735, "policy_sharpness": 8.425702095031738, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.230774879455566, "reward": 0.9166666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.0771929919719696, "rewards/accuracy_reward": 0.9166666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.0771929919719696, "sentence_full_gradient_variance/max_squared_error": 3204.433837890625, "sentence_full_gradient_variance/metric": 113.49201965332031, "sentence_full_gradient_variance/p75": 0.15269827842712402, "sentence_full_gradient_variance/p90": 44.85177230834961, "sentence_full_gradient_variance/p95": 485.095947265625, "sentence_full_gradient_variance/p99": 2933.875, "state_level_variance/metric": 19.64478302001953, "state_level_variance_full_gradient/metric": 0.7634912729263306, "step": 67 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894736886024475, "action_level_variance/metric": 19.21941375732422, "action_level_variance_full_gradient/metric": 234.86480712890625, "adam_stats/lr_effective_max": 2.4878381736925803e-05, "adam_stats/lr_effective_mean": -8.199033119105081e-11, "adam_stats/lr_effective_min": -2.4692799343029037e-05, "adam_stats/m_t_max": 0.00233409833163023, "adam_stats/m_t_mean": -2.6701075378499084e-11, "adam_stats/m_t_min": -0.002538143889978528, "adam_stats/v_t_max": 7.387698133243248e-05, "adam_stats/v_t_mean": 5.817570817440165e-12, "adam_stats/v_t_min": 0.0, "advantages": 4.967053879312289e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": -0.062475014477968216, "advantages/p75": 0.09146386384963989, "advantages/var": 0.47340402007102966, "all_logprobs": -0.06892933696508408, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -14.75, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.068359375, "all_logprobs/p25": -2.4318695068359375e-05, "all_logprobs/p5": -0.384765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1095844954252243, "clip_ratio": 0.0, "completion_length": 663.1771240234375, "completion_length/correct": 603.625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 564.0, "completion_length/correct/min": 224.0, "completion_length/correct/p25": 425.5, "completion_length/correct/p75": 789.75, "completion_length/correct/var": 58048.69140625, "completion_length/incorrect": 841.8333740234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 805.0, "completion_length/incorrect/min": 456.0, "completion_length/incorrect/p25": 768.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 21411.36328125, "completion_length/max": 1024.0, "completion_length/median": 686.0, "completion_length/min": 224.0, "completion_length/p25": 467.5, "completion_length/p75": 847.25, "completion_length/var": 59318.90625, "epoch": 0.0544, "feature_vector_variance/max_squared_error": 144211.296875, "feature_vector_variance/metric": 31391.71484375, "generated_tokens/total": 4052223.0, "grad_norm": 0.1955428123474121, "grouped_std_rewards": 0.18398544192314148, "learning_rate": 4.212216399081919e-06, "loss": 0.0, "mean_logprobs": -0.06884765625, "mean_logprobs/var": 0.000934600830078125, "num_completions/total": 6528, "per_sentence_gradient_norm": 3.9143378734588623, "per_sentence_gradient_norm/max": 29.167213439941406, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.2649457454681396, "per_sentence_gradient_norm/p85": 10.377056121826172, "per_sentence_gradient_norm/p90": 14.522332191467285, "per_sentence_gradient_norm/p95": 19.575687408447266, "per_sentence_gradient_norm/p99": 28.301063537597656, "per_sentence_gradient_norm/var": 45.33161926269531, "per_token_feature_norm": 195.09767150878906, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 71.5, "per_token_feature_norm/p25": 176.0, "per_token_feature_norm/p75": 220.0, "per_token_feature_norm/var": 1280.460693359375, "per_token_full_gradient_variance/max_squared_error": 7.152022838592529, "per_token_full_gradient_variance/variance": 0.007606399245560169, "per_token_gradient_norm": 4.853752136230469, "per_token_gradient_norm/max": 1110.493408203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 897.679443359375, "per_token_policy_error_norm": 0.03790188208222389, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.033712659031152725, "policy_entropy": 0.07600114494562149, "policy_entropy/max": 2.875, "policy_entropy/median": 1.3131648302078247e-07, "policy_entropy/min": 5.811323644522304e-17, "policy_entropy/p25": 1.7535057850182056e-09, "policy_entropy/p75": 0.0002899169921875, "policy_entropy/var": 0.0534132644534111, "policy_error_vector_variance/max_squared_error": 1.996622920036316, "policy_error_vector_variance/metric": 0.03786493092775345, "policy_loss": -1.2417634698280722e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": -0.0914638563990593, "policy_loss/p75": 0.06247501075267792, "policy_loss/var": 0.47340402007102966, "policy_sharpness": 8.540515899658203, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.723688125610352, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894736886024475, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894736886024475, "sentence_full_gradient_variance/max_squared_error": 3510.384033203125, "sentence_full_gradient_variance/metric": 274.6795654296875, "sentence_full_gradient_variance/p75": 81.07925415039062, "sentence_full_gradient_variance/p90": 1082.1409912109375, "sentence_full_gradient_variance/p95": 1863.7293701171875, "sentence_full_gradient_variance/p99": 3406.69775390625, "state_level_variance/metric": 32.20945358276367, "state_level_variance_full_gradient/metric": 39.814735412597656, "step": 68 }, { "accuracy_reward": 0.7083333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20877192914485931, "action_level_variance/metric": 29.320804595947266, "action_level_variance_full_gradient/metric": 165.77667236328125, "adam_stats/lr_effective_max": 2.2448932213592343e-05, "adam_stats/lr_effective_mean": -1.1687481538125155e-10, "adam_stats/lr_effective_min": -2.139129901479464e-05, "adam_stats/m_t_max": 0.0022967313416302204, "adam_stats/m_t_mean": -2.6026993060757064e-11, "adam_stats/m_t_min": -0.002294419100508094, "adam_stats/v_t_max": 7.387928781099617e-05, "adam_stats/v_t_mean": 5.820274383977475e-12, "adam_stats/v_t_min": 0.0, "advantages": -2.4835269396561444e-09, "advantages/max": 1.249750018119812, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.31563395261764526, "all_logprobs": -0.06410738825798035, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.375, "all_logprobs/p1": -1.5, "all_logprobs/p10": -0.06220698356628418, "all_logprobs/p25": -6.198883056640625e-05, "all_logprobs/p5": -0.3357419967651367, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.09083027392625809, "clip_ratio": 0.0, "completion_length": 494.6145935058594, "completion_length/correct": 491.4411926269531, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 382.0, "completion_length/correct/min": 128.0, "completion_length/correct/p25": 323.0, "completion_length/correct/p75": 749.0, "completion_length/correct/var": 60604.9375, "completion_length/incorrect": 502.3214416503906, "completion_length/incorrect/max": 934.0, "completion_length/incorrect/median": 398.0, "completion_length/incorrect/min": 227.0, "completion_length/incorrect/p25": 328.5, "completion_length/incorrect/p75": 716.0, "completion_length/incorrect/var": 46675.9296875, "completion_length/max": 1024.0, "completion_length/median": 393.0, "completion_length/min": 128.0, "completion_length/p25": 323.0, "completion_length/p75": 727.25, "completion_length/var": 56032.9375, "epoch": 0.0552, "feature_vector_variance/max_squared_error": 126279.1796875, "feature_vector_variance/metric": 32077.9296875, "generated_tokens/total": 4099706.0, "grad_norm": 0.28833624720573425, "grouped_std_rewards": 0.14026084542274475, "learning_rate": 3.978963279105821e-06, "loss": 0.0, "mean_logprobs": -0.072265625, "mean_logprobs/var": 0.0008697509765625, "num_completions/total": 6624, "per_sentence_gradient_norm": 2.741046667098999, "per_sentence_gradient_norm/max": 46.347900390625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 4.543150901794434, "per_sentence_gradient_norm/p85": 5.507354736328125, "per_sentence_gradient_norm/p90": 6.251715183258057, "per_sentence_gradient_norm/p95": 9.121657371520996, "per_sentence_gradient_norm/p99": 39.86115264892578, "per_sentence_gradient_norm/var": 43.63969802856445, "per_token_feature_norm": 201.25619506835938, "per_token_feature_norm/max": 290.0, "per_token_feature_norm/median": 202.0, "per_token_feature_norm/min": 72.5, "per_token_feature_norm/p25": 187.0, "per_token_feature_norm/p75": 220.0, "per_token_feature_norm/var": 816.7794799804688, "per_token_full_gradient_variance/max_squared_error": 4.021784782409668, "per_token_full_gradient_variance/variance": 0.006372447591274977, "per_token_gradient_norm": 3.505768060684204, "per_token_gradient_norm/max": 752.6904296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 686.997802734375, "per_token_policy_error_norm": 0.036474280059337616, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03220623359084129, "policy_entropy": 0.07121346145868301, "policy_entropy/max": 2.078125, "policy_entropy/median": 1.2014061212539673e-07, "policy_entropy/min": 2.2724877535296173e-16, "policy_entropy/p25": 1.1859810911118984e-09, "policy_entropy/p75": 0.000701904296875, "policy_entropy/var": 0.043656185269355774, "policy_error_vector_variance/max_squared_error": 2.0093631744384766, "policy_error_vector_variance/metric": 0.03644111379981041, "policy_loss": 3.725290298461914e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -1.249750018119812, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.31563395261764526, "policy_sharpness": 8.488659858703613, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.666143417358398, "reward": 0.7083333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20877192914485931, "rewards/accuracy_reward": 0.7083333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20877192914485931, "sentence_full_gradient_variance/max_squared_error": 2959.396728515625, "sentence_full_gradient_variance/metric": 174.93685913085938, "sentence_full_gradient_variance/p75": 18.81890106201172, "sentence_full_gradient_variance/p90": 517.8951416015625, "sentence_full_gradient_variance/p95": 829.8175659179688, "sentence_full_gradient_variance/p99": 2718.49609375, "state_level_variance/metric": 18.83623504638672, "state_level_variance_full_gradient/metric": 9.16019058227539, "step": 69 }, { "accuracy_reward": 0.6458333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2311403602361679, "action_level_variance/metric": 251.622314453125, "action_level_variance_full_gradient/metric": 1564.4212646484375, "adam_stats/lr_effective_max": 2.056633456959389e-05, "adam_stats/lr_effective_mean": -1.2149790895588097e-10, "adam_stats/lr_effective_min": -2.1378788005677052e-05, "adam_stats/m_t_max": 0.002979786368086934, "adam_stats/m_t_mean": -1.4077311365212619e-11, "adam_stats/m_t_min": -0.003110560355708003, "adam_stats/v_t_max": 7.423604256473482e-05, "adam_stats/v_t_mean": 5.836970230072014e-12, "adam_stats/v_t_min": 0.0, "advantages": -4.967053879312289e-09, "advantages/max": 2.560988187789917, "advantages/median": 0.0, "advantages/min": -2.0150647163391113, "advantages/p25": -0.36585545539855957, "advantages/p75": 0.46501490473747253, "advantages/var": 0.6312593817710876, "all_logprobs": -0.09600672125816345, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.1875, "all_logprobs/p1": -1.9296875, "all_logprobs/p10": -0.181640625, "all_logprobs/p25": -0.000335693359375, "all_logprobs/p5": -0.59375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.14362844824790955, "clip_ratio": 0.0, "completion_length": 694.6875, "completion_length/correct": 647.6612548828125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 621.0, "completion_length/correct/min": 207.0, "completion_length/correct/p25": 532.0, "completion_length/correct/p75": 722.5, "completion_length/correct/var": 30433.408203125, "completion_length/incorrect": 780.441162109375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 985.0, "completion_length/incorrect/min": 231.0, "completion_length/incorrect/p25": 522.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 81072.1875, "completion_length/max": 1024.0, "completion_length/median": 650.0, "completion_length/min": 207.0, "completion_length/p25": 530.5, "completion_length/p75": 941.5, "completion_length/var": 51778.48828125, "epoch": 0.056, "feature_vector_variance/max_squared_error": 118584.0078125, "feature_vector_variance/metric": 32516.345703125, "generated_tokens/total": 4166396.0, "grad_norm": 0.48194488883018494, "grouped_std_rewards": 0.2658340632915497, "learning_rate": 3.750000000000002e-06, "loss": -0.0, "mean_logprobs": -0.10693359375, "mean_logprobs/var": 0.005645751953125, "num_completions/total": 6720, "per_sentence_gradient_norm": 9.42122745513916, "per_sentence_gradient_norm/max": 121.70040130615234, "per_sentence_gradient_norm/median": 5.079833030700684, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 11.43851375579834, "per_sentence_gradient_norm/p85": 14.645781517028809, "per_sentence_gradient_norm/p90": 16.362342834472656, "per_sentence_gradient_norm/p95": 30.422754287719727, "per_sentence_gradient_norm/p99": 113.7850570678711, "per_sentence_gradient_norm/var": 314.2202453613281, "per_token_feature_norm": 191.8409423828125, "per_token_feature_norm/max": 302.0, "per_token_feature_norm/median": 194.0, "per_token_feature_norm/min": 73.0, "per_token_feature_norm/p25": 170.0, "per_token_feature_norm/p75": 216.0, "per_token_feature_norm/var": 1284.9990234375, "per_token_full_gradient_variance/max_squared_error": 9.114413261413574, "per_token_full_gradient_variance/variance": 0.011312505230307579, "per_token_gradient_norm": 8.236249923706055, "per_token_gradient_norm/max": 884.3812255859375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1549.0943603515625, "per_token_policy_error_norm": 0.05250444635748863, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.044728998094797134, "policy_entropy": 0.10775791108608246, "policy_entropy/max": 3.328125, "policy_entropy/median": 4.0978193283081055e-07, "policy_entropy/min": 1.9984014443252818e-15, "policy_entropy/p25": 4.94765117764473e-09, "policy_entropy/p75": 0.0030975341796875, "policy_entropy/var": 0.08049659430980682, "policy_error_vector_variance/max_squared_error": 2.008110523223877, "policy_error_vector_variance/metric": 0.052476879209280014, "policy_loss": 0.0, "policy_loss/max": 2.0150647163391113, "policy_loss/median": 0.0, "policy_loss/min": -2.560988187789917, "policy_loss/p25": -0.46501487493515015, "policy_loss/p75": 0.36585545539855957, "policy_loss/var": 0.6312593817710876, "policy_sharpness": 8.177010536193848, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.662294387817383, "reward": 0.6458333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2311403602361679, "rewards/accuracy_reward": 0.6458333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2311403602361679, "sentence_full_gradient_variance/max_squared_error": 59680.0625, "sentence_full_gradient_variance/metric": 1608.6175537109375, "sentence_full_gradient_variance/p75": 511.9420166015625, "sentence_full_gradient_variance/p90": 1168.255859375, "sentence_full_gradient_variance/p95": 2268.87109375, "sentence_full_gradient_variance/p99": 41565.1953125, "state_level_variance/metric": 90.06141662597656, "state_level_variance_full_gradient/metric": 44.19615936279297, "step": 70 }, { "accuracy_reward": 0.8229166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14725877344608307, "action_level_variance/metric": 23.543167114257812, "action_level_variance_full_gradient/metric": 4.620649814605713, "adam_stats/lr_effective_max": 1.9322624211781658e-05, "adam_stats/lr_effective_mean": -1.1111808695396519e-10, "adam_stats/lr_effective_min": -1.887339931272436e-05, "adam_stats/m_t_max": 0.0025543966330587864, "adam_stats/m_t_mean": -1.3961201986156802e-11, "adam_stats/m_t_min": -0.0030985765624791384, "adam_stats/v_t_max": 7.416505104629323e-05, "adam_stats/v_t_mean": 5.832537577910024e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 0.24990005791187286, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.15776851773262024, "all_logprobs": -0.08907013386487961, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.9375, "all_logprobs/p1": -1.890625, "all_logprobs/p10": -0.150390625, "all_logprobs/p25": -0.000431060791015625, "all_logprobs/p5": -0.53515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.13665185868740082, "clip_ratio": 0.0, "completion_length": 540.6354370117188, "completion_length/correct": 444.2025451660156, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 405.0, "completion_length/correct/min": 127.0, "completion_length/correct/p25": 317.5, "completion_length/correct/p75": 609.5, "completion_length/correct/var": 39006.73046875, "completion_length/incorrect": 988.7647094726562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 774.0, "completion_length/incorrect/p25": 992.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 4832.69140625, "completion_length/max": 1024.0, "completion_length/median": 460.0, "completion_length/min": 127.0, "completion_length/p25": 325.0, "completion_length/p75": 698.25, "completion_length/var": 76509.7890625, "epoch": 0.0568, "feature_vector_variance/max_squared_error": 118013.9296875, "feature_vector_variance/metric": 33344.7265625, "generated_tokens/total": 4218297.0, "grad_norm": 0.10656186193227768, "grouped_std_rewards": 0.0416666679084301, "learning_rate": 3.525605518250964e-06, "loss": -0.0, "mean_logprobs": -0.09033203125, "mean_logprobs/var": 0.0010528564453125, "num_completions/total": 6816, "per_sentence_gradient_norm": 0.9625166654586792, "per_sentence_gradient_norm/max": 50.30760192871094, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 2.2265231609344482, "per_sentence_gradient_norm/p90": 2.6687169075012207, "per_sentence_gradient_norm/p95": 3.0000834465026855, "per_sentence_gradient_norm/p99": 6.268429279327393, "per_sentence_gradient_norm/var": 26.985002517700195, "per_token_feature_norm": 193.51922607421875, "per_token_feature_norm/max": 306.0, "per_token_feature_norm/median": 196.0, "per_token_feature_norm/min": 74.0, "per_token_feature_norm/p25": 173.0, "per_token_feature_norm/p75": 217.0, "per_token_feature_norm/var": 1257.1068115234375, "per_token_full_gradient_variance/max_squared_error": 10.801697731018066, "per_token_full_gradient_variance/variance": 0.005096917040646076, "per_token_gradient_norm": 1.2080130577087402, "per_token_gradient_norm/max": 942.2501831054688, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 513.4998779296875, "per_token_policy_error_norm": 0.048923294991254807, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.042470138520002365, "policy_entropy": 0.09802836924791336, "policy_entropy/max": 3.640625, "policy_entropy/median": 6.332993507385254e-07, "policy_entropy/min": 3.7643499428696714e-16, "policy_entropy/p25": 8.440110832452774e-09, "policy_entropy/p75": 0.0038604736328125, "policy_entropy/var": 0.06727822124958038, "policy_error_vector_variance/max_squared_error": 2.0093069076538086, "policy_error_vector_variance/metric": 0.04888000711798668, "policy_loss": -4.967053879312289e-09, "policy_loss/max": 3.7485005855560303, "policy_loss/median": 0.0, "policy_loss/min": -0.24990007281303406, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.15776848793029785, "policy_sharpness": 8.183331489562988, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.75, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.331411361694336, "reward": 0.8229166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14725877344608307, "rewards/accuracy_reward": 0.8229166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14725877344608307, "sentence_full_gradient_variance/max_squared_error": 483.49163818359375, "sentence_full_gradient_variance/metric": 57.997711181640625, "sentence_full_gradient_variance/p75": 10.67541217803955, "sentence_full_gradient_variance/p90": 299.9891357421875, "sentence_full_gradient_variance/p95": 313.7314147949219, "sentence_full_gradient_variance/p99": 360.12066650390625, "state_level_variance/metric": 5.558629035949707, "state_level_variance_full_gradient/metric": 53.37705993652344, "step": 71 }, { "accuracy_reward": 0.8125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1539473831653595, "action_level_variance/metric": 24.3523006439209, "action_level_variance_full_gradient/metric": 1000.2802124023438, "adam_stats/lr_effective_max": 1.722726119623985e-05, "adam_stats/lr_effective_mean": -8.38028119121148e-11, "adam_stats/lr_effective_min": -1.85446442628745e-05, "adam_stats/m_t_max": 0.002781134797260165, "adam_stats/m_t_mean": -2.0930182934031372e-11, "adam_stats/m_t_min": -0.002864249749109149, "adam_stats/v_t_max": 7.409809768432751e-05, "adam_stats/v_t_mean": 5.834947108818156e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 1.6766761541366577, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.47342148423194885, "all_logprobs": -0.06622076779603958, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.8125, "all_logprobs/p1": -1.5625, "all_logprobs/p10": -0.06298828125, "all_logprobs/p25": -4.023313522338867e-05, "all_logprobs/p5": -0.341796875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.09621085226535797, "clip_ratio": 0.0, "completion_length": 448.60418701171875, "completion_length/correct": 416.923095703125, "completion_length/correct/max": 765.0, "completion_length/correct/median": 389.0, "completion_length/correct/min": 152.0, "completion_length/correct/p25": 325.25, "completion_length/correct/p75": 469.75, "completion_length/correct/var": 20543.34765625, "completion_length/incorrect": 585.888916015625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 476.0, "completion_length/incorrect/min": 350.0, "completion_length/incorrect/p25": 415.5, "completion_length/incorrect/p75": 651.0, "completion_length/incorrect/var": 54254.80859375, "completion_length/max": 1024.0, "completion_length/median": 408.0, "completion_length/min": 152.0, "completion_length/p25": 339.75, "completion_length/p75": 497.75, "completion_length/var": 30754.7890625, "epoch": 0.0576, "feature_vector_variance/max_squared_error": 114673.421875, "feature_vector_variance/metric": 32218.8046875, "generated_tokens/total": 4261363.0, "grad_norm": 0.27632826566696167, "grouped_std_rewards": 0.19598786532878876, "learning_rate": 3.3060532239694e-06, "loss": 0.0, "mean_logprobs": -0.0673828125, "mean_logprobs/var": 0.00080108642578125, "num_completions/total": 6912, "per_sentence_gradient_norm": 3.3570821285247803, "per_sentence_gradient_norm/max": 41.374603271484375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 4.682910919189453, "per_sentence_gradient_norm/p85": 6.147737979888916, "per_sentence_gradient_norm/p90": 8.268537521362305, "per_sentence_gradient_norm/p95": 15.676555633544922, "per_sentence_gradient_norm/p99": 21.56622886657715, "per_sentence_gradient_norm/var": 36.069435119628906, "per_token_feature_norm": 195.9417266845703, "per_token_feature_norm/max": 290.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 71.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 216.0, "per_token_feature_norm/var": 973.5928344726562, "per_token_full_gradient_variance/max_squared_error": 11.687819480895996, "per_token_full_gradient_variance/variance": 0.009539477527141571, "per_token_gradient_norm": 4.528555870056152, "per_token_gradient_norm/max": 1173.6029052734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1098.0439453125, "per_token_policy_error_norm": 0.037639129906892776, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03363784775137901, "policy_entropy": 0.07222054898738861, "policy_entropy/max": 3.796875, "policy_entropy/median": 2.551823854446411e-07, "policy_entropy/min": 2.3071822230491534e-16, "policy_entropy/p25": 3.4779077395796776e-09, "policy_entropy/p75": 0.00045013427734375, "policy_entropy/var": 0.04658317193388939, "policy_error_vector_variance/max_squared_error": 2.001730442047119, "policy_error_vector_variance/metric": 0.03758755326271057, "policy_loss": -1.2417634698280722e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -1.6766761541366577, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.47342148423194885, "policy_sharpness": 8.519844055175781, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.724663734436035, "reward": 0.8125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1539473831653595, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1539473831653595, "sentence_full_gradient_variance/max_squared_error": 14711.7802734375, "sentence_full_gradient_variance/metric": 1053.5755615234375, "sentence_full_gradient_variance/p75": 509.05572509765625, "sentence_full_gradient_variance/p90": 2218.34326171875, "sentence_full_gradient_variance/p95": 7114.0, "sentence_full_gradient_variance/p99": 12146.416015625, "state_level_variance/metric": 15.436113357543945, "state_level_variance_full_gradient/metric": 53.29531478881836, "step": 72 }, { "accuracy_reward": 0.9375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.05921052768826485, "action_level_variance/metric": 0.47125983238220215, "action_level_variance_full_gradient/metric": 29.394214630126953, "adam_stats/lr_effective_max": 1.538585092930589e-05, "adam_stats/lr_effective_mean": -6.751780384783856e-11, "adam_stats/lr_effective_min": -1.7452402971684933e-05, "adam_stats/m_t_max": 0.0025854185223579407, "adam_stats/m_t_mean": -1.7776440042194253e-11, "adam_stats/m_t_min": -0.0027693226002156734, "adam_stats/v_t_max": 7.403568451991305e-05, "adam_stats/v_t_mean": 5.829769392923234e-12, "adam_stats/v_t_min": 0.0, "advantages": 2.4835269396561444e-09, "advantages/max": 0.7498500347137451, "advantages/median": 0.0, "advantages/min": -1.249750018119812, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.1578315943479538, "all_logprobs": -0.07909592986106873, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.625, "all_logprobs/p1": -1.734375, "all_logprobs/p10": -0.1103515625, "all_logprobs/p25": -4.553794860839844e-05, "all_logprobs/p5": -0.48046875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11283378303050995, "clip_ratio": 0.0, "completion_length": 509.9895935058594, "completion_length/correct": 475.72222900390625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 387.0, "completion_length/correct/min": 188.0, "completion_length/correct/p25": 330.0, "completion_length/correct/p75": 593.5, "completion_length/correct/var": 45749.6796875, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 398.0, "completion_length/min": 188.0, "completion_length/p25": 332.25, "completion_length/p75": 654.25, "completion_length/var": 60659.4140625, "epoch": 0.0584, "feature_vector_variance/max_squared_error": 138485.375, "feature_vector_variance/metric": 30964.2890625, "generated_tokens/total": 4310322.0, "grad_norm": 0.0888366773724556, "grouped_std_rewards": 0.0833333358168602, "learning_rate": 3.0916106078064522e-06, "loss": -0.0, "mean_logprobs": -0.08349609375, "mean_logprobs/var": 0.00118255615234375, "num_completions/total": 7008, "per_sentence_gradient_norm": 1.17232346534729, "per_sentence_gradient_norm/max": 9.971136093139648, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 4.914816856384277, "per_sentence_gradient_norm/p90": 6.164263725280762, "per_sentence_gradient_norm/p95": 8.05945110321045, "per_sentence_gradient_norm/p99": 9.44347095489502, "per_sentence_gradient_norm/var": 7.390501499176025, "per_token_feature_norm": 193.9481964111328, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 69.5, "per_token_feature_norm/p25": 171.0, "per_token_feature_norm/p75": 221.0, "per_token_feature_norm/var": 1446.860595703125, "per_token_full_gradient_variance/max_squared_error": 1.3462516069412231, "per_token_full_gradient_variance/variance": 0.0024159643799066544, "per_token_gradient_norm": 2.063359022140503, "per_token_gradient_norm/max": 402.1070556640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 311.9881591796875, "per_token_policy_error_norm": 0.04453929886221886, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.037799060344696045, "policy_entropy": 0.08805005997419357, "policy_entropy/max": 2.5, "policy_entropy/median": 2.253800630569458e-07, "policy_entropy/min": 5.30825383648903e-16, "policy_entropy/p25": 3.812601789832115e-09, "policy_entropy/p75": 0.0005035400390625, "policy_entropy/var": 0.06274548172950745, "policy_error_vector_variance/max_squared_error": 1.996377944946289, "policy_error_vector_variance/metric": 0.044512003660202026, "policy_loss": -1.2417634698280722e-09, "policy_loss/max": 1.249750018119812, "policy_loss/median": 0.0, "policy_loss/min": -0.7498500943183899, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.1578315943479538, "policy_sharpness": 8.428297996520996, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.388504981994629, "reward": 0.9375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.05921052768826485, "rewards/accuracy_reward": 0.9375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.05921052768826485, "sentence_full_gradient_variance/max_squared_error": 1610.2296142578125, "sentence_full_gradient_variance/metric": 47.482704162597656, "sentence_full_gradient_variance/p75": 3.6176981925964355, "sentence_full_gradient_variance/p90": 3.624549150466919, "sentence_full_gradient_variance/p95": 213.00575256347656, "sentence_full_gradient_variance/p99": 962.9436645507812, "state_level_variance/metric": 8.246053695678711, "state_level_variance_full_gradient/metric": 18.08849334716797, "step": 73 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14035087823867798, "action_level_variance/metric": 0.0, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.2917248568555806e-05, "adam_stats/lr_effective_mean": -5.667972688416789e-11, "adam_stats/lr_effective_min": -1.4652157005912159e-05, "adam_stats/m_t_max": 0.002326876623556018, "adam_stats/m_t_mean": -1.5998792568527875e-11, "adam_stats/m_t_min": -0.002492390340194106, "adam_stats/v_t_max": 7.396165165118873e-05, "adam_stats/v_t_mean": 5.823938987320476e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.10740403831005096, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -8.8125, "all_logprobs/p1": -2.125, "all_logprobs/p10": -0.2353515625, "all_logprobs/p25": -0.0031890869140625, "all_logprobs/p5": -0.69140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15782365202903748, "clip_ratio": 0.0, "completion_length": 417.46875, "completion_length/correct": 376.4250183105469, "completion_length/correct/max": 568.0, "completion_length/correct/median": 391.0, "completion_length/correct/min": 138.0, "completion_length/correct/p25": 279.25, "completion_length/correct/p75": 493.25, "completion_length/correct/var": 15743.5400390625, "completion_length/incorrect": 622.6875, "completion_length/incorrect/max": 966.0, "completion_length/incorrect/median": 625.0, "completion_length/incorrect/min": 334.0, "completion_length/incorrect/p25": 508.75, "completion_length/incorrect/p75": 719.0, "completion_length/incorrect/var": 32925.1640625, "completion_length/max": 966.0, "completion_length/median": 409.0, "completion_length/min": 138.0, "completion_length/p25": 292.5, "completion_length/p75": 528.75, "completion_length/var": 26802.3125, "epoch": 0.0592, "feature_vector_variance/max_squared_error": 110746.40625, "feature_vector_variance/metric": 33774.79296875, "generated_tokens/total": 4350399.0, "grad_norm": 0.0, "grouped_std_rewards": 0.0, "learning_rate": 2.882538935057563e-06, "loss": 0.0, "mean_logprobs": -0.1044921875, "mean_logprobs/var": 0.0027923583984375, "num_completions/total": 7104, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 191.0636444091797, "per_token_feature_norm/max": 302.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 73.5, "per_token_feature_norm/p25": 167.0, "per_token_feature_norm/p75": 217.0, "per_token_feature_norm/var": 1450.484130859375, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.059708692133426666, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.051734790205955505, "policy_entropy": 0.11726308614015579, "policy_entropy/max": 2.4375, "policy_entropy/median": 2.3096799850463867e-06, "policy_entropy/min": 2.0261570199409107e-15, "policy_entropy/p25": 1.2922100722789764e-08, "policy_entropy/p75": 0.0228271484375, "policy_entropy/var": 0.07863841205835342, "policy_error_vector_variance/max_squared_error": 1.9945249557495117, "policy_error_vector_variance/metric": 0.059684354811906815, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 7.812804698944092, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.74212646484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.670184135437012, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14035087823867798, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14035087823867798, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 74 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14035087823867798, "action_level_variance/metric": 38.6370964050293, "action_level_variance_full_gradient/metric": 182.4850311279297, "adam_stats/lr_effective_max": 1.2737757060676813e-05, "adam_stats/lr_effective_mean": -4.018295418628526e-11, "adam_stats/lr_effective_min": -1.2631949175556656e-05, "adam_stats/m_t_max": 0.002298656851053238, "adam_stats/m_t_mean": -1.2249009966047719e-11, "adam_stats/m_t_min": -0.0022219091188162565, "adam_stats/v_t_max": 7.39213137421757e-05, "adam_stats/v_t_mean": 5.822207299610582e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.3155370354652405, "all_logprobs": -0.08980529010295868, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.3125, "all_logprobs/p1": -1.9140625, "all_logprobs/p10": -0.1455078125, "all_logprobs/p25": -0.0002574920654296875, "all_logprobs/p5": -0.5677728652954102, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.13700735569000244, "clip_ratio": 0.0, "completion_length": 499.10418701171875, "completion_length/correct": 426.7375183105469, "completion_length/correct/max": 833.0, "completion_length/correct/median": 402.0, "completion_length/correct/min": 150.0, "completion_length/correct/p25": 326.75, "completion_length/correct/p75": 532.25, "completion_length/correct/var": 17962.04296875, "completion_length/incorrect": 860.9375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 892.0, "completion_length/incorrect/min": 398.0, "completion_length/incorrect/p25": 716.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 35899.52734375, "completion_length/max": 1024.0, "completion_length/median": 458.0, "completion_length/min": 150.0, "completion_length/p25": 333.0, "completion_length/p75": 568.25, "completion_length/var": 47065.50390625, "epoch": 0.06, "feature_vector_variance/max_squared_error": 133494.625, "feature_vector_variance/metric": 31708.15625, "generated_tokens/total": 4398313.0, "grad_norm": 0.21433010697364807, "grouped_std_rewards": 0.0833333358168602, "learning_rate": 2.6790929273509547e-06, "loss": 0.0, "mean_logprobs": -0.08984375, "mean_logprobs/var": 0.00122833251953125, "num_completions/total": 7200, "per_sentence_gradient_norm": 2.021716594696045, "per_sentence_gradient_norm/max": 56.71830749511719, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.664463996887207, "per_sentence_gradient_norm/p85": 3.5446975231170654, "per_sentence_gradient_norm/p90": 4.071531772613525, "per_sentence_gradient_norm/p95": 4.842476844787598, "per_sentence_gradient_norm/p99": 33.65367126464844, "per_sentence_gradient_norm/var": 45.07036209106445, "per_token_feature_norm": 189.32286071777344, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 71.5, "per_token_feature_norm/p25": 164.0, "per_token_feature_norm/p75": 216.0, "per_token_feature_norm/var": 1488.234619140625, "per_token_full_gradient_variance/max_squared_error": 9.227781295776367, "per_token_full_gradient_variance/variance": 0.004852759651839733, "per_token_gradient_norm": 2.326951026916504, "per_token_gradient_norm/max": 1374.7626953125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 667.9996948242188, "per_token_policy_error_norm": 0.04942289739847183, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.043119918555021286, "policy_entropy": 0.09794772416353226, "policy_entropy/max": 2.890625, "policy_entropy/median": 7.115304470062256e-07, "policy_entropy/min": 2.373101715136272e-15, "policy_entropy/p25": 1.0710209608078003e-08, "policy_entropy/p75": 0.002410888671875, "policy_entropy/var": 0.06894121319055557, "policy_error_vector_variance/max_squared_error": 2.0086445808410645, "policy_error_vector_variance/metric": 0.04937860369682312, "policy_loss": 1.2417634698280722e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.3155370354652405, "policy_sharpness": 8.239885330200195, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.210759162902832, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14035087823867798, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14035087823867798, "sentence_full_gradient_variance/max_squared_error": 9964.0732421875, "sentence_full_gradient_variance/metric": 185.00979614257812, "sentence_full_gradient_variance/p75": 6.714726448059082, "sentence_full_gradient_variance/p90": 97.06320190429688, "sentence_full_gradient_variance/p95": 145.09454345703125, "sentence_full_gradient_variance/p99": 6056.0673828125, "state_level_variance/metric": 10.054317474365234, "state_level_variance_full_gradient/metric": 2.5247812271118164, "step": 75 }, { "accuracy_reward": 0.96875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.03059210255742073, "action_level_variance/metric": 82.72062683105469, "action_level_variance_full_gradient/metric": 200.04330444335938, "adam_stats/lr_effective_max": 1.2326425348874182e-05, "adam_stats/lr_effective_mean": -4.380313473051345e-11, "adam_stats/lr_effective_min": -1.2262534255569335e-05, "adam_stats/m_t_max": 0.001994023099541664, "adam_stats/m_t_mean": -2.5541579268262282e-11, "adam_stats/m_t_min": -0.0016734441742300987, "adam_stats/v_t_max": 7.394515705527738e-05, "adam_stats/v_t_mean": 5.8241406489245584e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 0.36585545539855957, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.3155708611011505, "all_logprobs": -0.09265151619911194, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.0, "all_logprobs/p1": -1.859375, "all_logprobs/p10": -0.171875, "all_logprobs/p25": -0.000553131103515625, "all_logprobs/p5": -0.57421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.13776010274887085, "clip_ratio": 0.0, "completion_length": 446.90625, "completion_length/correct": 441.8494567871094, "completion_length/correct/max": 959.0, "completion_length/correct/median": 413.0, "completion_length/correct/min": 181.0, "completion_length/correct/p25": 287.0, "completion_length/correct/p75": 556.0, "completion_length/correct/var": 31096.95703125, "completion_length/incorrect": 603.6666870117188, "completion_length/incorrect/max": 769.0, "completion_length/incorrect/median": 555.0, "completion_length/incorrect/min": 487.0, "completion_length/incorrect/p25": 521.0, "completion_length/incorrect/p75": 662.0, "completion_length/incorrect/var": 21657.333984375, "completion_length/max": 959.0, "completion_length/median": 416.0, "completion_length/min": 181.0, "completion_length/p25": 290.0, "completion_length/p75": 558.0, "completion_length/var": 31371.93359375, "epoch": 0.0608, "feature_vector_variance/max_squared_error": 129669.0703125, "feature_vector_variance/metric": 32503.48046875, "generated_tokens/total": 4441216.0, "grad_norm": 0.2976316213607788, "grouped_std_rewards": 0.09859417378902435, "learning_rate": 2.4815204523085656e-06, "loss": -0.0, "mean_logprobs": -0.08837890625, "mean_logprobs/var": 0.0019683837890625, "num_completions/total": 7296, "per_sentence_gradient_norm": 3.1946425437927246, "per_sentence_gradient_norm/max": 84.43208312988281, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 4.044071197509766, "per_sentence_gradient_norm/p85": 5.48164701461792, "per_sentence_gradient_norm/p90": 6.408505916595459, "per_sentence_gradient_norm/p95": 8.788952827453613, "per_sentence_gradient_norm/p99": 38.571067810058594, "per_sentence_gradient_norm/var": 99.09716033935547, "per_token_feature_norm": 190.2322235107422, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 194.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 166.0, "per_token_feature_norm/p75": 216.0, "per_token_feature_norm/var": 1453.1846923828125, "per_token_full_gradient_variance/max_squared_error": 10.037344932556152, "per_token_full_gradient_variance/variance": 0.010020785965025425, "per_token_gradient_norm": 4.285585880279541, "per_token_gradient_norm/max": 1320.175048828125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1308.248046875, "per_token_policy_error_norm": 0.051387861371040344, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.044120293110609055, "policy_entropy": 0.10255327075719833, "policy_entropy/max": 2.75, "policy_entropy/median": 8.046627044677734e-07, "policy_entropy/min": 5.238864897449957e-16, "policy_entropy/p25": 1.0186340659856796e-08, "policy_entropy/p75": 0.0048370361328125, "policy_entropy/var": 0.07048825174570084, "policy_error_vector_variance/max_squared_error": 2.0081846714019775, "policy_error_vector_variance/metric": 0.051355838775634766, "policy_loss": -4.967053879312289e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.36585548520088196, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.3155708611011505, "policy_sharpness": 8.138237953186035, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.535967826843262, "reward": 0.96875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.03059210255742073, "rewards/accuracy_reward": 0.96875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.03059210255742073, "sentence_full_gradient_variance/max_squared_error": 8070.4697265625, "sentence_full_gradient_variance/metric": 200.5931396484375, "sentence_full_gradient_variance/p75": 9.29835033416748, "sentence_full_gradient_variance/p90": 166.34567260742188, "sentence_full_gradient_variance/p95": 315.46160888671875, "sentence_full_gradient_variance/p99": 6094.3818359375, "state_level_variance/metric": 24.6171875, "state_level_variance_full_gradient/metric": 0.5498355627059937, "step": 76 }, { "accuracy_reward": 0.9270833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.068311408162117, "action_level_variance/metric": 18.97671127319336, "action_level_variance_full_gradient/metric": 876.0944213867188, "adam_stats/lr_effective_max": 1.218535271618748e-05, "adam_stats/lr_effective_mean": -8.004022791774368e-12, "adam_stats/lr_effective_min": -1.2253753084223717e-05, "adam_stats/m_t_max": 0.0023500407114624977, "adam_stats/m_t_mean": -1.9862975847440012e-11, "adam_stats/m_t_min": -0.0017563438741490245, "adam_stats/v_t_max": 7.387305231532082e-05, "adam_stats/v_t_mean": 5.825985093660391e-12, "adam_stats/v_t_min": 0.0, "advantages": 2.4835269396561444e-09, "advantages/max": 0.7498500347137451, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.31560009717941284, "all_logprobs": -0.09249018877744675, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -12.3125, "all_logprobs/p1": -1.9140625, "all_logprobs/p10": -0.1630859375, "all_logprobs/p25": -0.000335693359375, "all_logprobs/p5": -0.57421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.139400914311409, "clip_ratio": 0.0, "completion_length": 549.65625, "completion_length/correct": 557.2921142578125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 548.0, "completion_length/correct/min": 260.0, "completion_length/correct/p25": 341.0, "completion_length/correct/p75": 724.0, "completion_length/correct/var": 41446.2109375, "completion_length/incorrect": 452.5714416503906, "completion_length/incorrect/max": 625.0, "completion_length/incorrect/median": 466.0, "completion_length/incorrect/min": 341.0, "completion_length/incorrect/p25": 394.5, "completion_length/incorrect/p75": 473.5, "completion_length/incorrect/var": 9030.2861328125, "completion_length/max": 1024.0, "completion_length/median": 537.0, "completion_length/min": 260.0, "completion_length/p25": 346.25, "completion_length/p75": 711.75, "completion_length/var": 39711.74609375, "epoch": 0.0616, "feature_vector_variance/max_squared_error": 116074.8359375, "feature_vector_variance/metric": 31652.419921875, "generated_tokens/total": 4493983.0, "grad_norm": 0.24081386625766754, "grouped_std_rewards": 0.125, "learning_rate": 2.29006222155752e-06, "loss": -0.0, "mean_logprobs": -0.09912109375, "mean_logprobs/var": 0.00189208984375, "num_completions/total": 7392, "per_sentence_gradient_norm": 3.1369714736938477, "per_sentence_gradient_norm/max": 38.739315032958984, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 4.067697525024414, "per_sentence_gradient_norm/p85": 6.879178047180176, "per_sentence_gradient_norm/p90": 9.533927917480469, "per_sentence_gradient_norm/p95": 16.302703857421875, "per_sentence_gradient_norm/p99": 28.619340896606445, "per_sentence_gradient_norm/var": 41.05076217651367, "per_token_feature_norm": 187.55419921875, "per_token_feature_norm/max": 300.0, "per_token_feature_norm/median": 191.0, "per_token_feature_norm/min": 69.5, "per_token_feature_norm/p25": 162.0, "per_token_feature_norm/p75": 215.0, "per_token_feature_norm/var": 1500.42529296875, "per_token_full_gradient_variance/max_squared_error": 9.243400573730469, "per_token_full_gradient_variance/variance": 0.004438750445842743, "per_token_gradient_norm": 2.819239854812622, "per_token_gradient_norm/max": 1034.586181640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 458.9313659667969, "per_token_policy_error_norm": 0.051097314804792404, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.044013138860464096, "policy_entropy": 0.10175012797117233, "policy_entropy/max": 2.5, "policy_entropy/median": 6.48200511932373e-07, "policy_entropy/min": 6.765421556309548e-16, "policy_entropy/p25": 1.0884832590818405e-08, "policy_entropy/p75": 0.003021240234375, "policy_entropy/var": 0.07262414693832397, "policy_error_vector_variance/max_squared_error": 2.010392904281616, "policy_error_vector_variance/metric": 0.05103909224271774, "policy_loss": -2.4835269396561444e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.7498501539230347, "policy_loss/p25": -0.24990004301071167, "policy_loss/p75": 0.0, "policy_loss/var": 0.31560009717941284, "policy_sharpness": 8.202157020568848, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.427827835083008, "reward": 0.9270833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.068311408162117, "rewards/accuracy_reward": 0.9270833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.068311408162117, "sentence_full_gradient_variance/max_squared_error": 39506.3046875, "sentence_full_gradient_variance/metric": 885.3992919921875, "sentence_full_gradient_variance/p75": 154.4759521484375, "sentence_full_gradient_variance/p90": 1438.9554443359375, "sentence_full_gradient_variance/p95": 3994.34326171875, "sentence_full_gradient_variance/p99": 9764.958984375, "state_level_variance/metric": 27.39898109436035, "state_level_variance_full_gradient/metric": 9.304922103881836, "step": 77 }, { "accuracy_reward": 0.7604166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1841009110212326, "action_level_variance/metric": 57.501731872558594, "action_level_variance_full_gradient/metric": 729.7646484375, "adam_stats/lr_effective_max": 1.1989312042715028e-05, "adam_stats/lr_effective_mean": -1.4759277133791215e-11, "adam_stats/lr_effective_min": -1.2084313311788719e-05, "adam_stats/m_t_max": 0.0025056616868823767, "adam_stats/m_t_mean": -3.175333954108517e-11, "adam_stats/m_t_min": -0.0016284671146422625, "adam_stats/v_t_max": 7.405100768664852e-05, "adam_stats/v_t_mean": 5.838603038543777e-12, "adam_stats/v_t_min": 0.0, "advantages": -4.967053879312289e-09, "advantages/max": 1.249750018119812, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.6312694549560547, "all_logprobs": -0.09638282656669617, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -7.28125, "all_logprobs/p1": -1.9375, "all_logprobs/p10": -0.19140625, "all_logprobs/p25": -0.000553131103515625, "all_logprobs/p5": -0.60546875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.14338433742523193, "clip_ratio": 0.0, "completion_length": 608.9479370117188, "completion_length/correct": 587.4657592773438, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 514.0, "completion_length/correct/min": 259.0, "completion_length/correct/p25": 427.0, "completion_length/correct/p75": 725.0, "completion_length/correct/var": 50551.72265625, "completion_length/incorrect": 677.1304321289062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 670.0, "completion_length/incorrect/min": 381.0, "completion_length/incorrect/p25": 504.0, "completion_length/incorrect/p75": 824.0, "completion_length/incorrect/var": 42267.20703125, "completion_length/max": 1024.0, "completion_length/median": 521.0, "completion_length/min": 259.0, "completion_length/p25": 439.5, "completion_length/p75": 763.75, "completion_length/var": 49581.2109375, "epoch": 0.0624, "feature_vector_variance/max_squared_error": 125453.7109375, "feature_vector_variance/metric": 30221.482421875, "generated_tokens/total": 4552442.0, "grad_norm": 0.30284762382507324, "grouped_std_rewards": 0.28257960081100464, "learning_rate": 2.104951497460118e-06, "loss": 0.0, "mean_logprobs": -0.09912109375, "mean_logprobs/var": 0.0014495849609375, "num_completions/total": 7488, "per_sentence_gradient_norm": 7.055398941040039, "per_sentence_gradient_norm/max": 64.78434753417969, "per_sentence_gradient_norm/median": 5.412454128265381, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 9.645576477050781, "per_sentence_gradient_norm/p85": 11.857930183410645, "per_sentence_gradient_norm/p90": 14.67320442199707, "per_sentence_gradient_norm/p95": 23.65625762939453, "per_sentence_gradient_norm/p99": 33.56747055053711, "per_sentence_gradient_norm/var": 81.75474548339844, "per_token_feature_norm": 180.8349609375, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 183.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 150.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 1653.1995849609375, "per_token_full_gradient_variance/max_squared_error": 12.290072441101074, "per_token_full_gradient_variance/variance": 0.011242642998695374, "per_token_gradient_norm": 8.25599479675293, "per_token_gradient_norm/max": 1002.6268310546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1447.704833984375, "per_token_policy_error_norm": 0.05310015007853508, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04545988142490387, "policy_entropy": 0.10626242309808731, "policy_entropy/max": 2.890625, "policy_entropy/median": 2.5331974029541016e-06, "policy_entropy/min": 9.547918011776346e-15, "policy_entropy/p25": 3.91155481338501e-08, "policy_entropy/p75": 0.00469970703125, "policy_entropy/var": 0.07537932693958282, "policy_error_vector_variance/max_squared_error": 2.0094542503356934, "policy_error_vector_variance/metric": 0.053061846643686295, "policy_loss": 4.967053879312289e-09, "policy_loss/max": 2.560988426208496, "policy_loss/median": 0.0, "policy_loss/min": -1.2497501373291016, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.6312694549560547, "policy_sharpness": 8.138171195983887, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.634952545166016, "reward": 0.7604166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1841009110212326, "rewards/accuracy_reward": 0.7604166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1841009110212326, "sentence_full_gradient_variance/max_squared_error": 14676.4619140625, "sentence_full_gradient_variance/metric": 813.6950073242188, "sentence_full_gradient_variance/p75": 850.9986572265625, "sentence_full_gradient_variance/p90": 2589.54052734375, "sentence_full_gradient_variance/p95": 3484.248046875, "sentence_full_gradient_variance/p99": 7680.80322265625, "state_level_variance/metric": 32.394309997558594, "state_level_variance_full_gradient/metric": 83.93026733398438, "step": 78 }, { "accuracy_reward": 0.5520833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24989034235477448, "action_level_variance/metric": 32.13496398925781, "action_level_variance_full_gradient/metric": 862.9713745117188, "adam_stats/lr_effective_max": 1.1251514479226898e-05, "adam_stats/lr_effective_mean": 2.581196541229236e-12, "adam_stats/lr_effective_min": -1.100966619560495e-05, "adam_stats/m_t_max": 0.0016752613009884953, "adam_stats/m_t_mean": -2.5760513513994887e-11, "adam_stats/m_t_min": -0.0014885565033182502, "adam_stats/v_t_max": 7.404384814435616e-05, "adam_stats/v_t_mean": 5.843783356523913e-12, "adam_stats/v_t_min": 0.0, "advantages": 3.725290298461914e-09, "advantages/max": 1.6766761541366577, "advantages/median": 0.0, "advantages/min": -1.0976732969284058, "advantages/p25": -0.8537459373474121, "advantages/p75": 0.8537459373474121, "advantages/var": 0.7891570925712585, "all_logprobs": -0.11113449186086655, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -7.59375, "all_logprobs/p1": -2.15625, "all_logprobs/p10": -0.24609375, "all_logprobs/p25": -0.0020294189453125, "all_logprobs/p5": -0.69921875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.17206302285194397, "clip_ratio": 0.0, "completion_length": 769.0104370117188, "completion_length/correct": 737.8490600585938, "completion_length/correct/max": 1023.0, "completion_length/correct/median": 762.0, "completion_length/correct/min": 431.0, "completion_length/correct/p25": 631.0, "completion_length/correct/p75": 829.0, "completion_length/correct/var": 21153.4375, "completion_length/incorrect": 807.4185791015625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 817.0, "completion_length/incorrect/min": 462.0, "completion_length/incorrect/p25": 637.5, "completion_length/incorrect/p75": 1018.5, "completion_length/incorrect/var": 33101.10546875, "completion_length/max": 1024.0, "completion_length/median": 789.0, "completion_length/min": 431.0, "completion_length/p25": 630.75, "completion_length/p75": 882.0, "completion_length/var": 27422.349609375, "epoch": 0.0632, "feature_vector_variance/max_squared_error": 111745.28125, "feature_vector_variance/metric": 30464.8828125, "generated_tokens/total": 4626267.0, "grad_norm": 0.31981369853019714, "grouped_std_rewards": 0.4167756736278534, "learning_rate": 1.9264138089195424e-06, "loss": -0.0, "mean_logprobs": -0.1123046875, "mean_logprobs/var": 0.002105712890625, "num_completions/total": 7584, "per_sentence_gradient_norm": 12.180791854858398, "per_sentence_gradient_norm/max": 38.54541778564453, "per_sentence_gradient_norm/median": 10.899189949035645, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 7.737975597381592, "per_sentence_gradient_norm/p75": 17.104049682617188, "per_sentence_gradient_norm/p85": 19.481414794921875, "per_sentence_gradient_norm/p90": 23.17566680908203, "per_sentence_gradient_norm/p95": 28.162601470947266, "per_sentence_gradient_norm/p99": 34.928035736083984, "per_sentence_gradient_norm/var": 74.21735382080078, "per_token_feature_norm": 177.1259002685547, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 178.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 145.0, "per_token_feature_norm/p75": 205.0, "per_token_feature_norm/var": 1729.0302734375, "per_token_full_gradient_variance/max_squared_error": 2.246180295944214, "per_token_full_gradient_variance/variance": 0.013849187642335892, "per_token_gradient_norm": 12.380202293395996, "per_token_gradient_norm/max": 595.79638671875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1776.05029296875, "per_token_policy_error_norm": 0.06006825715303421, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05137590691447258, "policy_entropy": 0.12261413037776947, "policy_entropy/max": 2.546875, "policy_entropy/median": 7.033348083496094e-06, "policy_entropy/min": 1.2601031329495527e-14, "policy_entropy/p25": 1.1548399925231934e-07, "policy_entropy/p75": 0.01556396484375, "policy_entropy/var": 0.09166882187128067, "policy_error_vector_variance/max_squared_error": 2.0119736194610596, "policy_error_vector_variance/metric": 0.06002423167228699, "policy_loss": -8.692344621863413e-09, "policy_loss/max": 1.0976734161376953, "policy_loss/median": 0.0, "policy_loss/min": -1.6766762733459473, "policy_loss/p25": -0.8537459373474121, "policy_loss/p75": 0.8537459373474121, "policy_loss/var": 0.7891572117805481, "policy_sharpness": 7.892625331878662, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.49609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.619443893432617, "reward": 0.5520833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24989034235477448, "rewards/accuracy_reward": 0.5520833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24989034235477448, "sentence_full_gradient_variance/max_squared_error": 9054.447265625, "sentence_full_gradient_variance/metric": 868.9112548828125, "sentence_full_gradient_variance/p75": 1068.79443359375, "sentence_full_gradient_variance/p90": 1898.8330078125, "sentence_full_gradient_variance/p95": 2356.373291015625, "sentence_full_gradient_variance/p99": 8868.126953125, "state_level_variance/metric": 51.98127365112305, "state_level_variance_full_gradient/metric": 5.939777374267578, "step": 79 }, { "accuracy_reward": 0.6458333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2311403602361679, "action_level_variance/metric": 19.012649536132812, "action_level_variance_full_gradient/metric": 323.00274658203125, "adam_stats/lr_effective_max": 1.037099355016835e-05, "adam_stats/lr_effective_mean": -3.6849798386306976e-13, "adam_stats/lr_effective_min": -9.36951892072102e-06, "adam_stats/m_t_max": 0.0015184162184596062, "adam_stats/m_t_mean": -2.409245893064682e-11, "adam_stats/m_t_min": -0.0014526158338412642, "adam_stats/v_t_max": 7.397003355436027e-05, "adam_stats/v_t_mean": 5.840187708439082e-12, "adam_stats/v_t_min": 0.0, "advantages": 3.725290298461914e-09, "advantages/max": 1.4358407258987427, "advantages/median": 0.0, "advantages/min": -1.0976732969284058, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.3156619071960449, "all_logprobs": -0.10647137463092804, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -8.75, "all_logprobs/p1": -2.03125, "all_logprobs/p10": -0.23085927963256836, "all_logprobs/p25": -0.00193023681640625, "all_logprobs/p5": -0.69140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15927182137966156, "clip_ratio": 0.0, "completion_length": 603.09375, "completion_length/correct": 460.2419128417969, "completion_length/correct/max": 1013.0, "completion_length/correct/median": 381.0, "completion_length/correct/min": 120.0, "completion_length/correct/p25": 259.0, "completion_length/correct/p75": 582.25, "completion_length/correct/var": 67898.484375, "completion_length/incorrect": 863.5882568359375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 871.0, "completion_length/incorrect/min": 400.0, "completion_length/incorrect/p25": 797.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 32399.27734375, "completion_length/max": 1024.0, "completion_length/median": 544.0, "completion_length/min": 120.0, "completion_length/p25": 331.0, "completion_length/p75": 906.0, "completion_length/var": 92456.265625, "epoch": 0.064, "feature_vector_variance/max_squared_error": 106996.7734375, "feature_vector_variance/metric": 29253.248046875, "generated_tokens/total": 4684164.0, "grad_norm": 0.16491732001304626, "grouped_std_rewards": 0.1651768535375595, "learning_rate": 1.7546666766076658e-06, "loss": -0.0, "mean_logprobs": -0.1259765625, "mean_logprobs/var": 0.0037078857421875, "num_completions/total": 7680, "per_sentence_gradient_norm": 3.9655818939208984, "per_sentence_gradient_norm/max": 43.83827590942383, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 6.93417501449585, "per_sentence_gradient_norm/p85": 11.367557525634766, "per_sentence_gradient_norm/p90": 13.709478378295898, "per_sentence_gradient_norm/p95": 17.339645385742188, "per_sentence_gradient_norm/p99": 22.65774917602539, "per_sentence_gradient_norm/var": 51.19227981567383, "per_token_feature_norm": 174.4198455810547, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 174.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 142.0, "per_token_feature_norm/p75": 205.0, "per_token_feature_norm/var": 1732.1416015625, "per_token_full_gradient_variance/max_squared_error": 1.33500075340271, "per_token_full_gradient_variance/variance": 0.0061739226803183556, "per_token_gradient_norm": 5.120096683502197, "per_token_gradient_norm/max": 521.614013671875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 755.4826049804688, "per_token_policy_error_norm": 0.05860002338886261, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0496051125228405, "policy_entropy": 0.11754047125577927, "policy_entropy/max": 2.6875, "policy_entropy/median": 3.874301910400391e-06, "policy_entropy/min": 4.4853010194856324e-14, "policy_entropy/p25": 7.264316082000732e-08, "policy_entropy/p75": 0.01416015625, "policy_entropy/var": 0.08268577605485916, "policy_error_vector_variance/max_squared_error": 2.010331392288208, "policy_error_vector_variance/metric": 0.05857493355870247, "policy_loss": -7.450580596923828e-09, "policy_loss/max": 1.0976734161376953, "policy_loss/median": 0.0, "policy_loss/min": -1.4358408451080322, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.3156619369983673, "policy_sharpness": 7.935632228851318, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.62109375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.368962287902832, "reward": 0.6458333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2311403602361679, "rewards/accuracy_reward": 0.6458333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2311403602361679, "sentence_full_gradient_variance/max_squared_error": 6202.1298828125, "sentence_full_gradient_variance/metric": 331.37054443359375, "sentence_full_gradient_variance/p75": 9.522161483764648, "sentence_full_gradient_variance/p90": 500.30731201171875, "sentence_full_gradient_variance/p95": 2314.7880859375, "sentence_full_gradient_variance/p99": 5734.197265625, "state_level_variance/metric": 39.40160369873047, "state_level_variance_full_gradient/metric": 8.36778736114502, "step": 80 }, { "accuracy_reward": 0.90625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.08585526049137115, "action_level_variance/metric": 86.09934997558594, "action_level_variance_full_gradient/metric": 1310.63720703125, "adam_stats/lr_effective_max": 8.782371878623962e-06, "adam_stats/lr_effective_mean": 3.71429059384254e-12, "adam_stats/lr_effective_min": -8.862257345754188e-06, "adam_stats/m_t_max": 0.0012109348317608237, "adam_stats/m_t_mean": -2.1056134266728144e-11, "adam_stats/m_t_min": -0.0012608148390427232, "adam_stats/v_t_max": 7.39055685698986e-05, "adam_stats/v_t_mean": 5.848032995359187e-12, "adam_stats/v_t_min": 0.0, "advantages": -7.450580596923828e-09, "advantages/max": 0.5588920712471008, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.46501490473747253, "advantages/var": 0.47344300150871277, "all_logprobs": -0.13455642759799957, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -8.9375, "all_logprobs/p1": -2.34375, "all_logprobs/p10": -0.349609375, "all_logprobs/p25": -0.00860595703125, "all_logprobs/p5": -0.85546875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.2078431099653244, "clip_ratio": 0.0, "completion_length": 464.35418701171875, "completion_length/correct": 450.712646484375, "completion_length/correct/max": 851.0, "completion_length/correct/median": 404.0, "completion_length/correct/min": 196.0, "completion_length/correct/p25": 341.5, "completion_length/correct/p75": 538.5, "completion_length/correct/var": 23369.5078125, "completion_length/incorrect": 596.2222290039062, "completion_length/incorrect/max": 965.0, "completion_length/incorrect/median": 501.0, "completion_length/incorrect/min": 307.0, "completion_length/incorrect/p25": 425.0, "completion_length/incorrect/p75": 887.0, "completion_length/incorrect/var": 71020.703125, "completion_length/max": 965.0, "completion_length/median": 423.0, "completion_length/min": 196.0, "completion_length/p25": 341.75, "completion_length/p75": 542.25, "completion_length/var": 28954.0625, "epoch": 0.0648, "feature_vector_variance/max_squared_error": 118152.921875, "feature_vector_variance/metric": 32787.1484375, "generated_tokens/total": 4728742.0, "grad_norm": 0.38098034262657166, "grouped_std_rewards": 0.1986485719680786, "learning_rate": 1.5899193479495858e-06, "loss": -0.0, "mean_logprobs": -0.140625, "mean_logprobs/var": 0.005615234375, "num_completions/total": 7776, "per_sentence_gradient_norm": 7.166561126708984, "per_sentence_gradient_norm/max": 75.17584228515625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 7.2585883140563965, "per_sentence_gradient_norm/p85": 16.168621063232422, "per_sentence_gradient_norm/p90": 20.30398178100586, "per_sentence_gradient_norm/p95": 31.85223388671875, "per_sentence_gradient_norm/p99": 51.743038177490234, "per_sentence_gradient_norm/var": 169.63900756835938, "per_token_feature_norm": 184.16773986816406, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 152.0, "per_token_feature_norm/p75": 216.0, "per_token_feature_norm/var": 1838.716064453125, "per_token_full_gradient_variance/max_squared_error": 20.026460647583008, "per_token_full_gradient_variance/variance": 0.012510227039456367, "per_token_gradient_norm": 7.666369438171387, "per_token_gradient_norm/max": 892.50439453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1509.7459716796875, "per_token_policy_error_norm": 0.07243798673152924, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.061289142817258835, "policy_entropy": 0.14765122532844543, "policy_entropy/max": 3.75, "policy_entropy/median": 4.500150680541992e-06, "policy_entropy/min": 1.942890293094024e-15, "policy_entropy/p25": 4.6566128730773926e-08, "policy_entropy/p75": 0.050537109375, "policy_entropy/var": 0.10847149044275284, "policy_error_vector_variance/max_squared_error": 2.0044000148773193, "policy_error_vector_variance/metric": 0.0723823830485344, "policy_loss": 2.4835269396561444e-09, "policy_loss/max": 2.560988187789917, "policy_loss/median": 0.0, "policy_loss/min": -0.5588920712471008, "policy_loss/p25": -0.46501487493515015, "policy_loss/p75": 0.0, "policy_loss/var": 0.4734429717063904, "policy_sharpness": 7.61336088180542, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.011474609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.84880256652832, "reward": 0.90625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.08585526049137115, "rewards/accuracy_reward": 0.90625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.08585526049137115, "sentence_full_gradient_variance/max_squared_error": 43317.79296875, "sentence_full_gradient_variance/metric": 1318.1976318359375, "sentence_full_gradient_variance/p75": 366.2391357421875, "sentence_full_gradient_variance/p90": 1669.069091796875, "sentence_full_gradient_variance/p95": 4551.74609375, "sentence_full_gradient_variance/p99": 21743.244140625, "state_level_variance/metric": 104.58453369140625, "state_level_variance_full_gradient/metric": 7.5603437423706055, "step": 81 }, { "accuracy_reward": 0.84375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.13322368264198303, "action_level_variance/metric": 49.97574996948242, "action_level_variance_full_gradient/metric": 773.6217651367188, "adam_stats/lr_effective_max": 8.232362233684398e-06, "adam_stats/lr_effective_mean": -2.757477839815392e-12, "adam_stats/lr_effective_min": -7.5967691373080015e-06, "adam_stats/m_t_max": 0.0018822120036929846, "adam_stats/m_t_mean": -3.0133718437186374e-11, "adam_stats/m_t_min": -0.0012043988099321723, "adam_stats/v_t_max": 7.397042645607144e-05, "adam_stats/v_t_mean": 5.848601550978438e-12, "adam_stats/v_t_min": 0.0, "advantages": -1.2417634698280722e-09, "advantages/max": 1.6766761541366577, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.4733950197696686, "all_logprobs": -0.08262322843074799, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.25, "all_logprobs/p1": -1.8515625, "all_logprobs/p10": -0.11279296875, "all_logprobs/p25": -6.580352783203125e-05, "all_logprobs/p5": -0.48095703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12655556201934814, "clip_ratio": 0.0, "completion_length": 612.875, "completion_length/correct": 557.716064453125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 489.0, "completion_length/correct/min": 143.0, "completion_length/correct/p25": 322.0, "completion_length/correct/p75": 807.0, "completion_length/correct/var": 67591.203125, "completion_length/incorrect": 910.7333984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 474.0, "completion_length/incorrect/p25": 888.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 28404.498046875, "completion_length/max": 1024.0, "completion_length/median": 606.0, "completion_length/min": 143.0, "completion_length/p25": 351.0, "completion_length/p75": 868.75, "completion_length/var": 77707.3203125, "epoch": 0.0656, "feature_vector_variance/max_squared_error": 132132.65625, "feature_vector_variance/metric": 28819.10546875, "generated_tokens/total": 4787578.0, "grad_norm": 0.20735619962215424, "grouped_std_rewards": 0.17312976717948914, "learning_rate": 1.432372542187895e-06, "loss": 0.0, "mean_logprobs": -0.08056640625, "mean_logprobs/var": 0.000926971435546875, "num_completions/total": 7872, "per_sentence_gradient_norm": 3.561445713043213, "per_sentence_gradient_norm/max": 61.56584548950195, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.755648612976074, "per_sentence_gradient_norm/p85": 5.331235408782959, "per_sentence_gradient_norm/p90": 9.430493354797363, "per_sentence_gradient_norm/p95": 14.326802253723145, "per_sentence_gradient_norm/p99": 29.664974212646484, "per_sentence_gradient_norm/var": 63.99647521972656, "per_token_feature_norm": 179.5327911376953, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 181.0, "per_token_feature_norm/min": 58.75, "per_token_feature_norm/p25": 149.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 1663.2454833984375, "per_token_full_gradient_variance/max_squared_error": 8.630181312561035, "per_token_full_gradient_variance/variance": 0.008147340267896652, "per_token_gradient_norm": 4.8869781494140625, "per_token_gradient_norm/max": 929.9588623046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1047.1322021484375, "per_token_policy_error_norm": 0.04543687775731087, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03983065113425255, "policy_entropy": 0.09005864709615707, "policy_entropy/max": 2.703125, "policy_entropy/median": 8.866190910339355e-07, "policy_entropy/min": 3.6914915568786455e-15, "policy_entropy/p25": 2.3050233721733093e-08, "policy_entropy/p75": 0.00070953369140625, "policy_entropy/var": 0.06581977754831314, "policy_error_vector_variance/max_squared_error": 2.006133556365967, "policy_error_vector_variance/metric": 0.04541609063744545, "policy_loss": 2.4835269396561444e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -1.6766761541366577, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.4733950197696686, "policy_sharpness": 8.399053573608398, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.498790740966797, "reward": 0.84375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.13322368264198303, "rewards/accuracy_reward": 0.84375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.13322368264198303, "sentence_full_gradient_variance/max_squared_error": 58705.5, "sentence_full_gradient_variance/metric": 787.9025268554688, "sentence_full_gradient_variance/p75": 62.37554931640625, "sentence_full_gradient_variance/p90": 278.0942687988281, "sentence_full_gradient_variance/p95": 488.4463195800781, "sentence_full_gradient_variance/p99": 12239.77734375, "state_level_variance/metric": 19.77310562133789, "state_level_variance_full_gradient/metric": 14.280863761901855, "step": 82 }, { "accuracy_reward": 0.90625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.08585526049137115, "action_level_variance/metric": 204.37149047851562, "action_level_variance_full_gradient/metric": 560.0860595703125, "adam_stats/lr_effective_max": 7.6171882028575055e-06, "adam_stats/lr_effective_mean": -1.764480575539018e-11, "adam_stats/lr_effective_min": -7.233104042825289e-06, "adam_stats/m_t_max": 0.00196607643738389, "adam_stats/m_t_mean": -1.2828280972210226e-11, "adam_stats/m_t_min": -0.0013352477690204978, "adam_stats/v_t_max": 7.402121264021844e-05, "adam_stats/v_t_mean": 5.867240287366071e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.6526548862457275, "advantages/median": 0.24990005791187286, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.6311681866645813, "all_logprobs": -0.11034843325614929, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -9.0, "all_logprobs/p1": -2.078125, "all_logprobs/p10": -0.251953125, "all_logprobs/p25": -0.0021820068359375, "all_logprobs/p5": -0.69921875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.16634424030780792, "clip_ratio": 0.0, "completion_length": 514.0208740234375, "completion_length/correct": 487.6321716308594, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 467.0, "completion_length/correct/min": 80.0, "completion_length/correct/p25": 334.5, "completion_length/correct/p75": 627.5, "completion_length/correct/var": 47765.35546875, "completion_length/incorrect": 769.1111450195312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 778.0, "completion_length/incorrect/min": 450.0, "completion_length/incorrect/p25": 517.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 54917.859375, "completion_length/max": 1024.0, "completion_length/median": 487.0, "completion_length/min": 80.0, "completion_length/p25": 345.0, "completion_length/p75": 656.5, "completion_length/var": 54667.22265625, "epoch": 0.0664, "feature_vector_variance/max_squared_error": 115595.953125, "feature_vector_variance/metric": 29525.533203125, "generated_tokens/total": 4836924.0, "grad_norm": 0.3192763924598694, "grouped_std_rewards": 0.22004644572734833, "learning_rate": 1.282218205837188e-06, "loss": -0.0, "mean_logprobs": -0.11279296875, "mean_logprobs/var": 0.0014801025390625, "num_completions/total": 7968, "per_sentence_gradient_norm": 7.574288845062256, "per_sentence_gradient_norm/max": 121.98867797851562, "per_sentence_gradient_norm/median": 4.222084045410156, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 6.83791446685791, "per_sentence_gradient_norm/p85": 11.442834854125977, "per_sentence_gradient_norm/p90": 13.911707878112793, "per_sentence_gradient_norm/p95": 26.242799758911133, "per_sentence_gradient_norm/p99": 66.71309661865234, "per_sentence_gradient_norm/var": 229.3517303466797, "per_token_feature_norm": 175.4296875, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 176.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 142.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 1767.0931396484375, "per_token_full_gradient_variance/max_squared_error": 26.695133209228516, "per_token_full_gradient_variance/variance": 0.0207962766289711, "per_token_gradient_norm": 9.151453971862793, "per_token_gradient_norm/max": 1252.4678955078125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2987.1611328125, "per_token_policy_error_norm": 0.06047365069389343, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05154283717274666, "policy_entropy": 0.12144826352596283, "policy_entropy/max": 3.828125, "policy_entropy/median": 3.904104232788086e-06, "policy_entropy/min": 1.3877787807814457e-14, "policy_entropy/p25": 5.122274160385132e-08, "policy_entropy/p75": 0.0157470703125, "policy_entropy/var": 0.08811131119728088, "policy_error_vector_variance/max_squared_error": 2.011667251586914, "policy_error_vector_variance/metric": 0.06041615083813667, "policy_loss": -4.967053879312289e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": -0.24990005791187286, "policy_loss/min": -0.6526549458503723, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.6311682462692261, "policy_sharpness": 7.908143043518066, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.24609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.590814590454102, "reward": 0.90625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.08585526049137115, "rewards/accuracy_reward": 0.90625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.08585526049137115, "sentence_full_gradient_variance/max_squared_error": 21778.63671875, "sentence_full_gradient_variance/metric": 627.6488037109375, "sentence_full_gradient_variance/p75": 210.48971557617188, "sentence_full_gradient_variance/p90": 489.2619934082031, "sentence_full_gradient_variance/p95": 1014.9803466796875, "sentence_full_gradient_variance/p99": 9469.81640625, "state_level_variance/metric": 42.43730926513672, "state_level_variance_full_gradient/metric": 67.5627212524414, "step": 83 }, { "accuracy_reward": 0.9166666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.0771929919719696, "action_level_variance/metric": 13.196897506713867, "action_level_variance_full_gradient/metric": 7.322600364685059, "adam_stats/lr_effective_max": 6.056993242964381e-06, "adam_stats/lr_effective_mean": -1.5542723358352717e-11, "adam_stats/lr_effective_min": -5.780000265076524e-06, "adam_stats/m_t_max": 0.0015024039894342422, "adam_stats/m_t_mean": -1.470830575389659e-11, "adam_stats/m_t_min": -0.0013318893034011126, "adam_stats/v_t_max": 7.404375355690718e-05, "adam_stats/v_t_mean": 5.863823315799266e-12, "adam_stats/v_t_min": 0.0, "advantages": 4.967053879312289e-09, "advantages/max": 0.5588920712471008, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.46501490473747253, "advantages/var": 0.47340914607048035, "all_logprobs": -0.06331194937229156, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.6875, "all_logprobs/p1": -1.5701565742492676, "all_logprobs/p10": -0.048583984375, "all_logprobs/p25": -5.7220458984375e-06, "all_logprobs/p5": -0.33203125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.09218805283308029, "clip_ratio": 0.0, "completion_length": 666.6979370117188, "completion_length/correct": 634.6818237304688, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 607.0, "completion_length/correct/min": 207.0, "completion_length/correct/p25": 365.75, "completion_length/correct/p75": 909.5, "completion_length/correct/var": 78567.6875, "completion_length/incorrect": 1018.875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 983.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 210.125, "completion_length/max": 1024.0, "completion_length/median": 663.0, "completion_length/min": 207.0, "completion_length/p25": 370.75, "completion_length/p75": 940.75, "completion_length/var": 83360.9765625, "epoch": 0.0672, "feature_vector_variance/max_squared_error": 130513.6484375, "feature_vector_variance/metric": 26351.1328125, "generated_tokens/total": 4900927.0, "grad_norm": 0.12500187754631042, "grouped_std_rewards": 0.18338775634765625, "learning_rate": 1.1396392788268054e-06, "loss": -0.0, "mean_logprobs": -0.06884765625, "mean_logprobs/var": 0.0007476806640625, "num_completions/total": 8064, "per_sentence_gradient_norm": 2.3105568885803223, "per_sentence_gradient_norm/max": 29.757415771484375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.8362996578216553, "per_sentence_gradient_norm/p85": 3.617633581161499, "per_sentence_gradient_norm/p90": 4.268980026245117, "per_sentence_gradient_norm/p95": 10.535747528076172, "per_sentence_gradient_norm/p99": 17.68425941467285, "per_sentence_gradient_norm/var": 18.016658782958984, "per_token_feature_norm": 176.69955444335938, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 178.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 146.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 1630.11572265625, "per_token_full_gradient_variance/max_squared_error": 8.893187522888184, "per_token_full_gradient_variance/variance": 0.005505573004484177, "per_token_gradient_norm": 3.3517138957977295, "per_token_gradient_norm/max": 954.8134155273438, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 639.3663330078125, "per_token_policy_error_norm": 0.03555528074502945, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.031091731041669846, "policy_entropy": 0.06927619129419327, "policy_entropy/max": 2.921875, "policy_entropy/median": 5.327165126800537e-07, "policy_entropy/min": 1.8596235662471372e-15, "policy_entropy/p25": 2.3632310330867767e-08, "policy_entropy/p75": 7.677078247070312e-05, "policy_entropy/var": 0.04810785874724388, "policy_error_vector_variance/max_squared_error": 1.9780309200286865, "policy_error_vector_variance/metric": 0.035539861768484116, "policy_loss": -6.208817460162663e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.5588921308517456, "policy_loss/p25": -0.46501487493515015, "policy_loss/p75": 0.0, "policy_loss/var": 0.47340911626815796, "policy_sharpness": 8.708773612976074, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.920041084289551, "reward": 0.9166666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.0771929919719696, "rewards/accuracy_reward": 0.9166666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.0771929919719696, "sentence_full_gradient_variance/max_squared_error": 235.18243408203125, "sentence_full_gradient_variance/metric": 13.858904838562012, "sentence_full_gradient_variance/p75": 4.196557998657227, "sentence_full_gradient_variance/p90": 23.950634002685547, "sentence_full_gradient_variance/p95": 67.6270523071289, "sentence_full_gradient_variance/p99": 181.7516326904297, "state_level_variance/metric": 6.548272609710693, "state_level_variance_full_gradient/metric": 6.536305904388428, "step": 84 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1666666567325592, "action_level_variance/metric": 102.4525375366211, "action_level_variance_full_gradient/metric": 286.45111083984375, "adam_stats/lr_effective_max": 5.061713181930827e-06, "adam_stats/lr_effective_mean": -2.0997998478877733e-11, "adam_stats/lr_effective_min": -4.966133019479457e-06, "adam_stats/m_t_max": 0.0030009085312485695, "adam_stats/m_t_mean": -1.1496094007301672e-11, "adam_stats/m_t_min": -0.002351997885853052, "adam_stats/v_t_max": 7.411282422253862e-05, "adam_stats/v_t_mean": 5.872058915501466e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.6311418414115906, "all_logprobs": -0.10448971390724182, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -8.75, "all_logprobs/p1": -2.046875, "all_logprobs/p10": -0.2255859375, "all_logprobs/p25": -0.0019378662109375, "all_logprobs/p5": -0.66015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15551096200942993, "clip_ratio": 0.0, "completion_length": 478.125, "completion_length/correct": 389.52630615234375, "completion_length/correct/max": 809.0, "completion_length/correct/median": 403.0, "completion_length/correct/min": 143.0, "completion_length/correct/p25": 266.5, "completion_length/correct/p75": 476.5, "completion_length/correct/var": 23516.013671875, "completion_length/incorrect": 814.7999877929688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 826.0, "completion_length/incorrect/min": 402.0, "completion_length/incorrect/p25": 690.0, "completion_length/incorrect/p75": 976.0, "completion_length/incorrect/var": 37991.84765625, "completion_length/max": 1024.0, "completion_length/median": 439.0, "completion_length/min": 143.0, "completion_length/p25": 304.25, "completion_length/p75": 577.5, "completion_length/var": 56306.59375, "epoch": 0.068, "feature_vector_variance/max_squared_error": 112380.6953125, "feature_vector_variance/metric": 29605.904296875, "generated_tokens/total": 4946827.0, "grad_norm": 0.3054651618003845, "grouped_std_rewards": 0.1971883475780487, "learning_rate": 1.0048094716167097e-06, "loss": -0.0, "mean_logprobs": -0.1015625, "mean_logprobs/var": 0.0015106201171875, "num_completions/total": 8160, "per_sentence_gradient_norm": 5.61672306060791, "per_sentence_gradient_norm/max": 60.47877502441406, "per_sentence_gradient_norm/median": 3.2864162921905518, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.155675411224365, "per_sentence_gradient_norm/p85": 6.775668621063232, "per_sentence_gradient_norm/p90": 7.879571914672852, "per_sentence_gradient_norm/p95": 29.907085418701172, "per_sentence_gradient_norm/p99": 53.80642318725586, "per_sentence_gradient_norm/var": 116.00411987304688, "per_token_feature_norm": 175.54849243164062, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 176.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 144.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 1721.0391845703125, "per_token_full_gradient_variance/max_squared_error": 23.311702728271484, "per_token_full_gradient_variance/variance": 0.015555603429675102, "per_token_gradient_norm": 6.834183216094971, "per_token_gradient_norm/max": 1265.1190185546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1782.176025390625, "per_token_policy_error_norm": 0.05729885771870613, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04893827810883522, "policy_entropy": 0.11638940125703812, "policy_entropy/max": 3.84375, "policy_entropy/median": 5.125999450683594e-06, "policy_entropy/min": 2.3314683517128287e-14, "policy_entropy/p25": 8.055940270423889e-08, "policy_entropy/p75": 0.0146484375, "policy_entropy/var": 0.08077014237642288, "policy_error_vector_variance/max_squared_error": 2.0137763023376465, "policy_error_vector_variance/metric": 0.05725209414958954, "policy_loss": 0.0, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.6311418414115906, "policy_sharpness": 7.917827129364014, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.49609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.38931655883789, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1666666567325592, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1666666567325592, "sentence_full_gradient_variance/max_squared_error": 9071.1611328125, "sentence_full_gradient_variance/metric": 313.11737060546875, "sentence_full_gradient_variance/p75": 127.25379180908203, "sentence_full_gradient_variance/p90": 274.2490234375, "sentence_full_gradient_variance/p95": 393.88623046875, "sentence_full_gradient_variance/p99": 8036.498046875, "state_level_variance/metric": 22.49579620361328, "state_level_variance_full_gradient/metric": 26.666271209716797, "step": 85 }, { "accuracy_reward": 0.84375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.13322368264198303, "action_level_variance/metric": 41.724266052246094, "action_level_variance_full_gradient/metric": 1000.00390625, "adam_stats/lr_effective_max": 4.5438596316671465e-06, "adam_stats/lr_effective_mean": -2.0137761250205166e-11, "adam_stats/lr_effective_min": -4.652609732147539e-06, "adam_stats/m_t_max": 0.004568493925035, "adam_stats/m_t_mean": -1.389021710151983e-11, "adam_stats/m_t_min": -0.0035511243622750044, "adam_stats/v_t_max": 7.409785030176863e-05, "adam_stats/v_t_mean": 5.890038456968227e-12, "adam_stats/v_t_min": 0.0, "advantages": -1.2417634698280722e-09, "advantages/max": 1.249750018119812, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.4734242856502533, "all_logprobs": -0.10496153682470322, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -8.5, "all_logprobs/p1": -2.019218921661377, "all_logprobs/p10": -0.224609375, "all_logprobs/p25": -0.00150299072265625, "all_logprobs/p5": -0.6484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.162275493144989, "clip_ratio": 0.0, "completion_length": 461.22918701171875, "completion_length/correct": 458.8888854980469, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 380.0, "completion_length/correct/min": 147.0, "completion_length/correct/p25": 300.0, "completion_length/correct/p75": 569.0, "completion_length/correct/var": 42694.17578125, "completion_length/incorrect": 473.86669921875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 291.0, "completion_length/incorrect/min": 251.0, "completion_length/incorrect/p25": 268.5, "completion_length/incorrect/p75": 643.5, "completion_length/incorrect/var": 104757.6953125, "completion_length/max": 1024.0, "completion_length/median": 366.0, "completion_length/min": 147.0, "completion_length/p25": 289.75, "completion_length/p75": 572.25, "completion_length/var": 51420.84765625, "epoch": 0.0688, "feature_vector_variance/max_squared_error": 110531.8359375, "feature_vector_variance/metric": 29741.544921875, "generated_tokens/total": 4991105.0, "grad_norm": 0.49283361434936523, "grouped_std_rewards": 0.19953560829162598, "learning_rate": 8.778930535580476e-07, "loss": -0.0, "mean_logprobs": -0.099609375, "mean_logprobs/var": 0.0023040771484375, "num_completions/total": 8256, "per_sentence_gradient_norm": 5.672934532165527, "per_sentence_gradient_norm/max": 52.233821868896484, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 9.030557632446289, "per_sentence_gradient_norm/p85": 14.027730941772461, "per_sentence_gradient_norm/p90": 16.263151168823242, "per_sentence_gradient_norm/p95": 18.935319900512695, "per_sentence_gradient_norm/p99": 45.479740142822266, "per_sentence_gradient_norm/var": 80.3634033203125, "per_token_feature_norm": 176.1015167236328, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 176.0, "per_token_feature_norm/min": 69.5, "per_token_feature_norm/p25": 143.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 1798.3614501953125, "per_token_full_gradient_variance/max_squared_error": 6.543041229248047, "per_token_full_gradient_variance/variance": 0.010372447781264782, "per_token_gradient_norm": 6.890675067901611, "per_token_gradient_norm/max": 1239.640869140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1296.61669921875, "per_token_policy_error_norm": 0.05696232244372368, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.048646505922079086, "policy_entropy": 0.11596512049436569, "policy_entropy/max": 2.734375, "policy_entropy/median": 4.827976226806641e-06, "policy_entropy/min": 4.729550084903167e-14, "policy_entropy/p25": 7.171183824539185e-08, "policy_entropy/p75": 0.01141357421875, "policy_entropy/var": 0.08353452384471893, "policy_error_vector_variance/max_squared_error": 2.002135753631592, "policy_error_vector_variance/metric": 0.05689976364374161, "policy_loss": -3.1044087300813317e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -1.249750018119812, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.4734242558479309, "policy_sharpness": 7.961906433105469, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.87109375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.294313430786133, "reward": 0.84375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.13322368264198303, "rewards/accuracy_reward": 0.84375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.13322368264198303, "sentence_full_gradient_variance/max_squared_error": 15068.76171875, "sentence_full_gradient_variance/metric": 1016.5360107421875, "sentence_full_gradient_variance/p75": 289.00726318359375, "sentence_full_gradient_variance/p90": 4110.03125, "sentence_full_gradient_variance/p95": 4634.2294921875, "sentence_full_gradient_variance/p99": 11875.3349609375, "state_level_variance/metric": 48.49176788330078, "state_level_variance_full_gradient/metric": 16.532115936279297, "step": 86 }, { "accuracy_reward": 0.8229166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14725877344608307, "action_level_variance/metric": 66.1111831665039, "action_level_variance_full_gradient/metric": 864.3575439453125, "adam_stats/lr_effective_max": 3.7836180126760155e-06, "adam_stats/lr_effective_mean": -2.7793364831829592e-11, "adam_stats/lr_effective_min": -3.9346223275060765e-06, "adam_stats/m_t_max": 0.00426118029281497, "adam_stats/m_t_mean": -7.585725450565128e-12, "adam_stats/m_t_min": -0.003304349258542061, "adam_stats/v_t_max": 7.403948984574527e-05, "adam_stats/v_t_mean": 5.8969708456591e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 1.249750018119812, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.36585545539855957, "advantages/var": 0.6312528252601624, "all_logprobs": -0.10403510183095932, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -7.15625, "all_logprobs/p1": -1.9495313167572021, "all_logprobs/p10": -0.24609375, "all_logprobs/p25": -0.0022735595703125, "all_logprobs/p5": -0.6550779342651367, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.14668050408363342, "clip_ratio": 0.0, "completion_length": 495.28125, "completion_length/correct": 467.7468566894531, "completion_length/correct/max": 878.0, "completion_length/correct/median": 459.0, "completion_length/correct/min": 173.0, "completion_length/correct/p25": 266.0, "completion_length/correct/p75": 607.0, "completion_length/correct/var": 41589.09375, "completion_length/incorrect": 623.2352905273438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 542.0, "completion_length/incorrect/min": 405.0, "completion_length/incorrect/p25": 455.0, "completion_length/incorrect/p75": 821.0, "completion_length/incorrect/var": 43633.6953125, "completion_length/max": 1024.0, "completion_length/median": 477.0, "completion_length/min": 173.0, "completion_length/p25": 319.0, "completion_length/p75": 615.25, "completion_length/var": 45055.890625, "epoch": 0.0696, "feature_vector_variance/max_squared_error": 112373.3515625, "feature_vector_variance/metric": 29906.419921875, "generated_tokens/total": 5038652.0, "grad_norm": 0.3405899703502655, "grouped_std_rewards": 0.2643738389015198, "learning_rate": 7.59044652756249e-07, "loss": 0.0, "mean_logprobs": -0.10400390625, "mean_logprobs/var": 0.00164794921875, "num_completions/total": 8352, "per_sentence_gradient_norm": 8.087825775146484, "per_sentence_gradient_norm/max": 44.47669219970703, "per_sentence_gradient_norm/median": 3.942596673965454, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 8.21049690246582, "per_sentence_gradient_norm/p85": 19.34950828552246, "per_sentence_gradient_norm/p90": 25.932893753051758, "per_sentence_gradient_norm/p95": 34.010765075683594, "per_sentence_gradient_norm/p99": 43.960567474365234, "per_sentence_gradient_norm/var": 120.34471893310547, "per_token_feature_norm": 176.3584747314453, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 177.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 145.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 1636.6378173828125, "per_token_full_gradient_variance/max_squared_error": 4.057288646697998, "per_token_full_gradient_variance/variance": 0.013929763808846474, "per_token_gradient_norm": 9.835514068603516, "per_token_gradient_norm/max": 863.3731079101562, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1732.26220703125, "per_token_policy_error_norm": 0.057427551597356796, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04784907028079033, "policy_entropy": 0.11882589012384415, "policy_entropy/max": 3.109375, "policy_entropy/median": 4.678964614868164e-06, "policy_entropy/min": 2.3869795029440866e-15, "policy_entropy/p25": 4.423782229423523e-08, "policy_entropy/p75": 0.017333984375, "policy_entropy/var": 0.08284786343574524, "policy_error_vector_variance/max_squared_error": 2.00346040725708, "policy_error_vector_variance/metric": 0.057352688163518906, "policy_loss": -4.967053879312289e-09, "policy_loss/max": 2.560988426208496, "policy_loss/median": 0.0, "policy_loss/min": -1.2497501373291016, "policy_loss/p25": -0.36585545539855957, "policy_loss/p75": 0.0, "policy_loss/var": 0.6312527656555176, "policy_sharpness": 7.87502384185791, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.11712646484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.508332252502441, "reward": 0.8229166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14725877344608307, "rewards/accuracy_reward": 0.8229166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14725877344608307, "sentence_full_gradient_variance/max_squared_error": 18317.380859375, "sentence_full_gradient_variance/metric": 897.3468627929688, "sentence_full_gradient_variance/p75": 439.0664978027344, "sentence_full_gradient_variance/p90": 1996.196044921875, "sentence_full_gradient_variance/p95": 4012.771484375, "sentence_full_gradient_variance/p99": 12535.994140625, "state_level_variance/metric": 68.53427124023438, "state_level_variance_full_gradient/metric": 32.98931884765625, "step": 87 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14035087823867798, "action_level_variance/metric": 0.0, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 2.910370540121221e-06, "adam_stats/lr_effective_mean": -2.13761230938303e-11, "adam_stats/lr_effective_min": -3.0265030090959044e-06, "adam_stats/m_t_max": 0.003835062263533473, "adam_stats/m_t_mean": -6.827167303713466e-12, "adam_stats/m_t_min": -0.00297391414642334, "adam_stats/v_t_max": 7.396544970106333e-05, "adam_stats/v_t_mean": 5.8910732195216475e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.12391869723796844, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -9.625, "all_logprobs/p1": -2.234375, "all_logprobs/p10": -0.3125, "all_logprobs/p25": -0.003711700439453125, "all_logprobs/p5": -0.82421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1900399774312973, "clip_ratio": 0.0, "completion_length": 493.0833435058594, "completion_length/correct": 404.2375183105469, "completion_length/correct/max": 756.0, "completion_length/correct/median": 415.0, "completion_length/correct/min": 119.0, "completion_length/correct/p25": 329.25, "completion_length/correct/p75": 514.5, "completion_length/correct/var": 26041.2734375, "completion_length/incorrect": 937.3125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 571.0, "completion_length/incorrect/p25": 884.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 19888.49609375, "completion_length/max": 1024.0, "completion_length/median": 427.0, "completion_length/min": 119.0, "completion_length/p25": 355.0, "completion_length/p75": 581.5, "completion_length/var": 64679.02734375, "epoch": 0.0704, "feature_vector_variance/max_squared_error": 113780.5078125, "feature_vector_variance/metric": 29690.470703125, "generated_tokens/total": 5085988.0, "grad_norm": 0.0, "grouped_std_rewards": 0.0, "learning_rate": 6.484090676804927e-07, "loss": 0.0, "mean_logprobs": -0.119140625, "mean_logprobs/var": 0.0023193359375, "num_completions/total": 8448, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 175.4817657470703, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 177.0, "per_token_feature_norm/min": 70.5, "per_token_feature_norm/p25": 141.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 1846.8441162109375, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.066911980509758, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.055507462471723557, "policy_entropy": 0.13739165663719177, "policy_entropy/max": 3.0, "policy_entropy/median": 4.738569259643555e-06, "policy_entropy/min": 2.453592884421596e-14, "policy_entropy/p25": 8.288770914077759e-08, "policy_entropy/p75": 0.0264892578125, "policy_entropy/var": 0.1056230291724205, "policy_error_vector_variance/max_squared_error": 1.9988688230514526, "policy_error_vector_variance/metric": 0.06684797257184982, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 7.7926740646362305, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.2115478515625, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14035087823867798, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14035087823867798, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 88 }, { "accuracy_reward": 0.7604166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18410086631774902, "action_level_variance/metric": 4.069427013397217, "action_level_variance_full_gradient/metric": 404.07257080078125, "adam_stats/lr_effective_max": 2.4096705146803288e-06, "adam_stats/lr_effective_mean": -1.7664611093315408e-11, "adam_stats/lr_effective_min": -2.5171391371259233e-06, "adam_stats/m_t_max": 0.002679461147636175, "adam_stats/m_t_mean": -5.8862116569802225e-12, "adam_stats/m_t_min": -0.001950373756699264, "adam_stats/v_t_max": 7.389213715214282e-05, "adam_stats/v_t_mean": 5.893481015706303e-12, "adam_stats/v_t_min": 0.0, "advantages": 4.967053879312289e-09, "advantages/max": 0.8537459373474121, "advantages/median": 0.0, "advantages/min": -1.0976732969284058, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.15783312916755676, "all_logprobs": -0.1401177942752838, "all_logprobs/max": 0.0, "all_logprobs/median": -9.5367431640625e-07, "all_logprobs/min": -7.625, "all_logprobs/p1": -2.359375, "all_logprobs/p10": -0.384765625, "all_logprobs/p25": -0.01300048828125, "all_logprobs/p5": -0.90234375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.21302449703216553, "clip_ratio": 0.0, "completion_length": 483.22918701171875, "completion_length/correct": 417.6164245605469, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 391.0, "completion_length/correct/min": 120.0, "completion_length/correct/p25": 284.0, "completion_length/correct/p75": 539.0, "completion_length/correct/var": 36898.18359375, "completion_length/incorrect": 691.478271484375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 679.0, "completion_length/incorrect/min": 430.0, "completion_length/incorrect/p25": 558.0, "completion_length/incorrect/p75": 822.0, "completion_length/incorrect/var": 36470.26171875, "completion_length/max": 1024.0, "completion_length/median": 453.0, "completion_length/min": 120.0, "completion_length/p25": 323.25, "completion_length/p75": 618.75, "completion_length/var": 50218.30859375, "epoch": 0.0712, "feature_vector_variance/max_squared_error": 100937.625, "feature_vector_variance/metric": 31459.533203125, "generated_tokens/total": 5132378.0, "grad_norm": 0.24505287408828735, "grouped_std_rewards": 0.08539125323295593, "learning_rate": 5.461210907490952e-07, "loss": -0.0, "mean_logprobs": -0.1298828125, "mean_logprobs/var": 0.004608154296875, "num_completions/total": 8544, "per_sentence_gradient_norm": 3.128107786178589, "per_sentence_gradient_norm/max": 30.249155044555664, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 11.672945976257324, "per_sentence_gradient_norm/p90": 16.838623046875, "per_sentence_gradient_norm/p95": 20.521644592285156, "per_sentence_gradient_norm/p99": 23.94732666015625, "per_sentence_gradient_norm/var": 53.29553985595703, "per_token_feature_norm": 177.8025360107422, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 177.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 144.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 1987.55810546875, "per_token_full_gradient_variance/max_squared_error": 2.0544955730438232, "per_token_full_gradient_variance/variance": 0.004758117720484734, "per_token_gradient_norm": 3.897775173187256, "per_token_gradient_norm/max": 419.96295166015625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 591.7022094726562, "per_token_policy_error_norm": 0.07477652281522751, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.062236860394477844, "policy_entropy": 0.15785335004329681, "policy_entropy/max": 2.71875, "policy_entropy/median": 1.4901161193847656e-05, "policy_entropy/min": 1.532107773982716e-14, "policy_entropy/p25": 1.043081283569336e-07, "policy_entropy/p75": 0.07470703125, "policy_entropy/var": 0.12280318886041641, "policy_error_vector_variance/max_squared_error": 2.0192370414733887, "policy_error_vector_variance/metric": 0.07467030733823776, "policy_loss": -2.4835269396561444e-09, "policy_loss/max": 1.0976734161376953, "policy_loss/median": 0.0, "policy_loss/min": -0.8537459969520569, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.15783311426639557, "policy_sharpness": 7.434512615203857, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.6285400390625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.357269287109375, "reward": 0.7604166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18410086631774902, "rewards/accuracy_reward": 0.7604166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18410086631774902, "sentence_full_gradient_variance/max_squared_error": 5856.84423828125, "sentence_full_gradient_variance/metric": 406.12750244140625, "sentence_full_gradient_variance/p75": 0.41099444031715393, "sentence_full_gradient_variance/p90": 2285.615234375, "sentence_full_gradient_variance/p95": 2801.52587890625, "sentence_full_gradient_variance/p99": 3999.368408203125, "state_level_variance/metric": 58.71034622192383, "state_level_variance_full_gradient/metric": 2.0549724102020264, "step": 89 }, { "accuracy_reward": 0.96875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.03059210442006588, "action_level_variance/metric": 219.53697204589844, "action_level_variance_full_gradient/metric": 888.9208374023438, "adam_stats/lr_effective_max": 2.154289177269675e-06, "adam_stats/lr_effective_mean": -1.1527559289070677e-11, "adam_stats/lr_effective_min": -2.0812417460547294e-06, "adam_stats/m_t_max": 0.0007965589757077396, "adam_stats/m_t_mean": -2.5444420875819773e-11, "adam_stats/m_t_min": -0.0009988759411498904, "adam_stats/v_t_max": 7.391721737803891e-05, "adam_stats/v_t_mean": 5.905267594363828e-12, "adam_stats/v_t_min": 0.0, "advantages": 3.725290298461914e-09, "advantages/max": 0.24990005791187286, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.4733055531978607, "all_logprobs": -0.1360815018415451, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-07, "all_logprobs/min": -8.5, "all_logprobs/p1": -2.328125, "all_logprobs/p10": -0.35546875, "all_logprobs/p25": -0.01080322265625, "all_logprobs/p5": -0.8671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.21286052465438843, "clip_ratio": 0.0, "completion_length": 537.1041870117188, "completion_length/correct": 531.5484008789062, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 522.0, "completion_length/correct/min": 268.0, "completion_length/correct/p25": 402.0, "completion_length/correct/p75": 627.0, "completion_length/correct/var": 29786.86328125, "completion_length/incorrect": 709.3333740234375, "completion_length/incorrect/max": 943.0, "completion_length/incorrect/median": 723.0, "completion_length/incorrect/min": 462.0, "completion_length/incorrect/p25": 592.5, "completion_length/incorrect/p75": 833.0, "completion_length/incorrect/var": 57980.33203125, "completion_length/max": 1024.0, "completion_length/median": 522.0, "completion_length/min": 268.0, "completion_length/p25": 408.75, "completion_length/p75": 631.0, "completion_length/var": 31033.80078125, "epoch": 0.072, "feature_vector_variance/max_squared_error": 117005.7734375, "feature_vector_variance/metric": 33061.9609375, "generated_tokens/total": 5183940.0, "grad_norm": 0.3975130021572113, "grouped_std_rewards": 0.125, "learning_rate": 4.5230534410568764e-07, "loss": -0.0, "mean_logprobs": -0.12890625, "mean_logprobs/var": 0.00543212890625, "num_completions/total": 8640, "per_sentence_gradient_norm": 5.186263084411621, "per_sentence_gradient_norm/max": 114.87460327148438, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 4.137330532073975, "per_sentence_gradient_norm/p85": 6.258356094360352, "per_sentence_gradient_norm/p90": 9.472497940063477, "per_sentence_gradient_norm/p95": 12.080414772033691, "per_sentence_gradient_norm/p99": 86.81304931640625, "per_sentence_gradient_norm/var": 243.00209045410156, "per_token_feature_norm": 183.90281677246094, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 151.0, "per_token_feature_norm/p75": 215.0, "per_token_feature_norm/var": 1915.2894287109375, "per_token_full_gradient_variance/max_squared_error": 18.280441284179688, "per_token_full_gradient_variance/variance": 0.01647370494902134, "per_token_gradient_norm": 6.252751350402832, "per_token_gradient_norm/max": 1371.01416015625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2250.347900390625, "per_token_policy_error_norm": 0.07285801321268082, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06099780276417732, "policy_entropy": 0.1515396237373352, "policy_entropy/max": 3.8125, "policy_entropy/median": 8.761882781982422e-06, "policy_entropy/min": 1.429412144204889e-15, "policy_entropy/p25": 4.866160452365875e-08, "policy_entropy/p75": 0.0615234375, "policy_entropy/var": 0.11722564697265625, "policy_error_vector_variance/max_squared_error": 2.0161190032958984, "policy_error_vector_variance/metric": 0.07277241349220276, "policy_loss": -4.967053879312289e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.24990007281303406, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.4733055531978607, "policy_sharpness": 7.544646263122559, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.870849609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.991471290588379, "reward": 0.96875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.03059210442006588, "rewards/accuracy_reward": 0.96875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.03059210442006588, "sentence_full_gradient_variance/max_squared_error": 68157.2109375, "sentence_full_gradient_variance/metric": 924.8200073242188, "sentence_full_gradient_variance/p75": 119.98014831542969, "sentence_full_gradient_variance/p90": 321.34716796875, "sentence_full_gradient_variance/p95": 369.7652587890625, "sentence_full_gradient_variance/p99": 15450.62890625, "state_level_variance/metric": 41.585872650146484, "state_level_variance_full_gradient/metric": 35.89906692504883, "step": 90 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1666666567325592, "action_level_variance/metric": 121.224609375, "action_level_variance_full_gradient/metric": 1076.5604248046875, "adam_stats/lr_effective_max": 1.6735602912376635e-06, "adam_stats/lr_effective_mean": -9.56343858271591e-12, "adam_stats/lr_effective_min": -1.6170376966329059e-06, "adam_stats/m_t_max": 0.0007875208975747228, "adam_stats/m_t_mean": -2.7490657319995115e-11, "adam_stats/m_t_min": -0.0009939742740243673, "adam_stats/v_t_max": 7.385879871435463e-05, "adam_stats/v_t_mean": 5.90624120791472e-12, "adam_stats/v_t_min": 0.0, "advantages": 2.4835269396561444e-09, "advantages/max": 0.36585545539855957, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.47333940863609314, "all_logprobs": -0.10653310269117355, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -13.0625, "all_logprobs/p1": -2.1404690742492676, "all_logprobs/p10": -0.203125, "all_logprobs/p25": -0.000946044921875, "all_logprobs/p5": -0.671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.17341236770153046, "clip_ratio": 0.0, "completion_length": 491.6875, "completion_length/correct": 509.6842041015625, "completion_length/correct/max": 1017.0, "completion_length/correct/median": 464.0, "completion_length/correct/min": 160.0, "completion_length/correct/p25": 354.0, "completion_length/correct/p75": 667.5, "completion_length/correct/var": 40119.8984375, "completion_length/incorrect": 423.3000183105469, "completion_length/incorrect/max": 700.0, "completion_length/incorrect/median": 387.0, "completion_length/incorrect/min": 276.0, "completion_length/incorrect/p25": 346.5, "completion_length/incorrect/p75": 470.75, "completion_length/incorrect/var": 13636.7470703125, "completion_length/max": 1017.0, "completion_length/median": 444.0, "completion_length/min": 160.0, "completion_length/p25": 346.5, "completion_length/p75": 647.75, "completion_length/var": 35644.6640625, "epoch": 0.0728, "feature_vector_variance/max_squared_error": 118329.1953125, "feature_vector_variance/metric": 28715.509765625, "generated_tokens/total": 5231142.0, "grad_norm": 0.27074283361434937, "grouped_std_rewards": 0.14026084542274475, "learning_rate": 3.6707612778634855e-07, "loss": -0.0, "mean_logprobs": -0.11376953125, "mean_logprobs/var": 0.005889892578125, "num_completions/total": 8736, "per_sentence_gradient_norm": 4.049452781677246, "per_sentence_gradient_norm/max": 101.18112182617188, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.2429261207580566, "per_sentence_gradient_norm/p85": 5.647884368896484, "per_sentence_gradient_norm/p90": 7.988031387329102, "per_sentence_gradient_norm/p95": 14.93449878692627, "per_sentence_gradient_norm/p99": 48.663028717041016, "per_sentence_gradient_norm/var": 149.06944274902344, "per_token_feature_norm": 173.00633239746094, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 171.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 140.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 1860.874267578125, "per_token_full_gradient_variance/max_squared_error": 11.587114334106445, "per_token_full_gradient_variance/variance": 0.0073812492191791534, "per_token_gradient_norm": 4.341289520263672, "per_token_gradient_norm/max": 1137.318603515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1163.7479248046875, "per_token_policy_error_norm": 0.05694127455353737, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.049946870654821396, "policy_entropy": 0.11734656989574432, "policy_entropy/max": 3.296875, "policy_entropy/median": 3.5315752029418945e-06, "policy_entropy/min": 6.439293542825908e-14, "policy_entropy/p25": 9.452924132347107e-08, "policy_entropy/p75": 0.00811767578125, "policy_entropy/var": 0.0915648564696312, "policy_error_vector_variance/max_squared_error": 1.9990299940109253, "policy_error_vector_variance/metric": 0.0568976104259491, "policy_loss": 0.0, "policy_loss/max": 3.7485010623931885, "policy_loss/median": 0.0, "policy_loss/min": -0.36585548520088196, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.47333940863609314, "policy_sharpness": 8.01905632019043, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.272265434265137, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1666666567325592, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1666666567325592, "sentence_full_gradient_variance/max_squared_error": 40539.0703125, "sentence_full_gradient_variance/metric": 1077.6240234375, "sentence_full_gradient_variance/p75": 114.7816390991211, "sentence_full_gradient_variance/p90": 781.9444580078125, "sentence_full_gradient_variance/p95": 904.2613525390625, "sentence_full_gradient_variance/p99": 27184.65234375, "state_level_variance/metric": 40.642295837402344, "state_level_variance_full_gradient/metric": 1.0636564493179321, "step": 91 }, { "accuracy_reward": 0.9583333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.04035087674856186, "action_level_variance/metric": 22.57013511657715, "action_level_variance_full_gradient/metric": 51.90283203125, "adam_stats/lr_effective_max": 1.3176696711525437e-06, "adam_stats/lr_effective_mean": -5.6473862045103296e-12, "adam_stats/lr_effective_min": -1.2853574844484683e-06, "adam_stats/m_t_max": 0.002023676410317421, "adam_stats/m_t_mean": -1.3104551298270817e-11, "adam_stats/m_t_min": -0.0025563701055943966, "adam_stats/v_t_max": 7.435154839185998e-05, "adam_stats/v_t_mean": 5.909119114161365e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.46501490473747253, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.3155849874019623, "all_logprobs": -0.11647724360227585, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -8.9375, "all_logprobs/p1": -2.1875, "all_logprobs/p10": -0.259765625, "all_logprobs/p25": -0.0021820068359375, "all_logprobs/p5": -0.73828125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1783943921327591, "clip_ratio": 0.0, "completion_length": 468.35418701171875, "completion_length/correct": 453.6521911621094, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 366.0, "completion_length/correct/min": 230.0, "completion_length/correct/p25": 307.75, "completion_length/correct/p75": 567.75, "completion_length/correct/var": 38295.55078125, "completion_length/incorrect": 806.5, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 726.0, "completion_length/incorrect/min": 722.0, "completion_length/incorrect/p25": 725.0, "completion_length/incorrect/p75": 821.5, "completion_length/incorrect/var": 21227.666015625, "completion_length/max": 1024.0, "completion_length/median": 370.0, "completion_length/min": 230.0, "completion_length/p25": 308.75, "completion_length/p75": 620.0, "completion_length/var": 42377.20703125, "epoch": 0.0736, "feature_vector_variance/max_squared_error": 125425.1171875, "feature_vector_variance/metric": 29920.150390625, "generated_tokens/total": 5276104.0, "grad_norm": 0.18476714193820953, "grouped_std_rewards": 0.10885214805603027, "learning_rate": 2.905372804626083e-07, "loss": -0.0, "mean_logprobs": -0.11279296875, "mean_logprobs/var": 0.0017242431640625, "num_completions/total": 8832, "per_sentence_gradient_norm": 2.6278533935546875, "per_sentence_gradient_norm/max": 44.218711853027344, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 4.331484794616699, "per_sentence_gradient_norm/p85": 5.707844257354736, "per_sentence_gradient_norm/p90": 6.73617696762085, "per_sentence_gradient_norm/p95": 8.336371421813965, "per_sentence_gradient_norm/p99": 28.493507385253906, "per_sentence_gradient_norm/var": 35.33885192871094, "per_token_feature_norm": 178.2223358154297, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 179.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 144.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 1877.3778076171875, "per_token_full_gradient_variance/max_squared_error": 13.167150497436523, "per_token_full_gradient_variance/variance": 0.008068462833762169, "per_token_gradient_norm": 4.305417060852051, "per_token_gradient_norm/max": 1309.046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 922.7870483398438, "per_token_policy_error_norm": 0.06282362341880798, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.053527336567640305, "policy_entropy": 0.1285138577222824, "policy_entropy/max": 2.78125, "policy_entropy/median": 2.6226043701171875e-06, "policy_entropy/min": 1.404432126150823e-14, "policy_entropy/p25": 4.0512531995773315e-08, "policy_entropy/p75": 0.016357421875, "policy_entropy/var": 0.09746929258108139, "policy_error_vector_variance/max_squared_error": 2.000140905380249, "policy_error_vector_variance/metric": 0.06277898699045181, "policy_loss": -2.4835269396561444e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.4650149345397949, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.3155849277973175, "policy_sharpness": 7.890390872955322, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.24609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.807579040527344, "reward": 0.9583333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.04035087674856186, "rewards/accuracy_reward": 0.9583333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.04035087674856186, "sentence_full_gradient_variance/max_squared_error": 1116.6800537109375, "sentence_full_gradient_variance/metric": 61.09144592285156, "sentence_full_gradient_variance/p75": 9.960736274719238, "sentence_full_gradient_variance/p90": 138.314453125, "sentence_full_gradient_variance/p95": 277.9475402832031, "sentence_full_gradient_variance/p99": 1043.36767578125, "state_level_variance/metric": 16.573484420776367, "state_level_variance_full_gradient/metric": 9.188621520996094, "step": 92 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14035087823867798, "action_level_variance/metric": 0.0, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.097918223233137e-07, "adam_stats/lr_effective_mean": -3.898859533835175e-12, "adam_stats/lr_effective_min": -8.874844752426725e-07, "adam_stats/m_t_max": 0.0018213087460026145, "adam_stats/m_t_mean": -1.1794082203919753e-11, "adam_stats/m_t_min": -0.0023007330019026995, "adam_stats/v_t_max": 7.427719538100064e-05, "adam_stats/v_t_mean": 5.903209344959581e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.10121078044176102, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -8.0, "all_logprobs/p1": -2.046875, "all_logprobs/p10": -0.201171875, "all_logprobs/p25": -0.0006122589111328125, "all_logprobs/p5": -0.6472654342651367, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15784227848052979, "clip_ratio": 0.0, "completion_length": 409.65625, "completion_length/correct": 314.7749938964844, "completion_length/correct/max": 805.0, "completion_length/correct/median": 229.0, "completion_length/correct/min": 113.0, "completion_length/correct/p25": 177.25, "completion_length/correct/p75": 340.5, "completion_length/correct/var": 39000.9609375, "completion_length/incorrect": 884.0625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 476.0, "completion_length/incorrect/p25": 796.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 37781.52734375, "completion_length/max": 1024.0, "completion_length/median": 292.0, "completion_length/min": 113.0, "completion_length/p25": 187.0, "completion_length/p75": 647.25, "completion_length/var": 83883.9609375, "epoch": 0.0744, "feature_vector_variance/max_squared_error": 118521.5078125, "feature_vector_variance/metric": 28479.259765625, "generated_tokens/total": 5315431.0, "grad_norm": 0.0, "grouped_std_rewards": 0.0, "learning_rate": 2.2278205293002645e-07, "loss": 0.0, "mean_logprobs": -0.09912109375, "mean_logprobs/var": 0.0026702880859375, "num_completions/total": 8928, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 172.63864135742188, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 171.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 137.0, "per_token_feature_norm/p75": 205.0, "per_token_feature_norm/var": 1932.8436279296875, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.05470354110002518, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04630258306860924, "policy_entropy": 0.11141905933618546, "policy_entropy/max": 2.90625, "policy_entropy/median": 4.4405460357666016e-06, "policy_entropy/min": 1.3933298959045715e-14, "policy_entropy/p25": 8.987262845039368e-08, "policy_entropy/p75": 0.0052490234375, "policy_entropy/var": 0.08433079719543457, "policy_error_vector_variance/max_squared_error": 2.0092878341674805, "policy_error_vector_variance/metric": 0.05467792972922325, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 8.099824905395508, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.902295112609863, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14035087823867798, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14035087823867798, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 93 }, { "accuracy_reward": 0.7708333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17850880324840546, "action_level_variance/metric": 86.86936950683594, "action_level_variance_full_gradient/metric": 566.459716796875, "adam_stats/lr_effective_max": 6.889392807352124e-07, "adam_stats/lr_effective_mean": -1.6500586704285003e-12, "adam_stats/lr_effective_min": -7.064573992465739e-07, "adam_stats/m_t_max": 0.002164080273360014, "adam_stats/m_t_mean": 1.4645237063001804e-12, "adam_stats/m_t_min": -0.002870220225304365, "adam_stats/v_t_max": 7.426684896927327e-05, "adam_stats/v_t_mean": 5.903605729273842e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.6526548862457275, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.3155972957611084, "all_logprobs": -0.10824889689683914, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-07, "all_logprobs/min": -8.8125, "all_logprobs/p1": -2.078125, "all_logprobs/p10": -0.236328125, "all_logprobs/p25": -0.002227783203125, "all_logprobs/p5": -0.69140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1668001264333725, "clip_ratio": 0.0, "completion_length": 529.5104370117188, "completion_length/correct": 465.75677490234375, "completion_length/correct/max": 950.0, "completion_length/correct/median": 389.0, "completion_length/correct/min": 213.0, "completion_length/correct/p25": 303.5, "completion_length/correct/p75": 613.5, "completion_length/correct/var": 39462.734375, "completion_length/incorrect": 743.95458984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 789.0, "completion_length/incorrect/min": 293.0, "completion_length/incorrect/p25": 510.5, "completion_length/incorrect/p75": 1021.0, "completion_length/incorrect/var": 70732.90625, "completion_length/max": 1024.0, "completion_length/median": 461.0, "completion_length/min": 213.0, "completion_length/p25": 337.75, "completion_length/p75": 662.5, "completion_length/var": 59775.19921875, "epoch": 0.0752, "feature_vector_variance/max_squared_error": 106065.609375, "feature_vector_variance/metric": 28076.505859375, "generated_tokens/total": 5366264.0, "grad_norm": 0.28293663263320923, "grouped_std_rewards": 0.12145226448774338, "learning_rate": 1.6389299449645734e-07, "loss": -0.0, "mean_logprobs": -0.11328125, "mean_logprobs/var": 0.0022125244140625, "num_completions/total": 9024, "per_sentence_gradient_norm": 3.4103221893310547, "per_sentence_gradient_norm/max": 95.09271240234375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.593703269958496, "per_sentence_gradient_norm/p85": 5.986087322235107, "per_sentence_gradient_norm/p90": 7.41318416595459, "per_sentence_gradient_norm/p95": 13.63443660736084, "per_sentence_gradient_norm/p99": 24.477378845214844, "per_sentence_gradient_norm/var": 106.26470947265625, "per_token_feature_norm": 170.413818359375, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 168.0, "per_token_feature_norm/min": 63.5, "per_token_feature_norm/p25": 137.0, "per_token_feature_norm/p75": 201.0, "per_token_feature_norm/var": 1770.1348876953125, "per_token_full_gradient_variance/max_squared_error": 7.412241458892822, "per_token_full_gradient_variance/variance": 0.0057543376460671425, "per_token_gradient_norm": 3.55979061126709, "per_token_gradient_norm/max": 1211.644287109375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 721.099365234375, "per_token_policy_error_norm": 0.05930871143937111, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.051217131316661835, "policy_entropy": 0.11832603812217712, "policy_entropy/max": 2.59375, "policy_entropy/median": 7.987022399902344e-06, "policy_entropy/min": 1.8118839761882555e-13, "policy_entropy/p25": 1.9837170839309692e-07, "policy_entropy/p75": 0.01708984375, "policy_entropy/var": 0.08247100561857224, "policy_error_vector_variance/max_squared_error": 2.0116806030273438, "policy_error_vector_variance/metric": 0.05926761031150818, "policy_loss": 0.0, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -0.6526549458503723, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.3155972957611084, "policy_sharpness": 7.896174430847168, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.24609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.477204322814941, "reward": 0.7708333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17850880324840546, "rewards/accuracy_reward": 0.7708333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17850880324840546, "sentence_full_gradient_variance/max_squared_error": 33245.390625, "sentence_full_gradient_variance/metric": 567.4153442382812, "sentence_full_gradient_variance/p75": 115.32603454589844, "sentence_full_gradient_variance/p90": 410.19488525390625, "sentence_full_gradient_variance/p95": 946.8447265625, "sentence_full_gradient_variance/p99": 6416.8916015625, "state_level_variance/metric": 28.461313247680664, "state_level_variance_full_gradient/metric": 0.9556469321250916, "step": 94 }, { "accuracy_reward": 0.6979166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21304823458194733, "action_level_variance/metric": 146.86695861816406, "action_level_variance_full_gradient/metric": 542.6378173828125, "adam_stats/lr_effective_max": 4.583508257383073e-07, "adam_stats/lr_effective_mean": -1.5209192412435346e-12, "adam_stats/lr_effective_min": -4.6823683419461304e-07, "adam_stats/m_t_max": 0.0018469642382115126, "adam_stats/m_t_mean": -1.7209305811990983e-12, "adam_stats/m_t_min": -0.002485542092472315, "adam_stats/v_t_max": 7.419353642035276e-05, "adam_stats/v_t_mean": 5.9004997068901055e-12, "adam_stats/v_t_min": 0.0, "advantages": -1.2417634698280722e-09, "advantages/max": 3.7485008239746094, "advantages/median": 0.0, "advantages/min": -0.36585545539855957, "advantages/p25": -0.24990005791187286, "advantages/p75": 0.0, "advantages/var": 0.3155708611011505, "all_logprobs": -0.09788767993450165, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -7.875, "all_logprobs/p1": -2.015625, "all_logprobs/p10": -0.181640625, "all_logprobs/p25": -0.000335693359375, "all_logprobs/p5": -0.59765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1527339518070221, "clip_ratio": 0.0, "completion_length": 506.3333435058594, "completion_length/correct": 475.64178466796875, "completion_length/correct/max": 1000.0, "completion_length/correct/median": 465.0, "completion_length/correct/min": 80.0, "completion_length/correct/p25": 295.5, "completion_length/correct/p75": 663.5, "completion_length/correct/var": 63519.50390625, "completion_length/incorrect": 577.2413940429688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 506.0, "completion_length/incorrect/min": 339.0, "completion_length/incorrect/p25": 405.0, "completion_length/incorrect/p75": 745.0, "completion_length/incorrect/var": 45631.04296875, "completion_length/max": 1024.0, "completion_length/median": 477.0, "completion_length/min": 80.0, "completion_length/p25": 370.0, "completion_length/p75": 674.5, "completion_length/var": 59777.671875, "epoch": 0.076, "feature_vector_variance/max_squared_error": 127664.484375, "feature_vector_variance/metric": 31251.595703125, "generated_tokens/total": 5414872.0, "grad_norm": 0.22164799273014069, "grouped_std_rewards": 0.09859417378902435, "learning_rate": 1.1394185240843985e-07, "loss": 0.0, "mean_logprobs": -0.11279296875, "mean_logprobs/var": 0.005615234375, "num_completions/total": 9120, "per_sentence_gradient_norm": 4.0457329750061035, "per_sentence_gradient_norm/max": 108.39566802978516, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.222991943359375, "per_sentence_gradient_norm/p85": 6.548784255981445, "per_sentence_gradient_norm/p90": 7.109450340270996, "per_sentence_gradient_norm/p95": 8.182926177978516, "per_sentence_gradient_norm/p99": 52.82124328613281, "per_sentence_gradient_norm/var": 172.24533081054688, "per_token_feature_norm": 182.96405029296875, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 184.0, "per_token_feature_norm/min": 70.5, "per_token_feature_norm/p25": 150.0, "per_token_feature_norm/p75": 214.0, "per_token_feature_norm/var": 1842.7659912109375, "per_token_full_gradient_variance/max_squared_error": 14.271785736083984, "per_token_full_gradient_variance/variance": 0.010366754606366158, "per_token_gradient_norm": 4.612199783325195, "per_token_gradient_norm/max": 1523.4140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1523.99072265625, "per_token_policy_error_norm": 0.05292611941695213, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04549932852387428, "policy_entropy": 0.10836236923933029, "policy_entropy/max": 2.8125, "policy_entropy/median": 2.0712614059448242e-06, "policy_entropy/min": 3.372302437298913e-15, "policy_entropy/p25": 3.3993273973464966e-08, "policy_entropy/p75": 0.003021240234375, "policy_entropy/var": 0.08351211249828339, "policy_error_vector_variance/max_squared_error": 2.012315034866333, "policy_error_vector_variance/metric": 0.052882738411426544, "policy_loss": 7.450580596923828e-09, "policy_loss/max": 0.36585548520088196, "policy_loss/median": 0.0, "policy_loss/min": -3.7485008239746094, "policy_loss/p25": 0.0, "policy_loss/p75": 0.24990005791187286, "policy_loss/var": 0.3155708611011505, "policy_sharpness": 8.187520027160645, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.59365463256836, "reward": 0.6979166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21304823458194733, "rewards/accuracy_reward": 0.6979166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21304823458194733, "sentence_full_gradient_variance/max_squared_error": 22632.033203125, "sentence_full_gradient_variance/metric": 545.6326904296875, "sentence_full_gradient_variance/p75": 48.104209899902344, "sentence_full_gradient_variance/p90": 455.7867736816406, "sentence_full_gradient_variance/p95": 657.8095703125, "sentence_full_gradient_variance/p99": 17549.8203125, "state_level_variance/metric": 39.31602096557617, "state_level_variance_full_gradient/metric": 2.9949443340301514, "step": 95 }, { "accuracy_reward": 0.53125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.25164473056793213, "action_level_variance/metric": 25.20264434814453, "action_level_variance_full_gradient/metric": 342.9860534667969, "adam_stats/lr_effective_max": 2.9009351010245155e-07, "adam_stats/lr_effective_mean": -1.1063480860606933e-12, "adam_stats/lr_effective_min": -2.89664114916377e-07, "adam_stats/m_t_max": 0.001197397941723466, "adam_stats/m_t_mean": -1.4585596619376417e-11, "adam_stats/m_t_min": -0.0012665287358686328, "adam_stats/v_t_max": 7.421352347591892e-05, "adam_stats/v_t_mean": 5.897553712747028e-12, "adam_stats/v_t_min": 0.0, "advantages": 3.725290298461914e-09, "advantages/max": 2.0150647163391113, "advantages/median": 0.0, "advantages/min": -0.46501490473747253, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.15781643986701965, "all_logprobs": -0.1317906677722931, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-06, "all_logprobs/min": -12.0, "all_logprobs/p1": -2.34375, "all_logprobs/p10": -0.328125, "all_logprobs/p25": -0.009765625, "all_logprobs/p5": -0.83203125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.20553182065486908, "clip_ratio": 0.0, "completion_length": 484.25, "completion_length/correct": 313.29412841796875, "completion_length/correct/max": 706.0, "completion_length/correct/median": 327.0, "completion_length/correct/min": 159.0, "completion_length/correct/p25": 230.5, "completion_length/correct/p75": 370.0, "completion_length/correct/var": 12676.1708984375, "completion_length/incorrect": 678.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 670.0, "completion_length/incorrect/min": 369.0, "completion_length/incorrect/p25": 560.0, "completion_length/incorrect/p75": 793.0, "completion_length/incorrect/var": 26380.8671875, "completion_length/max": 1024.0, "completion_length/median": 409.0, "completion_length/min": 159.0, "completion_length/p25": 318.0, "completion_length/p75": 652.75, "completion_length/var": 52361.53515625, "epoch": 0.0768, "feature_vector_variance/max_squared_error": 101954.234375, "feature_vector_variance/metric": 29303.77734375, "generated_tokens/total": 5461360.0, "grad_norm": 0.15886259078979492, "grouped_std_rewards": 0.06718548387289047, "learning_rate": 7.298948443822229e-08, "loss": -0.0, "mean_logprobs": -0.1220703125, "mean_logprobs/var": 0.002685546875, "num_completions/total": 9216, "per_sentence_gradient_norm": 2.592862606048584, "per_sentence_gradient_norm/max": 42.01898956298828, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 7.747104644775391, "per_sentence_gradient_norm/p90": 8.914228439331055, "per_sentence_gradient_norm/p95": 11.757105827331543, "per_sentence_gradient_norm/p99": 40.314144134521484, "per_sentence_gradient_norm/var": 57.84470748901367, "per_token_feature_norm": 171.81243896484375, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 169.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 138.0, "per_token_feature_norm/p75": 202.0, "per_token_feature_norm/var": 1861.4974365234375, "per_token_full_gradient_variance/max_squared_error": 4.401504039764404, "per_token_full_gradient_variance/variance": 0.003937258385121822, "per_token_gradient_norm": 3.226529359817505, "per_token_gradient_norm/max": 696.4567260742188, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 597.78662109375, "per_token_policy_error_norm": 0.07111474871635437, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.060175828635692596, "policy_entropy": 0.1449536681175232, "policy_entropy/max": 2.953125, "policy_entropy/median": 1.800060272216797e-05, "policy_entropy/min": 1.4210854715202004e-14, "policy_entropy/p25": 2.086162567138672e-07, "policy_entropy/p75": 0.05889892578125, "policy_entropy/var": 0.10417590290307999, "policy_error_vector_variance/max_squared_error": 2.0019805431365967, "policy_error_vector_variance/metric": 0.07104333490133286, "policy_loss": -2.4835269396561444e-09, "policy_loss/max": 0.4650149345397949, "policy_loss/median": 0.0, "policy_loss/min": -2.0150647163391113, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.15781643986701965, "policy_sharpness": 7.539304256439209, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.8826141357421875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.78913688659668, "reward": 0.53125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.25164473056793213, "rewards/accuracy_reward": 0.53125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.25164473056793213, "sentence_full_gradient_variance/max_squared_error": 14155.48046875, "sentence_full_gradient_variance/metric": 343.0116882324219, "sentence_full_gradient_variance/p75": 0.005129336379468441, "sentence_full_gradient_variance/p90": 537.6143798828125, "sentence_full_gradient_variance/p95": 704.8291015625, "sentence_full_gradient_variance/p99": 8311.6318359375, "state_level_variance/metric": 40.33761215209961, "state_level_variance_full_gradient/metric": 0.025646716356277466, "step": 96 }, { "accuracy_reward": 0.65625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2279605269432068, "action_level_variance/metric": 14.944669723510742, "action_level_variance_full_gradient/metric": 1011.344970703125, "adam_stats/lr_effective_max": 1.7985381361995678e-07, "adam_stats/lr_effective_mean": -5.847604843921272e-13, "adam_stats/lr_effective_min": -1.8976051308072783e-07, "adam_stats/m_t_max": 0.002138980198651552, "adam_stats/m_t_mean": -8.230250682361717e-12, "adam_stats/m_t_min": -0.002726790262386203, "adam_stats/v_t_max": 7.439113687723875e-05, "adam_stats/v_t_mean": 5.913324083867133e-12, "adam_stats/v_t_min": 0.0, "advantages": 1.2417634698280722e-09, "advantages/max": 1.4358407258987427, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.47342610359191895, "all_logprobs": -0.10858000069856644, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -12.75, "all_logprobs/p1": -2.21875, "all_logprobs/p10": -0.2041015625, "all_logprobs/p25": -0.0008392333984375, "all_logprobs/p5": -0.69140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1794813722372055, "clip_ratio": 0.0, "completion_length": 518.78125, "completion_length/correct": 425.0158996582031, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 369.0, "completion_length/correct/min": 237.0, "completion_length/correct/p25": 324.5, "completion_length/correct/p75": 443.5, "completion_length/correct/var": 34132.2109375, "completion_length/incorrect": 697.7879028320312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 677.0, "completion_length/incorrect/min": 279.0, "completion_length/incorrect/p25": 433.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 70607.921875, "completion_length/max": 1024.0, "completion_length/median": 417.0, "completion_length/min": 237.0, "completion_length/p25": 338.75, "completion_length/p75": 625.5, "completion_length/var": 63020.77734375, "epoch": 0.0776, "feature_vector_variance/max_squared_error": 120401.5, "feature_vector_variance/metric": 30413.0390625, "generated_tokens/total": 5511163.0, "grad_norm": 0.2681891620159149, "grouped_std_rewards": 0.20123785734176636, "learning_rate": 4.108578473795033e-08, "loss": -0.0, "mean_logprobs": -0.1162109375, "mean_logprobs/var": 0.00860595703125, "num_completions/total": 9312, "per_sentence_gradient_norm": 3.4935874938964844, "per_sentence_gradient_norm/max": 35.939208984375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.336626052856445, "per_sentence_gradient_norm/p85": 8.793315887451172, "per_sentence_gradient_norm/p90": 9.67199993133545, "per_sentence_gradient_norm/p95": 10.511285781860352, "per_sentence_gradient_norm/p99": 25.255535125732422, "per_sentence_gradient_norm/var": 29.833444595336914, "per_token_feature_norm": 178.7731170654297, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 179.0, "per_token_feature_norm/min": 70.5, "per_token_feature_norm/p25": 145.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 1950.3587646484375, "per_token_full_gradient_variance/max_squared_error": 6.3515305519104, "per_token_full_gradient_variance/variance": 0.0062136342748999596, "per_token_gradient_norm": 4.463866710662842, "per_token_gradient_norm/max": 935.9537963867188, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 680.1786499023438, "per_token_policy_error_norm": 0.057010479271411896, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04897770285606384, "policy_entropy": 0.1205039769411087, "policy_entropy/max": 3.609375, "policy_entropy/median": 4.351139068603516e-06, "policy_entropy/min": 4.829470157119431e-15, "policy_entropy/p25": 6.426125764846802e-08, "policy_entropy/p75": 0.007232666015625, "policy_entropy/var": 0.1010272428393364, "policy_error_vector_variance/max_squared_error": 2.0143706798553467, "policy_error_vector_variance/metric": 0.05695895850658417, "policy_loss": 6.208817349140361e-10, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -1.4358408451080322, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.47342613339424133, "policy_sharpness": 8.041159629821777, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.28819751739502, "reward": 0.65625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2279605269432068, "rewards/accuracy_reward": 0.65625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2279605269432068, "sentence_full_gradient_variance/max_squared_error": 78071.5703125, "sentence_full_gradient_variance/metric": 1087.87109375, "sentence_full_gradient_variance/p75": 230.87936401367188, "sentence_full_gradient_variance/p90": 993.46435546875, "sentence_full_gradient_variance/p95": 1691.5816650390625, "sentence_full_gradient_variance/p99": 8719.220703125, "state_level_variance/metric": 18.614459991455078, "state_level_variance_full_gradient/metric": 76.52603149414062, "step": 97 }, { "accuracy_reward": 0.875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.11052631586790085, "action_level_variance/metric": 59.75761032104492, "action_level_variance_full_gradient/metric": 836.2178955078125, "adam_stats/lr_effective_max": 7.789888911702292e-08, "adam_stats/lr_effective_mean": -2.453555479446645e-13, "adam_stats/lr_effective_min": -8.003116391819276e-08, "adam_stats/m_t_max": 0.0022699309047311544, "adam_stats/m_t_mean": -1.4360551810199684e-11, "adam_stats/m_t_min": -0.0023409570567309856, "adam_stats/v_t_max": 7.451039709849283e-05, "adam_stats/v_t_mean": 5.920443389012542e-12, "adam_stats/v_t_min": 0.0, "advantages": -2.4835269396561444e-09, "advantages/max": 1.0976732969284058, "advantages/median": 0.0, "advantages/min": -3.7485008239746094, "advantages/p25": 0.0, "advantages/p75": 0.24990005791187286, "advantages/var": 0.47340402007102966, "all_logprobs": -0.09554890543222427, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -8.0625, "all_logprobs/p1": -1.921875, "all_logprobs/p10": -0.201171875, "all_logprobs/p25": -0.0009307861328125, "all_logprobs/p5": -0.59375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1328725814819336, "clip_ratio": 0.0, "completion_length": 418.4270935058594, "completion_length/correct": 398.2976379394531, "completion_length/correct/max": 850.0, "completion_length/correct/median": 378.0, "completion_length/correct/min": 113.0, "completion_length/correct/p25": 331.5, "completion_length/correct/p75": 465.25, "completion_length/correct/var": 13313.6806640625, "completion_length/incorrect": 559.3333740234375, "completion_length/incorrect/max": 801.0, "completion_length/incorrect/median": 530.0, "completion_length/incorrect/min": 375.0, "completion_length/incorrect/p25": 513.0, "completion_length/incorrect/p75": 606.0, "completion_length/incorrect/var": 12459.697265625, "completion_length/max": 850.0, "completion_length/median": 385.0, "completion_length/min": 113.0, "completion_length/p25": 337.25, "completion_length/p75": 497.0, "completion_length/var": 15940.8798828125, "epoch": 0.0784, "feature_vector_variance/max_squared_error": 106567.875, "feature_vector_variance/metric": 29249.87890625, "generated_tokens/total": 5551332.0, "grad_norm": 0.3149195909500122, "grouped_std_rewards": 0.18398544192314148, "learning_rate": 1.8269623051318517e-08, "loss": 0.0, "mean_logprobs": -0.09814453125, "mean_logprobs/var": 0.0010223388671875, "num_completions/total": 9408, "per_sentence_gradient_norm": 4.6071696281433105, "per_sentence_gradient_norm/max": 56.63595962524414, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.794079065322876, "per_sentence_gradient_norm/p85": 10.207409858703613, "per_sentence_gradient_norm/p90": 11.333030700683594, "per_sentence_gradient_norm/p95": 19.26260757446289, "per_sentence_gradient_norm/p99": 48.23843765258789, "per_sentence_gradient_norm/var": 82.4687271118164, "per_token_feature_norm": 174.79258728027344, "per_token_feature_norm/max": 306.0, "per_token_feature_norm/median": 176.0, "per_token_feature_norm/min": 71.0, "per_token_feature_norm/p25": 144.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 1594.288330078125, "per_token_full_gradient_variance/max_squared_error": 24.13389778137207, "per_token_full_gradient_variance/variance": 0.010541883297264576, "per_token_gradient_norm": 5.7211408615112305, "per_token_gradient_norm/max": 1143.87841796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1304.262939453125, "per_token_policy_error_norm": 0.05387141928076744, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04585402086377144, "policy_entropy": 0.10556153953075409, "policy_entropy/max": 2.65625, "policy_entropy/median": 5.3942203521728516e-06, "policy_entropy/min": 3.1530333899354446e-14, "policy_entropy/p25": 1.2014061212539673e-07, "policy_entropy/p75": 0.008056640625, "policy_entropy/var": 0.06992591917514801, "policy_error_vector_variance/max_squared_error": 2.0051167011260986, "policy_error_vector_variance/metric": 0.05385502427816391, "policy_loss": 7.450580596923828e-09, "policy_loss/max": 3.7485008239746094, "policy_loss/median": 0.0, "policy_loss/min": -1.0976732969284058, "policy_loss/p25": -0.24990005791187286, "policy_loss/p75": 0.0, "policy_loss/var": 0.47340402007102966, "policy_sharpness": 8.065792083740234, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.77175521850586, "reward": 0.875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.11052631586790085, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.11052631586790085, "sentence_full_gradient_variance/max_squared_error": 45918.0234375, "sentence_full_gradient_variance/metric": 852.9476318359375, "sentence_full_gradient_variance/p75": 234.62863159179688, "sentence_full_gradient_variance/p90": 1577.424560546875, "sentence_full_gradient_variance/p95": 2792.336181640625, "sentence_full_gradient_variance/p99": 7038.21337890625, "state_level_variance/metric": 30.70429039001465, "state_level_variance_full_gradient/metric": 16.72984504699707, "step": 98 }, { "accuracy_reward": 0.8125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1539473682641983, "action_level_variance/metric": 113.51354217529297, "action_level_variance_full_gradient/metric": 799.2705078125, "adam_stats/lr_effective_max": 1.966709639589226e-08, "adam_stats/lr_effective_mean": -7.695927906449865e-14, "adam_stats/lr_effective_min": -1.984812136868186e-08, "adam_stats/m_t_max": 0.001923156320117414, "adam_stats/m_t_mean": -1.4037497726715475e-11, "adam_stats/m_t_min": -0.002145961858332157, "adam_stats/v_t_max": 7.443603681167588e-05, "adam_stats/v_t_mean": 5.917816583989044e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.36585545539855957, "advantages/median": 0.0, "advantages/min": -2.560988187789917, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.15780235826969147, "all_logprobs": -0.10946911573410034, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-07, "all_logprobs/min": -9.5, "all_logprobs/p1": -2.140625, "all_logprobs/p10": -0.23710918426513672, "all_logprobs/p25": -0.00193023681640625, "all_logprobs/p5": -0.6953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.16845455765724182, "clip_ratio": 0.0, "completion_length": 450.96875, "completion_length/correct": 359.6282043457031, "completion_length/correct/max": 839.0, "completion_length/correct/median": 345.0, "completion_length/correct/min": 191.0, "completion_length/correct/p25": 265.5, "completion_length/correct/p75": 409.5, "completion_length/correct/var": 16435.431640625, "completion_length/incorrect": 846.7777709960938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 188.0, "completion_length/incorrect/p25": 727.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 66370.0625, "completion_length/max": 1024.0, "completion_length/median": 367.0, "completion_length/min": 188.0, "completion_length/p25": 272.5, "completion_length/p75": 496.5, "completion_length/var": 61732.07421875, "epoch": 0.0792, "feature_vector_variance/max_squared_error": 116964.8984375, "feature_vector_variance/metric": 28769.939453125, "generated_tokens/total": 5594625.0, "grad_norm": 0.24533694982528687, "grouped_std_rewards": 0.056927502155303955, "learning_rate": 4.568797356781784e-09, "loss": 0.0, "mean_logprobs": -0.107421875, "mean_logprobs/var": 0.002960205078125, "num_completions/total": 9504, "per_sentence_gradient_norm": 3.0273685455322266, "per_sentence_gradient_norm/max": 91.39048767089844, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 5.85847806930542, "per_sentence_gradient_norm/p90": 7.893001556396484, "per_sentence_gradient_norm/p95": 9.640658378601074, "per_sentence_gradient_norm/p99": 78.067626953125, "per_sentence_gradient_norm/var": 153.84632873535156, "per_token_feature_norm": 172.2561798095703, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 170.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 138.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 1889.1142578125, "per_token_full_gradient_variance/max_squared_error": 9.446722984313965, "per_token_full_gradient_variance/variance": 0.006885753478854895, "per_token_gradient_norm": 3.1094558238983154, "per_token_gradient_norm/max": 883.5409545898438, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1114.78515625, "per_token_policy_error_norm": 0.059330329298973083, "per_token_policy_error_norm/max": 1.9921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05049590766429901, "policy_entropy": 0.12063916027545929, "policy_entropy/max": 2.703125, "policy_entropy/median": 7.331371307373047e-06, "policy_entropy/min": 1.6431300764452317e-14, "policy_entropy/p25": 1.0710209608078003e-07, "policy_entropy/p75": 0.0140380859375, "policy_entropy/var": 0.08825153857469559, "policy_error_vector_variance/max_squared_error": 2.000406265258789, "policy_error_vector_variance/metric": 0.0592949241399765, "policy_loss": 2.4835269396561444e-09, "policy_loss/max": 2.560988426208496, "policy_loss/median": 0.0, "policy_loss/min": -0.36585548520088196, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.15780238807201385, "policy_sharpness": 7.901586055755615, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.37109375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.50390625, "reward": 0.8125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1539473682641983, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1539473682641983, "sentence_full_gradient_variance/max_squared_error": 66144.078125, "sentence_full_gradient_variance/metric": 815.9407958984375, "sentence_full_gradient_variance/p75": 3.3340342044830322, "sentence_full_gradient_variance/p90": 773.0806884765625, "sentence_full_gradient_variance/p95": 984.8402709960938, "sentence_full_gradient_variance/p99": 4408.4296875, "state_level_variance/metric": 54.98976516723633, "state_level_variance_full_gradient/metric": 16.6701717376709, "step": 99 }, { "accuracy_reward": 1.0, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 1.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": 0.0, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 0.0, "adam_stats/lr_effective_mean": 0.0, "adam_stats/lr_effective_min": 0.0, "adam_stats/m_t_max": 0.001730840653181076, "adam_stats/m_t_mean": -1.263375255106114e-11, "adam_stats/m_t_min": -0.001931365579366684, "adam_stats/v_t_max": 7.436160376528278e-05, "adam_stats/v_t_mean": 5.91189814116988e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0853414237499237, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.1875, "all_logprobs/p1": -1.71875, "all_logprobs/p10": -0.16015625, "all_logprobs/p25": -0.0006103515625, "all_logprobs/p5": -0.52734375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1165725514292717, "clip_ratio": 0.0, "completion_length": 380.5520935058594, "completion_length/correct": 380.5520935058594, "completion_length/correct/max": 823.0, "completion_length/correct/median": 366.0, "completion_length/correct/min": 149.0, "completion_length/correct/p25": 283.0, "completion_length/correct/p75": 468.25, "completion_length/correct/var": 18071.703125, "completion_length/max": 823.0, "completion_length/median": 366.0, "completion_length/min": 149.0, "completion_length/p25": 283.0, "completion_length/p75": 468.25, "completion_length/var": 18071.703125, "epoch": 0.08, "feature_vector_variance/max_squared_error": 116126.765625, "feature_vector_variance/metric": 28922.58984375, "generated_tokens/total": 5631158.0, "grad_norm": 0.0, "grouped_std_rewards": 0.0, "learning_rate": 0.0, "loss": 0.0, "mean_logprobs": -0.08544921875, "mean_logprobs/var": 0.00075531005859375, "num_completions/total": 9600, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 178.38275146484375, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 180.0, "per_token_feature_norm/min": 71.0, "per_token_feature_norm/p25": 146.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 1743.7589111328125, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.048326276242733, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.040729016065597534, "policy_entropy": 0.09632078558206558, "policy_entropy/max": 3.21875, "policy_entropy/median": 9.760260581970215e-07, "policy_entropy/min": 1.0824674490095276e-14, "policy_entropy/p25": 1.816079020500183e-08, "policy_entropy/p75": 0.0052490234375, "policy_entropy/var": 0.061243027448654175, "policy_error_vector_variance/max_squared_error": 2.001683235168457, "policy_error_vector_variance/metric": 0.04827412590384483, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 8.149609565734863, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.411141395568848, "reward": 1.0, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 1.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.0, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 1.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 100 }, { "adam_stats/lr_effective_max": 0.0, "adam_stats/lr_effective_mean": 0.0, "adam_stats/lr_effective_min": 0.0, "adam_stats/m_t_max": 0.001730840653181076, "adam_stats/m_t_mean": -1.263375255106114e-11, "adam_stats/m_t_min": -0.001931365579366684, "adam_stats/v_t_max": 7.436160376528278e-05, "adam_stats/v_t_mean": 5.91189814116988e-12, "adam_stats/v_t_min": 0.0, "epoch": 0.08, "step": 100, "total_flos": 0.0, "train_loss": -1.1951973089008306e-09, "train_runtime": 16563.3449, "train_samples_per_second": 0.58, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 24, "trial_name": null, "trial_params": null }