{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2816, "eval_steps": 10, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "accuracy_reward": 0.5729166865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24500218033790588, "action_level_variance/metric": 8754.80078125, "action_level_variance_full_gradient/metric": 25899.75, "adam_stats/lr_effective_max": 4.743423687614268e-06, "adam_stats/lr_effective_mean": 3.506309478873426e-11, "adam_stats/lr_effective_min": -4.7434286898351274e-06, "adam_stats/m_t_max": 0.006127931177616119, "adam_stats/m_t_mean": 7.014685360351436e-11, "adam_stats/m_t_min": -0.007470705080777407, "adam_stats/v_t_max": 5.5810687626944855e-06, "adam_stats/v_t_mean": 1.635226649183627e-13, "adam_stats/v_t_min": 0.0, "advantages": -0.10479652881622314, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 6.805868625640869, "all_logprobs": -0.16672870516777039, "all_logprobs/max": 0.0, "all_logprobs/median": -3.790855407714844e-05, "all_logprobs/min": -13.25, "all_logprobs/p1": -2.796875, "all_logprobs/p10": -0.4607391357421875, "all_logprobs/p25": -0.02001953125, "all_logprobs/p5": -1.0546875, "all_logprobs/p75": -5.960464477539062e-07, "all_logprobs/var": 0.2965088486671448, "clip_ratio": 0.0, "completion_length": 613.0364990234375, "completion_length/correct": 548.9613647460938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 488.0, "completion_length/correct/min": 17.0, "completion_length/correct/p25": 346.0, "completion_length/correct/p75": 700.25, "completion_length/correct/var": 67447.625, "completion_length/incorrect": 698.9908447265625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 867.0, "completion_length/incorrect/min": 4.0, "completion_length/incorrect/p25": 383.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 130685.453125, "completion_length/max": 1024.0, "completion_length/median": 565.0, "completion_length/min": 4.0, "completion_length/p25": 349.0, "completion_length/p75": 1024.0, "completion_length/var": 99834.9921875, "epoch": 0.0128, "feature_vector_variance/max_squared_error": 100151.40625, "feature_vector_variance/metric": 24885.6484375, "generated_tokens/total": 470812.0, "grad_norm": 0.5779036283493042, "grouped_std_rewards": 0.35420259833335876, "learning_rate": 1.5e-06, "loss": 0.1048, "mean_logprobs": -0.19921875, "mean_logprobs/var": 0.041259765625, "num_completions/total": 768, "per_sentence_gradient_norm": 20.232633590698242, "per_sentence_gradient_norm/max": 992.27294921875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 106.70056915283203, "per_sentence_gradient_norm/p99": 563.565673828125, "per_sentence_gradient_norm/var": 8356.322265625, "per_token_feature_norm": 163.11874389648438, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 61.25, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 2471.679443359375, "per_token_full_gradient_variance/max_squared_error": 426.4719543457031, "per_token_full_gradient_variance/variance": 0.17625731229782104, "per_token_gradient_norm": 14.109644889831543, "per_token_gradient_norm/max": 7949.52099609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 24384.296875, "per_token_policy_error_norm": 0.08435660600662231, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0704844743013382, "policy_entropy": 0.18533243238925934, "policy_entropy/max": 3.796875, "policy_entropy/median": 0.00045013427734375, "policy_entropy/min": 2.445960101127298e-16, "policy_entropy/p25": 1.0907649993896484e-05, "policy_entropy/p75": 0.10546875, "policy_entropy/var": 0.16659922897815704, "policy_error_vector_variance/max_squared_error": 2.022780656814575, "policy_error_vector_variance/metric": 0.08379873633384705, "policy_loss": 0.10479654371738434, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958683013916016, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 6.805868625640869, "policy_sharpness": 7.026472568511963, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.8238282203674316, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.58240032196045, "reward": 0.5729166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24500218033790588, "rewards/accuracy_reward": 0.5729166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24500218033790588, "sentence_full_gradient_variance/max_squared_error": 3469146.75, "sentence_full_gradient_variance/metric": 29256.037109375, "sentence_full_gradient_variance/p75": 426.2807922363281, "sentence_full_gradient_variance/p90": 1265.6865234375, "sentence_full_gradient_variance/p95": 55624.4609375, "sentence_full_gradient_variance/p99": 890698.875, "state_level_variance/metric": 692.201171875, "state_level_variance_full_gradient/metric": 3356.28515625, "step": 1 }, { "accuracy_reward": 0.6393229365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2308897227048874, "action_level_variance/metric": 5493.80810546875, "action_level_variance_full_gradient/metric": 20620.984375, "adam_stats/lr_effective_max": 1.2765834981109947e-05, "adam_stats/lr_effective_mean": 1.7656162643042705e-10, "adam_stats/lr_effective_min": -1.2766078725690022e-05, "adam_stats/m_t_max": 0.009450685232877731, "adam_stats/m_t_mean": 1.2722060904746257e-10, "adam_stats/m_t_min": -0.013266604393720627, "adam_stats/v_t_max": 9.856476935965475e-06, "adam_stats/v_t_mean": 2.4247306094384025e-13, "adam_stats/v_t_min": 0.0, "advantages": 0.0035905889235436916, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 5.302395343780518, "all_logprobs": -0.14882002770900726, "all_logprobs/max": 0.0, "all_logprobs/median": -1.7642974853515625e-05, "all_logprobs/min": -15.0625, "all_logprobs/p1": -2.59375, "all_logprobs/p10": -0.38671875, "all_logprobs/p25": -0.01123046875, "all_logprobs/p5": -0.94921875, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.2605057954788208, "clip_ratio": 0.0, "completion_length": 611.18359375, "completion_length/correct": 549.9837036132812, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 480.0, "completion_length/correct/min": 14.0, "completion_length/correct/p25": 361.0, "completion_length/correct/p75": 713.5, "completion_length/correct/var": 65279.52734375, "completion_length/incorrect": 719.6642456054688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 339.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 149030.5625, "completion_length/max": 1024.0, "completion_length/median": 544.0, "completion_length/min": 2.0, "completion_length/p25": 358.5, "completion_length/p75": 1024.0, "completion_length/var": 101979.359375, "epoch": 0.0256, "feature_vector_variance/max_squared_error": 99323.7890625, "feature_vector_variance/metric": 24809.1640625, "generated_tokens/total": 940201.0, "grad_norm": 0.4130246937274933, "grouped_std_rewards": 0.3320194482803345, "learning_rate": 3e-06, "loss": -0.0036, "mean_logprobs": -0.1943359375, "mean_logprobs/var": 0.08837890625, "num_completions/total": 1536, "per_sentence_gradient_norm": 15.766677856445312, "per_sentence_gradient_norm/max": 1005.9931030273438, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 93.22473907470703, "per_sentence_gradient_norm/p99": 324.1252136230469, "per_sentence_gradient_norm/var": 5252.05859375, "per_token_feature_norm": 160.8798065185547, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 58.5, "per_token_feature_norm/p25": 122.5, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 2342.988037109375, "per_token_full_gradient_variance/max_squared_error": 1201.07470703125, "per_token_full_gradient_variance/variance": 0.1310078650712967, "per_token_gradient_norm": 11.766345024108887, "per_token_gradient_norm/max": 6728.51611328125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 17982.685546875, "per_token_policy_error_norm": 0.07631509006023407, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06432756781578064, "policy_entropy": 0.16514869034290314, "policy_entropy/max": 3.75, "policy_entropy/median": 0.00022411346435546875, "policy_entropy/min": 1.2045919817182948e-14, "policy_entropy/p25": 7.12275505065918e-06, "policy_entropy/p75": 0.0673828125, "policy_entropy/var": 0.1417994350194931, "policy_error_vector_variance/max_squared_error": 2.0218377113342285, "policy_error_vector_variance/metric": 0.07592758536338806, "policy_loss": -0.0035905814729630947, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 5.302395343780518, "policy_sharpness": 7.278046131134033, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.370361328125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.814764022827148, "reward": 0.6393229365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2308897227048874, "rewards/accuracy_reward": 0.6393229365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2308897227048874, "sentence_full_gradient_variance/max_squared_error": 4729521.5, "sentence_full_gradient_variance/metric": 23399.62109375, "sentence_full_gradient_variance/p75": 297.9320373535156, "sentence_full_gradient_variance/p90": 647.3416137695312, "sentence_full_gradient_variance/p95": 40727.3203125, "sentence_full_gradient_variance/p99": 479672.5625, "state_level_variance/metric": 442.7498474121094, "state_level_variance_full_gradient/metric": 2778.637451171875, "step": 2 }, { "accuracy_reward": 0.5729166865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2450021356344223, "action_level_variance/metric": 4257.02001953125, "action_level_variance_full_gradient/metric": 21712.84375, "adam_stats/lr_effective_max": 2.235242209280841e-05, "adam_stats/lr_effective_mean": 1.3455098146764044e-10, "adam_stats/lr_effective_min": -2.2354270186042413e-05, "adam_stats/m_t_max": 0.004477295093238354, "adam_stats/m_t_mean": 4.537547074190407e-11, "adam_stats/m_t_min": -0.0042250980623066425, "adam_stats/v_t_max": 1.5798425010871142e-05, "adam_stats/v_t_mean": 5.770263281947019e-13, "adam_stats/v_t_min": 0.0, "advantages": 0.09690238535404205, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 5.056778430938721, "all_logprobs": -0.15951746702194214, "all_logprobs/max": 0.0, "all_logprobs/median": -1.9311904907226562e-05, "all_logprobs/min": -14.25, "all_logprobs/p1": -2.703125, "all_logprobs/p10": -0.427734375, "all_logprobs/p25": -0.0150146484375, "all_logprobs/p5": -1.015625, "all_logprobs/p75": -4.76837158203125e-07, "all_logprobs/var": 0.2846689820289612, "clip_ratio": 0.0, "completion_length": 621.24609375, "completion_length/correct": 550.0272827148438, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 495.0, "completion_length/correct/min": 90.0, "completion_length/correct/p25": 345.0, "completion_length/correct/p75": 726.75, "completion_length/correct/var": 67183.734375, "completion_length/incorrect": 716.7835083007812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1002.0, "completion_length/incorrect/min": 10.0, "completion_length/incorrect/p25": 379.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 136925.984375, "completion_length/max": 1024.0, "completion_length/median": 565.0, "completion_length/min": 10.0, "completion_length/p25": 350.0, "completion_length/p75": 1024.0, "completion_length/var": 103642.734375, "epoch": 0.0384, "feature_vector_variance/max_squared_error": 103264.765625, "feature_vector_variance/metric": 24990.90234375, "generated_tokens/total": 1417318.0, "grad_norm": 0.8177697658538818, "grouped_std_rewards": 0.37970590591430664, "learning_rate": 4.5e-06, "loss": -0.0969, "mean_logprobs": -0.18359375, "mean_logprobs/var": 0.0286865234375, "num_completions/total": 2304, "per_sentence_gradient_norm": 14.960780143737793, "per_sentence_gradient_norm/max": 944.2427368164062, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 29.26137924194336, "per_sentence_gradient_norm/p95": 92.72657012939453, "per_sentence_gradient_norm/p99": 303.35528564453125, "per_sentence_gradient_norm/var": 4038.453369140625, "per_token_feature_norm": 161.6126708984375, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 50.5, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 2389.63427734375, "per_token_full_gradient_variance/max_squared_error": 1023411.3125, "per_token_full_gradient_variance/variance": 2.259514093399048, "per_token_gradient_norm": 12.039739608764648, "per_token_gradient_norm/max": 5753.0732421875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 16172.8271484375, "per_token_policy_error_norm": 0.08102539926767349, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06790424138307571, "policy_entropy": 0.17634351551532745, "policy_entropy/max": 3.765625, "policy_entropy/median": 0.00024127960205078125, "policy_entropy/min": 8.992806499463768e-15, "policy_entropy/p25": 7.4803829193115234e-06, "policy_entropy/p75": 0.08447265625, "policy_entropy/var": 0.1569308638572693, "policy_error_vector_variance/max_squared_error": 2.0209827423095703, "policy_error_vector_variance/metric": 0.08052829653024673, "policy_loss": -0.09690239280462265, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 5.056778430938721, "policy_sharpness": 7.1983747482299805, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.1240234375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.148073196411133, "reward": 0.5729166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2450021356344223, "rewards/accuracy_reward": 0.5729166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2450021356344223, "sentence_full_gradient_variance/max_squared_error": 5269693.5, "sentence_full_gradient_variance/metric": 24457.490234375, "sentence_full_gradient_variance/p75": 722.6155395507812, "sentence_full_gradient_variance/p90": 890.5147705078125, "sentence_full_gradient_variance/p95": 56123.140625, "sentence_full_gradient_variance/p99": 541862.0625, "state_level_variance/metric": 311.5478515625, "state_level_variance_full_gradient/metric": 2744.6513671875, "step": 3 }, { "accuracy_reward": 0.6497396230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2278747707605362, "action_level_variance/metric": 3358.212890625, "action_level_variance_full_gradient/metric": 15920.14453125, "adam_stats/lr_effective_max": 3.2781776099000126e-05, "adam_stats/lr_effective_mean": -2.056287501384091e-10, "adam_stats/lr_effective_min": -3.2782529160613194e-05, "adam_stats/m_t_max": 0.0033407905139029026, "adam_stats/m_t_mean": -1.7983120201292557e-11, "adam_stats/m_t_min": -0.0032194145023822784, "adam_stats/v_t_max": 1.8641288988874294e-05, "adam_stats/v_t_mean": 6.569367987964425e-13, "adam_stats/v_t_min": 0.0, "advantages": 0.11476191878318787, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 5.138956546783447, "all_logprobs": -0.15539240837097168, "all_logprobs/max": 0.0, "all_logprobs/median": -1.9550323486328125e-05, "all_logprobs/min": -11.125, "all_logprobs/p1": -2.625, "all_logprobs/p10": -0.419921875, "all_logprobs/p25": -0.0150146484375, "all_logprobs/p5": -0.98828125, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.26549050211906433, "clip_ratio": 0.0, "completion_length": 608.09375, "completion_length/correct": 557.1663208007812, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 495.0, "completion_length/correct/min": 63.0, "completion_length/correct/p25": 353.0, "completion_length/correct/p75": 717.0, "completion_length/correct/var": 68635.046875, "completion_length/incorrect": 702.5650634765625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 820.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 423.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 124678.15625, "completion_length/max": 1024.0, "completion_length/median": 550.0, "completion_length/min": 2.0, "completion_length/p25": 361.75, "completion_length/p75": 991.25, "completion_length/var": 92945.2109375, "epoch": 0.0512, "feature_vector_variance/max_squared_error": 100995.9609375, "feature_vector_variance/metric": 24895.23828125, "generated_tokens/total": 1884334.0, "grad_norm": 0.4124968349933624, "grouped_std_rewards": 0.3266811668872833, "learning_rate": 6e-06, "loss": -0.1148, "mean_logprobs": -0.1796875, "mean_logprobs/var": 0.04638671875, "num_completions/total": 3072, "per_sentence_gradient_norm": 12.595849990844727, "per_sentence_gradient_norm/max": 696.9605712890625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 64.67691802978516, "per_sentence_gradient_norm/p99": 296.32867431640625, "per_sentence_gradient_norm/var": 3203.729248046875, "per_token_feature_norm": 161.44064331054688, "per_token_feature_norm/max": 336.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 61.75, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 2362.899658203125, "per_token_full_gradient_variance/max_squared_error": 594.0691528320312, "per_token_full_gradient_variance/variance": 0.1358940452337265, "per_token_gradient_norm": 11.511700630187988, "per_token_gradient_norm/max": 6835.21533203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 18142.93359375, "per_token_policy_error_norm": 0.08020640164613724, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06718453019857407, "policy_entropy": 0.1725679337978363, "policy_entropy/max": 3.796875, "policy_entropy/median": 0.000244140625, "policy_entropy/min": 3.372302437298913e-15, "policy_entropy/p25": 7.0035457611083984e-06, "policy_entropy/p75": 0.08447265625, "policy_entropy/var": 0.14889539778232574, "policy_error_vector_variance/max_squared_error": 2.0185723304748535, "policy_error_vector_variance/metric": 0.07981272786855698, "policy_loss": -0.11476191878318787, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 5.138956546783447, "policy_sharpness": 7.212281227111816, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.1454315185546875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.035271644592285, "reward": 0.6497396230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2278747707605362, "rewards/accuracy_reward": 0.6497396230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2278747707605362, "sentence_full_gradient_variance/max_squared_error": 3264974.5, "sentence_full_gradient_variance/metric": 18005.5859375, "sentence_full_gradient_variance/p75": 417.2225036621094, "sentence_full_gradient_variance/p90": 470.8259582519531, "sentence_full_gradient_variance/p95": 39348.93359375, "sentence_full_gradient_variance/p99": 289111.34375, "state_level_variance/metric": 263.869873046875, "state_level_variance_full_gradient/metric": 2085.439453125, "step": 4 }, { "accuracy_reward": 0.6666666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.22251193225383759, "action_level_variance/metric": 7688.12109375, "action_level_variance_full_gradient/metric": 18817.62109375, "adam_stats/lr_effective_max": 4.376009019324556e-05, "adam_stats/lr_effective_mean": -1.0385807491797294e-10, "adam_stats/lr_effective_min": -4.36666960013099e-05, "adam_stats/m_t_max": 0.012574083171784878, "adam_stats/m_t_mean": 1.1973060043413142e-10, "adam_stats/m_t_min": -0.014242063276469707, "adam_stats/v_t_max": 4.305747279431671e-05, "adam_stats/v_t_mean": 1.1802454629936121e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.04860624298453331, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 6.091475486755371, "all_logprobs": -0.14740192890167236, "all_logprobs/max": 0.0, "all_logprobs/median": -1.2159347534179688e-05, "all_logprobs/min": -13.75, "all_logprobs/p1": -2.546875, "all_logprobs/p10": -0.38671875, "all_logprobs/p25": -0.01104736328125, "all_logprobs/p5": -0.9609375, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.25256574153900146, "clip_ratio": 0.0, "completion_length": 634.2721557617188, "completion_length/correct": 551.466796875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 494.0, "completion_length/correct/min": 10.0, "completion_length/correct/p25": 362.75, "completion_length/correct/p75": 686.75, "completion_length/correct/var": 65309.73828125, "completion_length/incorrect": 799.8828125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 581.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 115514.3203125, "completion_length/max": 1024.0, "completion_length/median": 566.0, "completion_length/min": 2.0, "completion_length/p25": 381.75, "completion_length/p75": 1024.0, "completion_length/var": 95647.1328125, "epoch": 0.064, "feature_vector_variance/max_squared_error": 96508.21875, "feature_vector_variance/metric": 24849.333984375, "generated_tokens/total": 2371455.0, "grad_norm": 8.046526908874512, "grouped_std_rewards": 0.3271579146385193, "learning_rate": 7.5e-06, "loss": -0.0486, "mean_logprobs": -0.1689453125, "mean_logprobs/var": 0.0308837890625, "num_completions/total": 3840, "per_sentence_gradient_norm": 14.520159721374512, "per_sentence_gradient_norm/max": 1334.259521484375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 82.82683563232422, "per_sentence_gradient_norm/p99": 241.61439514160156, "per_sentence_gradient_norm/var": 7487.03515625, "per_token_feature_norm": 160.0344696044922, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 60.5, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 2257.99853515625, "per_token_full_gradient_variance/max_squared_error": 1782037.0, "per_token_full_gradient_variance/variance": 3.863419532775879, "per_token_gradient_norm": 13.178288459777832, "per_token_gradient_norm/max": 7480.046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 27979.759765625, "per_token_policy_error_norm": 0.07612510025501251, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06371481716632843, "policy_entropy": 0.16386938095092773, "policy_entropy/max": 3.84375, "policy_entropy/median": 0.0001583099365234375, "policy_entropy/min": 2.0872192862952943e-14, "policy_entropy/p25": 6.139278411865234e-06, "policy_entropy/p75": 0.064453125, "policy_entropy/var": 0.14017869532108307, "policy_error_vector_variance/max_squared_error": 2.018622875213623, "policy_error_vector_variance/metric": 0.07579529285430908, "policy_loss": -0.048606231808662415, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 6.091475486755371, "policy_sharpness": 7.347497940063477, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.49951171875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.6792573928833, "reward": 0.6666666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.22251193225383759, "rewards/accuracy_reward": 0.6666666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.22251193225383759, "sentence_full_gradient_variance/max_squared_error": 4123015.0, "sentence_full_gradient_variance/metric": 21340.3984375, "sentence_full_gradient_variance/p75": 305.5281677246094, "sentence_full_gradient_variance/p90": 308.5117492675781, "sentence_full_gradient_variance/p95": 22766.822265625, "sentence_full_gradient_variance/p99": 344124.40625, "state_level_variance/metric": 758.0767211914062, "state_level_variance_full_gradient/metric": 2522.77783203125, "step": 5 }, { "accuracy_reward": 0.6783854365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21846310794353485, "action_level_variance/metric": 2165.313232421875, "action_level_variance_full_gradient/metric": 6675.28076171875, "adam_stats/lr_effective_max": 5.454982238006778e-05, "adam_stats/lr_effective_mean": -2.654132469359638e-10, "adam_stats/lr_effective_min": -5.4448581067845225e-05, "adam_stats/m_t_max": 0.007947533391416073, "adam_stats/m_t_mean": 6.434344029804251e-11, "adam_stats/m_t_min": -0.008715683594346046, "adam_stats/v_t_max": 4.584703128784895e-05, "adam_stats/v_t_mean": 1.2319532330037908e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.008318047039210796, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.286035537719727, "all_logprobs": -0.1500413566827774, "all_logprobs/max": 0.0, "all_logprobs/median": -1.3947486877441406e-05, "all_logprobs/min": -12.25, "all_logprobs/p1": -2.578125, "all_logprobs/p10": -0.3984375, "all_logprobs/p25": -0.013427734375, "all_logprobs/p5": -0.97265625, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.25116825103759766, "clip_ratio": 0.0, "completion_length": 605.0091552734375, "completion_length/correct": 517.5585327148438, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 461.0, "completion_length/correct/min": 28.0, "completion_length/correct/p25": 332.0, "completion_length/correct/p75": 660.0, "completion_length/correct/var": 57708.58984375, "completion_length/incorrect": 789.4696655273438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 52.0, "completion_length/incorrect/p25": 589.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 94435.2734375, "completion_length/max": 1024.0, "completion_length/median": 528.0, "completion_length/min": 28.0, "completion_length/p25": 360.75, "completion_length/p75": 954.25, "completion_length/var": 85564.9140625, "epoch": 0.0768, "feature_vector_variance/max_squared_error": 103609.09375, "feature_vector_variance/metric": 24981.416015625, "generated_tokens/total": 2836102.0, "grad_norm": 0.32702207565307617, "grouped_std_rewards": 0.2862210273742676, "learning_rate": 9e-06, "loss": -0.0083, "mean_logprobs": -0.16015625, "mean_logprobs/var": 0.01336669921875, "num_completions/total": 4608, "per_sentence_gradient_norm": 9.628893852233887, "per_sentence_gradient_norm/max": 560.6012573242188, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 61.07594680786133, "per_sentence_gradient_norm/p99": 194.6278839111328, "per_sentence_gradient_norm/var": 2075.2998046875, "per_token_feature_norm": 161.2283935546875, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 54.75, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 2270.7841796875, "per_token_full_gradient_variance/max_squared_error": 333.3014831542969, "per_token_full_gradient_variance/variance": 0.1252726912498474, "per_token_gradient_norm": 10.74965763092041, "per_token_gradient_norm/max": 8293.1220703125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 16641.869140625, "per_token_policy_error_norm": 0.07802923023700714, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0653560683131218, "policy_entropy": 0.16665053367614746, "policy_entropy/max": 3.78125, "policy_entropy/median": 0.00018024444580078125, "policy_entropy/min": 5.204170427930421e-18, "policy_entropy/p25": 5.8710575103759766e-06, "policy_entropy/p75": 0.076171875, "policy_entropy/var": 0.13824407756328583, "policy_error_vector_variance/max_squared_error": 2.0226285457611084, "policy_error_vector_variance/metric": 0.07774118334054947, "policy_loss": -0.008318042382597923, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.286035537719727, "policy_sharpness": 7.292131423950195, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.37451171875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.727907180786133, "reward": 0.6783854365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21846310794353485, "rewards/accuracy_reward": 0.6783854365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21846310794353485, "sentence_full_gradient_variance/max_squared_error": 2290566.25, "sentence_full_gradient_variance/metric": 7475.5888671875, "sentence_full_gradient_variance/p75": 228.83409118652344, "sentence_full_gradient_variance/p90": 506.2820129394531, "sentence_full_gradient_variance/p95": 20278.33984375, "sentence_full_gradient_variance/p99": 109355.0390625, "state_level_variance/metric": 179.82171630859375, "state_level_variance_full_gradient/metric": 800.3077392578125, "step": 6 }, { "accuracy_reward": 0.7408854365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19222448766231537, "action_level_variance/metric": 7249.6044921875, "action_level_variance_full_gradient/metric": 14744.3232421875, "adam_stats/lr_effective_max": 6.491185195045546e-05, "adam_stats/lr_effective_mean": -3.68113844961826e-10, "adam_stats/lr_effective_min": -6.47457709419541e-05, "adam_stats/m_t_max": 0.0077143036760389805, "adam_stats/m_t_mean": 4.991367735796004e-11, "adam_stats/m_t_min": -0.008307982236146927, "adam_stats/v_t_max": 4.5804106775904074e-05, "adam_stats/v_t_mean": 1.2383237881288811e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.010315751656889915, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.417995929718018, "all_logprobs": -0.13359519839286804, "all_logprobs/max": 0.0, "all_logprobs/median": -7.271766662597656e-06, "all_logprobs/min": -10.1875, "all_logprobs/p1": -2.34375, "all_logprobs/p10": -0.33984375, "all_logprobs/p25": -0.00885009765625, "all_logprobs/p5": -0.8515625, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.21171128749847412, "clip_ratio": 0.0, "completion_length": 609.3060302734375, "completion_length/correct": 529.8611450195312, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 475.0, "completion_length/correct/min": 72.0, "completion_length/correct/p25": 346.0, "completion_length/correct/p75": 659.0, "completion_length/correct/var": 59234.31640625, "completion_length/incorrect": 836.4622802734375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 637.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 69964.625, "completion_length/max": 1024.0, "completion_length/median": 548.0, "completion_length/min": 2.0, "completion_length/p25": 368.5, "completion_length/p75": 901.0, "completion_length/var": 79997.03125, "epoch": 0.0896, "feature_vector_variance/max_squared_error": 103706.3046875, "feature_vector_variance/metric": 24715.994140625, "generated_tokens/total": 3304049.0, "grad_norm": 0.1466267853975296, "grouped_std_rewards": 0.25351768732070923, "learning_rate": 1.05e-05, "loss": 0.0103, "mean_logprobs": -0.142578125, "mean_logprobs/var": 0.017578125, "num_completions/total": 5376, "per_sentence_gradient_norm": 10.525237083435059, "per_sentence_gradient_norm/max": 2027.90234375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 52.98207473754883, "per_sentence_gradient_norm/p99": 172.88809204101562, "per_sentence_gradient_norm/var": 7148.13037109375, "per_token_feature_norm": 159.22125244140625, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 58.25, "per_token_feature_norm/p25": 124.0, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 2011.0621337890625, "per_token_full_gradient_variance/max_squared_error": 457.335205078125, "per_token_full_gradient_variance/variance": 0.09727247059345245, "per_token_gradient_norm": 8.180116653442383, "per_token_gradient_norm/max": 7849.31689453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 12383.490234375, "per_token_policy_error_norm": 0.07117193937301636, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.059721674770116806, "policy_entropy": 0.1487431526184082, "policy_entropy/max": 3.78125, "policy_entropy/median": 9.870529174804688e-05, "policy_entropy/min": 4.2847669856627135e-16, "policy_entropy/p25": 3.5017728805541992e-06, "policy_entropy/p75": 0.05517578125, "policy_entropy/var": 0.11422356963157654, "policy_error_vector_variance/max_squared_error": 2.01414155960083, "policy_error_vector_variance/metric": 0.07098657637834549, "policy_loss": 0.010315751656889915, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.417995929718018, "policy_sharpness": 7.471322536468506, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.870849609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.006998062133789, "reward": 0.7408854365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19222448766231537, "rewards/accuracy_reward": 0.7408854365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19222448766231537, "sentence_full_gradient_variance/max_squared_error": 5449199.5, "sentence_full_gradient_variance/metric": 16579.73046875, "sentence_full_gradient_variance/p75": 454.64715576171875, "sentence_full_gradient_variance/p90": 1172.07763671875, "sentence_full_gradient_variance/p95": 8216.0, "sentence_full_gradient_variance/p99": 127943.8984375, "state_level_variance/metric": 803.7927856445312, "state_level_variance_full_gradient/metric": 1835.405517578125, "step": 7 }, { "accuracy_reward": 0.7213541865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2012643963098526, "action_level_variance/metric": 1246.616943359375, "action_level_variance_full_gradient/metric": 5874.89990234375, "adam_stats/lr_effective_max": 7.628063758602366e-05, "adam_stats/lr_effective_mean": -4.839916800669641e-10, "adam_stats/lr_effective_min": -7.526211265940219e-05, "adam_stats/m_t_max": 0.006430177949368954, "adam_stats/m_t_mean": 2.8796798279273617e-11, "adam_stats/m_t_min": -0.007446666248142719, "adam_stats/v_t_max": 4.5872391638113186e-05, "adam_stats/v_t_mean": 1.2436094905601824e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.02561859041452408, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.263084888458252, "all_logprobs": -0.13051970303058624, "all_logprobs/max": 0.0, "all_logprobs/median": -5.125999450683594e-06, "all_logprobs/min": -14.0, "all_logprobs/p1": -2.359375, "all_logprobs/p10": -0.31640625, "all_logprobs/p25": -0.00592041015625, "all_logprobs/p5": -0.83203125, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.21129557490348816, "clip_ratio": 0.0, "completion_length": 627.6119995117188, "completion_length/correct": 559.9025268554688, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 536.0, "completion_length/correct/min": 67.0, "completion_length/correct/p25": 357.25, "completion_length/correct/p75": 742.0, "completion_length/correct/var": 65358.7109375, "completion_length/incorrect": 802.8971557617188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 7.0, "completion_length/incorrect/p25": 624.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 80366.2421875, "completion_length/max": 1024.0, "completion_length/median": 600.0, "completion_length/min": 7.0, "completion_length/p25": 386.0, "completion_length/p75": 907.25, "completion_length/var": 81325.109375, "epoch": 0.1024, "feature_vector_variance/max_squared_error": 106020.4921875, "feature_vector_variance/metric": 24613.373046875, "generated_tokens/total": 3786055.0, "grad_norm": 0.14188651740550995, "grouped_std_rewards": 0.2025415599346161, "learning_rate": 1.2e-05, "loss": 0.0256, "mean_logprobs": -0.13671875, "mean_logprobs/var": 0.0125732421875, "num_completions/total": 6144, "per_sentence_gradient_norm": 6.091675758361816, "per_sentence_gradient_norm/max": 463.3869934082031, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 32.5858268737793, "per_sentence_gradient_norm/p99": 163.0586395263672, "per_sentence_gradient_norm/var": 1211.085205078125, "per_token_feature_norm": 158.29147338867188, "per_token_feature_norm/max": 340.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 61.5, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 187.0, "per_token_feature_norm/var": 1991.43896484375, "per_token_full_gradient_variance/max_squared_error": 1421.0225830078125, "per_token_full_gradient_variance/variance": 0.08799613267183304, "per_token_gradient_norm": 6.761094093322754, "per_token_gradient_norm/max": 6792.22607421875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 11575.4775390625, "per_token_policy_error_norm": 0.06921156495809555, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05850296840071678, "policy_entropy": 0.14440448582172394, "policy_entropy/max": 3.75, "policy_entropy/median": 7.152557373046875e-05, "policy_entropy/min": 3.625572064791527e-16, "policy_entropy/p25": 2.905726432800293e-06, "policy_entropy/p75": 0.038330078125, "policy_entropy/var": 0.11502765864133835, "policy_error_vector_variance/max_squared_error": 2.0138602256774902, "policy_error_vector_variance/metric": 0.0690445527434349, "policy_loss": 0.025618605315685272, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.2630856037139893, "policy_sharpness": 7.591964244842529, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.1405029296875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.721616744995117, "reward": 0.7213541865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2012643963098526, "rewards/accuracy_reward": 0.7213541865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2012643963098526, "sentence_full_gradient_variance/max_squared_error": 1954375.375, "sentence_full_gradient_variance/metric": 6621.18359375, "sentence_full_gradient_variance/p75": 118.16004943847656, "sentence_full_gradient_variance/p90": 455.58447265625, "sentence_full_gradient_variance/p95": 455.58447265625, "sentence_full_gradient_variance/p99": 98068.75, "state_level_variance/metric": 119.96825408935547, "state_level_variance_full_gradient/metric": 746.2825927734375, "step": 8 }, { "accuracy_reward": 0.6809896230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21752598881721497, "action_level_variance/metric": 2562.12646484375, "action_level_variance_full_gradient/metric": 10077.517578125, "adam_stats/lr_effective_max": 8.696987788425758e-05, "adam_stats/lr_effective_mean": -6.069160729538225e-10, "adam_stats/lr_effective_min": -8.700922626303509e-05, "adam_stats/m_t_max": 0.00937602762132883, "adam_stats/m_t_mean": 7.864790069200822e-11, "adam_stats/m_t_min": -0.01074492558836937, "adam_stats/v_t_max": 4.955176336807199e-05, "adam_stats/v_t_mean": 1.3152382828673925e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.002042613923549652, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.717120170593262, "all_logprobs": -0.12629927694797516, "all_logprobs/max": 0.0, "all_logprobs/median": -4.649162292480469e-06, "all_logprobs/min": -11.5, "all_logprobs/p1": -2.3125, "all_logprobs/p10": -0.3125, "all_logprobs/p25": -0.00592041015625, "all_logprobs/p5": -0.81640625, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.20200251042842865, "clip_ratio": 0.0, "completion_length": 633.6185302734375, "completion_length/correct": 557.5908203125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 519.0, "completion_length/correct/min": 86.0, "completion_length/correct/p25": 375.0, "completion_length/correct/p75": 695.5, "completion_length/correct/var": 57608.93359375, "completion_length/incorrect": 795.9142456054688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 950.0, "completion_length/incorrect/min": 25.0, "completion_length/incorrect/p25": 589.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 74075.140625, "completion_length/max": 1024.0, "completion_length/median": 594.0, "completion_length/min": 25.0, "completion_length/p25": 413.75, "completion_length/p75": 907.5, "completion_length/var": 75127.1484375, "epoch": 0.1152, "feature_vector_variance/max_squared_error": 101668.3828125, "feature_vector_variance/metric": 24701.486328125, "generated_tokens/total": 4272674.0, "grad_norm": 0.41004034876823425, "grouped_std_rewards": 0.263298898935318, "learning_rate": 1.3500000000000001e-05, "loss": 0.002, "mean_logprobs": -0.1298828125, "mean_logprobs/var": 0.0069580078125, "num_completions/total": 6912, "per_sentence_gradient_norm": 9.268218040466309, "per_sentence_gradient_norm/max": 685.0384521484375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 43.98069381713867, "per_sentence_gradient_norm/p99": 202.07614135742188, "per_sentence_gradient_norm/var": 2479.455078125, "per_token_feature_norm": 158.4069061279297, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 61.25, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 187.0, "per_token_feature_norm/var": 1943.1268310546875, "per_token_full_gradient_variance/max_squared_error": 27985236.0, "per_token_full_gradient_variance/variance": 57.626102447509766, "per_token_gradient_norm": 9.465322494506836, "per_token_gradient_norm/max": 7134.28076171875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 15897.0361328125, "per_token_policy_error_norm": 0.06728873401880264, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.057027384638786316, "policy_entropy": 0.1400676965713501, "policy_entropy/max": 3.765625, "policy_entropy/median": 6.437301635742188e-05, "policy_entropy/min": 1.2934098236883074e-14, "policy_entropy/p25": 2.682209014892578e-06, "policy_entropy/p75": 0.037841796875, "policy_entropy/var": 0.10756752640008926, "policy_error_vector_variance/max_squared_error": 2.0207560062408447, "policy_error_vector_variance/metric": 0.06715485453605652, "policy_loss": 0.0020426164846867323, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.71712064743042, "policy_sharpness": 7.597768306732178, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.1405029296875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.629859924316406, "reward": 0.6809896230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21752598881721497, "rewards/accuracy_reward": 0.6809896230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21752598881721497, "sentence_full_gradient_variance/max_squared_error": 2228788.0, "sentence_full_gradient_variance/metric": 11483.2734375, "sentence_full_gradient_variance/p75": 87.2176284790039, "sentence_full_gradient_variance/p90": 131.0458221435547, "sentence_full_gradient_variance/p95": 9415.2880859375, "sentence_full_gradient_variance/p99": 269018.75, "state_level_variance/metric": 236.8329620361328, "state_level_variance_full_gradient/metric": 1405.7548828125, "step": 9 }, { "accuracy_reward": 0.77734375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17330610752105713, "action_level_variance/metric": 1384.1956787109375, "action_level_variance_full_gradient/metric": 6842.55419921875, "adam_stats/lr_effective_max": 9.7336494945921e-05, "adam_stats/lr_effective_mean": -3.769005940679193e-10, "adam_stats/lr_effective_min": -9.508724178886041e-05, "adam_stats/m_t_max": 0.008171395398676395, "adam_stats/m_t_mean": 7.824542402889989e-11, "adam_stats/m_t_min": -0.009215720929205418, "adam_stats/v_t_max": 4.9522885092301294e-05, "adam_stats/v_t_mean": 1.3214284267509813e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.04047080874443054, "advantages/max": 9.659051895141602, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.7603976726531982, "all_logprobs": -0.1261378824710846, "all_logprobs/max": 0.0, "all_logprobs/median": -2.9802322387695312e-06, "all_logprobs/min": -12.125, "all_logprobs/p1": -2.34375, "all_logprobs/p10": -0.302734375, "all_logprobs/p25": -0.005279541015625, "all_logprobs/p5": -0.81640625, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.20254525542259216, "clip_ratio": 0.0, "completion_length": 574.5494995117188, "completion_length/correct": 500.01507568359375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 466.0, "completion_length/correct/min": 18.0, "completion_length/correct/p25": 325.0, "completion_length/correct/p75": 648.0, "completion_length/correct/var": 52876.7734375, "completion_length/incorrect": 834.76611328125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 15.0, "completion_length/incorrect/p25": 673.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 88831.5625, "completion_length/max": 1024.0, "completion_length/median": 530.0, "completion_length/min": 15.0, "completion_length/p25": 347.75, "completion_length/p75": 787.5, "completion_length/var": 80197.3203125, "epoch": 0.128, "feature_vector_variance/max_squared_error": 102876.234375, "feature_vector_variance/metric": 25057.08203125, "generated_tokens/total": 4713928.0, "grad_norm": 0.14605292677879333, "grouped_std_rewards": 0.24497459828853607, "learning_rate": 1.5e-05, "loss": -0.0405, "mean_logprobs": -0.1298828125, "mean_logprobs/var": 0.006500244140625, "num_completions/total": 7680, "per_sentence_gradient_norm": 6.936122894287109, "per_sentence_gradient_norm/max": 517.7893676757812, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 47.605247497558594, "per_sentence_gradient_norm/p99": 158.64451599121094, "per_sentence_gradient_norm/var": 1337.8277587890625, "per_token_feature_norm": 159.0756072998047, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 55.75, "per_token_feature_norm/p25": 125.5, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 1846.22998046875, "per_token_full_gradient_variance/max_squared_error": 702.6654663085938, "per_token_full_gradient_variance/variance": 0.09528613090515137, "per_token_gradient_norm": 8.191883087158203, "per_token_gradient_norm/max": 7609.01318359375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 13674.3056640625, "per_token_policy_error_norm": 0.0672375038266182, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.057200558483600616, "policy_entropy": 0.138671413064003, "policy_entropy/max": 3.765625, "policy_entropy/median": 4.3392181396484375e-05, "policy_entropy/min": 4.551914400963142e-15, "policy_entropy/p25": 1.7136335372924805e-06, "policy_entropy/p75": 0.035888671875, "policy_entropy/var": 0.10730815678834915, "policy_error_vector_variance/max_squared_error": 2.0180554389953613, "policy_error_vector_variance/metric": 0.0670994371175766, "policy_loss": -0.040470805019140244, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.7603981494903564, "policy_sharpness": 7.651146411895752, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.2381591796875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.469995498657227, "reward": 0.77734375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17330610752105713, "rewards/accuracy_reward": 0.77734375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17330610752105713, "sentence_full_gradient_variance/max_squared_error": 1245195.25, "sentence_full_gradient_variance/metric": 7719.5166015625, "sentence_full_gradient_variance/p75": 199.4789581298828, "sentence_full_gradient_variance/p90": 256.07806396484375, "sentence_full_gradient_variance/p95": 13278.107421875, "sentence_full_gradient_variance/p99": 159740.671875, "state_level_variance/metric": 126.22954559326172, "state_level_variance_full_gradient/metric": 876.962158203125, "step": 10 }, { "accuracy_reward": 0.7239583730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20010320842266083, "action_level_variance/metric": 2034.343505859375, "action_level_variance_full_gradient/metric": 12726.166015625, "adam_stats/lr_effective_max": 9.746137948241085e-05, "adam_stats/lr_effective_mean": -7.493770048938586e-10, "adam_stats/lr_effective_min": -9.421776485396549e-05, "adam_stats/m_t_max": 0.006133552175015211, "adam_stats/m_t_mean": 4.1964525604454295e-11, "adam_stats/m_t_min": -0.0067631835117936134, "adam_stats/v_t_max": 4.998887379770167e-05, "adam_stats/v_t_mean": 1.3553733873086782e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.09432855248451233, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.9590559005737305, "all_logprobs": -0.10978163778781891, "all_logprobs/max": 0.0, "all_logprobs/median": -1.7881393432617188e-06, "all_logprobs/min": -10.5625, "all_logprobs/p1": -2.125, "all_logprobs/p10": -0.2412109375, "all_logprobs/p25": -0.002471923828125, "all_logprobs/p5": -0.69140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.17062285542488098, "clip_ratio": 0.0, "completion_length": 616.4453125, "completion_length/correct": 535.3651123046875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 504.0, "completion_length/correct/min": 94.0, "completion_length/correct/p25": 379.0, "completion_length/correct/p75": 667.75, "completion_length/correct/var": 45257.359375, "completion_length/incorrect": 829.0896606445312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 31.0, "completion_length/incorrect/p25": 646.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 85066.671875, "completion_length/max": 1024.0, "completion_length/median": 554.0, "completion_length/min": 31.0, "completion_length/p25": 410.75, "completion_length/p75": 862.0, "completion_length/var": 73413.53125, "epoch": 0.1408, "feature_vector_variance/max_squared_error": 114237.0234375, "feature_vector_variance/metric": 24480.193359375, "generated_tokens/total": 5187358.0, "grad_norm": 0.3110400438308716, "grouped_std_rewards": 0.23720571398735046, "learning_rate": 1.4995431202643219e-05, "loss": 0.0943, "mean_logprobs": -0.1142578125, "mean_logprobs/var": 0.0038604736328125, "num_completions/total": 8448, "per_sentence_gradient_norm": 7.968535423278809, "per_sentence_gradient_norm/max": 562.98193359375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 35.83443832397461, "per_sentence_gradient_norm/p99": 255.11509704589844, "per_sentence_gradient_norm/var": 1973.415283203125, "per_token_feature_norm": 157.5878143310547, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 61.0, "per_token_feature_norm/p25": 125.0, "per_token_feature_norm/p75": 185.0, "per_token_feature_norm/var": 1730.7393798828125, "per_token_full_gradient_variance/max_squared_error": 550.0415649414062, "per_token_full_gradient_variance/variance": 0.0890432745218277, "per_token_gradient_norm": 7.252300262451172, "per_token_gradient_norm/max": 6819.4423828125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 11731.89453125, "per_token_policy_error_norm": 0.059420496225357056, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05079279839992523, "policy_entropy": 0.1219135969877243, "policy_entropy/max": 3.8125, "policy_entropy/median": 2.6106834411621094e-05, "policy_entropy/min": 1.0451708942760263e-16, "policy_entropy/p25": 1.2516975402832031e-06, "policy_entropy/p75": 0.0179443359375, "policy_entropy/var": 0.0888872891664505, "policy_error_vector_variance/max_squared_error": 2.016801357269287, "policy_error_vector_variance/metric": 0.059320494532585144, "policy_loss": 0.09432855248451233, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.9590559005737305, "policy_sharpness": 7.840445041656494, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.86712646484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.723231315612793, "reward": 0.7239583730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20010320842266083, "rewards/accuracy_reward": 0.7239583730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20010320842266083, "sentence_full_gradient_variance/max_squared_error": 2320621.25, "sentence_full_gradient_variance/metric": 14420.1201171875, "sentence_full_gradient_variance/p75": 242.27984619140625, "sentence_full_gradient_variance/p90": 308.4582824707031, "sentence_full_gradient_variance/p95": 308.4582824707031, "sentence_full_gradient_variance/p99": 254780.421875, "state_level_variance/metric": 192.80374145507812, "state_level_variance_full_gradient/metric": 1693.955078125, "step": 11 }, { "accuracy_reward": 0.7200521230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.201839879155159, "action_level_variance/metric": 2512.820068359375, "action_level_variance_full_gradient/metric": 12636.5810546875, "adam_stats/lr_effective_max": 9.475930710323155e-05, "adam_stats/lr_effective_mean": -4.3579273523164375e-10, "adam_stats/lr_effective_min": -9.29262678255327e-05, "adam_stats/m_t_max": 0.006301446817815304, "adam_stats/m_t_mean": 5.1659426736350156e-11, "adam_stats/m_t_min": -0.006590405013412237, "adam_stats/v_t_max": 5.007336585549638e-05, "adam_stats/v_t_mean": 1.3660077843175022e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.09494742006063461, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 5.11945915222168, "all_logprobs": -0.10224185883998871, "all_logprobs/max": 0.0, "all_logprobs/median": -1.430511474609375e-06, "all_logprobs/min": -11.875, "all_logprobs/p1": -2.046875, "all_logprobs/p10": -0.2021484375, "all_logprobs/p25": -0.00160980224609375, "all_logprobs/p5": -0.6328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.16117055714130402, "clip_ratio": 0.0, "completion_length": 610.7330932617188, "completion_length/correct": 530.4593505859375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 482.0, "completion_length/correct/min": 83.0, "completion_length/correct/p25": 370.0, "completion_length/correct/p75": 674.0, "completion_length/correct/var": 55625.78125, "completion_length/incorrect": 817.2046508789062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1017.0, "completion_length/incorrect/min": 28.0, "completion_length/incorrect/p25": 620.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 77435.65625, "completion_length/max": 1024.0, "completion_length/median": 558.0, "completion_length/min": 28.0, "completion_length/p25": 400.0, "completion_length/p75": 892.25, "completion_length/var": 78234.2734375, "epoch": 0.1536, "feature_vector_variance/max_squared_error": 110656.8125, "feature_vector_variance/metric": 24956.36328125, "generated_tokens/total": 5656401.0, "grad_norm": 0.1655702292919159, "grouped_std_rewards": 0.24614903330802917, "learning_rate": 1.4981730376948682e-05, "loss": 0.0949, "mean_logprobs": -0.10546875, "mean_logprobs/var": 0.0037689208984375, "num_completions/total": 9216, "per_sentence_gradient_norm": 8.74372673034668, "per_sentence_gradient_norm/max": 670.9297485351562, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 36.7520637512207, "per_sentence_gradient_norm/p99": 245.6404571533203, "per_sentence_gradient_norm/var": 2439.543701171875, "per_token_feature_norm": 159.02723693847656, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 57.5, "per_token_feature_norm/p25": 126.5, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 1685.735595703125, "per_token_full_gradient_variance/max_squared_error": 603.975830078125, "per_token_full_gradient_variance/variance": 0.12463963776826859, "per_token_gradient_norm": 9.358736991882324, "per_token_gradient_norm/max": 7042.11767578125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 16849.07421875, "per_token_policy_error_norm": 0.05525927618145943, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04759718105196953, "policy_entropy": 0.11362714320421219, "policy_entropy/max": 3.78125, "policy_entropy/median": 2.2172927856445312e-05, "policy_entropy/min": 6.966649479522857e-15, "policy_entropy/p25": 9.611248970031738e-07, "policy_entropy/p75": 0.01263427734375, "policy_entropy/var": 0.08295250684022903, "policy_error_vector_variance/max_squared_error": 2.015789747238159, "policy_error_vector_variance/metric": 0.05515187606215477, "policy_loss": 0.09494742006063461, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 5.11945915222168, "policy_sharpness": 7.93206787109375, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.62109375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.341979026794434, "reward": 0.7200521230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.201839879155159, "rewards/accuracy_reward": 0.7200521230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.201839879155159, "sentence_full_gradient_variance/max_squared_error": 3362230.75, "sentence_full_gradient_variance/metric": 14397.3310546875, "sentence_full_gradient_variance/p75": 106.11592102050781, "sentence_full_gradient_variance/p90": 168.011962890625, "sentence_full_gradient_variance/p95": 3162.99853515625, "sentence_full_gradient_variance/p99": 168330.96875, "state_level_variance/metric": 240.15135192871094, "state_level_variance_full_gradient/metric": 1760.7532958984375, "step": 12 }, { "accuracy_reward": 0.7083333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20686659216880798, "action_level_variance/metric": 2277.30029296875, "action_level_variance_full_gradient/metric": 9898.6162109375, "adam_stats/lr_effective_max": 9.535790741210803e-05, "adam_stats/lr_effective_mean": -2.2184679093761872e-10, "adam_stats/lr_effective_min": -9.352914639748633e-05, "adam_stats/m_t_max": 0.005834121722728014, "adam_stats/m_t_mean": 3.513768442875431e-11, "adam_stats/m_t_min": -0.004053083714097738, "adam_stats/v_t_max": 5.153013989911415e-05, "adam_stats/v_t_mean": 1.4647995273328607e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.20572912693023682, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 5.306407928466797, "all_logprobs": -0.09773648530244827, "all_logprobs/max": 0.0, "all_logprobs/median": -9.5367431640625e-07, "all_logprobs/min": -11.5, "all_logprobs/p1": -1.984375, "all_logprobs/p10": -0.1904296875, "all_logprobs/p25": -0.001495361328125, "all_logprobs/p5": -0.59375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.14662222564220428, "clip_ratio": 0.0, "completion_length": 594.4114990234375, "completion_length/correct": 501.9007263183594, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 476.0, "completion_length/correct/min": 97.0, "completion_length/correct/p25": 349.75, "completion_length/correct/p75": 617.5, "completion_length/correct/var": 42910.734375, "completion_length/incorrect": 819.0803833007812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 981.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 635.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63705.359375, "completion_length/max": 1024.0, "completion_length/median": 542.0, "completion_length/min": 2.0, "completion_length/p25": 388.75, "completion_length/p75": 802.75, "completion_length/var": 69712.0546875, "epoch": 0.1664, "feature_vector_variance/max_squared_error": 107314.5625, "feature_vector_variance/metric": 24853.09765625, "generated_tokens/total": 6112909.0, "grad_norm": 0.45937228202819824, "grouped_std_rewards": 0.17272385954856873, "learning_rate": 1.495891421526205e-05, "loss": 0.2057, "mean_logprobs": -0.103515625, "mean_logprobs/var": 0.035400390625, "num_completions/total": 9984, "per_sentence_gradient_norm": 7.1450419425964355, "per_sentence_gradient_norm/max": 809.2319946289062, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 199.5471649169922, "per_sentence_gradient_norm/var": 2229.151123046875, "per_token_feature_norm": 159.0430450439453, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 63.25, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 186.0, "per_token_feature_norm/var": 1589.3426513671875, "per_token_full_gradient_variance/max_squared_error": 465.614501953125, "per_token_full_gradient_variance/variance": 0.12097517400979996, "per_token_gradient_norm": 7.853297233581543, "per_token_gradient_norm/max": 6879.44091796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 15845.05859375, "per_token_policy_error_norm": 0.053847476840019226, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.046744346618652344, "policy_entropy": 0.10812567174434662, "policy_entropy/max": 3.734375, "policy_entropy/median": 1.5735626220703125e-05, "policy_entropy/min": 1.8984813721090177e-14, "policy_entropy/p25": 6.631016731262207e-07, "policy_entropy/p75": 0.01129150390625, "policy_entropy/var": 0.07401320338249207, "policy_error_vector_variance/max_squared_error": 2.014446973800659, "policy_error_vector_variance/metric": 0.0538005530834198, "policy_loss": 0.20572912693023682, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 5.306407928466797, "policy_sharpness": 7.974755764007568, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.99609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.083597183227539, "reward": 0.7083333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20686659216880798, "rewards/accuracy_reward": 0.7083333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20686659216880798, "sentence_full_gradient_variance/max_squared_error": 2117976.0, "sentence_full_gradient_variance/metric": 11179.037109375, "sentence_full_gradient_variance/p75": 217.61671447753906, "sentence_full_gradient_variance/p90": 739.407958984375, "sentence_full_gradient_variance/p95": 739.407958984375, "sentence_full_gradient_variance/p99": 241629.484375, "state_level_variance/metric": 236.06996154785156, "state_level_variance_full_gradient/metric": 1280.418701171875, "step": 13 }, { "accuracy_reward": 0.7057291865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2079462856054306, "action_level_variance/metric": 647.327880859375, "action_level_variance_full_gradient/metric": 8219.830078125, "adam_stats/lr_effective_max": 9.405719174537808e-05, "adam_stats/lr_effective_mean": -4.371146777870649e-10, "adam_stats/lr_effective_min": -9.431433136342093e-05, "adam_stats/m_t_max": 0.004567115567624569, "adam_stats/m_t_mean": -7.195566191497971e-12, "adam_stats/m_t_min": -0.004699639044702053, "adam_stats/v_t_max": 5.4389725846704096e-05, "adam_stats/v_t_mean": 1.528582273778445e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.03333970159292221, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.8810927867889404, "all_logprobs": -0.09442400932312012, "all_logprobs/max": 0.0, "all_logprobs/median": -5.960464477539062e-07, "all_logprobs/min": -13.125, "all_logprobs/p1": -1.9609375, "all_logprobs/p10": -0.1689453125, "all_logprobs/p25": -0.000911712646484375, "all_logprobs/p5": -0.57421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1443999707698822, "clip_ratio": 0.0, "completion_length": 599.7083740234375, "completion_length/correct": 477.3985290527344, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 433.0, "completion_length/correct/min": 158.0, "completion_length/correct/p25": 313.0, "completion_length/correct/p75": 598.75, "completion_length/correct/var": 42827.1171875, "completion_length/incorrect": 893.035400390625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 196.0, "completion_length/incorrect/p25": 818.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 48648.828125, "completion_length/max": 1024.0, "completion_length/median": 527.0, "completion_length/min": 158.0, "completion_length/p25": 362.75, "completion_length/p75": 870.25, "completion_length/var": 80402.625, "epoch": 0.1792, "feature_vector_variance/max_squared_error": 113906.1640625, "feature_vector_variance/metric": 25602.79296875, "generated_tokens/total": 6573485.0, "grad_norm": 0.3568934500217438, "grouped_std_rewards": 0.1785198152065277, "learning_rate": 1.4927010515561777e-05, "loss": -0.0333, "mean_logprobs": -0.0927734375, "mean_logprobs/var": 0.0017547607421875, "num_completions/total": 10752, "per_sentence_gradient_norm": 4.253023147583008, "per_sentence_gradient_norm/max": 327.8563232421875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 21.978181838989258, "per_sentence_gradient_norm/p99": 114.3359603881836, "per_sentence_gradient_norm/var": 630.0599975585938, "per_token_feature_norm": 161.84910583496094, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 158.0, "per_token_feature_norm/min": 57.75, "per_token_feature_norm/p25": 131.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 1564.0615234375, "per_token_full_gradient_variance/max_squared_error": 390.7252502441406, "per_token_full_gradient_variance/variance": 0.06702303886413574, "per_token_gradient_norm": 5.061619281768799, "per_token_gradient_norm/max": 7150.36279296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8429.287109375, "per_token_policy_error_norm": 0.05174247920513153, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04492051526904106, "policy_entropy": 0.10389435291290283, "policy_entropy/max": 3.734375, "policy_entropy/median": 9.059906005859375e-06, "policy_entropy/min": 5.551115123125783e-15, "policy_entropy/p25": 3.9301812648773193e-07, "policy_entropy/p75": 0.007415771484375, "policy_entropy/var": 0.07316475361585617, "policy_error_vector_variance/max_squared_error": 2.0084826946258545, "policy_error_vector_variance/metric": 0.05171297490596771, "policy_loss": -0.03333970159292221, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.8810927867889404, "policy_sharpness": 8.067859649658203, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.75, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.732125282287598, "reward": 0.7057291865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2079462856054306, "rewards/accuracy_reward": 0.7057291865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2079462856054306, "sentence_full_gradient_variance/max_squared_error": 3104511.5, "sentence_full_gradient_variance/metric": 9348.201171875, "sentence_full_gradient_variance/p75": 71.96272277832031, "sentence_full_gradient_variance/p90": 153.04617309570312, "sentence_full_gradient_variance/p95": 153.04617309570312, "sentence_full_gradient_variance/p99": 92319.890625, "state_level_variance/metric": 63.48912048339844, "state_level_variance_full_gradient/metric": 1128.3719482421875, "step": 14 }, { "accuracy_reward": 0.7981771230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16130046546459198, "action_level_variance/metric": 718.511962890625, "action_level_variance_full_gradient/metric": 4672.46826171875, "adam_stats/lr_effective_max": 9.176127787213773e-05, "adam_stats/lr_effective_mean": -3.0518415572444724e-10, "adam_stats/lr_effective_min": -9.096426219912246e-05, "adam_stats/m_t_max": 0.004256888292729855, "adam_stats/m_t_mean": -3.973547836252922e-13, "adam_stats/m_t_min": -0.003747497219592333, "adam_stats/v_t_max": 5.440123277367093e-05, "adam_stats/v_t_mean": 1.529419386275821e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.09300766885280609, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.0374696254730225, "all_logprobs": -0.08615214377641678, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -11.875, "all_logprobs/p1": -1.8203125, "all_logprobs/p10": -0.1455078125, "all_logprobs/p25": -0.000804901123046875, "all_logprobs/p5": -0.51953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12647207081317902, "clip_ratio": 0.0, "completion_length": 527.94921875, "completion_length/correct": 453.9184265136719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 422.0, "completion_length/correct/min": 162.0, "completion_length/correct/p25": 307.0, "completion_length/correct/p75": 552.0, "completion_length/correct/var": 35489.09375, "completion_length/incorrect": 820.72900390625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 962.0, "completion_length/incorrect/min": 167.0, "completion_length/incorrect/p25": 612.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 56801.16796875, "completion_length/max": 1024.0, "completion_length/median": 472.0, "completion_length/min": 162.0, "completion_length/p25": 333.75, "completion_length/p75": 662.75, "completion_length/var": 61424.890625, "epoch": 0.192, "feature_vector_variance/max_squared_error": 113979.3046875, "feature_vector_variance/metric": 25876.8359375, "generated_tokens/total": 6978950.0, "grad_norm": 0.11144553869962692, "grouped_std_rewards": 0.13086046278476715, "learning_rate": 1.488605814759156e-05, "loss": 0.093, "mean_logprobs": -0.0859375, "mean_logprobs/var": 0.0010833740234375, "num_completions/total": 11520, "per_sentence_gradient_norm": 3.6296048164367676, "per_sentence_gradient_norm/max": 469.5066833496094, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 126.22389221191406, "per_sentence_gradient_norm/var": 706.257568359375, "per_token_feature_norm": 163.17433166503906, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 160.0, "per_token_feature_norm/min": 61.25, "per_token_feature_norm/p25": 133.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 1493.1964111328125, "per_token_full_gradient_variance/max_squared_error": 521.201904296875, "per_token_full_gradient_variance/variance": 0.06229571998119354, "per_token_gradient_norm": 4.55739688873291, "per_token_gradient_norm/max": 6378.72998046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 7885.541015625, "per_token_policy_error_norm": 0.04805756360292435, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.041903503239154816, "policy_entropy": 0.09557190537452698, "policy_entropy/max": 3.75, "policy_entropy/median": 6.645917892456055e-06, "policy_entropy/min": 5.169475958410885e-16, "policy_entropy/p25": 2.4028122425079346e-07, "policy_entropy/p75": 0.006591796875, "policy_entropy/var": 0.06142299249768257, "policy_error_vector_variance/max_squared_error": 2.011495590209961, "policy_error_vector_variance/metric": 0.04803352430462837, "policy_loss": 0.09300766885280609, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.0374701023101807, "policy_sharpness": 8.109843254089355, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.41024112701416, "reward": 0.7981771230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16130046546459198, "rewards/accuracy_reward": 0.7981771230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16130046546459198, "sentence_full_gradient_variance/max_squared_error": 2158529.25, "sentence_full_gradient_variance/metric": 5289.8134765625, "sentence_full_gradient_variance/p75": 86.2373275756836, "sentence_full_gradient_variance/p90": 239.1721954345703, "sentence_full_gradient_variance/p95": 239.1721954345703, "sentence_full_gradient_variance/p99": 50767.62890625, "state_level_variance/metric": 77.44670104980469, "state_level_variance_full_gradient/metric": 617.34521484375, "step": 15 }, { "accuracy_reward": 0.7942708730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16361771523952484, "action_level_variance/metric": 477.77374267578125, "action_level_variance_full_gradient/metric": 4949.20654296875, "adam_stats/lr_effective_max": 9.233026503352448e-05, "adam_stats/lr_effective_mean": -3.543464688338105e-10, "adam_stats/lr_effective_min": -8.679964957991615e-05, "adam_stats/m_t_max": 0.003830436384305358, "adam_stats/m_t_mean": 1.6004376643397045e-12, "adam_stats/m_t_min": -0.003445226699113846, "adam_stats/v_t_max": 5.435139246401377e-05, "adam_stats/v_t_mean": 1.5289794170342264e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.004254058003425598, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.784937858581543, "all_logprobs": -0.0862140879034996, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -11.0625, "all_logprobs/p1": -1.8046875, "all_logprobs/p10": -0.1455078125, "all_logprobs/p25": -0.000640869140625, "all_logprobs/p5": -0.5234375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12607944011688232, "clip_ratio": 0.0, "completion_length": 565.1979370117188, "completion_length/correct": 486.18853759765625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 436.0, "completion_length/correct/min": 142.0, "completion_length/correct/p25": 330.25, "completion_length/correct/p75": 613.0, "completion_length/correct/var": 42857.76171875, "completion_length/incorrect": 870.2341918945312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 719.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 53397.46875, "completion_length/max": 1024.0, "completion_length/median": 505.0, "completion_length/min": 2.0, "completion_length/p25": 354.0, "completion_length/p75": 731.5, "completion_length/var": 69091.453125, "epoch": 0.2048, "feature_vector_variance/max_squared_error": 114933.0, "feature_vector_variance/metric": 25307.19921875, "generated_tokens/total": 7413022.0, "grad_norm": 0.07192010432481766, "grouped_std_rewards": 0.14260295033454895, "learning_rate": 1.4836107005503543e-05, "loss": -0.0043, "mean_logprobs": -0.09228515625, "mean_logprobs/var": 0.032958984375, "num_completions/total": 12288, "per_sentence_gradient_norm": 3.450500011444092, "per_sentence_gradient_norm/max": 351.8014221191406, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 84.666259765625, "per_sentence_gradient_norm/var": 466.47515869140625, "per_token_feature_norm": 161.8207550048828, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 159.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 132.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 1488.57861328125, "per_token_full_gradient_variance/max_squared_error": 270.2794494628906, "per_token_full_gradient_variance/variance": 0.04426335543394089, "per_token_gradient_norm": 4.14409065246582, "per_token_gradient_norm/max": 6278.216796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5417.18994140625, "per_token_policy_error_norm": 0.04805272817611694, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.041553471237421036, "policy_entropy": 0.09584508091211319, "policy_entropy/max": 3.71875, "policy_entropy/median": 6.884336471557617e-06, "policy_entropy/min": 1.6542323066914832e-14, "policy_entropy/p25": 2.849847078323364e-07, "policy_entropy/p75": 0.005767822265625, "policy_entropy/var": 0.06318018585443497, "policy_error_vector_variance/max_squared_error": 2.014927864074707, "policy_error_vector_variance/metric": 0.048014089465141296, "policy_loss": -0.004254058003425598, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.784937858581543, "policy_sharpness": 8.137941360473633, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.336207389831543, "reward": 0.7942708730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16361771523952484, "rewards/accuracy_reward": 0.7942708730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16361771523952484, "sentence_full_gradient_variance/max_squared_error": 1589212.0, "sentence_full_gradient_variance/metric": 5597.36328125, "sentence_full_gradient_variance/p75": 127.84939575195312, "sentence_full_gradient_variance/p90": 132.42434692382812, "sentence_full_gradient_variance/p95": 132.42434692382812, "sentence_full_gradient_variance/p99": 86743.2265625, "state_level_variance/metric": 48.319087982177734, "state_level_variance_full_gradient/metric": 648.15673828125, "step": 16 }, { "accuracy_reward": 0.7291666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1977400928735733, "action_level_variance/metric": 780.9533081054688, "action_level_variance_full_gradient/metric": 9456.44921875, "adam_stats/lr_effective_max": 9.401211718795821e-05, "adam_stats/lr_effective_mean": -1.5211942816506507e-10, "adam_stats/lr_effective_min": -8.942277781898156e-05, "adam_stats/m_t_max": 0.003804448526352644, "adam_stats/m_t_mean": 2.043802974083242e-11, "adam_stats/m_t_min": -0.003241084748879075, "adam_stats/v_t_max": 5.429717566585168e-05, "adam_stats/v_t_mean": 1.5342726004605178e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.14788685739040375, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.732564926147461, "all_logprobs": -0.08702138811349869, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -9.6875, "all_logprobs/p1": -1.828125, "all_logprobs/p10": -0.1474609375, "all_logprobs/p25": -0.000812530517578125, "all_logprobs/p5": -0.5234375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12715034186840057, "clip_ratio": 0.0, "completion_length": 575.1732177734375, "completion_length/correct": 479.63037109375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 435.0, "completion_length/correct/min": 134.0, "completion_length/correct/p25": 320.0, "completion_length/correct/p75": 585.25, "completion_length/correct/var": 44202.66796875, "completion_length/incorrect": 832.4038696289062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 215.0, "completion_length/incorrect/p25": 624.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 57928.2109375, "completion_length/max": 1024.0, "completion_length/median": 511.0, "completion_length/min": 134.0, "completion_length/p25": 360.5, "completion_length/p75": 787.25, "completion_length/var": 72457.9140625, "epoch": 0.2176, "feature_vector_variance/max_squared_error": 115630.53125, "feature_vector_variance/metric": 26062.185546875, "generated_tokens/total": 7854755.0, "grad_norm": 0.14722543954849243, "grouped_std_rewards": 0.18013732135295868, "learning_rate": 1.4777217947069972e-05, "loss": 0.1479, "mean_logprobs": -0.08642578125, "mean_logprobs/var": 0.0013427734375, "num_completions/total": 13056, "per_sentence_gradient_norm": 4.869413375854492, "per_sentence_gradient_norm/max": 303.892333984375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 10.731490135192871, "per_sentence_gradient_norm/p99": 139.65553283691406, "per_sentence_gradient_norm/var": 758.2293701171875, "per_token_feature_norm": 164.31736755371094, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 162.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 134.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 1494.9405517578125, "per_token_full_gradient_variance/max_squared_error": 1173.7208251953125, "per_token_full_gradient_variance/variance": 0.08335836976766586, "per_token_gradient_norm": 6.121516227722168, "per_token_gradient_norm/max": 6392.6474609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 10372.484375, "per_token_policy_error_norm": 0.04841633513569832, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04197237640619278, "policy_entropy": 0.0972265675663948, "policy_entropy/max": 3.75, "policy_entropy/median": 6.735324859619141e-06, "policy_entropy/min": 1.63202784619898e-14, "policy_entropy/p25": 2.421438694000244e-07, "policy_entropy/p75": 0.00701904296875, "policy_entropy/var": 0.06447292119264603, "policy_error_vector_variance/max_squared_error": 2.0138278007507324, "policy_error_vector_variance/metric": 0.04838401824235916, "policy_loss": 0.14788685739040375, "policy_loss/max": 19.79339599609375, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.732564926147461, "policy_sharpness": 8.099199295043945, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.480610847473145, "reward": 0.7291666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.1977400928735733, "rewards/accuracy_reward": 0.7291666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1977400928735733, "sentence_full_gradient_variance/max_squared_error": 3431040.0, "sentence_full_gradient_variance/metric": 10691.611328125, "sentence_full_gradient_variance/p75": 198.65994262695312, "sentence_full_gradient_variance/p90": 410.3616638183594, "sentence_full_gradient_variance/p95": 410.3616638183594, "sentence_full_gradient_variance/p99": 106367.6796875, "state_level_variance/metric": 74.68598175048828, "state_level_variance_full_gradient/metric": 1235.1602783203125, "step": 17 }, { "accuracy_reward": 0.734375, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1953226923942566, "action_level_variance/metric": 486.978515625, "action_level_variance_full_gradient/metric": 5412.33154296875, "adam_stats/lr_effective_max": 9.283226245315745e-05, "adam_stats/lr_effective_mean": -1.8646625910001546e-10, "adam_stats/lr_effective_min": -8.788380364421755e-05, "adam_stats/m_t_max": 0.003970268182456493, "adam_stats/m_t_mean": 3.2721221565612524e-11, "adam_stats/m_t_min": -0.0024249819107353687, "adam_stats/v_t_max": 5.433705518953502e-05, "adam_stats/v_t_mean": 1.5375781160439916e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0220370851457119, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 3.2931742668151855, "all_logprobs": -0.08015189319849014, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -12.8125, "all_logprobs/p1": -1.7734375, "all_logprobs/p10": -0.119140625, "all_logprobs/p25": -0.0003509521484375, "all_logprobs/p5": -0.474609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11960121989250183, "clip_ratio": 0.0, "completion_length": 554.97265625, "completion_length/correct": 487.3776550292969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 430.0, "completion_length/correct/min": 94.0, "completion_length/correct/p25": 323.5, "completion_length/correct/p75": 609.75, "completion_length/correct/var": 47373.6953125, "completion_length/incorrect": 741.8529663085938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 754.0, "completion_length/incorrect/min": 195.0, "completion_length/incorrect/p25": 530.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 67071.9140625, "completion_length/max": 1024.0, "completion_length/median": 494.0, "completion_length/min": 94.0, "completion_length/p25": 342.0, "completion_length/p75": 743.0, "completion_length/var": 65174.05078125, "epoch": 0.2304, "feature_vector_variance/max_squared_error": 117131.1328125, "feature_vector_variance/metric": 25992.38671875, "generated_tokens/total": 8280974.0, "grad_norm": 0.12729549407958984, "grouped_std_rewards": 0.21875157952308655, "learning_rate": 1.4709462719537392e-05, "loss": -0.022, "mean_logprobs": -0.07958984375, "mean_logprobs/var": 0.00128173828125, "num_completions/total": 13824, "per_sentence_gradient_norm": 3.8734092712402344, "per_sentence_gradient_norm/max": 287.99761962890625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 20.72108268737793, "per_sentence_gradient_norm/p99": 91.65910339355469, "per_sentence_gradient_norm/var": 472.5906066894531, "per_token_feature_norm": 164.73727416992188, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 163.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 135.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 1448.4061279296875, "per_token_full_gradient_variance/max_squared_error": 293.95477294921875, "per_token_full_gradient_variance/variance": 0.057421308010816574, "per_token_gradient_norm": 4.844062805175781, "per_token_gradient_norm/max": 6197.033203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6596.12744140625, "per_token_policy_error_norm": 0.044483691453933716, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03912469372153282, "policy_entropy": 0.08876212686300278, "policy_entropy/max": 3.703125, "policy_entropy/median": 4.738569259643555e-06, "policy_entropy/min": 3.730349362740526e-14, "policy_entropy/p25": 1.909211277961731e-07, "policy_entropy/p75": 0.0033416748046875, "policy_entropy/var": 0.05866153910756111, "policy_error_vector_variance/max_squared_error": 2.01519775390625, "policy_error_vector_variance/metric": 0.04445592314004898, "policy_loss": -0.022037073969841003, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.2931747436523438, "policy_sharpness": 8.24030590057373, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.898195266723633, "reward": 0.734375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.1953226923942566, "rewards/accuracy_reward": 0.734375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1953226923942566, "sentence_full_gradient_variance/max_squared_error": 1131582.75, "sentence_full_gradient_variance/metric": 6112.802734375, "sentence_full_gradient_variance/p75": 162.56578063964844, "sentence_full_gradient_variance/p90": 248.5601043701172, "sentence_full_gradient_variance/p95": 4091.829345703125, "sentence_full_gradient_variance/p99": 105589.5234375, "state_level_variance/metric": 46.35184860229492, "state_level_variance_full_gradient/metric": 700.470703125, "step": 18 }, { "accuracy_reward": 0.7708333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17687959969043732, "action_level_variance/metric": 567.1600341796875, "action_level_variance_full_gradient/metric": 3863.455078125, "adam_stats/lr_effective_max": 8.891116158338264e-05, "adam_stats/lr_effective_mean": 9.834472819436613e-11, "adam_stats/lr_effective_min": -8.732415153644979e-05, "adam_stats/m_t_max": 0.0035301351454108953, "adam_stats/m_t_mean": 4.0024414338368786e-11, "adam_stats/m_t_min": -0.0023442269302904606, "adam_stats/v_t_max": 5.430995224742219e-05, "adam_stats/v_t_mean": 1.5466494187807434e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.12748144567012787, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.837531328201294, "all_logprobs": -0.07786169648170471, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -14.0, "all_logprobs/p1": -1.71875, "all_logprobs/p10": -0.1064453125, "all_logprobs/p25": -0.0003108978271484375, "all_logprobs/p5": -0.44921875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11793261766433716, "clip_ratio": 0.0, "completion_length": 563.35546875, "completion_length/correct": 475.5287170410156, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 442.0, "completion_length/correct/min": 101.0, "completion_length/correct/p25": 315.5, "completion_length/correct/p75": 611.75, "completion_length/correct/var": 45530.11328125, "completion_length/incorrect": 858.7727661132812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 217.0, "completion_length/incorrect/p25": 713.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 54231.5390625, "completion_length/max": 1024.0, "completion_length/median": 507.0, "completion_length/min": 101.0, "completion_length/p25": 340.0, "completion_length/p75": 758.0, "completion_length/var": 73435.4453125, "epoch": 0.2432, "feature_vector_variance/max_squared_error": 119138.9921875, "feature_vector_variance/metric": 26064.640625, "generated_tokens/total": 8713631.0, "grad_norm": 0.1719832420349121, "grouped_std_rewards": 0.17444249987602234, "learning_rate": 1.4632923872213653e-05, "loss": 0.1275, "mean_logprobs": -0.0771484375, "mean_logprobs/var": 0.001434326171875, "num_completions/total": 14592, "per_sentence_gradient_norm": 3.903792142868042, "per_sentence_gradient_norm/max": 354.509521484375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 9.020330429077148, "per_sentence_gradient_norm/p99": 110.20333099365234, "per_sentence_gradient_norm/var": 552.6399536132812, "per_token_feature_norm": 165.4812469482422, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 164.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 136.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 1423.04052734375, "per_token_full_gradient_variance/max_squared_error": 739.2334594726562, "per_token_full_gradient_variance/variance": 0.08232639729976654, "per_token_gradient_norm": 5.443235397338867, "per_token_gradient_norm/max": 7495.51025390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9186.697265625, "per_token_policy_error_norm": 0.04321831464767456, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.038074687123298645, "policy_entropy": 0.08591876178979874, "policy_entropy/max": 3.734375, "policy_entropy/median": 4.1425228118896484e-06, "policy_entropy/min": 5.245803791353865e-15, "policy_entropy/p25": 1.5739351511001587e-07, "policy_entropy/p75": 0.003021240234375, "policy_entropy/var": 0.05667960271239281, "policy_error_vector_variance/max_squared_error": 2.012930154800415, "policy_error_vector_variance/metric": 0.04318055510520935, "policy_loss": 0.12748144567012787, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.837531566619873, "policy_sharpness": 8.270211219787598, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.7135648727417, "reward": 0.7708333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17687959969043732, "rewards/accuracy_reward": 0.7708333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17687959969043732, "sentence_full_gradient_variance/max_squared_error": 1262865.125, "sentence_full_gradient_variance/metric": 4392.2626953125, "sentence_full_gradient_variance/p75": 29.361785888671875, "sentence_full_gradient_variance/p90": 89.393798828125, "sentence_full_gradient_variance/p95": 89.393798828125, "sentence_full_gradient_variance/p99": 62878.76953125, "state_level_variance/metric": 56.24125671386719, "state_level_variance_full_gradient/metric": 528.807373046875, "step": 19 }, { "accuracy_reward": 0.7421875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1915946900844574, "action_level_variance/metric": 630.0311279296875, "action_level_variance_full_gradient/metric": 7791.5732421875, "adam_stats/lr_effective_max": 8.807251288089901e-05, "adam_stats/lr_effective_mean": 2.473377058720416e-10, "adam_stats/lr_effective_min": -8.754617010708898e-05, "adam_stats/m_t_max": 0.0030535252299159765, "adam_stats/m_t_mean": 3.221465455505168e-11, "adam_stats/m_t_min": -0.0029097325168550014, "adam_stats/v_t_max": 5.449409218272194e-05, "adam_stats/v_t_mean": 1.5531229731122198e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.12436811625957489, "advantages/max": 7.48191499710083, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.604832410812378, "all_logprobs": -0.07710966467857361, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -12.875, "all_logprobs/p1": -1.7109375, "all_logprobs/p10": -0.10708045959472656, "all_logprobs/p25": -0.0002956390380859375, "all_logprobs/p5": -0.4375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11401350051164627, "clip_ratio": 0.0, "completion_length": 556.7552490234375, "completion_length/correct": 473.310546875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 441.0, "completion_length/correct/min": 104.0, "completion_length/correct/p25": 318.0, "completion_length/correct/p75": 594.75, "completion_length/correct/var": 43867.48046875, "completion_length/incorrect": 796.9747314453125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 913.0, "completion_length/incorrect/min": 210.0, "completion_length/incorrect/p25": 565.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 60726.74609375, "completion_length/max": 1024.0, "completion_length/median": 504.0, "completion_length/min": 104.0, "completion_length/p25": 352.75, "completion_length/p75": 722.25, "completion_length/var": 68211.671875, "epoch": 0.256, "feature_vector_variance/max_squared_error": 126495.5390625, "feature_vector_variance/metric": 25478.185546875, "generated_tokens/total": 9141219.0, "grad_norm": 0.16854345798492432, "grouped_std_rewards": 0.2054763287305832, "learning_rate": 1.4547694655894313e-05, "loss": 0.1244, "mean_logprobs": -0.0771484375, "mean_logprobs/var": 0.0010833740234375, "num_completions/total": 15360, "per_sentence_gradient_norm": 4.169229507446289, "per_sentence_gradient_norm/max": 391.0950927734375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 22.388137817382812, "per_sentence_gradient_norm/p99": 102.28868103027344, "per_sentence_gradient_norm/var": 613.4474487304688, "per_token_feature_norm": 163.0010986328125, "per_token_feature_norm/max": 342.0, "per_token_feature_norm/median": 160.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 134.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 1424.08056640625, "per_token_full_gradient_variance/max_squared_error": 561.5095825195312, "per_token_full_gradient_variance/variance": 0.05626508593559265, "per_token_gradient_norm": 4.701107025146484, "per_token_gradient_norm/max": 6334.50390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 7148.6220703125, "per_token_policy_error_norm": 0.043003786355257034, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.037903398275375366, "policy_entropy": 0.08558017015457153, "policy_entropy/max": 3.609375, "policy_entropy/median": 5.036592483520508e-06, "policy_entropy/min": 3.885780586188048e-14, "policy_entropy/p25": 2.1979212760925293e-07, "policy_entropy/p75": 0.002777099609375, "policy_entropy/var": 0.055771809071302414, "policy_error_vector_variance/max_squared_error": 2.011392831802368, "policy_error_vector_variance/metric": 0.04298108071088791, "policy_loss": 0.12436811625957489, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -7.481915473937988, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.604832410812378, "policy_sharpness": 8.274653434753418, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.695826530456543, "reward": 0.7421875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.1915946900844574, "rewards/accuracy_reward": 0.7421875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1915946900844574, "sentence_full_gradient_variance/max_squared_error": 1201605.375, "sentence_full_gradient_variance/metric": 8852.216796875, "sentence_full_gradient_variance/p75": 82.58521270751953, "sentence_full_gradient_variance/p90": 212.2599334716797, "sentence_full_gradient_variance/p95": 212.2599334716797, "sentence_full_gradient_variance/p99": 235991.875, "state_level_variance/metric": 62.01744079589844, "state_level_variance_full_gradient/metric": 1060.644287109375, "step": 20 }, { "accuracy_reward": 0.7838541865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1696477085351944, "action_level_variance/metric": 213.923583984375, "action_level_variance_full_gradient/metric": 3613.00830078125, "adam_stats/lr_effective_max": 8.555575186619535e-05, "adam_stats/lr_effective_mean": 3.824625616211108e-10, "adam_stats/lr_effective_min": -8.663701737532392e-05, "adam_stats/m_t_max": 0.0029160194098949432, "adam_stats/m_t_mean": 3.287818628461281e-11, "adam_stats/m_t_min": -0.002832382218912244, "adam_stats/v_t_max": 5.4475374781759456e-05, "adam_stats/v_t_mean": 1.5550290005314493e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.06457933783531189, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.6473037004470825, "all_logprobs": -0.07641423493623734, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -13.0625, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.10595703125, "all_logprobs/p25": -0.000255584716796875, "all_logprobs/p5": -0.43398284912109375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1119837537407875, "clip_ratio": 0.0, "completion_length": 562.0794677734375, "completion_length/correct": 483.84051513671875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 442.0, "completion_length/correct/min": 97.0, "completion_length/correct/p25": 288.25, "completion_length/correct/p75": 616.0, "completion_length/correct/var": 54541.328125, "completion_length/incorrect": 845.813232421875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 250.0, "completion_length/incorrect/p25": 662.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 56749.625, "completion_length/max": 1024.0, "completion_length/median": 514.0, "completion_length/min": 97.0, "completion_length/p25": 323.0, "completion_length/p75": 798.0, "completion_length/var": 77173.2421875, "epoch": 0.2688, "feature_vector_variance/max_squared_error": 116290.40625, "feature_vector_variance/metric": 24717.23828125, "generated_tokens/total": 9572896.0, "grad_norm": 0.11956389993429184, "grouped_std_rewards": 0.17236730456352234, "learning_rate": 1.4453878909250906e-05, "loss": -0.0646, "mean_logprobs": -0.07666015625, "mean_logprobs/var": 0.000881195068359375, "num_completions/total": 16128, "per_sentence_gradient_norm": 2.6764533519744873, "per_sentence_gradient_norm/max": 166.0570526123047, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 71.40968322753906, "per_sentence_gradient_norm/var": 207.02972412109375, "per_token_feature_norm": 160.61293029785156, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 157.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 131.0, "per_token_feature_norm/p75": 187.0, "per_token_feature_norm/var": 1406.595703125, "per_token_full_gradient_variance/max_squared_error": 57.69134521484375, "per_token_full_gradient_variance/variance": 0.0255429744720459, "per_token_gradient_norm": 3.347323179244995, "per_token_gradient_norm/max": 3407.23046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2947.28271484375, "per_token_policy_error_norm": 0.04282170534133911, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03761514276266098, "policy_entropy": 0.08455166220664978, "policy_entropy/max": 3.703125, "policy_entropy/median": 5.781650543212891e-06, "policy_entropy/min": 2.8727020762175925e-15, "policy_entropy/p25": 2.775341272354126e-07, "policy_entropy/p75": 0.002410888671875, "policy_entropy/var": 0.05465830862522125, "policy_error_vector_variance/max_squared_error": 2.012085199356079, "policy_error_vector_variance/metric": 0.04280102252960205, "policy_loss": -0.06457935273647308, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -7.481915473937988, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.6473037004470825, "policy_sharpness": 8.30087947845459, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.59520435333252, "reward": 0.7838541865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1696477085351944, "rewards/accuracy_reward": 0.7838541865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1696477085351944, "sentence_full_gradient_variance/max_squared_error": 1119237.375, "sentence_full_gradient_variance/metric": 4046.573974609375, "sentence_full_gradient_variance/p75": 187.9166717529297, "sentence_full_gradient_variance/p90": 235.00270080566406, "sentence_full_gradient_variance/p95": 235.00270080566406, "sentence_full_gradient_variance/p99": 84604.7109375, "state_level_variance/metric": 19.783119201660156, "state_level_variance_full_gradient/metric": 433.56585693359375, "step": 21 }, { "accuracy_reward": 0.7265625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19892846047878265, "action_level_variance/metric": 463.03411865234375, "action_level_variance_full_gradient/metric": 3913.83447265625, "adam_stats/lr_effective_max": 8.588493801653385e-05, "adam_stats/lr_effective_mean": 4.122659158500852e-10, "adam_stats/lr_effective_min": -8.748234540689737e-05, "adam_stats/m_t_max": 0.0026236544363200665, "adam_stats/m_t_mean": 3.335567932971628e-11, "adam_stats/m_t_min": -0.002396556083112955, "adam_stats/v_t_max": 5.442132169264369e-05, "adam_stats/v_t_mean": 1.5551497806534642e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.03260820358991623, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.9639792442321777, "all_logprobs": -0.08075728267431259, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-07, "all_logprobs/min": -9.875, "all_logprobs/p1": -1.7493743896484375, "all_logprobs/p10": -0.126953125, "all_logprobs/p25": -0.0004405975341796875, "all_logprobs/p5": -0.474609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1200653538107872, "clip_ratio": 0.0, "completion_length": 592.3294677734375, "completion_length/correct": 501.4981994628906, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 454.0, "completion_length/correct/min": 129.0, "completion_length/correct/p25": 331.0, "completion_length/correct/p75": 620.75, "completion_length/correct/var": 46327.08984375, "completion_length/incorrect": 833.6809692382812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 245.0, "completion_length/incorrect/p25": 607.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 57304.8359375, "completion_length/max": 1024.0, "completion_length/median": 540.0, "completion_length/min": 129.0, "completion_length/p25": 372.5, "completion_length/p75": 801.0, "completion_length/var": 71208.8515625, "epoch": 0.2816, "feature_vector_variance/max_squared_error": 119272.109375, "feature_vector_variance/metric": 24804.82421875, "generated_tokens/total": 10027805.0, "grad_norm": 0.10264931619167328, "grouped_std_rewards": 0.2200184315443039, "learning_rate": 1.4351590932319506e-05, "loss": 0.0326, "mean_logprobs": -0.0810546875, "mean_logprobs/var": 0.001251220703125, "num_completions/total": 16896, "per_sentence_gradient_norm": 4.240811824798584, "per_sentence_gradient_norm/max": 255.12991333007812, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 31.39674186706543, "per_sentence_gradient_norm/p99": 103.51464080810547, "per_sentence_gradient_norm/var": 445.6298828125, "per_token_feature_norm": 160.5012664794922, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 157.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 131.0, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 1451.365478515625, "per_token_full_gradient_variance/max_squared_error": 1474.9925537109375, "per_token_full_gradient_variance/variance": 0.05915175378322601, "per_token_gradient_norm": 4.947407245635986, "per_token_gradient_norm/max": 6644.39404296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6117.779296875, "per_token_policy_error_norm": 0.044888194650411606, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.039182890206575394, "policy_entropy": 0.08979986608028412, "policy_entropy/max": 3.734375, "policy_entropy/median": 7.808208465576172e-06, "policy_entropy/min": 2.864375403532904e-14, "policy_entropy/p25": 3.2223761081695557e-07, "policy_entropy/p75": 0.004058837890625, "policy_entropy/var": 0.05851036682724953, "policy_error_vector_variance/max_squared_error": 2.0156824588775635, "policy_error_vector_variance/metric": 0.04486484453082085, "policy_loss": 0.03260820358991623, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.9639792442321777, "policy_sharpness": 8.205936431884766, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.75, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.9988374710083, "reward": 0.7265625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19892846047878265, "rewards/accuracy_reward": 0.7265625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19892846047878265, "sentence_full_gradient_variance/max_squared_error": 411102.15625, "sentence_full_gradient_variance/metric": 4453.685546875, "sentence_full_gradient_variance/p75": 29.548782348632812, "sentence_full_gradient_variance/p90": 86.1899185180664, "sentence_full_gradient_variance/p95": 86.1899185180664, "sentence_full_gradient_variance/p99": 117194.9765625, "state_level_variance/metric": 40.31473159790039, "state_level_variance_full_gradient/metric": 539.85107421875, "step": 22 }, { "accuracy_reward": 0.75390625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18577350676059723, "action_level_variance/metric": 353.20111083984375, "action_level_variance_full_gradient/metric": 3221.205078125, "adam_stats/lr_effective_max": 8.445360435871407e-05, "adam_stats/lr_effective_mean": 3.643459145497019e-10, "adam_stats/lr_effective_min": -8.290039841085672e-05, "adam_stats/m_t_max": 0.0024658117908984423, "adam_stats/m_t_mean": 2.8704557827885502e-11, "adam_stats/m_t_min": -0.002272867364808917, "adam_stats/v_t_max": 5.4375130275730044e-05, "adam_stats/v_t_mean": 1.5550023291580062e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.011302519589662552, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.0071260929107666, "all_logprobs": -0.08052019029855728, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -10.6875, "all_logprobs/p1": -1.7414054870605469, "all_logprobs/p10": -0.1259765625, "all_logprobs/p25": -0.00043487548828125, "all_logprobs/p5": -0.474609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11938571184873581, "clip_ratio": 0.0, "completion_length": 544.28515625, "completion_length/correct": 453.3229675292969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 412.0, "completion_length/correct/min": 121.0, "completion_length/correct/p25": 283.0, "completion_length/correct/p75": 605.0, "completion_length/correct/var": 46176.95703125, "completion_length/incorrect": 822.9470825195312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 965.0, "completion_length/incorrect/min": 131.0, "completion_length/incorrect/p25": 650.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63929.23046875, "completion_length/max": 1024.0, "completion_length/median": 494.0, "completion_length/min": 121.0, "completion_length/p25": 312.0, "completion_length/p75": 738.75, "completion_length/var": 75848.7734375, "epoch": 0.2944, "feature_vector_variance/max_squared_error": 112725.3203125, "feature_vector_variance/metric": 24948.154296875, "generated_tokens/total": 10445816.0, "grad_norm": 0.08245325088500977, "grouped_std_rewards": 0.16688963770866394, "learning_rate": 1.4240955347243754e-05, "loss": -0.0113, "mean_logprobs": -0.08203125, "mean_logprobs/var": 0.00119781494140625, "num_completions/total": 17664, "per_sentence_gradient_norm": 3.0681376457214355, "per_sentence_gradient_norm/max": 255.74600219726562, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 95.56351470947266, "per_sentence_gradient_norm/var": 344.2358093261719, "per_token_feature_norm": 160.99571228027344, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 157.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 131.0, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 1459.017822265625, "per_token_full_gradient_variance/max_squared_error": 280.20819091796875, "per_token_full_gradient_variance/variance": 0.03572273999452591, "per_token_gradient_norm": 3.6068456172943115, "per_token_gradient_norm/max": 6206.46533203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4397.1220703125, "per_token_policy_error_norm": 0.044897496700286865, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03933065012097359, "policy_entropy": 0.08942686021327972, "policy_entropy/max": 3.671875, "policy_entropy/median": 7.033348083496094e-06, "policy_entropy/min": 6.5052130349130266e-18, "policy_entropy/p25": 3.03611159324646e-07, "policy_entropy/p75": 0.00396728515625, "policy_entropy/var": 0.05824930593371391, "policy_error_vector_variance/max_squared_error": 2.0122783184051514, "policy_error_vector_variance/metric": 0.0448734425008297, "policy_loss": -0.011302514933049679, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.0071260929107666, "policy_sharpness": 8.207305908203125, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.75, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.004049301147461, "reward": 0.75390625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18577350676059723, "rewards/accuracy_reward": 0.75390625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18577350676059723, "sentence_full_gradient_variance/max_squared_error": 1063592.875, "sentence_full_gradient_variance/metric": 3638.56787109375, "sentence_full_gradient_variance/p75": 75.70476531982422, "sentence_full_gradient_variance/p90": 153.33810424804688, "sentence_full_gradient_variance/p95": 153.33810424804688, "sentence_full_gradient_variance/p99": 78265.9765625, "state_level_variance/metric": 35.10231399536133, "state_level_variance_full_gradient/metric": 417.3626403808594, "step": 23 }, { "accuracy_reward": 0.8111979365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15335553884506226, "action_level_variance/metric": 298.0766296386719, "action_level_variance_full_gradient/metric": 2931.533935546875, "adam_stats/lr_effective_max": 8.281297778012231e-05, "adam_stats/lr_effective_mean": 4.424555444249023e-10, "adam_stats/lr_effective_min": -8.523878932464868e-05, "adam_stats/m_t_max": 0.0020453270990401506, "adam_stats/m_t_mean": 1.759332263207014e-11, "adam_stats/m_t_min": -0.0024423091672360897, "adam_stats/v_t_max": 5.4325319069903344e-05, "adam_stats/v_t_mean": 1.5752192302279089e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.011052966117858887, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.288644790649414, "all_logprobs": -0.08658254891633987, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-07, "all_logprobs/min": -9.375, "all_logprobs/p1": -1.859375, "all_logprobs/p10": -0.142578125, "all_logprobs/p25": -0.00070953369140625, "all_logprobs/p5": -0.5234375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.13043783605098724, "clip_ratio": 0.0, "completion_length": 511.40625, "completion_length/correct": 447.226318359375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 393.0, "completion_length/correct/min": 99.0, "completion_length/correct/p25": 285.0, "completion_length/correct/p75": 561.0, "completion_length/correct/var": 44276.81640625, "completion_length/incorrect": 787.1586303710938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 976.0, "completion_length/incorrect/min": 133.0, "completion_length/incorrect/p25": 534.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 75548.6484375, "completion_length/max": 1024.0, "completion_length/median": 437.0, "completion_length/min": 99.0, "completion_length/p25": 304.0, "completion_length/p75": 648.0, "completion_length/var": 67811.0390625, "epoch": 0.3072, "feature_vector_variance/max_squared_error": 111633.8046875, "feature_vector_variance/metric": 25427.873046875, "generated_tokens/total": 10838576.0, "grad_norm": 0.21855346858501434, "grouped_std_rewards": 0.16491426527500153, "learning_rate": 1.4122106946441953e-05, "loss": 0.0111, "mean_logprobs": -0.087890625, "mean_logprobs/var": 0.00153350830078125, "num_completions/total": 18432, "per_sentence_gradient_norm": 3.0630385875701904, "per_sentence_gradient_norm/max": 192.36422729492188, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 87.28022003173828, "per_sentence_gradient_norm/var": 289.07080078125, "per_token_feature_norm": 161.55897521972656, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 158.0, "per_token_feature_norm/min": 62.75, "per_token_feature_norm/p25": 132.0, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 1472.1947021484375, "per_token_full_gradient_variance/max_squared_error": 259.271240234375, "per_token_full_gradient_variance/variance": 0.04404004290699959, "per_token_gradient_norm": 4.214842319488525, "per_token_gradient_norm/max": 4228.98486328125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4692.416015625, "per_token_policy_error_norm": 0.04793000593781471, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.041989702731370926, "policy_entropy": 0.09544478356838226, "policy_entropy/max": 3.734375, "policy_entropy/median": 8.761882781982422e-06, "policy_entropy/min": 2.7422508708241367e-14, "policy_entropy/p25": 3.6135315895080566e-07, "policy_entropy/p75": 0.00604248046875, "policy_entropy/var": 0.06306741386651993, "policy_error_vector_variance/max_squared_error": 2.0124123096466064, "policy_error_vector_variance/metric": 0.047898534685373306, "policy_loss": 0.011052973568439484, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.288644313812256, "policy_sharpness": 8.123533248901367, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.372154235839844, "reward": 0.8111979365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15335553884506226, "rewards/accuracy_reward": 0.8111979365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15335553884506226, "sentence_full_gradient_variance/max_squared_error": 609412.0625, "sentence_full_gradient_variance/metric": 3283.938232421875, "sentence_full_gradient_variance/p75": 171.44461059570312, "sentence_full_gradient_variance/p90": 207.09222412109375, "sentence_full_gradient_variance/p95": 207.09222412109375, "sentence_full_gradient_variance/p99": 83735.265625, "state_level_variance/metric": 28.170818328857422, "state_level_variance_full_gradient/metric": 352.404052734375, "step": 24 }, { "accuracy_reward": 0.7161458730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20354601740837097, "action_level_variance/metric": 297.64892578125, "action_level_variance_full_gradient/metric": 3549.240234375, "adam_stats/lr_effective_max": 8.405085100093856e-05, "adam_stats/lr_effective_mean": 6.384713868712311e-10, "adam_stats/lr_effective_min": -8.690409595146775e-05, "adam_stats/m_t_max": 0.0020780216436833143, "adam_stats/m_t_mean": 7.509281391149258e-12, "adam_stats/m_t_min": -0.002164693782106042, "adam_stats/v_t_max": 5.433888509287499e-05, "adam_stats/v_t_mean": 1.6152852709908072e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.05844786390662193, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.8418819904327393, "all_logprobs": -0.08255746215581894, "all_logprobs/max": 0.0, "all_logprobs/median": -5.960464477539062e-07, "all_logprobs/min": -11.25, "all_logprobs/p1": -1.8046875, "all_logprobs/p10": -0.126953125, "all_logprobs/p25": -0.000553131103515625, "all_logprobs/p5": -0.4765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12448722869157791, "clip_ratio": 0.0, "completion_length": 554.3255615234375, "completion_length/correct": 447.0909118652344, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 405.0, "completion_length/correct/min": 94.0, "completion_length/correct/p25": 297.5, "completion_length/correct/p75": 566.75, "completion_length/correct/var": 41858.1875, "completion_length/incorrect": 824.8715209960938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 175.0, "completion_length/incorrect/p25": 584.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 69430.25, "completion_length/max": 1024.0, "completion_length/median": 461.0, "completion_length/min": 94.0, "completion_length/p25": 329.25, "completion_length/p75": 776.25, "completion_length/var": 78654.0390625, "epoch": 0.32, "feature_vector_variance/max_squared_error": 106790.0703125, "feature_vector_variance/metric": 24905.51171875, "generated_tokens/total": 11264298.0, "grad_norm": 0.29710653424263, "grouped_std_rewards": 0.18872015178203583, "learning_rate": 1.3995190528383292e-05, "loss": -0.0584, "mean_logprobs": -0.08154296875, "mean_logprobs/var": 0.00112152099609375, "num_completions/total": 19200, "per_sentence_gradient_norm": 2.8992886543273926, "per_sentence_gradient_norm/max": 317.825927734375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 19.692533493041992, "per_sentence_gradient_norm/p99": 71.89387512207031, "per_sentence_gradient_norm/var": 289.62017822265625, "per_token_feature_norm": 160.08168029785156, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 156.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 131.0, "per_token_feature_norm/p75": 187.0, "per_token_feature_norm/var": 1431.606689453125, "per_token_full_gradient_variance/max_squared_error": 219.64175415039062, "per_token_full_gradient_variance/variance": 0.031805187463760376, "per_token_gradient_norm": 3.400400161743164, "per_token_gradient_norm/max": 7221.18603515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4241.68408203125, "per_token_policy_error_norm": 0.04575395956635475, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.040135595947504044, "policy_entropy": 0.09126657247543335, "policy_entropy/max": 3.765625, "policy_entropy/median": 9.894371032714844e-06, "policy_entropy/min": 8.465450562766819e-16, "policy_entropy/p25": 4.153698682785034e-07, "policy_entropy/p75": 0.004791259765625, "policy_entropy/var": 0.06025973707437515, "policy_error_vector_variance/max_squared_error": 2.0122363567352295, "policy_error_vector_variance/metric": 0.045723188668489456, "policy_loss": -0.05844786390662193, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.8418819904327393, "policy_sharpness": 8.173782348632812, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.109564781188965, "reward": 0.7161458730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20354601740837097, "rewards/accuracy_reward": 0.7161458730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20354601740837097, "sentence_full_gradient_variance/max_squared_error": 576967.8125, "sentence_full_gradient_variance/metric": 3996.99169921875, "sentence_full_gradient_variance/p75": 91.88014221191406, "sentence_full_gradient_variance/p90": 221.68032836914062, "sentence_full_gradient_variance/p95": 221.68032836914062, "sentence_full_gradient_variance/p99": 78402.2578125, "state_level_variance/metric": 29.103404998779297, "state_level_variance_full_gradient/metric": 447.752197265625, "step": 25 }, { "accuracy_reward": 0.765625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17967729270458221, "action_level_variance/metric": 567.2557983398438, "action_level_variance_full_gradient/metric": 3313.78857421875, "adam_stats/lr_effective_max": 8.388121204916388e-05, "adam_stats/lr_effective_mean": 4.222991123459252e-10, "adam_stats/lr_effective_min": -8.736194286029786e-05, "adam_stats/m_t_max": 0.0024070811923593283, "adam_stats/m_t_mean": 1.4333547369849153e-11, "adam_stats/m_t_min": -0.002889753319323063, "adam_stats/v_t_max": 5.453637641039677e-05, "adam_stats/v_t_mean": 1.62708767900005e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.05440530180931091, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.171022415161133, "all_logprobs": -0.08623038977384567, "all_logprobs/max": 0.0, "all_logprobs/median": -7.152557373046875e-07, "all_logprobs/min": -13.375, "all_logprobs/p1": -1.859375, "all_logprobs/p10": -0.1376953125, "all_logprobs/p25": -0.000644683837890625, "all_logprobs/p5": -0.51953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1305643767118454, "clip_ratio": 0.0, "completion_length": 525.2526245117188, "completion_length/correct": 437.31121826171875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 390.0, "completion_length/correct/min": 53.0, "completion_length/correct/p25": 279.0, "completion_length/correct/p75": 543.0, "completion_length/correct/var": 40221.8515625, "completion_length/incorrect": 812.5277709960938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 936.0, "completion_length/incorrect/min": 260.0, "completion_length/incorrect/p25": 584.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 58255.078125, "completion_length/max": 1024.0, "completion_length/median": 473.0, "completion_length/min": 53.0, "completion_length/p25": 307.75, "completion_length/p75": 684.25, "completion_length/var": 69674.2578125, "epoch": 0.3328, "feature_vector_variance/max_squared_error": 113807.046875, "feature_vector_variance/metric": 24591.318359375, "generated_tokens/total": 11667692.0, "grad_norm": 0.1997358798980713, "grouped_std_rewards": 0.15080569684505463, "learning_rate": 1.3860360721173195e-05, "loss": -0.0544, "mean_logprobs": -0.08740234375, "mean_logprobs/var": 0.00182342529296875, "num_completions/total": 19968, "per_sentence_gradient_norm": 3.5933022499084473, "per_sentence_gradient_norm/max": 402.6860046386719, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 100.99783325195312, "per_sentence_gradient_norm/var": 555.0667114257812, "per_token_feature_norm": 158.9940948486328, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 129.0, "per_token_feature_norm/p75": 186.0, "per_token_feature_norm/var": 1503.53369140625, "per_token_full_gradient_variance/max_squared_error": 135.31405639648438, "per_token_full_gradient_variance/variance": 0.04465876892209053, "per_token_gradient_norm": 4.43649435043335, "per_token_gradient_norm/max": 6245.8974609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5231.865234375, "per_token_policy_error_norm": 0.04761578515172005, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04153374582529068, "policy_entropy": 0.09561610221862793, "policy_entropy/max": 3.71875, "policy_entropy/median": 1.1086463928222656e-05, "policy_entropy/min": 6.418476861114186e-17, "policy_entropy/p25": 4.842877388000488e-07, "policy_entropy/p75": 0.0057373046875, "policy_entropy/var": 0.0654681921005249, "policy_error_vector_variance/max_squared_error": 2.014439105987549, "policy_error_vector_variance/metric": 0.047585517168045044, "policy_loss": -0.05440531298518181, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.171022415161133, "policy_sharpness": 8.132502555847168, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.360862731933594, "reward": 0.765625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17967729270458221, "rewards/accuracy_reward": 0.765625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17967729270458221, "sentence_full_gradient_variance/max_squared_error": 1018829.375, "sentence_full_gradient_variance/metric": 3730.08984375, "sentence_full_gradient_variance/p75": 114.88516235351562, "sentence_full_gradient_variance/p90": 124.19505310058594, "sentence_full_gradient_variance/p95": 124.19505310058594, "sentence_full_gradient_variance/p99": 94809.2578125, "state_level_variance/metric": 58.605628967285156, "state_level_variance_full_gradient/metric": 416.30120849609375, "step": 26 }, { "accuracy_reward": 0.734375, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1953226923942566, "action_level_variance/metric": 470.97723388671875, "action_level_variance_full_gradient/metric": 5078.24365234375, "adam_stats/lr_effective_max": 8.304851508000866e-05, "adam_stats/lr_effective_mean": 3.384927615979194e-10, "adam_stats/lr_effective_min": -8.719865581952035e-05, "adam_stats/m_t_max": 0.0019649569876492023, "adam_stats/m_t_mean": 1.2699622603529193e-11, "adam_stats/m_t_min": -0.002173531800508499, "adam_stats/v_t_max": 5.448430965770967e-05, "adam_stats/v_t_mean": 1.6279887594256026e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.037727177143096924, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 3.284745216369629, "all_logprobs": -0.0824662446975708, "all_logprobs/max": 0.0, "all_logprobs/median": -5.960464477539062e-07, "all_logprobs/min": -9.5, "all_logprobs/p1": -1.8125, "all_logprobs/p10": -0.126953125, "all_logprobs/p25": -0.00048828125, "all_logprobs/p5": -0.478515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12278429418802261, "clip_ratio": 0.0, "completion_length": 554.1432495117188, "completion_length/correct": 456.0992736816406, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 420.0, "completion_length/correct/min": 62.0, "completion_length/correct/p25": 301.0, "completion_length/correct/p75": 568.0, "completion_length/correct/var": 46655.9453125, "completion_length/incorrect": 825.2059326171875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 994.0, "completion_length/incorrect/min": 102.0, "completion_length/incorrect/p25": 658.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 67972.4296875, "completion_length/max": 1024.0, "completion_length/median": 494.0, "completion_length/min": 62.0, "completion_length/p25": 322.75, "completion_length/p75": 773.25, "completion_length/var": 78847.59375, "epoch": 0.3456, "feature_vector_variance/max_squared_error": 114123.9453125, "feature_vector_variance/metric": 23678.96875, "generated_tokens/total": 12093274.0, "grad_norm": 0.10888662934303284, "grouped_std_rewards": 0.20033685863018036, "learning_rate": 1.3717781794162813e-05, "loss": -0.0377, "mean_logprobs": -0.0859375, "mean_logprobs/var": 0.0018157958984375, "num_completions/total": 20736, "per_sentence_gradient_norm": 4.21009635925293, "per_sentence_gradient_norm/max": 235.13516235351562, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 27.773733139038086, "per_sentence_gradient_norm/p99": 126.2285385131836, "per_sentence_gradient_norm/var": 453.8432922363281, "per_token_feature_norm": 156.3706512451172, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 182.0, "per_token_feature_norm/var": 1429.1351318359375, "per_token_full_gradient_variance/max_squared_error": 276.66162109375, "per_token_full_gradient_variance/variance": 0.057169295847415924, "per_token_gradient_norm": 5.038876056671143, "per_token_gradient_norm/max": 5775.4951171875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6572.51123046875, "per_token_policy_error_norm": 0.045666325837373734, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03980404511094093, "policy_entropy": 0.09171939641237259, "policy_entropy/max": 3.734375, "policy_entropy/median": 9.298324584960938e-06, "policy_entropy/min": 9.880984919163893e-15, "policy_entropy/p25": 4.76837158203125e-07, "policy_entropy/p75": 0.004364013671875, "policy_entropy/var": 0.061785489320755005, "policy_error_vector_variance/max_squared_error": 2.01910662651062, "policy_error_vector_variance/metric": 0.04563435539603233, "policy_loss": -0.03772719204425812, "policy_loss/max": 12.958683013916016, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.28474497795105, "policy_sharpness": 8.189786911010742, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.076543807983398, "reward": 0.734375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.1953226923942566, "rewards/accuracy_reward": 0.734375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1953226923942566, "sentence_full_gradient_variance/max_squared_error": 1228966.0, "sentence_full_gradient_variance/metric": 5765.91943359375, "sentence_full_gradient_variance/p75": 71.82518005371094, "sentence_full_gradient_variance/p90": 111.54096221923828, "sentence_full_gradient_variance/p95": 111.54096221923828, "sentence_full_gradient_variance/p99": 131933.53125, "state_level_variance/metric": 41.58037567138672, "state_level_variance_full_gradient/metric": 687.67626953125, "step": 27 }, { "accuracy_reward": 0.7643229365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18036825954914093, "action_level_variance/metric": 380.3666687011719, "action_level_variance_full_gradient/metric": 4656.8603515625, "adam_stats/lr_effective_max": 8.495258225593716e-05, "adam_stats/lr_effective_mean": 2.645352548125146e-10, "adam_stats/lr_effective_min": -8.049921598285437e-05, "adam_stats/m_t_max": 0.0020085300784558058, "adam_stats/m_t_mean": 1.2228299102468032e-11, "adam_stats/m_t_min": -0.0023407002445310354, "adam_stats/v_t_max": 5.4429892770713195e-05, "adam_stats/v_t_mean": 1.6279376935032785e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.08424042910337448, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.3727807998657227, "all_logprobs": -0.08683392405509949, "all_logprobs/max": 0.0, "all_logprobs/median": -9.5367431640625e-07, "all_logprobs/min": -10.875, "all_logprobs/p1": -1.859375, "all_logprobs/p10": -0.142578125, "all_logprobs/p25": -0.000804901123046875, "all_logprobs/p5": -0.5234375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12966671586036682, "clip_ratio": 0.0, "completion_length": 558.2291870117188, "completion_length/correct": 482.6354064941406, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 442.0, "completion_length/correct/min": 105.0, "completion_length/correct/p25": 326.0, "completion_length/correct/p75": 601.0, "completion_length/correct/var": 43482.64453125, "completion_length/incorrect": 803.3867797851562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 960.0, "completion_length/incorrect/min": 247.0, "completion_length/incorrect/p25": 601.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 70239.7421875, "completion_length/max": 1024.0, "completion_length/median": 498.0, "completion_length/min": 105.0, "completion_length/p25": 341.5, "completion_length/p75": 748.5, "completion_length/var": 68261.8671875, "epoch": 0.3584, "feature_vector_variance/max_squared_error": 111621.3828125, "feature_vector_variance/metric": 23986.2109375, "generated_tokens/total": 12521994.0, "grad_norm": 0.07409513741731644, "grouped_std_rewards": 0.20155511796474457, "learning_rate": 1.3567627457812107e-05, "loss": 0.0842, "mean_logprobs": -0.08740234375, "mean_logprobs/var": 0.00189208984375, "num_completions/total": 21504, "per_sentence_gradient_norm": 3.743016242980957, "per_sentence_gradient_norm/max": 262.0980529785156, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 19.531038284301758, "per_sentence_gradient_norm/p99": 96.04953002929688, "per_sentence_gradient_norm/var": 366.8341369628906, "per_token_feature_norm": 156.87083435058594, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 127.0, "per_token_feature_norm/p75": 183.0, "per_token_feature_norm/var": 1483.16015625, "per_token_full_gradient_variance/max_squared_error": 141.14620971679688, "per_token_full_gradient_variance/variance": 0.038909491151571274, "per_token_gradient_norm": 4.501586437225342, "per_token_gradient_norm/max": 3814.104736328125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4481.1494140625, "per_token_policy_error_norm": 0.048013877123594284, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.041963450610637665, "policy_entropy": 0.09662900865077972, "policy_entropy/max": 3.734375, "policy_entropy/median": 1.5616416931152344e-05, "policy_entropy/min": 1.2545520178264269e-14, "policy_entropy/p25": 7.078051567077637e-07, "policy_entropy/p75": 0.006683349609375, "policy_entropy/var": 0.06532222032546997, "policy_error_vector_variance/max_squared_error": 2.0161917209625244, "policy_error_vector_variance/metric": 0.047984778881073, "policy_loss": 0.08424042165279388, "policy_loss/max": 12.958683013916016, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.3727807998657227, "policy_sharpness": 8.09409236907959, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.489670753479004, "reward": 0.7643229365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18036825954914093, "rewards/accuracy_reward": 0.7643229365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18036825954914093, "sentence_full_gradient_variance/max_squared_error": 657118.3125, "sentence_full_gradient_variance/metric": 5298.21630859375, "sentence_full_gradient_variance/p75": 53.89039993286133, "sentence_full_gradient_variance/p90": 55.335479736328125, "sentence_full_gradient_variance/p95": 55.335479736328125, "sentence_full_gradient_variance/p99": 141515.671875, "state_level_variance/metric": 33.888668060302734, "state_level_variance_full_gradient/metric": 641.3560180664062, "step": 28 }, { "accuracy_reward": 0.7447916865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1903248429298401, "action_level_variance/metric": 351.31207275390625, "action_level_variance_full_gradient/metric": 3672.920654296875, "adam_stats/lr_effective_max": 7.673248910577968e-05, "adam_stats/lr_effective_mean": 2.32496355501155e-10, "adam_stats/lr_effective_min": -7.501464278902858e-05, "adam_stats/m_t_max": 0.001752745360136032, "adam_stats/m_t_mean": 1.1684807635359284e-11, "adam_stats/m_t_min": -0.0019235247746109962, "adam_stats/v_t_max": 5.4376956541091204e-05, "adam_stats/v_t_mean": 1.6273159035573581e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.08260151743888855, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.280410885810852, "all_logprobs": -0.09646634012460709, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-06, "all_logprobs/min": -13.0625, "all_logprobs/p1": -1.984375, "all_logprobs/p10": -0.1728515625, "all_logprobs/p25": -0.00144195556640625, "all_logprobs/p5": -0.578125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1530655175447464, "clip_ratio": 0.0, "completion_length": 532.97265625, "completion_length/correct": 433.2779541015625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 390.0, "completion_length/correct/min": 48.0, "completion_length/correct/p25": 280.0, "completion_length/correct/p75": 546.5, "completion_length/correct/var": 42222.7890625, "completion_length/incorrect": 823.9183349609375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 964.0, "completion_length/incorrect/min": 234.0, "completion_length/incorrect/p25": 650.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 59437.421875, "completion_length/max": 1024.0, "completion_length/median": 457.0, "completion_length/min": 48.0, "completion_length/p25": 307.75, "completion_length/p75": 726.75, "completion_length/var": 75587.8984375, "epoch": 0.3712, "feature_vector_variance/max_squared_error": 104938.8046875, "feature_vector_variance/metric": 24834.751953125, "generated_tokens/total": 12931317.0, "grad_norm": 0.05292295664548874, "grouped_std_rewards": 0.15948569774627686, "learning_rate": 1.3410080652050414e-05, "loss": -0.0826, "mean_logprobs": -0.09619140625, "mean_logprobs/var": 0.0022735595703125, "num_completions/total": 22272, "per_sentence_gradient_norm": 2.7156715393066406, "per_sentence_gradient_norm/max": 401.9851379394531, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 62.55886459350586, "per_sentence_gradient_norm/var": 344.3856201171875, "per_token_feature_norm": 159.02479553222656, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 186.0, "per_token_feature_norm/var": 1607.2679443359375, "per_token_full_gradient_variance/max_squared_error": 97.37405395507812, "per_token_full_gradient_variance/variance": 0.01662498526275158, "per_token_gradient_norm": 2.6956946849823, "per_token_gradient_norm/max": 3695.046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2135.619384765625, "per_token_policy_error_norm": 0.0523994117975235, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.045914288610219955, "policy_entropy": 0.10636550188064575, "policy_entropy/max": 3.75, "policy_entropy/median": 1.9073486328125e-05, "policy_entropy/min": 1.2975731600306517e-15, "policy_entropy/p25": 8.009374141693115e-07, "policy_entropy/p75": 0.01129150390625, "policy_entropy/var": 0.07499762624502182, "policy_error_vector_variance/max_squared_error": 2.0140469074249268, "policy_error_vector_variance/metric": 0.052335720509290695, "policy_loss": -0.08260151743888855, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -9.659052848815918, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.280410885810852, "policy_sharpness": 7.960327625274658, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.87109375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.118243217468262, "reward": 0.7447916865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.1903248429298401, "rewards/accuracy_reward": 0.7447916865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1903248429298401, "sentence_full_gradient_variance/max_squared_error": 473517.78125, "sentence_full_gradient_variance/metric": 4112.681640625, "sentence_full_gradient_variance/p75": 172.623046875, "sentence_full_gradient_variance/p90": 194.48043823242188, "sentence_full_gradient_variance/p95": 194.48043823242188, "sentence_full_gradient_variance/p99": 98298.859375, "state_level_variance/metric": 36.92375946044922, "state_level_variance_full_gradient/metric": 439.7615661621094, "step": 29 }, { "accuracy_reward": 0.7903646230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1659044325351715, "action_level_variance/metric": 272.54132080078125, "action_level_variance_full_gradient/metric": 3902.34375, "adam_stats/lr_effective_max": 7.522061059717089e-05, "adam_stats/lr_effective_mean": 2.0531538968970864e-10, "adam_stats/lr_effective_min": -7.433043356286362e-05, "adam_stats/m_t_max": 0.0027859671972692013, "adam_stats/m_t_mean": 1.6936796474846527e-13, "adam_stats/m_t_min": -0.002890840405598283, "adam_stats/v_t_max": 5.432323450804688e-05, "adam_stats/v_t_mean": 1.6374526517690113e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.03530282527208328, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.6224199533462524, "all_logprobs": -0.0852339118719101, "all_logprobs/max": 0.0, "all_logprobs/median": -8.344650268554688e-07, "all_logprobs/min": -10.0625, "all_logprobs/p1": -1.8501567840576172, "all_logprobs/p10": -0.13496017456054688, "all_logprobs/p25": -0.0006256103515625, "all_logprobs/p5": -0.5078125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12811371684074402, "clip_ratio": 0.0, "completion_length": 492.6028747558594, "completion_length/correct": 419.5057678222656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 371.0, "completion_length/correct/min": 19.0, "completion_length/correct/p25": 268.5, "completion_length/correct/p75": 518.0, "completion_length/correct/var": 43306.48046875, "completion_length/incorrect": 768.1925659179688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 886.0, "completion_length/incorrect/min": 125.0, "completion_length/incorrect/p25": 503.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 84348.8671875, "completion_length/max": 1024.0, "completion_length/median": 406.0, "completion_length/min": 19.0, "completion_length/p25": 288.0, "completion_length/p75": 645.0, "completion_length/var": 71982.734375, "epoch": 0.384, "feature_vector_variance/max_squared_error": 107394.1796875, "feature_vector_variance/metric": 23618.7578125, "generated_tokens/total": 13309636.0, "grad_norm": 0.16939003765583038, "grouped_std_rewards": 0.1780642718076706, "learning_rate": 1.3245333323392335e-05, "loss": -0.0353, "mean_logprobs": -0.09033203125, "mean_logprobs/var": 0.001983642578125, "num_completions/total": 23040, "per_sentence_gradient_norm": 2.945283889770508, "per_sentence_gradient_norm/max": 196.9078369140625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 78.71530151367188, "per_sentence_gradient_norm/var": 264.2106018066406, "per_token_feature_norm": 155.8738555908203, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 59.5, "per_token_feature_norm/p25": 126.0, "per_token_feature_norm/p75": 182.0, "per_token_feature_norm/var": 1481.417236328125, "per_token_full_gradient_variance/max_squared_error": 96.47038269042969, "per_token_full_gradient_variance/variance": 0.027004975825548172, "per_token_gradient_norm": 3.442305326461792, "per_token_gradient_norm/max": 4244.1708984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3255.8046875, "per_token_policy_error_norm": 0.04728316888213158, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04153095930814743, "policy_entropy": 0.09390737861394882, "policy_entropy/max": 3.6875, "policy_entropy/median": 1.2755393981933594e-05, "policy_entropy/min": 6.17284001691587e-14, "policy_entropy/p25": 6.51925802230835e-07, "policy_entropy/p75": 0.00531005859375, "policy_entropy/var": 0.06246257945895195, "policy_error_vector_variance/max_squared_error": 2.011223554611206, "policy_error_vector_variance/metric": 0.04725022614002228, "policy_loss": -0.03530282527208328, "policy_loss/max": 12.958683013916016, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.6224199533462524, "policy_sharpness": 8.149191856384277, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.252279281616211, "reward": 0.7903646230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1659044325351715, "rewards/accuracy_reward": 0.7903646230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1659044325351715, "sentence_full_gradient_variance/max_squared_error": 715826.0625, "sentence_full_gradient_variance/metric": 4422.080078125, "sentence_full_gradient_variance/p75": 110.38963317871094, "sentence_full_gradient_variance/p90": 132.39271545410156, "sentence_full_gradient_variance/p95": 132.39271545410156, "sentence_full_gradient_variance/p99": 136935.796875, "state_level_variance/metric": 25.66025733947754, "state_level_variance_full_gradient/metric": 519.7362670898438, "step": 30 }, { "accuracy_reward": 0.7330729365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19593213498592377, "action_level_variance/metric": 463.2901306152344, "action_level_variance_full_gradient/metric": 8051.677734375, "adam_stats/lr_effective_max": 7.699152047280222e-05, "adam_stats/lr_effective_mean": 4.730912883665894e-10, "adam_stats/lr_effective_min": -7.429332617903128e-05, "adam_stats/m_t_max": 0.00281407218426466, "adam_stats/m_t_mean": -9.47390429995032e-12, "adam_stats/m_t_min": -0.0018790820613503456, "adam_stats/v_t_max": 5.42713823961094e-05, "adam_stats/v_t_mean": 1.6836351106677694e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.010077735409140587, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.743187665939331, "all_logprobs": -0.08461254090070724, "all_logprobs/max": 0.0, "all_logprobs/median": -8.344650268554688e-07, "all_logprobs/min": -10.875, "all_logprobs/p1": -1.8125, "all_logprobs/p10": -0.130859375, "all_logprobs/p25": -0.00063323974609375, "all_logprobs/p5": -0.494140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1289374679327011, "clip_ratio": 0.0, "completion_length": 545.0651245117188, "completion_length/correct": 449.7069396972656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 407.0, "completion_length/correct/min": 92.0, "completion_length/correct/p25": 305.0, "completion_length/correct/p75": 572.0, "completion_length/correct/var": 41270.50390625, "completion_length/incorrect": 806.9512329101562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 904.0, "completion_length/incorrect/min": 224.0, "completion_length/incorrect/p25": 600.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 56635.57421875, "completion_length/max": 1024.0, "completion_length/median": 486.0, "completion_length/min": 92.0, "completion_length/p25": 334.75, "completion_length/p75": 726.5, "completion_length/var": 70308.8984375, "epoch": 0.3968, "feature_vector_variance/max_squared_error": 105279.671875, "feature_vector_variance/metric": 23125.66796875, "generated_tokens/total": 13728246.0, "grad_norm": 0.32845285534858704, "grouped_std_rewards": 0.198020339012146, "learning_rate": 1.3073586191080456e-05, "loss": -0.0101, "mean_logprobs": -0.0869140625, "mean_logprobs/var": 0.00156402587890625, "num_completions/total": 23808, "per_sentence_gradient_norm": 4.065764427185059, "per_sentence_gradient_norm/max": 242.64930725097656, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 20.236604690551758, "per_sentence_gradient_norm/p99": 125.82696533203125, "per_sentence_gradient_norm/var": 447.3421936035156, "per_token_feature_norm": 154.5934600830078, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 149.0, "per_token_feature_norm/min": 63.0, "per_token_feature_norm/p25": 125.5, "per_token_feature_norm/p75": 180.0, "per_token_feature_norm/var": 1442.4560546875, "per_token_full_gradient_variance/max_squared_error": 245.0066680908203, "per_token_full_gradient_variance/variance": 0.05186804383993149, "per_token_gradient_norm": 4.453553199768066, "per_token_gradient_norm/max": 7006.86083984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5632.20654296875, "per_token_policy_error_norm": 0.04681161418557167, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.041243232786655426, "policy_entropy": 0.09330933541059494, "policy_entropy/max": 3.71875, "policy_entropy/median": 1.4007091522216797e-05, "policy_entropy/min": 1.163513729807164e-13, "policy_entropy/p25": 7.897615432739258e-07, "policy_entropy/p75": 0.00555419921875, "policy_entropy/var": 0.061391327530145645, "policy_error_vector_variance/max_squared_error": 2.0096092224121094, "policy_error_vector_variance/metric": 0.04678512364625931, "policy_loss": -0.010077735409140587, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.743187665939331, "policy_sharpness": 8.143779754638672, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.236384391784668, "reward": 0.7330729365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19593213498592377, "rewards/accuracy_reward": 0.7330729365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19593213498592377, "sentence_full_gradient_variance/max_squared_error": 2329536.25, "sentence_full_gradient_variance/metric": 9160.8095703125, "sentence_full_gradient_variance/p75": 93.09904479980469, "sentence_full_gradient_variance/p90": 108.38052368164062, "sentence_full_gradient_variance/p95": 108.51089477539062, "sentence_full_gradient_variance/p99": 163688.8125, "state_level_variance/metric": 41.8164176940918, "state_level_variance_full_gradient/metric": 1109.131591796875, "step": 31 }, { "accuracy_reward": 0.7369791865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19409359991550446, "action_level_variance/metric": 730.75537109375, "action_level_variance_full_gradient/metric": 5256.6435546875, "adam_stats/lr_effective_max": 7.402417395496741e-05, "adam_stats/lr_effective_mean": 3.965486827794962e-10, "adam_stats/lr_effective_min": -7.591295434394851e-05, "adam_stats/m_t_max": 0.00303925690241158, "adam_stats/m_t_mean": -7.666692801444608e-12, "adam_stats/m_t_min": -0.00200134445913136, "adam_stats/v_t_max": 5.424036135082133e-05, "adam_stats/v_t_mean": 1.6852771348579987e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0897996574640274, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.223161458969116, "all_logprobs": -0.08478353917598724, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-07, "all_logprobs/min": -16.625, "all_logprobs/p1": -1.828125, "all_logprobs/p10": -0.1328125, "all_logprobs/p25": -0.0005979537963867188, "all_logprobs/p5": -0.50390625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12727564573287964, "clip_ratio": 0.0, "completion_length": 540.3046875, "completion_length/correct": 463.13427734375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 414.0, "completion_length/correct/min": 98.0, "completion_length/correct/p25": 330.0, "completion_length/correct/p75": 568.5, "completion_length/correct/var": 36528.375, "completion_length/incorrect": 756.53466796875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 784.0, "completion_length/incorrect/min": 215.0, "completion_length/incorrect/p25": 517.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 72516.40625, "completion_length/max": 1024.0, "completion_length/median": 466.0, "completion_length/min": 98.0, "completion_length/p25": 346.0, "completion_length/p75": 686.25, "completion_length/var": 62620.08984375, "epoch": 0.4096, "feature_vector_variance/max_squared_error": 111000.6484375, "feature_vector_variance/metric": 24669.2265625, "generated_tokens/total": 14143200.0, "grad_norm": 0.1046162098646164, "grouped_std_rewards": 0.217790886759758, "learning_rate": 1.2895048502539883e-05, "loss": -0.0898, "mean_logprobs": -0.08544921875, "mean_logprobs/var": 0.0019683837890625, "num_completions/total": 24576, "per_sentence_gradient_norm": 4.7073211669921875, "per_sentence_gradient_norm/max": 345.4830627441406, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 24.066560745239258, "per_sentence_gradient_norm/p99": 115.75321197509766, "per_sentence_gradient_norm/var": 709.520263671875, "per_token_feature_norm": 159.6202392578125, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 156.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 130.0, "per_token_feature_norm/p75": 186.0, "per_token_feature_norm/var": 1441.317138671875, "per_token_full_gradient_variance/max_squared_error": 583.262451171875, "per_token_full_gradient_variance/variance": 0.07023821771144867, "per_token_gradient_norm": 5.921971797943115, "per_token_gradient_norm/max": 6596.302734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8193.953125, "per_token_policy_error_norm": 0.0471288301050663, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.041301026940345764, "policy_entropy": 0.09347239881753922, "policy_entropy/max": 3.75, "policy_entropy/median": 8.225440979003906e-06, "policy_entropy/min": 1.0269562977782698e-14, "policy_entropy/p25": 4.0978193283081055e-07, "policy_entropy/p75": 0.0052490234375, "policy_entropy/var": 0.06220415607094765, "policy_error_vector_variance/max_squared_error": 2.0118319988250732, "policy_error_vector_variance/metric": 0.04709995165467262, "policy_loss": -0.0897996574640274, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.223161458969116, "policy_sharpness": 8.154157638549805, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.265625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.249259948730469, "reward": 0.7369791865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19409359991550446, "rewards/accuracy_reward": 0.7369791865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19409359991550446, "sentence_full_gradient_variance/max_squared_error": 1054249.625, "sentence_full_gradient_variance/metric": 5909.1005859375, "sentence_full_gradient_variance/p75": 172.78709411621094, "sentence_full_gradient_variance/p90": 301.76171875, "sentence_full_gradient_variance/p95": 3750.75244140625, "sentence_full_gradient_variance/p99": 134124.5625, "state_level_variance/metric": 69.913818359375, "state_level_variance_full_gradient/metric": 652.45654296875, "step": 32 }, { "accuracy_reward": 0.7473958730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18904145061969757, "action_level_variance/metric": 359.7130126953125, "action_level_variance_full_gradient/metric": 4633.11328125, "adam_stats/lr_effective_max": 7.677717803744599e-05, "adam_stats/lr_effective_mean": 2.8720409384064283e-10, "adam_stats/lr_effective_min": -7.75144508224912e-05, "adam_stats/m_t_max": 0.0030069376807659864, "adam_stats/m_t_mean": -1.6798262433836975e-11, "adam_stats/m_t_min": -0.002020936692133546, "adam_stats/v_t_max": 5.4193173127714545e-05, "adam_stats/v_t_mean": 1.6894636731268342e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.10946601629257202, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.3276007175445557, "all_logprobs": -0.07631447166204453, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -10.375, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.11279296875, "all_logprobs/p25": -0.0003833770751953125, "all_logprobs/p5": -0.44921875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.10843832045793533, "clip_ratio": 0.0, "completion_length": 556.9609375, "completion_length/correct": 485.627197265625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 466.0, "completion_length/correct/min": 70.0, "completion_length/correct/p25": 330.25, "completion_length/correct/p75": 614.75, "completion_length/correct/var": 37319.4765625, "completion_length/incorrect": 768.0205688476562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 901.0, "completion_length/incorrect/min": 200.0, "completion_length/incorrect/p25": 529.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 80787.5546875, "completion_length/max": 1024.0, "completion_length/median": 517.0, "completion_length/min": 70.0, "completion_length/p25": 353.0, "completion_length/p75": 720.25, "completion_length/var": 63283.98828125, "epoch": 0.4224, "feature_vector_variance/max_squared_error": 113885.359375, "feature_vector_variance/metric": 25210.8203125, "generated_tokens/total": 14570946.0, "grad_norm": 0.13145633041858673, "grouped_std_rewards": 0.19416150450706482, "learning_rate": 1.270993777844248e-05, "loss": -0.1095, "mean_logprobs": -0.076171875, "mean_logprobs/var": 0.00119781494140625, "num_completions/total": 25344, "per_sentence_gradient_norm": 3.380929946899414, "per_sentence_gradient_norm/max": 232.8399200439453, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 19.681289672851562, "per_sentence_gradient_norm/p99": 91.05326843261719, "per_sentence_gradient_norm/var": 348.7364196777344, "per_token_feature_norm": 162.0227508544922, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 159.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 132.0, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 1430.1773681640625, "per_token_full_gradient_variance/max_squared_error": 332.0225830078125, "per_token_full_gradient_variance/variance": 0.049576953053474426, "per_token_gradient_norm": 4.233713150024414, "per_token_gradient_norm/max": 6259.66015625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5714.25, "per_token_policy_error_norm": 0.042790234088897705, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.037011101841926575, "policy_entropy": 0.08612877875566483, "policy_entropy/max": 3.703125, "policy_entropy/median": 6.735324859619141e-06, "policy_entropy/min": 4.3021142204224816e-15, "policy_entropy/p25": 2.9243528842926025e-07, "policy_entropy/p75": 0.0035400390625, "policy_entropy/var": 0.05544307827949524, "policy_error_vector_variance/max_squared_error": 2.016385316848755, "policy_error_vector_variance/metric": 0.042757730931043625, "policy_loss": -0.10946601629257202, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.3276007175445557, "policy_sharpness": 8.242579460144043, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.787405014038086, "reward": 0.7473958730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.18904145061969757, "rewards/accuracy_reward": 0.7473958730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18904145061969757, "sentence_full_gradient_variance/max_squared_error": 1104099.625, "sentence_full_gradient_variance/metric": 5163.798828125, "sentence_full_gradient_variance/p75": 121.48905181884766, "sentence_full_gradient_variance/p90": 754.9013061523438, "sentence_full_gradient_variance/p95": 754.9013061523438, "sentence_full_gradient_variance/p99": 109728.765625, "state_level_variance/metric": 33.88642501831055, "state_level_variance_full_gradient/metric": 530.6860961914062, "step": 33 }, { "accuracy_reward": 0.7799479365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17185290157794952, "action_level_variance/metric": 357.942626953125, "action_level_variance_full_gradient/metric": 7992.578125, "adam_stats/lr_effective_max": 7.739298598608002e-05, "adam_stats/lr_effective_mean": 2.787730601916394e-10, "adam_stats/lr_effective_min": -7.819238817319274e-05, "adam_stats/m_t_max": 0.0025834105908870697, "adam_stats/m_t_mean": -1.1361505486418011e-11, "adam_stats/m_t_min": -0.0017524672439321876, "adam_stats/v_t_max": 5.415723717305809e-05, "adam_stats/v_t_mean": 1.6975448824396722e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.05115506052970886, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.674027919769287, "all_logprobs": -0.0739566907286644, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -10.75, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.10009765625, "all_logprobs/p25": -0.0002803802490234375, "all_logprobs/p5": -0.416015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.10926146060228348, "clip_ratio": 0.0, "completion_length": 517.8294677734375, "completion_length/correct": 452.5926513671875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 419.0, "completion_length/correct/min": 127.0, "completion_length/correct/p25": 313.5, "completion_length/correct/p75": 544.0, "completion_length/correct/var": 35643.6640625, "completion_length/incorrect": 749.0532836914062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 805.0, "completion_length/incorrect/min": 187.0, "completion_length/incorrect/p25": 507.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 80149.9375, "completion_length/max": 1024.0, "completion_length/median": 444.0, "completion_length/min": 127.0, "completion_length/p25": 330.0, "completion_length/p75": 652.25, "completion_length/var": 60449.6015625, "epoch": 0.4352, "feature_vector_variance/max_squared_error": 114029.7734375, "feature_vector_variance/metric": 25602.060546875, "generated_tokens/total": 14968639.0, "grad_norm": 0.17401042580604553, "grouped_std_rewards": 0.17447006702423096, "learning_rate": 1.2518479547691437e-05, "loss": 0.0512, "mean_logprobs": -0.07470703125, "mean_logprobs/var": 0.00109100341796875, "num_completions/total": 26112, "per_sentence_gradient_norm": 3.309917449951172, "per_sentence_gradient_norm/max": 198.06680297851562, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 107.65679168701172, "per_sentence_gradient_norm/var": 347.4394836425781, "per_token_feature_norm": 163.6276397705078, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 161.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 134.0, "per_token_feature_norm/p75": 191.0, "per_token_feature_norm/var": 1414.8035888671875, "per_token_full_gradient_variance/max_squared_error": 563.0778198242188, "per_token_full_gradient_variance/variance": 0.05237468704581261, "per_token_gradient_norm": 3.9570436477661133, "per_token_gradient_norm/max": 6223.78466796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5725.09765625, "per_token_policy_error_norm": 0.04141676053404808, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.036518678069114685, "policy_entropy": 0.08190963417291641, "policy_entropy/max": 3.75, "policy_entropy/median": 5.3942203521728516e-06, "policy_entropy/min": 1.4988010832439613e-14, "policy_entropy/p25": 2.0954757928848267e-07, "policy_entropy/p75": 0.002685546875, "policy_entropy/var": 0.05219331011176109, "policy_error_vector_variance/max_squared_error": 2.0079407691955566, "policy_error_vector_variance/metric": 0.04139622300863266, "policy_loss": 0.05115504562854767, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.674027919769287, "policy_sharpness": 8.296935081481934, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.521611213684082, "reward": 0.7799479365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17185290157794952, "rewards/accuracy_reward": 0.7799479365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17185290157794952, "sentence_full_gradient_variance/max_squared_error": 2164533.0, "sentence_full_gradient_variance/metric": 9078.58984375, "sentence_full_gradient_variance/p75": 119.64117431640625, "sentence_full_gradient_variance/p90": 240.90890502929688, "sentence_full_gradient_variance/p95": 240.90890502929688, "sentence_full_gradient_variance/p99": 144451.09375, "state_level_variance/metric": 34.14292907714844, "state_level_variance_full_gradient/metric": 1086.011474609375, "step": 34 }, { "accuracy_reward": 0.7669271230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17898298799991608, "action_level_variance/metric": 469.0629577636719, "action_level_variance_full_gradient/metric": 13978.9482421875, "adam_stats/lr_effective_max": 7.482450746465474e-05, "adam_stats/lr_effective_mean": 2.4093166350880324e-10, "adam_stats/lr_effective_min": -7.353230466833338e-05, "adam_stats/m_t_max": 0.0020900839008390903, "adam_stats/m_t_mean": -1.149678442724511e-11, "adam_stats/m_t_min": -0.0018169443355873227, "adam_stats/v_t_max": 5.413741382653825e-05, "adam_stats/v_t_mean": 1.7103550563682401e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.0050521716475486755, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 4.842082500457764, "all_logprobs": -0.07631585747003555, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -10.5, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.10205078125, "all_logprobs/p25": -0.000324249267578125, "all_logprobs/p5": -0.4296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11278564482927322, "clip_ratio": 0.0, "completion_length": 501.76824951171875, "completion_length/correct": 435.3854064941406, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 393.0, "completion_length/correct/min": 127.0, "completion_length/correct/p25": 287.0, "completion_length/correct/p75": 551.0, "completion_length/correct/var": 41415.40625, "completion_length/incorrect": 720.2011108398438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 778.0, "completion_length/incorrect/min": 149.0, "completion_length/incorrect/p25": 440.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 88902.0078125, "completion_length/max": 1024.0, "completion_length/median": 432.0, "completion_length/min": 127.0, "completion_length/p25": 304.0, "completion_length/p75": 646.0, "completion_length/var": 66900.8671875, "epoch": 0.448, "feature_vector_variance/max_squared_error": 120823.21875, "feature_vector_variance/metric": 25625.08203125, "generated_tokens/total": 15353997.0, "grad_norm": 0.21395769715309143, "grouped_std_rewards": 0.21256375312805176, "learning_rate": 1.2320907072649045e-05, "loss": 0.0051, "mean_logprobs": -0.07568359375, "mean_logprobs/var": 0.00104522705078125, "num_completions/total": 26880, "per_sentence_gradient_norm": 4.242238521575928, "per_sentence_gradient_norm/max": 244.1145782470703, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 26.361309051513672, "per_sentence_gradient_norm/p99": 110.15399169921875, "per_sentence_gradient_norm/var": 451.6544189453125, "per_token_feature_norm": 163.8485870361328, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 161.0, "per_token_feature_norm/min": 63.5, "per_token_feature_norm/p25": 135.0, "per_token_feature_norm/p75": 191.0, "per_token_feature_norm/var": 1393.656494140625, "per_token_full_gradient_variance/max_squared_error": 183.16787719726562, "per_token_full_gradient_variance/variance": 0.05961519479751587, "per_token_gradient_norm": 4.8653364181518555, "per_token_gradient_norm/max": 5818.1748046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6666.51806640625, "per_token_policy_error_norm": 0.04272853583097458, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03777167573571205, "policy_entropy": 0.08414041250944138, "policy_entropy/max": 3.75, "policy_entropy/median": 4.9173831939697266e-06, "policy_entropy/min": 6.6058269965196814e-15, "policy_entropy/p25": 2.0023435354232788e-07, "policy_entropy/p75": 0.003021240234375, "policy_entropy/var": 0.05425067991018295, "policy_error_vector_variance/max_squared_error": 2.009451389312744, "policy_error_vector_variance/metric": 0.04269975796341896, "policy_loss": 0.0050521655939519405, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.842082500457764, "policy_sharpness": 8.273734092712402, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.652257919311523, "reward": 0.7669271230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17898298799991608, "rewards/accuracy_reward": 0.7669271230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17898298799991608, "sentence_full_gradient_variance/max_squared_error": 2867663.5, "sentence_full_gradient_variance/metric": 15802.5654296875, "sentence_full_gradient_variance/p75": 404.4891357421875, "sentence_full_gradient_variance/p90": 670.9896850585938, "sentence_full_gradient_variance/p95": 9094.291015625, "sentence_full_gradient_variance/p99": 285341.96875, "state_level_variance/metric": 41.06403350830078, "state_level_variance_full_gradient/metric": 1823.6165771484375, "step": 35 }, { "accuracy_reward": 0.7109375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20577330887317657, "action_level_variance/metric": 690.6270751953125, "action_level_variance_full_gradient/metric": 7275.61083984375, "adam_stats/lr_effective_max": 7.070103310979903e-05, "adam_stats/lr_effective_mean": 2.7397750734792226e-10, "adam_stats/lr_effective_min": -7.417572487611324e-05, "adam_stats/m_t_max": 0.001478243269957602, "adam_stats/m_t_mean": -4.701143987984047e-12, "adam_stats/m_t_min": -0.0015875009121373296, "adam_stats/v_t_max": 5.416487329057418e-05, "adam_stats/v_t_mean": 1.714688287191013e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.03873549401760101, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.320577383041382, "all_logprobs": -0.0748029351234436, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -9.9375, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.10009765625, "all_logprobs/p25": -0.00028228759765625, "all_logprobs/p5": -0.427734375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.10919731855392456, "clip_ratio": 0.0, "completion_length": 543.0807495117188, "completion_length/correct": 466.8992614746094, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 421.0, "completion_length/correct/min": 98.0, "completion_length/correct/p25": 297.0, "completion_length/correct/p75": 599.75, "completion_length/correct/var": 47824.3671875, "completion_length/incorrect": 730.4459838867188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 775.0, "completion_length/incorrect/min": 182.0, "completion_length/incorrect/p25": 448.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 81295.15625, "completion_length/max": 1024.0, "completion_length/median": 473.0, "completion_length/min": 98.0, "completion_length/p25": 325.75, "completion_length/p75": 737.5, "completion_length/var": 71698.5, "epoch": 0.4608, "feature_vector_variance/max_squared_error": 110096.1953125, "feature_vector_variance/metric": 25953.7421875, "generated_tokens/total": 15771083.0, "grad_norm": 0.1876211017370224, "grouped_std_rewards": 0.19236505031585693, "learning_rate": 1.2117461064942437e-05, "loss": -0.0387, "mean_logprobs": -0.0751953125, "mean_logprobs/var": 0.001373291015625, "num_completions/total": 27648, "per_sentence_gradient_norm": 3.998006582260132, "per_sentence_gradient_norm/max": 500.0654602050781, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 17.319183349609375, "per_sentence_gradient_norm/p99": 105.36637115478516, "per_sentence_gradient_norm/var": 675.5225830078125, "per_token_feature_norm": 164.9193115234375, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 163.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 136.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 1376.866455078125, "per_token_full_gradient_variance/max_squared_error": 1168.7415771484375, "per_token_full_gradient_variance/variance": 0.06083284690976143, "per_token_gradient_norm": 4.487252235412598, "per_token_gradient_norm/max": 7160.56884765625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 7113.1435546875, "per_token_policy_error_norm": 0.041730307042598724, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.036531586199998856, "policy_entropy": 0.08348337560892105, "policy_entropy/max": 3.734375, "policy_entropy/median": 5.0067901611328125e-06, "policy_entropy/min": 2.6367796834847468e-15, "policy_entropy/p25": 2.0582228899002075e-07, "policy_entropy/p75": 0.0027008056640625, "policy_entropy/var": 0.054477158933877945, "policy_error_vector_variance/max_squared_error": 2.0100510120391846, "policy_error_vector_variance/metric": 0.04170135036110878, "policy_loss": -0.038735490292310715, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.320577383041382, "policy_sharpness": 8.292058944702148, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.57288646697998, "reward": 0.7109375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20577330887317657, "rewards/accuracy_reward": 0.7109375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20577330887317657, "sentence_full_gradient_variance/max_squared_error": 2089777.625, "sentence_full_gradient_variance/metric": 8200.9169921875, "sentence_full_gradient_variance/p75": 197.97720336914062, "sentence_full_gradient_variance/p90": 279.3753662109375, "sentence_full_gradient_variance/p95": 279.6175842285156, "sentence_full_gradient_variance/p99": 108291.75, "state_level_variance/metric": 71.08479309082031, "state_level_variance_full_gradient/metric": 925.3060302734375, "step": 36 }, { "accuracy_reward": 0.7174479365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20298071205615997, "action_level_variance/metric": 232.541015625, "action_level_variance_full_gradient/metric": 5058.0888671875, "adam_stats/lr_effective_max": 7.092605665093288e-05, "adam_stats/lr_effective_mean": 1.9967776043738894e-10, "adam_stats/lr_effective_min": -7.23342236597091e-05, "adam_stats/m_t_max": 0.0020415023900568485, "adam_stats/m_t_mean": 1.7502060564722477e-11, "adam_stats/m_t_min": -0.0021468254271894693, "adam_stats/v_t_max": 5.411172242020257e-05, "adam_stats/v_t_mean": 1.7248552846432785e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.022277135401964188, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.187551975250244, "all_logprobs": -0.0723840594291687, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -11.5, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.0888671875, "all_logprobs/p25": -0.00022220611572265625, "all_logprobs/p5": -0.390625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1086052730679512, "clip_ratio": 0.0, "completion_length": 541.4583740234375, "completion_length/correct": 473.8747863769531, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 404.0, "completion_length/correct/min": 136.0, "completion_length/correct/p25": 304.0, "completion_length/correct/p75": 629.0, "completion_length/correct/var": 49158.26171875, "completion_length/incorrect": 713.0645141601562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 757.0, "completion_length/incorrect/min": 182.0, "completion_length/incorrect/p25": 454.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 87826.5625, "completion_length/max": 1024.0, "completion_length/median": 472.0, "completion_length/min": 136.0, "completion_length/p25": 316.75, "completion_length/p75": 732.75, "completion_length/var": 71596.6953125, "epoch": 0.4736, "feature_vector_variance/max_squared_error": 109050.40625, "feature_vector_variance/metric": 25714.40234375, "generated_tokens/total": 16186923.0, "grad_norm": 0.17120932042598724, "grouped_std_rewards": 0.21699373424053192, "learning_rate": 1.1908389392193549e-05, "loss": -0.0223, "mean_logprobs": -0.0712890625, "mean_logprobs/var": 0.00107574462890625, "num_completions/total": 28416, "per_sentence_gradient_norm": 3.048130750656128, "per_sentence_gradient_norm/max": 157.18540954589844, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 21.31668472290039, "per_sentence_gradient_norm/p99": 76.72222137451172, "per_sentence_gradient_norm/var": 223.5409698486328, "per_token_feature_norm": 164.92593383789062, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 163.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 136.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 1387.71435546875, "per_token_full_gradient_variance/max_squared_error": 87.39837646484375, "per_token_full_gradient_variance/variance": 0.02646881341934204, "per_token_gradient_norm": 3.3614604473114014, "per_token_gradient_norm/max": 4104.763671875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3180.75048828125, "per_token_policy_error_norm": 0.04032586142420769, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03583613783121109, "policy_entropy": 0.08006220310926437, "policy_entropy/max": 3.78125, "policy_entropy/median": 4.6193599700927734e-06, "policy_entropy/min": 2.222614453595284e-17, "policy_entropy/p25": 1.8440186977386475e-07, "policy_entropy/p75": 0.002166748046875, "policy_entropy/var": 0.05109802633523941, "policy_error_vector_variance/max_squared_error": 2.0115723609924316, "policy_error_vector_variance/metric": 0.04029830917716026, "policy_loss": -0.022277137264609337, "policy_loss/max": 12.958683013916016, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.187551975250244, "policy_sharpness": 8.333541870117188, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.381317138671875, "reward": 0.7174479365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20298071205615997, "rewards/accuracy_reward": 0.7174479365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20298071205615997, "sentence_full_gradient_variance/max_squared_error": 757911.8125, "sentence_full_gradient_variance/metric": 5746.052734375, "sentence_full_gradient_variance/p75": 66.033203125, "sentence_full_gradient_variance/p90": 91.11864471435547, "sentence_full_gradient_variance/p95": 569.10595703125, "sentence_full_gradient_variance/p99": 144401.90625, "state_level_variance/metric": 19.984699249267578, "state_level_variance_full_gradient/metric": 687.9642333984375, "step": 37 }, { "accuracy_reward": 0.7252604365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19951753318309784, "action_level_variance/metric": 856.8121948242188, "action_level_variance_full_gradient/metric": 5581.84326171875, "adam_stats/lr_effective_max": 6.814200605731457e-05, "adam_stats/lr_effective_mean": 5.743065051300178e-11, "adam_stats/lr_effective_min": -6.979361205594614e-05, "adam_stats/m_t_max": 0.007832027040421963, "adam_stats/m_t_mean": -8.994623101887811e-11, "adam_stats/m_t_min": -0.005819193087518215, "adam_stats/v_t_max": 6.070678136893548e-05, "adam_stats/v_t_mean": 1.926474388000421e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.050736259669065475, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.7435553073883057, "all_logprobs": -0.07538418471813202, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -10.9375, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.1005859375, "all_logprobs/p25": -0.00031280517578125, "all_logprobs/p5": -0.4296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11271993815898895, "clip_ratio": 0.0, "completion_length": 525.4609375, "completion_length/correct": 445.8061218261719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 387.0, "completion_length/correct/min": 61.0, "completion_length/correct/p25": 277.0, "completion_length/correct/p75": 575.0, "completion_length/correct/var": 52670.9765625, "completion_length/incorrect": 735.734619140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 813.0, "completion_length/incorrect/min": 118.0, "completion_length/incorrect/p25": 471.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 90334.859375, "completion_length/max": 1024.0, "completion_length/median": 435.0, "completion_length/min": 61.0, "completion_length/p25": 300.0, "completion_length/p75": 753.25, "completion_length/var": 79685.609375, "epoch": 0.4864, "feature_vector_variance/max_squared_error": 116047.65625, "feature_vector_variance/metric": 25631.27734375, "generated_tokens/total": 16590477.0, "grad_norm": 0.6322877407073975, "grouped_std_rewards": 0.1983994096517563, "learning_rate": 1.1693946776030601e-05, "loss": -0.0507, "mean_logprobs": -0.0771484375, "mean_logprobs/var": 0.0012969970703125, "num_completions/total": 29184, "per_sentence_gradient_norm": 4.078022003173828, "per_sentence_gradient_norm/max": 546.8335571289062, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 15.768340110778809, "per_sentence_gradient_norm/p99": 92.24496459960938, "per_sentence_gradient_norm/var": 841.2774047851562, "per_token_feature_norm": 163.9058380126953, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 161.0, "per_token_feature_norm/min": 68.5, "per_token_feature_norm/p25": 135.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 1385.68115234375, "per_token_full_gradient_variance/max_squared_error": 639.7415771484375, "per_token_full_gradient_variance/variance": 0.05874931067228317, "per_token_gradient_norm": 4.740694046020508, "per_token_gradient_norm/max": 7501.38623046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8050.412109375, "per_token_policy_error_norm": 0.0420563742518425, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03714839741587639, "policy_entropy": 0.08356662839651108, "policy_entropy/max": 3.734375, "policy_entropy/median": 5.751848220825195e-06, "policy_entropy/min": 4.107825191113079e-15, "policy_entropy/p25": 2.3189932107925415e-07, "policy_entropy/p75": 0.003021240234375, "policy_entropy/var": 0.053566861897706985, "policy_error_vector_variance/max_squared_error": 2.0153167247772217, "policy_error_vector_variance/metric": 0.04202587902545929, "policy_loss": -0.050736259669065475, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.7435553073883057, "policy_sharpness": 8.27878475189209, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.618725776672363, "reward": 0.7252604365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19951753318309784, "rewards/accuracy_reward": 0.7252604365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19951753318309784, "sentence_full_gradient_variance/max_squared_error": 905108.5625, "sentence_full_gradient_variance/metric": 6300.685546875, "sentence_full_gradient_variance/p75": 213.1720428466797, "sentence_full_gradient_variance/p90": 293.6150817871094, "sentence_full_gradient_variance/p95": 293.6150817871094, "sentence_full_gradient_variance/p99": 196085.921875, "state_level_variance/metric": 91.42359161376953, "state_level_variance_full_gradient/metric": 718.8421630859375, "step": 38 }, { "accuracy_reward": 0.765625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17967729270458221, "action_level_variance/metric": 299.7020263671875, "action_level_variance_full_gradient/metric": 4865.2529296875, "adam_stats/lr_effective_max": 6.77527132211253e-05, "adam_stats/lr_effective_mean": 4.88036555612581e-11, "adam_stats/lr_effective_min": -6.723921251250431e-05, "adam_stats/m_t_max": 0.007243006955832243, "adam_stats/m_t_mean": -8.311892840673352e-11, "adam_stats/m_t_min": -0.0055470275692641735, "adam_stats/v_t_max": 6.064945409889333e-05, "adam_stats/v_t_mean": 1.9311609603112068e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.14907360076904297, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.622377395629883, "all_logprobs": -0.07731962203979492, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -13.5625, "all_logprobs/p1": -1.7265625, "all_logprobs/p10": -0.1005859375, "all_logprobs/p25": -0.0003452301025390625, "all_logprobs/p5": -0.4296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12043814361095428, "clip_ratio": 0.0, "completion_length": 546.4453125, "completion_length/correct": 467.2278747558594, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 425.0, "completion_length/correct/min": 100.0, "completion_length/correct/p25": 293.5, "completion_length/correct/p75": 591.0, "completion_length/correct/var": 45529.57421875, "completion_length/incorrect": 805.2222290039062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 924.0, "completion_length/incorrect/min": 103.0, "completion_length/incorrect/p25": 616.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 67467.7265625, "completion_length/max": 1024.0, "completion_length/median": 488.0, "completion_length/min": 100.0, "completion_length/p25": 334.75, "completion_length/p75": 746.5, "completion_length/var": 71116.4375, "epoch": 0.4992, "feature_vector_variance/max_squared_error": 122464.8828125, "feature_vector_variance/metric": 25646.18359375, "generated_tokens/total": 17010148.0, "grad_norm": 0.14689718186855316, "grouped_std_rewards": 0.20604334771633148, "learning_rate": 1.1474394481749037e-05, "loss": -0.1491, "mean_logprobs": -0.07666015625, "mean_logprobs/var": 0.003265380859375, "num_completions/total": 29952, "per_sentence_gradient_norm": 3.3326680660247803, "per_sentence_gradient_norm/max": 179.16978454589844, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 17.33236312866211, "per_sentence_gradient_norm/p99": 104.2822494506836, "per_sentence_gradient_norm/var": 288.97161865234375, "per_token_feature_norm": 163.30410766601562, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 160.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 134.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 1436.6636962890625, "per_token_full_gradient_variance/max_squared_error": 200.82223510742188, "per_token_full_gradient_variance/variance": 0.03729608654975891, "per_token_gradient_norm": 4.014011383056641, "per_token_gradient_norm/max": 5196.63427734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4212.375, "per_token_policy_error_norm": 0.0424354150891304, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03742720186710358, "policy_entropy": 0.08610326796770096, "policy_entropy/max": 3.65625, "policy_entropy/median": 6.884336471557617e-06, "policy_entropy/min": 5.2735593669694936e-15, "policy_entropy/p25": 2.477318048477173e-07, "policy_entropy/p75": 0.0032958984375, "policy_entropy/var": 0.05890160799026489, "policy_error_vector_variance/max_squared_error": 2.012725830078125, "policy_error_vector_variance/metric": 0.04236995056271553, "policy_loss": -0.14907360076904297, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.622377395629883, "policy_sharpness": 8.250244140625, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.836928367614746, "reward": 0.765625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17967729270458221, "rewards/accuracy_reward": 0.765625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17967729270458221, "sentence_full_gradient_variance/max_squared_error": 1067826.75, "sentence_full_gradient_variance/metric": 5397.236328125, "sentence_full_gradient_variance/p75": 225.1034698486328, "sentence_full_gradient_variance/p90": 680.1786499023438, "sentence_full_gradient_variance/p95": 680.1786499023438, "sentence_full_gradient_variance/p99": 144650.703125, "state_level_variance/metric": 26.63351058959961, "state_level_variance_full_gradient/metric": 531.9833984375, "step": 39 }, { "accuracy_reward": 0.7395833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19285091757774353, "action_level_variance/metric": 586.8591918945312, "action_level_variance_full_gradient/metric": 9674.330078125, "adam_stats/lr_effective_max": 6.666801346000284e-05, "adam_stats/lr_effective_mean": 1.646511954556118e-10, "adam_stats/lr_effective_min": -6.63427053950727e-05, "adam_stats/m_t_max": 0.006430205423384905, "adam_stats/m_t_mean": -5.629133617568449e-11, "adam_stats/m_t_min": -0.0050411531701684, "adam_stats/v_t_max": 6.060920350137167e-05, "adam_stats/v_t_mean": 1.953589200132111e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.1089930385351181, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.560396909713745, "all_logprobs": -0.07201091945171356, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -11.1875, "all_logprobs/p1": -1.6796875, "all_logprobs/p10": -0.0888671875, "all_logprobs/p25": -0.00019931793212890625, "all_logprobs/p5": -0.388671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.10865484923124313, "clip_ratio": 0.0, "completion_length": 533.5612182617188, "completion_length/correct": 452.283447265625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 407.0, "completion_length/correct/min": 40.0, "completion_length/correct/p25": 292.75, "completion_length/correct/p75": 569.25, "completion_length/correct/var": 43943.73046875, "completion_length/incorrect": 764.3899536132812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 839.0, "completion_length/incorrect/min": 134.0, "completion_length/incorrect/p25": 516.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 73384.828125, "completion_length/max": 1024.0, "completion_length/median": 462.0, "completion_length/min": 40.0, "completion_length/p25": 315.75, "completion_length/p75": 708.5, "completion_length/var": 70310.703125, "epoch": 0.512, "feature_vector_variance/max_squared_error": 123626.6171875, "feature_vector_variance/metric": 25900.61328125, "generated_tokens/total": 17419922.0, "grad_norm": 0.2722166180610657, "grouped_std_rewards": 0.19578325748443604, "learning_rate": 1.125e-05, "loss": 0.109, "mean_logprobs": -0.07177734375, "mean_logprobs/var": 0.00179290771484375, "num_completions/total": 30720, "per_sentence_gradient_norm": 3.645425796508789, "per_sentence_gradient_norm/max": 356.65863037109375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 18.026052474975586, "per_sentence_gradient_norm/p99": 78.83082580566406, "per_sentence_gradient_norm/var": 574.3178100585938, "per_token_feature_norm": 164.71774291992188, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 163.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 135.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 1417.5350341796875, "per_token_full_gradient_variance/max_squared_error": 1083.5225830078125, "per_token_full_gradient_variance/variance": 0.06813215464353561, "per_token_gradient_norm": 4.4477219581604, "per_token_gradient_norm/max": 7098.4052734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8423.8779296875, "per_token_policy_error_norm": 0.04005664587020874, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03550058603286743, "policy_entropy": 0.07976201176643372, "policy_entropy/max": 3.71875, "policy_entropy/median": 4.5299530029296875e-06, "policy_entropy/min": 1.240327285323417e-16, "policy_entropy/p25": 1.7881393432617188e-07, "policy_entropy/p75": 0.00193023681640625, "policy_entropy/var": 0.05224980413913727, "policy_error_vector_variance/max_squared_error": 2.0083725452423096, "policy_error_vector_variance/metric": 0.04003271088004112, "policy_loss": 0.1089930385351181, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -7.481915473937988, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.560396909713745, "policy_sharpness": 8.350808143615723, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.75, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.339042663574219, "reward": 0.7395833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19285091757774353, "rewards/accuracy_reward": 0.7395833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19285091757774353, "sentence_full_gradient_variance/max_squared_error": 2993643.5, "sentence_full_gradient_variance/metric": 11004.8427734375, "sentence_full_gradient_variance/p75": 85.22161102294922, "sentence_full_gradient_variance/p90": 147.51101684570312, "sentence_full_gradient_variance/p95": 147.51101684570312, "sentence_full_gradient_variance/p99": 164714.90625, "state_level_variance/metric": 60.700565338134766, "state_level_variance_full_gradient/metric": 1330.51318359375, "step": 40 }, { "accuracy_reward": 0.6901041865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21413923799991608, "action_level_variance/metric": 415.2188720703125, "action_level_variance_full_gradient/metric": 6249.5859375, "adam_stats/lr_effective_max": 6.792345811845735e-05, "adam_stats/lr_effective_mean": 2.7331123475526908e-11, "adam_stats/lr_effective_min": -6.746972212567925e-05, "adam_stats/m_t_max": 0.00869245920330286, "adam_stats/m_t_mean": -8.138992257933353e-11, "adam_stats/m_t_min": -0.006685475818812847, "adam_stats/v_t_max": 6.0719197790604085e-05, "adam_stats/v_t_mean": 1.9863095543359854e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.09790630638599396, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.8115737438201904, "all_logprobs": -0.07475362718105316, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -12.5, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.09619140625, "all_logprobs/p25": -0.00026702880859375, "all_logprobs/p5": -0.412109375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11424585431814194, "clip_ratio": 0.0, "completion_length": 557.1041870117188, "completion_length/correct": 459.3358459472656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 425.0, "completion_length/correct/min": 73.0, "completion_length/correct/p25": 308.0, "completion_length/correct/p75": 572.0, "completion_length/correct/var": 43864.17578125, "completion_length/incorrect": 774.8235473632812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 843.0, "completion_length/incorrect/min": 229.0, "completion_length/incorrect/p25": 564.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 68679.4453125, "completion_length/max": 1024.0, "completion_length/median": 496.0, "completion_length/min": 73.0, "completion_length/p25": 344.0, "completion_length/p75": 746.25, "completion_length/var": 72788.6171875, "epoch": 0.5248, "feature_vector_variance/max_squared_error": 114791.4921875, "feature_vector_variance/metric": 25323.443359375, "generated_tokens/total": 17847778.0, "grad_norm": 0.3053728938102722, "grouped_std_rewards": 0.21120858192443848, "learning_rate": 1.1021036720894182e-05, "loss": 0.0979, "mean_logprobs": -0.07275390625, "mean_logprobs/var": 0.00186920166015625, "num_completions/total": 31488, "per_sentence_gradient_norm": 3.7719645500183105, "per_sentence_gradient_norm/max": 238.1903076171875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 19.76502799987793, "per_sentence_gradient_norm/p99": 130.37669372558594, "per_sentence_gradient_norm/var": 401.51397705078125, "per_token_feature_norm": 162.645263671875, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 160.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 133.0, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 1404.2520751953125, "per_token_full_gradient_variance/max_squared_error": 173.99765014648438, "per_token_full_gradient_variance/variance": 0.04272661358118057, "per_token_gradient_norm": 4.4769792556762695, "per_token_gradient_norm/max": 5252.36181640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4920.90869140625, "per_token_policy_error_norm": 0.04140646383166313, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03667021170258522, "policy_entropy": 0.08282987028360367, "policy_entropy/max": 3.71875, "policy_entropy/median": 6.645917892456055e-06, "policy_entropy/min": 9.43689570931383e-15, "policy_entropy/p25": 2.5890767574310303e-07, "policy_entropy/p75": 0.0026092529296875, "policy_entropy/var": 0.05532119795680046, "policy_error_vector_variance/max_squared_error": 2.012716293334961, "policy_error_vector_variance/metric": 0.041372232139110565, "policy_loss": 0.09790630638599396, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -7.481915473937988, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.8115737438201904, "policy_sharpness": 8.29857349395752, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.584142684936523, "reward": 0.6901041865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21413923799991608, "rewards/accuracy_reward": 0.6901041865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21413923799991608, "sentence_full_gradient_variance/max_squared_error": 2622675.0, "sentence_full_gradient_variance/metric": 7089.88427734375, "sentence_full_gradient_variance/p75": 119.12202453613281, "sentence_full_gradient_variance/p90": 190.14297485351562, "sentence_full_gradient_variance/p95": 190.14297485351562, "sentence_full_gradient_variance/p99": 100198.015625, "state_level_variance/metric": 38.07121658325195, "state_level_variance_full_gradient/metric": 840.2978515625, "step": 41 }, { "accuracy_reward": 0.6875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21512383222579956, "action_level_variance/metric": 492.9267578125, "action_level_variance_full_gradient/metric": 8244.462890625, "adam_stats/lr_effective_max": 6.738019874319434e-05, "adam_stats/lr_effective_mean": 5.671680139429647e-11, "adam_stats/lr_effective_min": -6.839602428954095e-05, "adam_stats/m_t_max": 0.008421357721090317, "adam_stats/m_t_mean": -7.27186366678012e-11, "adam_stats/m_t_min": -0.006428915541619062, "adam_stats/v_t_max": 6.076362114981748e-05, "adam_stats/v_t_mean": 1.990001479573733e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.10464338958263397, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.7239906787872314, "all_logprobs": -0.07538238167762756, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-07, "all_logprobs/min": -12.875, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.10107421875, "all_logprobs/p25": -0.0003795623779296875, "all_logprobs/p5": -0.427734375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11251504719257355, "clip_ratio": 0.0, "completion_length": 577.0911865234375, "completion_length/correct": 483.35986328125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 423.0, "completion_length/correct/min": 130.0, "completion_length/correct/p25": 319.0, "completion_length/correct/p75": 621.75, "completion_length/correct/var": 47784.41796875, "completion_length/incorrect": 783.300048828125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 964.0, "completion_length/incorrect/min": 188.0, "completion_length/incorrect/p25": 492.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 84806.3515625, "completion_length/max": 1024.0, "completion_length/median": 479.0, "completion_length/min": 130.0, "completion_length/p25": 341.0, "completion_length/p75": 828.25, "completion_length/var": 78611.7109375, "epoch": 0.5376, "feature_vector_variance/max_squared_error": 109334.7890625, "feature_vector_variance/metric": 25418.236328125, "generated_tokens/total": 18290984.0, "grad_norm": 0.17331956326961517, "grouped_std_rewards": 0.18407446146011353, "learning_rate": 1.078778360091808e-05, "loss": 0.1046, "mean_logprobs": -0.0751953125, "mean_logprobs/var": 0.001220703125, "num_completions/total": 32256, "per_sentence_gradient_norm": 3.7990036010742188, "per_sentence_gradient_norm/max": 315.1883850097656, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 13.917869567871094, "per_sentence_gradient_norm/p99": 107.11406707763672, "per_sentence_gradient_norm/var": 479.1181945800781, "per_token_feature_norm": 162.84263610839844, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 160.0, "per_token_feature_norm/min": 69.0, "per_token_feature_norm/p25": 134.0, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 1342.2945556640625, "per_token_full_gradient_variance/max_squared_error": 565.8198852539062, "per_token_full_gradient_variance/variance": 0.06714049726724625, "per_token_gradient_norm": 4.757879734039307, "per_token_gradient_norm/max": 6514.19091796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 7044.8701171875, "per_token_policy_error_norm": 0.04197748750448227, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03694973140954971, "policy_entropy": 0.08399967849254608, "policy_entropy/max": 3.71875, "policy_entropy/median": 8.106231689453125e-06, "policy_entropy/min": 1.2823075934420558e-14, "policy_entropy/p25": 3.725290298461914e-07, "policy_entropy/p75": 0.003448486328125, "policy_entropy/var": 0.05350395292043686, "policy_error_vector_variance/max_squared_error": 2.01420521736145, "policy_error_vector_variance/metric": 0.041951265186071396, "policy_loss": 0.10464337468147278, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.7239906787872314, "policy_sharpness": 8.250329971313477, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.717951774597168, "reward": 0.6875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21512383222579956, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21512383222579956, "sentence_full_gradient_variance/max_squared_error": 2103188.0, "sentence_full_gradient_variance/metric": 9292.4794921875, "sentence_full_gradient_variance/p75": 251.99234008789062, "sentence_full_gradient_variance/p90": 529.2525634765625, "sentence_full_gradient_variance/p95": 529.2525634765625, "sentence_full_gradient_variance/p99": 156765.6875, "state_level_variance/metric": 47.680091857910156, "state_level_variance_full_gradient/metric": 1048.016357421875, "step": 42 }, { "accuracy_reward": 0.73046875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19714082777500153, "action_level_variance/metric": 368.3166198730469, "action_level_variance_full_gradient/metric": 6647.31005859375, "adam_stats/lr_effective_max": 6.576696614501998e-05, "adam_stats/lr_effective_mean": 7.074823366037819e-12, "adam_stats/lr_effective_min": -6.268042488954961e-05, "adam_stats/m_t_max": 0.006163205951452255, "adam_stats/m_t_mean": -4.9554419595532195e-11, "adam_stats/m_t_min": -0.0049376352690160275, "adam_stats/v_t_max": 6.07088950346224e-05, "adam_stats/v_t_mean": 1.995090290890511e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.04179834574460983, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.8721935749053955, "all_logprobs": -0.06825864315032959, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -9.0, "all_logprobs/p1": -1.6015625, "all_logprobs/p10": -0.07275390625, "all_logprobs/p25": -0.00012302398681640625, "all_logprobs/p5": -0.38671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.10109040886163712, "clip_ratio": 0.0, "completion_length": 533.6237182617188, "completion_length/correct": 443.3671875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 395.0, "completion_length/correct/min": 101.0, "completion_length/correct/p25": 284.0, "completion_length/correct/p75": 553.0, "completion_length/correct/var": 44579.703125, "completion_length/incorrect": 778.2318725585938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 863.0, "completion_length/incorrect/min": 238.0, "completion_length/incorrect/p25": 547.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 71545.9765625, "completion_length/max": 1024.0, "completion_length/median": 456.0, "completion_length/min": 101.0, "completion_length/p25": 311.0, "completion_length/p75": 718.5, "completion_length/var": 73870.40625, "epoch": 0.5504, "feature_vector_variance/max_squared_error": 108900.984375, "feature_vector_variance/metric": 25252.58984375, "generated_tokens/total": 18700808.0, "grad_norm": 0.1640872210264206, "grouped_std_rewards": 0.19284343719482422, "learning_rate": 1.0550524823068504e-05, "loss": 0.0418, "mean_logprobs": -0.06982421875, "mean_logprobs/var": 0.00142669677734375, "num_completions/total": 33024, "per_sentence_gradient_norm": 3.6459262371063232, "per_sentence_gradient_norm/max": 193.859619140625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 14.943937301635742, "per_sentence_gradient_norm/p99": 100.13721466064453, "per_sentence_gradient_norm/var": 355.4866638183594, "per_token_feature_norm": 163.32191467285156, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 161.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 134.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 1359.1907958984375, "per_token_full_gradient_variance/max_squared_error": 161.89291381835938, "per_token_full_gradient_variance/variance": 0.041962046176195145, "per_token_gradient_norm": 4.141676902770996, "per_token_gradient_norm/max": 6009.14990234375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4944.75244140625, "per_token_policy_error_norm": 0.03831296041607857, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03394940122961998, "policy_entropy": 0.0755065307021141, "policy_entropy/max": 3.703125, "policy_entropy/median": 4.76837158203125e-06, "policy_entropy/min": 9.769962616701378e-15, "policy_entropy/p25": 2.2724270820617676e-07, "policy_entropy/p75": 0.0012359619140625, "policy_entropy/var": 0.04912232235074043, "policy_error_vector_variance/max_squared_error": 2.009974956512451, "policy_error_vector_variance/metric": 0.03829321265220642, "policy_loss": 0.041798338294029236, "policy_loss/max": 12.958683013916016, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.8721935749053955, "policy_sharpness": 8.430354118347168, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.975343704223633, "reward": 0.73046875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19714082777500153, "rewards/accuracy_reward": 0.73046875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19714082777500153, "sentence_full_gradient_variance/max_squared_error": 1281925.375, "sentence_full_gradient_variance/metric": 7503.1767578125, "sentence_full_gradient_variance/p75": 170.3673553466797, "sentence_full_gradient_variance/p90": 309.8918151855469, "sentence_full_gradient_variance/p95": 310.2286071777344, "sentence_full_gradient_variance/p99": 201888.921875, "state_level_variance/metric": 33.09150314331055, "state_level_variance_full_gradient/metric": 855.865478515625, "step": 43 }, { "accuracy_reward": 0.7643229365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18036822974681854, "action_level_variance/metric": 384.18408203125, "action_level_variance_full_gradient/metric": 6994.005859375, "adam_stats/lr_effective_max": 6.180913624120876e-05, "adam_stats/lr_effective_mean": 2.0633944552983507e-10, "adam_stats/lr_effective_min": -6.135425792308524e-05, "adam_stats/m_t_max": 0.002450030529871583, "adam_stats/m_t_mean": -8.156648967361235e-12, "adam_stats/m_t_min": -0.0025439022574573755, "adam_stats/v_t_max": 6.204345845617354e-05, "adam_stats/v_t_mean": 2.0313627082518826e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.036274686455726624, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.7092883586883545, "all_logprobs": -0.0757652148604393, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-07, "all_logprobs/min": -10.0625, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.10009765625, "all_logprobs/p25": -0.0003376007080078125, "all_logprobs/p5": -0.41796875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11754626035690308, "clip_ratio": 0.0, "completion_length": 522.0065307617188, "completion_length/correct": 429.7018737792969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 399.0, "completion_length/correct/min": 81.0, "completion_length/correct/p25": 279.5, "completion_length/correct/p75": 532.0, "completion_length/correct/var": 39853.2890625, "completion_length/incorrect": 821.359130859375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 204.0, "completion_length/incorrect/p25": 608.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 66395.078125, "completion_length/max": 1024.0, "completion_length/median": 453.0, "completion_length/min": 81.0, "completion_length/p25": 312.0, "completion_length/p75": 691.0, "completion_length/var": 73697.828125, "epoch": 0.5632, "feature_vector_variance/max_squared_error": 113670.859375, "feature_vector_variance/metric": 25981.33984375, "generated_tokens/total": 19101708.0, "grad_norm": 0.31436407566070557, "grouped_std_rewards": 0.15215662121772766, "learning_rate": 1.0309549450619342e-05, "loss": 0.0363, "mean_logprobs": -0.0732421875, "mean_logprobs/var": 0.00174713134765625, "num_completions/total": 33792, "per_sentence_gradient_norm": 3.1530141830444336, "per_sentence_gradient_norm/max": 273.8856506347656, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 93.14714050292969, "per_sentence_gradient_norm/var": 374.7304992675781, "per_token_feature_norm": 164.03958129882812, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 161.0, "per_token_feature_norm/min": 68.5, "per_token_feature_norm/p25": 135.0, "per_token_feature_norm/p75": 191.0, "per_token_feature_norm/var": 1410.371826171875, "per_token_full_gradient_variance/max_squared_error": 272.6882019042969, "per_token_full_gradient_variance/variance": 0.04402695596218109, "per_token_gradient_norm": 4.0563740730285645, "per_token_gradient_norm/max": 6137.498046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5488.0927734375, "per_token_policy_error_norm": 0.04172685742378235, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0372152253985405, "policy_entropy": 0.08382240682840347, "policy_entropy/max": 3.71875, "policy_entropy/median": 7.450580596923828e-06, "policy_entropy/min": 4.052314039881821e-15, "policy_entropy/p25": 2.551823854446411e-07, "policy_entropy/p75": 0.0031890869140625, "policy_entropy/var": 0.056017421185970306, "policy_error_vector_variance/max_squared_error": 2.010915756225586, "policy_error_vector_variance/metric": 0.04167867451906204, "policy_loss": 0.03627469018101692, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.7092883586883545, "policy_sharpness": 8.264795303344727, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.68116569519043, "reward": 0.7643229365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18036822974681854, "rewards/accuracy_reward": 0.7643229365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18036822974681854, "sentence_full_gradient_variance/max_squared_error": 2305530.5, "sentence_full_gradient_variance/metric": 7938.3544921875, "sentence_full_gradient_variance/p75": 123.14542388916016, "sentence_full_gradient_variance/p90": 154.89431762695312, "sentence_full_gradient_variance/p95": 154.89431762695312, "sentence_full_gradient_variance/p99": 134766.765625, "state_level_variance/metric": 38.48236846923828, "state_level_variance_full_gradient/metric": 944.3480834960938, "step": 44 }, { "accuracy_reward": 0.7213541865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.201264426112175, "action_level_variance/metric": 340.2701416015625, "action_level_variance_full_gradient/metric": 3477.61474609375, "adam_stats/lr_effective_max": 6.055034100427292e-05, "adam_stats/lr_effective_mean": 9.814875301383807e-11, "adam_stats/lr_effective_min": -5.9608646552078426e-05, "adam_stats/m_t_max": 0.0022244080901145935, "adam_stats/m_t_mean": 4.680678154095341e-12, "adam_stats/m_t_min": -0.0022045604418963194, "adam_stats/v_t_max": 6.226108962437138e-05, "adam_stats/v_t_mean": 2.0747585507269184e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.13380646705627441, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.5467381477355957, "all_logprobs": -0.08027710020542145, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-07, "all_logprobs/min": -9.75, "all_logprobs/p1": -1.765625, "all_logprobs/p10": -0.1142578125, "all_logprobs/p25": -0.000453948974609375, "all_logprobs/p5": -0.474609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12097962200641632, "clip_ratio": 0.0, "completion_length": 552.2747802734375, "completion_length/correct": 458.7310485839844, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 408.0, "completion_length/correct/min": 93.0, "completion_length/correct/p25": 316.25, "completion_length/correct/p75": 556.75, "completion_length/correct/var": 43113.875, "completion_length/incorrect": 794.439208984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 930.0, "completion_length/incorrect/min": 190.0, "completion_length/incorrect/p25": 570.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 73077.734375, "completion_length/max": 1024.0, "completion_length/median": 454.0, "completion_length/min": 93.0, "completion_length/p25": 345.0, "completion_length/p75": 769.75, "completion_length/var": 74061.296875, "epoch": 0.576, "feature_vector_variance/max_squared_error": 111153.9921875, "feature_vector_variance/metric": 25536.203125, "generated_tokens/total": 19525856.0, "grad_norm": 0.3334077298641205, "grouped_std_rewards": 0.20042598247528076, "learning_rate": 1.0065151074942516e-05, "loss": 0.1338, "mean_logprobs": -0.07666015625, "mean_logprobs/var": 0.00139617919921875, "num_completions/total": 34560, "per_sentence_gradient_norm": 3.4254262447357178, "per_sentence_gradient_norm/max": 215.66481018066406, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 21.461034774780273, "per_sentence_gradient_norm/p99": 77.66612243652344, "per_sentence_gradient_norm/var": 328.9649353027344, "per_token_feature_norm": 162.69654846191406, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 159.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 133.0, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 1416.596923828125, "per_token_full_gradient_variance/max_squared_error": 1236.5745849609375, "per_token_full_gradient_variance/variance": 0.06673867255449295, "per_token_gradient_norm": 4.68178653717041, "per_token_gradient_norm/max": 6494.70703125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 7213.77978515625, "per_token_policy_error_norm": 0.044674720615148544, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03948443755507469, "policy_entropy": 0.08861511200666428, "policy_entropy/max": 3.75, "policy_entropy/median": 8.404254913330078e-06, "policy_entropy/min": 2.095545958979983e-15, "policy_entropy/p25": 3.073364496231079e-07, "policy_entropy/p75": 0.00421142578125, "policy_entropy/var": 0.05840463936328888, "policy_error_vector_variance/max_squared_error": 2.0124359130859375, "policy_error_vector_variance/metric": 0.04464898258447647, "policy_loss": 0.1338064819574356, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.546738862991333, "policy_sharpness": 8.208320617675781, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.6875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.95722484588623, "reward": 0.7213541865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.201264426112175, "rewards/accuracy_reward": 0.7213541865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.201264426112175, "sentence_full_gradient_variance/max_squared_error": 810995.25, "sentence_full_gradient_variance/metric": 3928.00537109375, "sentence_full_gradient_variance/p75": 86.7933120727539, "sentence_full_gradient_variance/p90": 234.2397918701172, "sentence_full_gradient_variance/p95": 234.2397918701172, "sentence_full_gradient_variance/p99": 79952.3125, "state_level_variance/metric": 31.124439239501953, "state_level_variance_full_gradient/metric": 450.3900146484375, "step": 45 }, { "accuracy_reward": 0.8033854365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15816323459148407, "action_level_variance/metric": 408.71270751953125, "action_level_variance_full_gradient/metric": 3859.26708984375, "adam_stats/lr_effective_max": 5.654064079863019e-05, "adam_stats/lr_effective_mean": 1.312512459827886e-10, "adam_stats/lr_effective_min": -5.690287798643112e-05, "adam_stats/m_t_max": 0.002067580120638013, "adam_stats/m_t_mean": -2.266005091899892e-12, "adam_stats/m_t_min": -0.0020451394375413656, "adam_stats/v_t_max": 6.222767842700705e-05, "adam_stats/v_t_mean": 2.074487066502928e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.09352624416351318, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.6872403621673584, "all_logprobs": -0.07624539732933044, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -11.0625, "all_logprobs/p1": -1.71875, "all_logprobs/p10": -0.10009765625, "all_logprobs/p25": -0.0003032684326171875, "all_logprobs/p5": -0.4296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1158742606639862, "clip_ratio": 0.0, "completion_length": 516.3919677734375, "completion_length/correct": 451.14910888671875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 394.0, "completion_length/correct/min": 136.0, "completion_length/correct/p25": 281.0, "completion_length/correct/p75": 595.0, "completion_length/correct/var": 43158.6875, "completion_length/incorrect": 782.9801025390625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 900.0, "completion_length/incorrect/min": 203.0, "completion_length/incorrect/p25": 511.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 73187.0859375, "completion_length/max": 1024.0, "completion_length/median": 443.0, "completion_length/min": 136.0, "completion_length/p25": 300.0, "completion_length/p75": 679.5, "completion_length/var": 66390.6328125, "epoch": 0.5888, "feature_vector_variance/max_squared_error": 113476.6015625, "feature_vector_variance/metric": 25795.12109375, "generated_tokens/total": 19922444.0, "grad_norm": 0.06928649544715881, "grouped_std_rewards": 0.15697859227657318, "learning_rate": 9.817627457812105e-06, "loss": 0.0935, "mean_logprobs": -0.0732421875, "mean_logprobs/var": 0.00154876708984375, "num_completions/total": 35328, "per_sentence_gradient_norm": 3.331531047821045, "per_sentence_gradient_norm/max": 215.5148162841797, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 122.03267669677734, "per_sentence_gradient_norm/var": 398.1320495605469, "per_token_feature_norm": 163.84092712402344, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 161.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 135.0, "per_token_feature_norm/p75": 191.0, "per_token_feature_norm/var": 1394.514892578125, "per_token_full_gradient_variance/max_squared_error": 250.1844940185547, "per_token_full_gradient_variance/variance": 0.05210690200328827, "per_token_gradient_norm": 4.911154270172119, "per_token_gradient_norm/max": 6275.27880859375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6220.76611328125, "per_token_policy_error_norm": 0.042356643825769424, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03764744848012924, "policy_entropy": 0.08394535630941391, "policy_entropy/max": 3.578125, "policy_entropy/median": 6.794929504394531e-06, "policy_entropy/min": 4.228388472693467e-17, "policy_entropy/p25": 2.421438694000244e-07, "policy_entropy/p75": 0.0029449462890625, "policy_entropy/var": 0.055497024208307266, "policy_error_vector_variance/max_squared_error": 2.009747266769409, "policy_error_vector_variance/metric": 0.042336929589509964, "policy_loss": 0.09352624416351318, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -7.481915473937988, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.6872403621673584, "policy_sharpness": 8.279706954956055, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.65841293334961, "reward": 0.8033854365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15816323459148407, "rewards/accuracy_reward": 0.8033854365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15816323459148407, "sentence_full_gradient_variance/max_squared_error": 1080504.625, "sentence_full_gradient_variance/metric": 4382.6748046875, "sentence_full_gradient_variance/p75": 65.17340850830078, "sentence_full_gradient_variance/p90": 94.072998046875, "sentence_full_gradient_variance/p95": 94.072998046875, "sentence_full_gradient_variance/p99": 46281.99609375, "state_level_variance/metric": 40.41093444824219, "state_level_variance_full_gradient/metric": 523.4078979492188, "step": 46 }, { "accuracy_reward": 0.6888021230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2146332561969757, "action_level_variance/metric": 714.6956787109375, "action_level_variance_full_gradient/metric": 4705.498046875, "adam_stats/lr_effective_max": 5.7666798966238275e-05, "adam_stats/lr_effective_mean": 2.8364197102170863e-10, "adam_stats/lr_effective_min": -5.770711868535727e-05, "adam_stats/m_t_max": 0.0035673086531460285, "adam_stats/m_t_mean": 3.186998928650375e-11, "adam_stats/m_t_min": -0.0031458488665521145, "adam_stats/v_t_max": 6.226082041393965e-05, "adam_stats/v_t_mean": 2.16916914118348e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.01934683322906494, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.6046411991119385, "all_logprobs": -0.07363256812095642, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -10.125, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.08984375, "all_logprobs/p25": -0.0002307891845703125, "all_logprobs/p5": -0.404296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11081662029027939, "clip_ratio": 0.0, "completion_length": 601.15234375, "completion_length/correct": 501.28546142578125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 456.0, "completion_length/correct/min": 144.0, "completion_length/correct/p25": 340.0, "completion_length/correct/p75": 643.0, "completion_length/correct/var": 45457.72265625, "completion_length/incorrect": 822.1966552734375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 995.0, "completion_length/incorrect/min": 225.0, "completion_length/incorrect/p25": 613.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63608.65625, "completion_length/max": 1024.0, "completion_length/median": 534.0, "completion_length/min": 144.0, "completion_length/p25": 372.0, "completion_length/p75": 839.25, "completion_length/var": 73134.484375, "epoch": 0.6016, "feature_vector_variance/max_squared_error": 112735.046875, "feature_vector_variance/metric": 25735.533203125, "generated_tokens/total": 20384128.0, "grad_norm": 0.4684304893016815, "grouped_std_rewards": 0.22204077243804932, "learning_rate": 9.567280168627493e-06, "loss": -0.0193, "mean_logprobs": -0.0712890625, "mean_logprobs/var": 0.00150299072265625, "num_completions/total": 36096, "per_sentence_gradient_norm": 3.904222011566162, "per_sentence_gradient_norm/max": 564.2070922851562, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 22.901750564575195, "per_sentence_gradient_norm/p99": 84.7280044555664, "per_sentence_gradient_norm/var": 700.3645629882812, "per_token_feature_norm": 163.81936645507812, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 161.0, "per_token_feature_norm/min": 69.0, "per_token_feature_norm/p25": 135.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 1345.720458984375, "per_token_full_gradient_variance/max_squared_error": 466.7949523925781, "per_token_full_gradient_variance/variance": 0.05163038149476051, "per_token_gradient_norm": 4.486845970153809, "per_token_gradient_norm/max": 6817.5869140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6265.560546875, "per_token_policy_error_norm": 0.04079706594347954, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03605085238814354, "policy_entropy": 0.08210396766662598, "policy_entropy/max": 3.65625, "policy_entropy/median": 6.616115570068359e-06, "policy_entropy/min": 7.105427357601002e-15, "policy_entropy/p25": 2.4586915969848633e-07, "policy_entropy/p75": 0.0022125244140625, "policy_entropy/var": 0.05462074279785156, "policy_error_vector_variance/max_squared_error": 2.0132594108581543, "policy_error_vector_variance/metric": 0.04077170044183731, "policy_loss": -0.019346829503774643, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.6046409606933594, "policy_sharpness": 8.32507610321045, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.471248626708984, "reward": 0.6888021230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2146332561969757, "rewards/accuracy_reward": 0.6888021230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2146332561969757, "sentence_full_gradient_variance/max_squared_error": 1413802.25, "sentence_full_gradient_variance/metric": 5319.77490234375, "sentence_full_gradient_variance/p75": 122.37194061279297, "sentence_full_gradient_variance/p90": 127.2782211303711, "sentence_full_gradient_variance/p95": 127.2782211303711, "sentence_full_gradient_variance/p99": 69759.125, "state_level_variance/metric": 74.87394714355469, "state_level_variance_full_gradient/metric": 614.276611328125, "step": 47 }, { "accuracy_reward": 0.74609375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18968485295772552, "action_level_variance/metric": 260.58984375, "action_level_variance_full_gradient/metric": 8037.41162109375, "adam_stats/lr_effective_max": 5.608424180536531e-05, "adam_stats/lr_effective_mean": 2.4442600721208407e-10, "adam_stats/lr_effective_min": -5.602983583230525e-05, "adam_stats/m_t_max": 0.004504523240029812, "adam_stats/m_t_mean": 4.55808273069902e-11, "adam_stats/m_t_min": -0.0049552880227565765, "adam_stats/v_t_max": 6.231659062905237e-05, "adam_stats/v_t_mean": 2.1735337054490378e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.009517719969153404, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.0441653728485107, "all_logprobs": -0.07447545230388641, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -11.0, "all_logprobs/p1": -1.703125, "all_logprobs/p10": -0.09619140625, "all_logprobs/p25": -0.000278472900390625, "all_logprobs/p5": -0.412109375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11280837655067444, "clip_ratio": 0.0, "completion_length": 567.7734375, "completion_length/correct": 496.19024658203125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 468.0, "completion_length/correct/min": 128.0, "completion_length/correct/p25": 342.0, "completion_length/correct/p75": 621.0, "completion_length/correct/var": 41702.515625, "completion_length/incorrect": 778.1179809570312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 964.0, "completion_length/incorrect/min": 159.0, "completion_length/incorrect/p25": 497.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 77278.3984375, "completion_length/max": 1024.0, "completion_length/median": 516.0, "completion_length/min": 128.0, "completion_length/p25": 369.25, "completion_length/p75": 736.0, "completion_length/var": 65723.25, "epoch": 0.6144, "feature_vector_variance/max_squared_error": 113066.75, "feature_vector_variance/metric": 25503.525390625, "generated_tokens/total": 20820180.0, "grad_norm": 0.15419188141822815, "grouped_std_rewards": 0.1859624981880188, "learning_rate": 9.314414216997507e-06, "loss": -0.0095, "mean_logprobs": -0.07275390625, "mean_logprobs/var": 0.0013580322265625, "num_completions/total": 36864, "per_sentence_gradient_norm": 2.884183406829834, "per_sentence_gradient_norm/max": 200.3319091796875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 14.122300148010254, "per_sentence_gradient_norm/p99": 68.30805969238281, "per_sentence_gradient_norm/var": 252.60025024414062, "per_token_feature_norm": 162.61338806152344, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 159.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 134.0, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 1379.0472412109375, "per_token_full_gradient_variance/max_squared_error": 240.37246704101562, "per_token_full_gradient_variance/variance": 0.03658173978328705, "per_token_gradient_norm": 3.3046715259552, "per_token_gradient_norm/max": 5176.12646484375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4223.0400390625, "per_token_policy_error_norm": 0.041231125593185425, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03669360280036926, "policy_entropy": 0.08271314203739166, "policy_entropy/max": 3.75, "policy_entropy/median": 6.735324859619141e-06, "policy_entropy/min": 8.014422459012849e-16, "policy_entropy/p25": 2.4400651454925537e-07, "policy_entropy/p75": 0.002685546875, "policy_entropy/var": 0.0545598529279232, "policy_error_vector_variance/max_squared_error": 2.0115480422973633, "policy_error_vector_variance/metric": 0.04120509326457977, "policy_loss": -0.00951772928237915, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.0441653728485107, "policy_sharpness": 8.298487663269043, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.54694938659668, "reward": 0.74609375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.18968485295772552, "rewards/accuracy_reward": 0.74609375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18968485295772552, "sentence_full_gradient_variance/max_squared_error": 2477488.25, "sentence_full_gradient_variance/metric": 9049.0537109375, "sentence_full_gradient_variance/p75": 287.4967956542969, "sentence_full_gradient_variance/p90": 370.5413513183594, "sentence_full_gradient_variance/p95": 370.5413513183594, "sentence_full_gradient_variance/p99": 169260.1875, "state_level_variance/metric": 24.510534286499023, "state_level_variance_full_gradient/metric": 1011.6412963867188, "step": 48 }, { "accuracy_reward": 0.73828125, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1934739649295807, "action_level_variance/metric": 382.41259765625, "action_level_variance_full_gradient/metric": 3060.25146484375, "adam_stats/lr_effective_max": 5.3206160373520106e-05, "adam_stats/lr_effective_mean": 1.3889507599618156e-10, "adam_stats/lr_effective_min": -5.664415220962837e-05, "adam_stats/m_t_max": 0.005024529993534088, "adam_stats/m_t_mean": 4.984740051283687e-11, "adam_stats/m_t_min": -0.005625531077384949, "adam_stats/v_t_max": 6.22697698418051e-05, "adam_stats/v_t_mean": 2.186395812661668e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.07064298540353775, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.8723459243774414, "all_logprobs": -0.07887832075357437, "all_logprobs/max": 0.0, "all_logprobs/median": -5.960464477539062e-07, "all_logprobs/min": -9.6875, "all_logprobs/p1": -1.75, "all_logprobs/p10": -0.11279296875, "all_logprobs/p25": -0.0004024505615234375, "all_logprobs/p5": -0.46484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11949963122606277, "clip_ratio": 0.0, "completion_length": 562.8021240234375, "completion_length/correct": 466.80950927734375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 429.0, "completion_length/correct/min": 119.0, "completion_length/correct/p25": 309.5, "completion_length/correct/p75": 586.0, "completion_length/correct/var": 45015.85546875, "completion_length/incorrect": 833.5870361328125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 951.0, "completion_length/incorrect/min": 210.0, "completion_length/incorrect/p25": 663.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 54611.21484375, "completion_length/max": 1024.0, "completion_length/median": 504.0, "completion_length/min": 119.0, "completion_length/p25": 340.0, "completion_length/p75": 786.75, "completion_length/var": 73486.4453125, "epoch": 0.6272, "feature_vector_variance/max_squared_error": 112647.6640625, "feature_vector_variance/metric": 25484.2265625, "generated_tokens/total": 21252412.0, "grad_norm": 0.19211286306381226, "grouped_std_rewards": 0.17039570212364197, "learning_rate": 9.059337681133194e-06, "loss": -0.0706, "mean_logprobs": -0.076171875, "mean_logprobs/var": 0.0017547607421875, "num_completions/total": 37632, "per_sentence_gradient_norm": 3.1210219860076904, "per_sentence_gradient_norm/max": 309.1222839355469, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 88.84415435791016, "per_sentence_gradient_norm/var": 373.15771484375, "per_token_feature_norm": 162.5839080810547, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 159.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 133.0, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 1425.9388427734375, "per_token_full_gradient_variance/max_squared_error": 128.82516479492188, "per_token_full_gradient_variance/variance": 0.03602980822324753, "per_token_gradient_norm": 3.879498243331909, "per_token_gradient_norm/max": 4100.8154296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4475.93212890625, "per_token_policy_error_norm": 0.04355846345424652, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.038404662162065506, "policy_entropy": 0.08812227100133896, "policy_entropy/max": 3.71875, "policy_entropy/median": 9.894371032714844e-06, "policy_entropy/min": 1.033895191682177e-15, "policy_entropy/p25": 3.818422555923462e-07, "policy_entropy/p75": 0.0037689208984375, "policy_entropy/var": 0.05893491581082344, "policy_error_vector_variance/max_squared_error": 2.0131916999816895, "policy_error_vector_variance/metric": 0.04353116452693939, "policy_loss": -0.07064298540353775, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.8723458051681519, "policy_sharpness": 8.22271728515625, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.959177017211914, "reward": 0.73828125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.1934739649295807, "rewards/accuracy_reward": 0.73828125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1934739649295807, "sentence_full_gradient_variance/max_squared_error": 678861.5625, "sentence_full_gradient_variance/metric": 3449.9658203125, "sentence_full_gradient_variance/p75": 94.37642669677734, "sentence_full_gradient_variance/p90": 134.6579132080078, "sentence_full_gradient_variance/p95": 134.6579132080078, "sentence_full_gradient_variance/p99": 69181.09375, "state_level_variance/metric": 38.4614372253418, "state_level_variance_full_gradient/metric": 389.71429443359375, "step": 49 }, { "accuracy_reward": 0.7109375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20577330887317657, "action_level_variance/metric": 588.1844482421875, "action_level_variance_full_gradient/metric": 3599.42919921875, "adam_stats/lr_effective_max": 5.3178406233200803e-05, "adam_stats/lr_effective_mean": 1.9297191622413834e-10, "adam_stats/lr_effective_min": -4.987414649804123e-05, "adam_stats/m_t_max": 0.004921857267618179, "adam_stats/m_t_mean": 5.214710260825761e-11, "adam_stats/m_t_min": -0.005609242711216211, "adam_stats/v_t_max": 6.225055403774604e-05, "adam_stats/v_t_mean": 2.1872241431214468e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.054255545139312744, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.039064645767212, "all_logprobs": -0.08762093633413315, "all_logprobs/max": 0.0, "all_logprobs/median": -9.5367431640625e-07, "all_logprobs/min": -12.8125, "all_logprobs/p1": -1.9140625, "all_logprobs/p10": -0.13671875, "all_logprobs/p25": -0.000759124755859375, "all_logprobs/p5": -0.51171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.13944999873638153, "clip_ratio": 0.0, "completion_length": 548.85546875, "completion_length/correct": 465.474365234375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 433.0, "completion_length/correct/min": 148.0, "completion_length/correct/p25": 304.0, "completion_length/correct/p75": 585.75, "completion_length/correct/var": 41460.4921875, "completion_length/incorrect": 753.9279174804688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 800.0, "completion_length/incorrect/min": 172.0, "completion_length/incorrect/p25": 518.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 71486.125, "completion_length/max": 1024.0, "completion_length/median": 485.0, "completion_length/min": 148.0, "completion_length/p25": 339.75, "completion_length/p75": 730.75, "completion_length/var": 67179.3515625, "epoch": 0.64, "feature_vector_variance/max_squared_error": 110019.9375, "feature_vector_variance/metric": 25119.49609375, "generated_tokens/total": 21673932.0, "grad_norm": 0.136672243475914, "grouped_std_rewards": 0.19660541415214539, "learning_rate": 8.80236133250198e-06, "loss": -0.0543, "mean_logprobs": -0.0869140625, "mean_logprobs/var": 0.0037078857421875, "num_completions/total": 38400, "per_sentence_gradient_norm": 3.868976593017578, "per_sentence_gradient_norm/max": 442.8173828125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 17.55919075012207, "per_sentence_gradient_norm/p99": 92.66617584228516, "per_sentence_gradient_norm/var": 573.9628295898438, "per_token_feature_norm": 160.3924102783203, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 155.0, "per_token_feature_norm/min": 63.75, "per_token_feature_norm/p25": 130.0, "per_token_feature_norm/p75": 187.0, "per_token_feature_norm/var": 1502.440673828125, "per_token_full_gradient_variance/max_squared_error": 279.7747802734375, "per_token_full_gradient_variance/variance": 0.051306094974279404, "per_token_gradient_norm": 4.862302780151367, "per_token_gradient_norm/max": 7005.0048828125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6873.17626953125, "per_token_policy_error_norm": 0.04766125604510307, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04199332743883133, "policy_entropy": 0.0971093624830246, "policy_entropy/max": 3.671875, "policy_entropy/median": 1.4603137969970703e-05, "policy_entropy/min": 4.496403249731884e-15, "policy_entropy/p25": 6.07222318649292e-07, "policy_entropy/p75": 0.006561279296875, "policy_entropy/var": 0.06831200420856476, "policy_error_vector_variance/max_squared_error": 2.01349139213562, "policy_error_vector_variance/metric": 0.04760384187102318, "policy_loss": -0.054255545139312744, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958683013916016, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.039064645767212, "policy_sharpness": 8.097326278686523, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.543551445007324, "reward": 0.7109375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20577330887317657, "rewards/accuracy_reward": 0.7109375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20577330887317657, "sentence_full_gradient_variance/max_squared_error": 762991.5, "sentence_full_gradient_variance/metric": 4032.15087890625, "sentence_full_gradient_variance/p75": 123.28894805908203, "sentence_full_gradient_variance/p90": 363.38958740234375, "sentence_full_gradient_variance/p95": 363.38958740234375, "sentence_full_gradient_variance/p99": 98747.90625, "state_level_variance/metric": 59.17042922973633, "state_level_variance_full_gradient/metric": 432.7215270996094, "step": 50 }, { "accuracy_reward": 0.7356771230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19470982253551483, "action_level_variance/metric": 491.4731140136719, "action_level_variance_full_gradient/metric": 2928.974365234375, "adam_stats/lr_effective_max": 5.233651972957887e-05, "adam_stats/lr_effective_mean": 1.8009355118309145e-10, "adam_stats/lr_effective_min": -4.8462057748110965e-05, "adam_stats/m_t_max": 0.004460189025849104, "adam_stats/m_t_mean": 4.3520031328680986e-11, "adam_stats/m_t_min": -0.0049476101994514465, "adam_stats/v_t_max": 6.220502109499648e-05, "adam_stats/v_t_mean": 2.189193271107115e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.01715516671538353, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.422661781311035, "all_logprobs": -0.09145214408636093, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-06, "all_logprobs/min": -12.3125, "all_logprobs/p1": -1.9140625, "all_logprobs/p10": -0.16015625, "all_logprobs/p25": -0.00112152099609375, "all_logprobs/p5": -0.55078125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.14326134324073792, "clip_ratio": 0.0, "completion_length": 523.671875, "completion_length/correct": 436.95574951171875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 387.0, "completion_length/correct/min": 52.0, "completion_length/correct/p25": 254.0, "completion_length/correct/p75": 559.0, "completion_length/correct/var": 51456.46484375, "completion_length/incorrect": 765.0245971679688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 812.0, "completion_length/incorrect/min": 187.0, "completion_length/incorrect/p25": 538.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 61817.33984375, "completion_length/max": 1024.0, "completion_length/median": 461.0, "completion_length/min": 52.0, "completion_length/p25": 291.0, "completion_length/p75": 711.25, "completion_length/var": 75074.515625, "epoch": 0.6528, "feature_vector_variance/max_squared_error": 107774.59375, "feature_vector_variance/metric": 25016.826171875, "generated_tokens/total": 22076112.0, "grad_norm": 0.11064387112855911, "grouped_std_rewards": 0.18201503157615662, "learning_rate": 8.543798257200491e-06, "loss": -0.0172, "mean_logprobs": -0.08935546875, "mean_logprobs/var": 0.0025634765625, "num_completions/total": 39168, "per_sentence_gradient_norm": 3.7194724082946777, "per_sentence_gradient_norm/max": 345.97357177734375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 10.621417999267578, "per_sentence_gradient_norm/p99": 107.30968475341797, "per_sentence_gradient_norm/var": 478.2613525390625, "per_token_feature_norm": 160.01451110839844, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 129.0, "per_token_feature_norm/p75": 187.0, "per_token_feature_norm/var": 1583.0491943359375, "per_token_full_gradient_variance/max_squared_error": 145.31961059570312, "per_token_full_gradient_variance/variance": 0.048145513981580734, "per_token_gradient_norm": 4.834490776062012, "per_token_gradient_norm/max": 4282.439453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5979.60400390625, "per_token_policy_error_norm": 0.049778614193201065, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.043599873781204224, "policy_entropy": 0.10183645039796829, "policy_entropy/max": 3.71875, "policy_entropy/median": 1.895427703857422e-05, "policy_entropy/min": 2.3245294578089215e-16, "policy_entropy/p25": 7.264316082000732e-07, "policy_entropy/p75": 0.00909423828125, "policy_entropy/var": 0.07194077223539352, "policy_error_vector_variance/max_squared_error": 2.0138003826141357, "policy_error_vector_variance/metric": 0.04971139878034592, "policy_loss": -0.01715516671538353, "policy_loss/max": 12.958683013916016, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.422661781311035, "policy_sharpness": 8.017487525939941, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.861753463745117, "reward": 0.7356771230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19470982253551483, "rewards/accuracy_reward": 0.7356771230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19470982253551483, "sentence_full_gradient_variance/max_squared_error": 557799.0, "sentence_full_gradient_variance/metric": 3309.873779296875, "sentence_full_gradient_variance/p75": 91.85572052001953, "sentence_full_gradient_variance/p90": 99.5676498413086, "sentence_full_gradient_variance/p95": 99.5676498413086, "sentence_full_gradient_variance/p99": 74037.1015625, "state_level_variance/metric": 48.10071563720703, "state_level_variance_full_gradient/metric": 380.8995056152344, "step": 51 }, { "accuracy_reward": 0.8177083730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14925578236579895, "action_level_variance/metric": 679.3177490234375, "action_level_variance_full_gradient/metric": 1195.9522705078125, "adam_stats/lr_effective_max": 4.3701827962649986e-05, "adam_stats/lr_effective_mean": 1.184488757077773e-10, "adam_stats/lr_effective_min": -4.433424328453839e-05, "adam_stats/m_t_max": 0.006101572420448065, "adam_stats/m_t_mean": 5.5235364893047034e-11, "adam_stats/m_t_min": -0.006344939582049847, "adam_stats/v_t_max": 6.214392487891018e-05, "adam_stats/v_t_mean": 2.2451021076952404e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.005421492271125317, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.3761560916900635, "all_logprobs": -0.08937282860279083, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-06, "all_logprobs/min": -10.875, "all_logprobs/p1": -1.9140625, "all_logprobs/p10": -0.146484375, "all_logprobs/p25": -0.000911712646484375, "all_logprobs/p5": -0.52734375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.13729149103164673, "clip_ratio": 0.0, "completion_length": 488.76953125, "completion_length/correct": 415.8280334472656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 366.0, "completion_length/correct/min": 75.0, "completion_length/correct/p25": 280.0, "completion_length/correct/p75": 531.25, "completion_length/correct/var": 39994.27734375, "completion_length/incorrect": 815.9642944335938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 237.0, "completion_length/incorrect/p25": 587.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 64897.46875, "completion_length/max": 1024.0, "completion_length/median": 405.0, "completion_length/min": 75.0, "completion_length/p25": 299.0, "completion_length/p75": 625.0, "completion_length/var": 68352.4140625, "epoch": 0.6656, "feature_vector_variance/max_squared_error": 107800.7421875, "feature_vector_variance/metric": 24294.453125, "generated_tokens/total": 22451488.0, "grad_norm": 0.3430834114551544, "grouped_std_rewards": 0.12046842277050018, "learning_rate": 8.283963474507402e-06, "loss": 0.0054, "mean_logprobs": -0.0869140625, "mean_logprobs/var": 0.0022735595703125, "num_completions/total": 39936, "per_sentence_gradient_norm": 3.335904121398926, "per_sentence_gradient_norm/max": 349.56903076171875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 72.74858856201172, "per_sentence_gradient_norm/var": 669.0606689453125, "per_token_feature_norm": 157.65249633789062, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 183.0, "per_token_feature_norm/var": 1554.92431640625, "per_token_full_gradient_variance/max_squared_error": 26504936.0, "per_token_full_gradient_variance/variance": 70.69451904296875, "per_token_gradient_norm": 5.511551856994629, "per_token_gradient_norm/max": 7239.43310546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 11041.16796875, "per_token_policy_error_norm": 0.04915384203195572, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.043405674397945404, "policy_entropy": 0.09891501814126968, "policy_entropy/max": 3.390625, "policy_entropy/median": 1.8715858459472656e-05, "policy_entropy/min": 3.419486915845482e-14, "policy_entropy/p25": 8.381903171539307e-07, "policy_entropy/p75": 0.00762939453125, "policy_entropy/var": 0.06806162744760513, "policy_error_vector_variance/max_squared_error": 2.0096514225006104, "policy_error_vector_variance/metric": 0.049121081829071045, "policy_loss": 0.0054214997217059135, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.3761565685272217, "policy_sharpness": 8.06180477142334, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.664135932922363, "reward": 0.8177083730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14925578236579895, "rewards/accuracy_reward": 0.8177083730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14925578236579895, "sentence_full_gradient_variance/max_squared_error": 274266.375, "sentence_full_gradient_variance/metric": 1340.497802734375, "sentence_full_gradient_variance/p75": 64.12623596191406, "sentence_full_gradient_variance/p90": 67.47527313232422, "sentence_full_gradient_variance/p95": 67.47527313232422, "sentence_full_gradient_variance/p99": 44981.91796875, "state_level_variance/metric": 74.56317138671875, "state_level_variance_full_gradient/metric": 144.54559326171875, "step": 52 }, { "accuracy_reward": 0.7395833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19285091757774353, "action_level_variance/metric": 666.8619384765625, "action_level_variance_full_gradient/metric": 3153.761474609375, "adam_stats/lr_effective_max": 4.164857818977907e-05, "adam_stats/lr_effective_mean": 7.116378319960148e-11, "adam_stats/lr_effective_min": -4.170238389633596e-05, "adam_stats/m_t_max": 0.00533272372558713, "adam_stats/m_t_mean": 4.8568982169427954e-11, "adam_stats/m_t_min": -0.005548702087253332, "adam_stats/v_t_max": 6.208178092492744e-05, "adam_stats/v_t_mean": 2.2436124139102454e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.012801647186279297, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.1503477096557617, "all_logprobs": -0.08771117776632309, "all_logprobs/max": 0.0, "all_logprobs/median": -1.6689300537109375e-06, "all_logprobs/min": -13.625, "all_logprobs/p1": -1.8984375, "all_logprobs/p10": -0.142578125, "all_logprobs/p25": -0.00095367431640625, "all_logprobs/p5": -0.51953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.13465774059295654, "clip_ratio": 0.0, "completion_length": 569.8268432617188, "completion_length/correct": 484.3080749511719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 427.0, "completion_length/correct/min": 147.0, "completion_length/correct/p25": 327.0, "completion_length/correct/p75": 619.5, "completion_length/correct/var": 43642.87109375, "completion_length/incorrect": 812.7000122070312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 914.0, "completion_length/incorrect/min": 242.0, "completion_length/incorrect/p25": 619.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 58769.15625, "completion_length/max": 1024.0, "completion_length/median": 495.0, "completion_length/min": 147.0, "completion_length/p25": 366.75, "completion_length/p75": 780.0, "completion_length/var": 68307.8046875, "epoch": 0.6784, "feature_vector_variance/max_squared_error": 109190.9375, "feature_vector_variance/metric": 23741.619140625, "generated_tokens/total": 22889114.0, "grad_norm": 0.08085574954748154, "grouped_std_rewards": 0.18635272979736328, "learning_rate": 8.02317355308094e-06, "loss": 0.0128, "mean_logprobs": -0.087890625, "mean_logprobs/var": 0.0019683837890625, "num_completions/total": 40704, "per_sentence_gradient_norm": 4.27262020111084, "per_sentence_gradient_norm/max": 382.7228088378906, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 15.594141006469727, "per_sentence_gradient_norm/p99": 117.641845703125, "per_sentence_gradient_norm/var": 649.4522705078125, "per_token_feature_norm": 156.37339782714844, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 126.0, "per_token_feature_norm/p75": 182.0, "per_token_feature_norm/var": 1550.710693359375, "per_token_full_gradient_variance/max_squared_error": 143.37550354003906, "per_token_full_gradient_variance/variance": 0.04799637198448181, "per_token_gradient_norm": 5.165068626403809, "per_token_gradient_norm/max": 5102.48095703125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5909.142578125, "per_token_policy_error_norm": 0.04795365035533905, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0420403927564621, "policy_entropy": 0.09830660372972488, "policy_entropy/max": 3.71875, "policy_entropy/median": 2.5510787963867188e-05, "policy_entropy/min": 1.3589129821411916e-13, "policy_entropy/p25": 1.3262033462524414e-06, "policy_entropy/p75": 0.00811767578125, "policy_entropy/var": 0.06836280226707458, "policy_error_vector_variance/max_squared_error": 2.015634775161743, "policy_error_vector_variance/metric": 0.04789847880601883, "policy_loss": 0.0128016397356987, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -12.958683013916016, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.1503477096557617, "policy_sharpness": 8.05063247680664, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.64551830291748, "reward": 0.7395833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19285091757774353, "rewards/accuracy_reward": 0.7395833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19285091757774353, "sentence_full_gradient_variance/max_squared_error": 922028.5, "sentence_full_gradient_variance/metric": 3585.978515625, "sentence_full_gradient_variance/p75": 34.0658073425293, "sentence_full_gradient_variance/p90": 52.84867858886719, "sentence_full_gradient_variance/p95": 52.84867858886719, "sentence_full_gradient_variance/p99": 67626.7109375, "state_level_variance/metric": 65.78775024414062, "state_level_variance_full_gradient/metric": 432.2170104980469, "step": 53 }, { "accuracy_reward": 0.703125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20901235938072205, "action_level_variance/metric": 2024.8006591796875, "action_level_variance_full_gradient/metric": 8386.7236328125, "adam_stats/lr_effective_max": 3.957720036851242e-05, "adam_stats/lr_effective_mean": 1.0131476213537383e-10, "adam_stats/lr_effective_min": -4.0265116695081815e-05, "adam_stats/m_t_max": 0.006679334677755833, "adam_stats/m_t_mean": 8.110342952782901e-11, "adam_stats/m_t_min": -0.007691586390137672, "adam_stats/v_t_max": 6.204987585078925e-05, "adam_stats/v_t_mean": 2.323926858241454e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.013635064475238323, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.294292449951172, "all_logprobs": -0.10701359808444977, "all_logprobs/max": 0.0, "all_logprobs/median": -3.933906555175781e-06, "all_logprobs/min": -11.5, "all_logprobs/p1": -2.125, "all_logprobs/p10": -0.2099609375, "all_logprobs/p25": -0.0028076171875, "all_logprobs/p5": -0.66796875, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.17405156791210175, "clip_ratio": 0.0, "completion_length": 530.5078125, "completion_length/correct": 431.1184997558594, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 394.0, "completion_length/correct/min": 82.0, "completion_length/correct/p25": 289.0, "completion_length/correct/p75": 536.25, "completion_length/correct/var": 37753.25, "completion_length/incorrect": 765.9035034179688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 826.0, "completion_length/incorrect/min": 159.0, "completion_length/incorrect/p25": 524.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 74738.2578125, "completion_length/max": 1024.0, "completion_length/median": 450.0, "completion_length/min": 82.0, "completion_length/p25": 331.0, "completion_length/p75": 696.25, "completion_length/var": 72076.3515625, "epoch": 0.6912, "feature_vector_variance/max_squared_error": 106394.8203125, "feature_vector_variance/metric": 24538.779296875, "generated_tokens/total": 23296544.0, "grad_norm": 0.41559863090515137, "grouped_std_rewards": 0.2000323385000229, "learning_rate": 7.76174622526876e-06, "loss": 0.0136, "mean_logprobs": -0.103515625, "mean_logprobs/var": 0.003509521484375, "num_completions/total": 41472, "per_sentence_gradient_norm": 6.670111656188965, "per_sentence_gradient_norm/max": 750.145263671875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 26.298494338989258, "per_sentence_gradient_norm/p99": 158.99632263183594, "per_sentence_gradient_norm/var": 1982.892333984375, "per_token_feature_norm": 158.93040466308594, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 126.0, "per_token_feature_norm/p75": 186.0, "per_token_feature_norm/var": 1797.794189453125, "per_token_full_gradient_variance/max_squared_error": 383.5071105957031, "per_token_full_gradient_variance/variance": 0.0991649404168129, "per_token_gradient_norm": 7.0704665184021, "per_token_gradient_norm/max": 7979.2109375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 13136.6962890625, "per_token_policy_error_norm": 0.05728314071893692, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0499020554125309, "policy_entropy": 0.11899790912866592, "policy_entropy/max": 3.6875, "policy_entropy/median": 5.4836273193359375e-05, "policy_entropy/min": 1.2612133559741778e-13, "policy_entropy/p25": 2.0563602447509766e-06, "policy_entropy/p75": 0.0206298828125, "policy_entropy/var": 0.08899152278900146, "policy_error_vector_variance/max_squared_error": 2.0119810104370117, "policy_error_vector_variance/metric": 0.05719413980841637, "policy_loss": 0.01363505981862545, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958683013916016, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.294292449951172, "policy_sharpness": 7.760797023773193, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.93163013458252, "reward": 0.703125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20901235938072205, "rewards/accuracy_reward": 0.703125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20901235938072205, "sentence_full_gradient_variance/max_squared_error": 1590964.625, "sentence_full_gradient_variance/metric": 9505.6904296875, "sentence_full_gradient_variance/p75": 86.30388641357422, "sentence_full_gradient_variance/p90": 429.3558349609375, "sentence_full_gradient_variance/p95": 429.3558349609375, "sentence_full_gradient_variance/p99": 201926.171875, "state_level_variance/metric": 210.80557250976562, "state_level_variance_full_gradient/metric": 1118.9654541015625, "step": 54 }, { "accuracy_reward": 0.7565104365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18444257974624634, "action_level_variance/metric": 733.154296875, "action_level_variance_full_gradient/metric": 4699.31201171875, "adam_stats/lr_effective_max": 3.840104545815848e-05, "adam_stats/lr_effective_mean": 8.053398226071096e-11, "adam_stats/lr_effective_min": -3.8049605791457e-05, "adam_stats/m_t_max": 0.006348620168864727, "adam_stats/m_t_mean": 7.607570823298104e-11, "adam_stats/m_t_min": -0.007383243180811405, "adam_stats/v_t_max": 6.199251947691664e-05, "adam_stats/v_t_mean": 2.3230837826321293e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.04934120178222656, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.3899388313293457, "all_logprobs": -0.0975019633769989, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-06, "all_logprobs/min": -11.125, "all_logprobs/p1": -2.015625, "all_logprobs/p10": -0.1796875, "all_logprobs/p25": -0.00170135498046875, "all_logprobs/p5": -0.58203125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15626399219036102, "clip_ratio": 0.0, "completion_length": 510.1028747558594, "completion_length/correct": 422.16351318359375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 381.0, "completion_length/correct/min": 70.0, "completion_length/correct/p25": 284.0, "completion_length/correct/p75": 528.0, "completion_length/correct/var": 37303.52734375, "completion_length/incorrect": 783.3262329101562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 855.0, "completion_length/incorrect/min": 224.0, "completion_length/incorrect/p25": 513.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 68731.28125, "completion_length/max": 1024.0, "completion_length/median": 434.0, "completion_length/min": 70.0, "completion_length/p25": 305.0, "completion_length/p75": 695.5, "completion_length/var": 68934.6328125, "epoch": 0.704, "feature_vector_variance/max_squared_error": 112221.6875, "feature_vector_variance/metric": 24493.349609375, "generated_tokens/total": 23688304.0, "grad_norm": 0.08064015954732895, "grouped_std_rewards": 0.17504167556762695, "learning_rate": 7.5e-06, "loss": 0.0493, "mean_logprobs": -0.09619140625, "mean_logprobs/var": 0.00262451171875, "num_completions/total": 42240, "per_sentence_gradient_norm": 4.422374248504639, "per_sentence_gradient_norm/max": 380.48406982421875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 125.3914566040039, "per_sentence_gradient_norm/var": 714.5272827148438, "per_token_feature_norm": 158.77496337890625, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 126.5, "per_token_feature_norm/p75": 185.0, "per_token_feature_norm/var": 1720.7447509765625, "per_token_full_gradient_variance/max_squared_error": 390.5020751953125, "per_token_full_gradient_variance/variance": 0.06556802988052368, "per_token_gradient_norm": 6.132735252380371, "per_token_gradient_norm/max": 5223.96875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9023.7109375, "per_token_policy_error_norm": 0.05262552946805954, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.046184442937374115, "policy_entropy": 0.1085972711443901, "policy_entropy/max": 3.765625, "policy_entropy/median": 3.528594970703125e-05, "policy_entropy/min": 3.6193270602780103e-14, "policy_entropy/p25": 1.5050172805786133e-06, "policy_entropy/p75": 0.01312255859375, "policy_entropy/var": 0.07753043621778488, "policy_error_vector_variance/max_squared_error": 2.0190157890319824, "policy_error_vector_variance/metric": 0.05257287621498108, "policy_loss": 0.049341194331645966, "policy_loss/max": 12.958683013916016, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.3899388313293457, "policy_sharpness": 7.913443565368652, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.49609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.300814628601074, "reward": 0.7565104365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18444257974624634, "rewards/accuracy_reward": 0.7565104365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18444257974624634, "sentence_full_gradient_variance/max_squared_error": 1075492.75, "sentence_full_gradient_variance/metric": 5338.552734375, "sentence_full_gradient_variance/p75": 47.088287353515625, "sentence_full_gradient_variance/p90": 104.4586410522461, "sentence_full_gradient_variance/p95": 104.4586410522461, "sentence_full_gradient_variance/p99": 73656.5234375, "state_level_variance/metric": 72.84571075439453, "state_level_variance_full_gradient/metric": 639.2421875, "step": 55 }, { "accuracy_reward": 0.7942708730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16361773014068604, "action_level_variance/metric": 434.3526611328125, "action_level_variance_full_gradient/metric": 2935.251708984375, "adam_stats/lr_effective_max": 3.7245266867103055e-05, "adam_stats/lr_effective_mean": 2.943796595378245e-11, "adam_stats/lr_effective_min": -3.936920620617457e-05, "adam_stats/m_t_max": 0.006592664401978254, "adam_stats/m_t_mean": 7.075956487412327e-11, "adam_stats/m_t_min": -0.00705080246552825, "adam_stats/v_t_max": 6.197642505867407e-05, "adam_stats/v_t_mean": 2.324611206652727e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.06182350963354111, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.2772884368896484, "all_logprobs": -0.09719530493021011, "all_logprobs/max": 0.0, "all_logprobs/median": -2.86102294921875e-06, "all_logprobs/min": -11.0625, "all_logprobs/p1": -1.9765625, "all_logprobs/p10": -0.1796875, "all_logprobs/p25": -0.001861572265625, "all_logprobs/p5": -0.58984375, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.15283715724945068, "clip_ratio": 0.0, "completion_length": 495.72918701171875, "completion_length/correct": 436.00164794921875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 412.0, "completion_length/correct/min": 145.0, "completion_length/correct/p25": 302.0, "completion_length/correct/p75": 528.75, "completion_length/correct/var": 33208.28515625, "completion_length/incorrect": 726.3228149414062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 737.0, "completion_length/incorrect/min": 172.0, "completion_length/incorrect/p25": 519.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63783.33203125, "completion_length/max": 1024.0, "completion_length/median": 452.0, "completion_length/min": 145.0, "completion_length/p25": 324.75, "completion_length/p75": 612.25, "completion_length/var": 53214.24609375, "epoch": 0.7168, "feature_vector_variance/max_squared_error": 103924.0078125, "feature_vector_variance/metric": 24002.599609375, "generated_tokens/total": 24069024.0, "grad_norm": 0.1175653263926506, "grouped_std_rewards": 0.15518298745155334, "learning_rate": 7.238253774731245e-06, "loss": -0.0618, "mean_logprobs": -0.095703125, "mean_logprobs/var": 0.0027923583984375, "num_completions/total": 43008, "per_sentence_gradient_norm": 3.504033088684082, "per_sentence_gradient_norm/max": 290.6064453125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 114.07117462158203, "per_sentence_gradient_norm/var": 422.6247253417969, "per_token_feature_norm": 157.27188110351562, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 125.5, "per_token_feature_norm/p75": 184.0, "per_token_feature_norm/var": 1724.6986083984375, "per_token_full_gradient_variance/max_squared_error": 194.19683837890625, "per_token_full_gradient_variance/variance": 0.05006915330886841, "per_token_gradient_norm": 4.717787742614746, "per_token_gradient_norm/max": 6657.07470703125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6231.75390625, "per_token_policy_error_norm": 0.052674803882837296, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.045794907957315445, "policy_entropy": 0.10909517854452133, "policy_entropy/max": 3.734375, "policy_entropy/median": 4.076957702636719e-05, "policy_entropy/min": 1.3233858453531866e-13, "policy_entropy/p25": 1.7881393432617188e-06, "policy_entropy/p75": 0.01409912109375, "policy_entropy/var": 0.07810679078102112, "policy_error_vector_variance/max_squared_error": 2.016148090362549, "policy_error_vector_variance/metric": 0.05260168015956879, "policy_loss": -0.06182349473237991, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.2772889137268066, "policy_sharpness": 7.8878173828125, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.37109375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.367801666259766, "reward": 0.7942708730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16361773014068604, "rewards/accuracy_reward": 0.7942708730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16361773014068604, "sentence_full_gradient_variance/max_squared_error": 357254.4375, "sentence_full_gradient_variance/metric": 3301.289794921875, "sentence_full_gradient_variance/p75": 90.1605453491211, "sentence_full_gradient_variance/p90": 145.0248565673828, "sentence_full_gradient_variance/p95": 145.0248565673828, "sentence_full_gradient_variance/p99": 110196.8828125, "state_level_variance/metric": 42.458106994628906, "state_level_variance_full_gradient/metric": 366.038330078125, "step": 56 }, { "accuracy_reward": 0.734375, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19532270729541779, "action_level_variance/metric": 973.2535400390625, "action_level_variance_full_gradient/metric": 6645.0341796875, "adam_stats/lr_effective_max": 3.824030136456713e-05, "adam_stats/lr_effective_mean": 2.103439167636112e-12, "adam_stats/lr_effective_min": -3.819914854830131e-05, "adam_stats/m_t_max": 0.0062507810071110725, "adam_stats/m_t_mean": 6.317536077604657e-11, "adam_stats/m_t_min": -0.006998798344284296, "adam_stats/v_t_max": 6.191451393533498e-05, "adam_stats/v_t_mean": 2.329809522388926e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.04108549654483795, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.159332275390625, "all_logprobs": -0.09844203293323517, "all_logprobs/max": 0.0, "all_logprobs/median": -3.0994415283203125e-06, "all_logprobs/min": -11.375, "all_logprobs/p1": -2.09375, "all_logprobs/p10": -0.1669921875, "all_logprobs/p25": -0.00133514404296875, "all_logprobs/p5": -0.58203125, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.1628744751214981, "clip_ratio": 0.0, "completion_length": 548.26171875, "completion_length/correct": 464.0443115234375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 422.0, "completion_length/correct/min": 103.0, "completion_length/correct/p25": 293.0, "completion_length/correct/p75": 593.5, "completion_length/correct/var": 46023.08984375, "completion_length/incorrect": 781.0980834960938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 860.0, "completion_length/incorrect/min": 222.0, "completion_length/incorrect/p25": 534.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 69862.2890625, "completion_length/max": 1024.0, "completion_length/median": 482.0, "completion_length/min": 103.0, "completion_length/p25": 337.75, "completion_length/p75": 753.0, "completion_length/var": 71906.984375, "epoch": 0.7296, "feature_vector_variance/max_squared_error": 106470.3046875, "feature_vector_variance/metric": 23805.138671875, "generated_tokens/total": 24490088.0, "grad_norm": 0.14503729343414307, "grouped_std_rewards": 0.18096241354942322, "learning_rate": 6.976826446919061e-06, "loss": 0.0411, "mean_logprobs": -0.09716796875, "mean_logprobs/var": 0.00390625, "num_completions/total": 43776, "per_sentence_gradient_norm": 5.272634506225586, "per_sentence_gradient_norm/max": 439.9798889160156, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 11.234780311584473, "per_sentence_gradient_norm/p99": 149.5370635986328, "per_sentence_gradient_norm/var": 946.685546875, "per_token_feature_norm": 156.63392639160156, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 148.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 124.5, "per_token_feature_norm/p75": 182.0, "per_token_feature_norm/var": 1760.6328125, "per_token_full_gradient_variance/max_squared_error": 473.906005859375, "per_token_full_gradient_variance/variance": 0.09928987175226212, "per_token_gradient_norm": 6.810169696807861, "per_token_gradient_norm/max": 6819.44189453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 11810.39453125, "per_token_policy_error_norm": 0.052280325442552567, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.045737043023109436, "policy_entropy": 0.10983507335186005, "policy_entropy/max": 3.734375, "policy_entropy/median": 4.38690185546875e-05, "policy_entropy/min": 3.979039320256561e-13, "policy_entropy/p25": 2.384185791015625e-06, "policy_entropy/p75": 0.010986328125, "policy_entropy/var": 0.08608898520469666, "policy_error_vector_variance/max_squared_error": 2.014014720916748, "policy_error_vector_variance/metric": 0.05217776075005531, "policy_loss": 0.04108549654483795, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.159332275390625, "policy_sharpness": 7.936781883239746, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.87109375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.320487022399902, "reward": 0.734375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19532270729541779, "rewards/accuracy_reward": 0.734375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19532270729541779, "sentence_full_gradient_variance/max_squared_error": 1923629.125, "sentence_full_gradient_variance/metric": 7526.36279296875, "sentence_full_gradient_variance/p75": 122.6402816772461, "sentence_full_gradient_variance/p90": 126.03656005859375, "sentence_full_gradient_variance/p95": 126.03656005859375, "sentence_full_gradient_variance/p99": 144687.515625, "state_level_variance/metric": 94.84397888183594, "state_level_variance_full_gradient/metric": 881.3283081054688, "step": 57 }, { "accuracy_reward": 0.7278646230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19833599030971527, "action_level_variance/metric": 2123.405029296875, "action_level_variance_full_gradient/metric": 3772.17578125, "adam_stats/lr_effective_max": 3.7599835195578635e-05, "adam_stats/lr_effective_mean": 5.2430029068295525e-11, "adam_stats/lr_effective_min": -3.615778041421436e-05, "adam_stats/m_t_max": 0.005319323390722275, "adam_stats/m_t_mean": 7.569855853262197e-11, "adam_stats/m_t_min": -0.006358427461236715, "adam_stats/v_t_max": 6.185479287523776e-05, "adam_stats/v_t_mean": 2.343493888529169e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.05996270105242729, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.8479926586151123, "all_logprobs": -0.1067977175116539, "all_logprobs/max": 0.0, "all_logprobs/median": -3.6954879760742188e-06, "all_logprobs/min": -12.375, "all_logprobs/p1": -2.15625, "all_logprobs/p10": -0.2041015625, "all_logprobs/p25": -0.00250244140625, "all_logprobs/p5": -0.6484375, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.17811092734336853, "clip_ratio": 0.0, "completion_length": 523.9818115234375, "completion_length/correct": 423.01788330078125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 369.0, "completion_length/correct/min": 115.0, "completion_length/correct/p25": 259.5, "completion_length/correct/p75": 540.5, "completion_length/correct/var": 42869.55078125, "completion_length/incorrect": 794.0238647460938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 840.0, "completion_length/incorrect/min": 319.0, "completion_length/incorrect/p25": 593.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 54434.6328125, "completion_length/max": 1024.0, "completion_length/median": 459.0, "completion_length/min": 115.0, "completion_length/p25": 292.0, "completion_length/p75": 688.0, "completion_length/var": 73250.0, "epoch": 0.7424, "feature_vector_variance/max_squared_error": 100407.1328125, "feature_vector_variance/metric": 24245.3984375, "generated_tokens/total": 24892506.0, "grad_norm": 0.2069534808397293, "grouped_std_rewards": 0.15448690950870514, "learning_rate": 6.7160365254926005e-06, "loss": -0.06, "mean_logprobs": -0.10546875, "mean_logprobs/var": 0.00555419921875, "num_completions/total": 44544, "per_sentence_gradient_norm": 5.6764936447143555, "per_sentence_gradient_norm/max": 896.5189819335938, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 126.45074462890625, "per_sentence_gradient_norm/var": 2093.90869140625, "per_token_feature_norm": 158.5189666748047, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 149.0, "per_token_feature_norm/min": 63.25, "per_token_feature_norm/p25": 125.5, "per_token_feature_norm/p75": 185.0, "per_token_feature_norm/var": 1893.6494140625, "per_token_full_gradient_variance/max_squared_error": 654.5240478515625, "per_token_full_gradient_variance/variance": 0.11780227720737457, "per_token_gradient_norm": 7.160384178161621, "per_token_gradient_norm/max": 7840.0390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 14879.8154296875, "per_token_policy_error_norm": 0.05645674094557762, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04913795739412308, "policy_entropy": 0.11971291899681091, "policy_entropy/max": 3.71875, "policy_entropy/median": 5.125999450683594e-05, "policy_entropy/min": 6.927791673660977e-14, "policy_entropy/p25": 2.205371856689453e-06, "policy_entropy/p75": 0.018798828125, "policy_entropy/var": 0.0935632660984993, "policy_error_vector_variance/max_squared_error": 2.0164551734924316, "policy_error_vector_variance/metric": 0.056333597749471664, "policy_loss": -0.05996270477771759, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.8479926586151123, "policy_sharpness": 7.787384510040283, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.5625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.909744262695312, "reward": 0.7278646230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19833599030971527, "rewards/accuracy_reward": 0.7278646230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19833599030971527, "sentence_full_gradient_variance/max_squared_error": 785767.5625, "sentence_full_gradient_variance/metric": 4264.56396484375, "sentence_full_gradient_variance/p75": 89.95689392089844, "sentence_full_gradient_variance/p90": 212.2561798095703, "sentence_full_gradient_variance/p95": 212.2561798095703, "sentence_full_gradient_variance/p99": 95529.5625, "state_level_variance/metric": 235.65780639648438, "state_level_variance_full_gradient/metric": 492.3875732421875, "step": 58 }, { "accuracy_reward": 0.7174479365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20298071205615997, "action_level_variance/metric": 752.1105346679688, "action_level_variance_full_gradient/metric": 4123.578125, "adam_stats/lr_effective_max": 3.825082239927724e-05, "adam_stats/lr_effective_mean": 8.83130929496545e-11, "adam_stats/lr_effective_min": -3.633778032963164e-05, "adam_stats/m_t_max": 0.004748481325805187, "adam_stats/m_t_mean": 7.35225907932957e-11, "adam_stats/m_t_min": -0.0057347915135324, "adam_stats/v_t_max": 6.18568665231578e-05, "adam_stats/v_t_mean": 2.346769046451813e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.08792274445295334, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.438237190246582, "all_logprobs": -0.10004693269729614, "all_logprobs/max": 0.0, "all_logprobs/median": -3.337860107421875e-06, "all_logprobs/min": -13.125, "all_logprobs/p1": -2.09375, "all_logprobs/p10": -0.1787109375, "all_logprobs/p25": -0.00150299072265625, "all_logprobs/p5": -0.59765625, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.16589847207069397, "clip_ratio": 0.0, "completion_length": 538.4765625, "completion_length/correct": 436.3284912109375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 389.0, "completion_length/correct/min": 97.0, "completion_length/correct/p25": 280.0, "completion_length/correct/p75": 551.0, "completion_length/correct/var": 41181.9765625, "completion_length/incorrect": 797.847900390625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 909.0, "completion_length/incorrect/min": 181.0, "completion_length/incorrect/p25": 587.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 62594.16015625, "completion_length/max": 1024.0, "completion_length/median": 470.0, "completion_length/min": 97.0, "completion_length/p25": 308.75, "completion_length/p75": 719.25, "completion_length/var": 73687.1328125, "epoch": 0.7552, "feature_vector_variance/max_squared_error": 114825.3671875, "feature_vector_variance/metric": 23793.59375, "generated_tokens/total": 25306056.0, "grad_norm": 0.13355392217636108, "grouped_std_rewards": 0.20129838585853577, "learning_rate": 6.456201742799511e-06, "loss": 0.0879, "mean_logprobs": -0.1025390625, "mean_logprobs/var": 0.0040283203125, "num_completions/total": 45312, "per_sentence_gradient_norm": 5.421852111816406, "per_sentence_gradient_norm/max": 257.69036865234375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 25.215076446533203, "per_sentence_gradient_norm/p99": 142.16302490234375, "per_sentence_gradient_norm/var": 723.6561889648438, "per_token_feature_norm": 156.98886108398438, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 148.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 124.5, "per_token_feature_norm/p75": 183.0, "per_token_feature_norm/var": 1811.7799072265625, "per_token_full_gradient_variance/max_squared_error": 336.77325439453125, "per_token_full_gradient_variance/variance": 0.07244843244552612, "per_token_gradient_norm": 6.65314245223999, "per_token_gradient_norm/max": 5587.61279296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8844.8701171875, "per_token_policy_error_norm": 0.053269658237695694, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04674682766199112, "policy_entropy": 0.11126121133565903, "policy_entropy/max": 3.71875, "policy_entropy/median": 4.8160552978515625e-05, "policy_entropy/min": 2.4980018054066022e-14, "policy_entropy/p25": 2.4139881134033203e-06, "policy_entropy/p75": 0.01171875, "policy_entropy/var": 0.08511596918106079, "policy_error_vector_variance/max_squared_error": 2.01055908203125, "policy_error_vector_variance/metric": 0.053177401423454285, "policy_loss": 0.08792273700237274, "policy_loss/max": 19.79339599609375, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.438237190246582, "policy_sharpness": 7.91417932510376, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.74609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.408442497253418, "reward": 0.7174479365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20298071205615997, "rewards/accuracy_reward": 0.7174479365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20298071205615997, "sentence_full_gradient_variance/max_squared_error": 902870.0625, "sentence_full_gradient_variance/metric": 4699.126953125, "sentence_full_gradient_variance/p75": 32.7828369140625, "sentence_full_gradient_variance/p90": 40.70903015136719, "sentence_full_gradient_variance/p95": 40.70903015136719, "sentence_full_gradient_variance/p99": 107734.5703125, "state_level_variance/metric": 65.29751586914062, "state_level_variance_full_gradient/metric": 575.549072265625, "step": 59 }, { "accuracy_reward": 0.7278646230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19833597540855408, "action_level_variance/metric": 2130.002197265625, "action_level_variance_full_gradient/metric": 6868.21875, "adam_stats/lr_effective_max": 3.527920853230171e-05, "adam_stats/lr_effective_mean": 1.248340181225771e-10, "adam_stats/lr_effective_min": -3.6739904317073524e-05, "adam_stats/m_t_max": 0.004269413184374571, "adam_stats/m_t_mean": 6.990190370981253e-11, "adam_stats/m_t_min": -0.005217770114541054, "adam_stats/v_t_max": 6.181123899295926e-05, "adam_stats/v_t_mean": 2.3467651433239922e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.03791451454162598, "advantages/max": 9.659051895141602, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.8475496768951416, "all_logprobs": -0.0981607437133789, "all_logprobs/max": 0.0, "all_logprobs/median": -2.2649765014648438e-06, "all_logprobs/min": -13.375, "all_logprobs/p1": -2.03125, "all_logprobs/p10": -0.173828125, "all_logprobs/p25": -0.001556396484375, "all_logprobs/p5": -0.58984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15996547043323517, "clip_ratio": 0.0, "completion_length": 523.3294677734375, "completion_length/correct": 434.9534912109375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 388.0, "completion_length/correct/min": 82.0, "completion_length/correct/p25": 275.5, "completion_length/correct/p75": 546.0, "completion_length/correct/var": 44485.04296875, "completion_length/incorrect": 759.7033081054688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 787.0, "completion_length/incorrect/min": 141.0, "completion_length/incorrect/p25": 518.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 70682.78125, "completion_length/max": 1024.0, "completion_length/median": 446.0, "completion_length/min": 82.0, "completion_length/p25": 316.0, "completion_length/p75": 695.5, "completion_length/var": 72448.515625, "epoch": 0.768, "feature_vector_variance/max_squared_error": 103486.0859375, "feature_vector_variance/metric": 24878.173828125, "generated_tokens/total": 25707972.0, "grad_norm": 0.12442989647388458, "grouped_std_rewards": 0.2048044055700302, "learning_rate": 6.197638667498023e-06, "loss": 0.0379, "mean_logprobs": -0.09814453125, "mean_logprobs/var": 0.003875732421875, "num_completions/total": 46080, "per_sentence_gradient_norm": 6.320703029632568, "per_sentence_gradient_norm/max": 962.8082885742188, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 20.489913940429688, "per_sentence_gradient_norm/p99": 148.6063690185547, "per_sentence_gradient_norm/var": 2092.775390625, "per_token_feature_norm": 159.51564025878906, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 186.0, "per_token_feature_norm/var": 1723.3779296875, "per_token_full_gradient_variance/max_squared_error": 409.4563293457031, "per_token_full_gradient_variance/variance": 0.07552961260080338, "per_token_gradient_norm": 6.333585262298584, "per_token_gradient_norm/max": 6502.12939453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9425.40234375, "per_token_policy_error_norm": 0.052498213946819305, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.045818030834198, "policy_entropy": 0.10997019708156586, "policy_entropy/max": 3.734375, "policy_entropy/median": 3.314018249511719e-05, "policy_entropy/min": 2.6201263381153694e-14, "policy_entropy/p25": 1.3262033462524414e-06, "policy_entropy/p75": 0.0125732421875, "policy_entropy/var": 0.08264867961406708, "policy_error_vector_variance/max_squared_error": 2.013721227645874, "policy_error_vector_variance/metric": 0.05240803211927414, "policy_loss": 0.03791450709104538, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.8475499153137207, "policy_sharpness": 7.906859397888184, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.49609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.410331726074219, "reward": 0.7278646230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19833597540855408, "rewards/accuracy_reward": 0.7278646230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19833597540855408, "sentence_full_gradient_variance/max_squared_error": 1810145.625, "sentence_full_gradient_variance/metric": 7772.17138671875, "sentence_full_gradient_variance/p75": 125.70977783203125, "sentence_full_gradient_variance/p90": 182.1924591064453, "sentence_full_gradient_variance/p95": 182.1924591064453, "sentence_full_gradient_variance/p99": 145165.671875, "state_level_variance/metric": 228.68106079101562, "state_level_variance_full_gradient/metric": 903.9532470703125, "step": 60 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18774445354938507, "action_level_variance/metric": 218.78768920898438, "action_level_variance_full_gradient/metric": 2349.26171875, "adam_stats/lr_effective_max": 3.358353933435865e-05, "adam_stats/lr_effective_mean": 1.4948782489643264e-10, "adam_stats/lr_effective_min": -3.343458956805989e-05, "adam_stats/m_t_max": 0.004654239863157272, "adam_stats/m_t_mean": 7.133309914975072e-11, "adam_stats/m_t_min": -0.005989938508719206, "adam_stats/v_t_max": 6.17594996583648e-05, "adam_stats/v_t_mean": 2.350255190117223e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.01111244410276413, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.9597453474998474, "all_logprobs": -0.09788238257169724, "all_logprobs/max": 0.0, "all_logprobs/median": -2.5033950805664062e-06, "all_logprobs/min": -12.625, "all_logprobs/p1": -2.03125, "all_logprobs/p10": -0.173828125, "all_logprobs/p25": -0.00150299072265625, "all_logprobs/p5": -0.58203125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15783390402793884, "clip_ratio": 0.0, "completion_length": 522.40234375, "completion_length/correct": 436.8993225097656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 418.0, "completion_length/correct/min": 101.0, "completion_length/correct/p25": 295.0, "completion_length/correct/p75": 537.25, "completion_length/correct/var": 32619.390625, "completion_length/incorrect": 778.9114990234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 834.0, "completion_length/incorrect/min": 242.0, "completion_length/incorrect/p25": 567.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 60933.99609375, "completion_length/max": 1024.0, "completion_length/median": 473.0, "completion_length/min": 101.0, "completion_length/p25": 319.0, "completion_length/p75": 652.0, "completion_length/var": 61588.734375, "epoch": 0.7808, "feature_vector_variance/max_squared_error": 111870.34375, "feature_vector_variance/metric": 24275.6875, "generated_tokens/total": 26109178.0, "grad_norm": 0.11464573442935944, "grouped_std_rewards": 0.1565883755683899, "learning_rate": 5.9406623188668065e-06, "loss": -0.0111, "mean_logprobs": -0.0986328125, "mean_logprobs/var": 0.003997802734375, "num_completions/total": 46848, "per_sentence_gradient_norm": 2.644235610961914, "per_sentence_gradient_norm/max": 192.3144073486328, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 84.59402465820312, "per_sentence_gradient_norm/var": 212.0718536376953, "per_token_feature_norm": 157.68186950683594, "per_token_feature_norm/max": 344.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 125.5, "per_token_feature_norm/p75": 184.0, "per_token_feature_norm/var": 1740.00927734375, "per_token_full_gradient_variance/max_squared_error": 55.36688995361328, "per_token_full_gradient_variance/variance": 0.018036728724837303, "per_token_gradient_norm": 3.002389669418335, "per_token_gradient_norm/max": 3095.575439453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2206.92333984375, "per_token_policy_error_norm": 0.052686359733343124, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0462002232670784, "policy_entropy": 0.10930866003036499, "policy_entropy/max": 3.765625, "policy_entropy/median": 3.600120544433594e-05, "policy_entropy/min": 7.93809462606987e-15, "policy_entropy/p25": 1.5869736671447754e-06, "policy_entropy/p75": 0.0115966796875, "policy_entropy/var": 0.08153488487005234, "policy_error_vector_variance/max_squared_error": 2.014388084411621, "policy_error_vector_variance/metric": 0.05261431261897087, "policy_loss": -0.01111244410276413, "policy_loss/max": 9.659050941467285, "policy_loss/median": 0.0, "policy_loss/min": -7.481915473937988, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.9597452878952026, "policy_sharpness": 7.927286624908447, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.74609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.316604614257812, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.18774445354938507, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18774445354938507, "sentence_full_gradient_variance/max_squared_error": 288533.4375, "sentence_full_gradient_variance/metric": 2652.677734375, "sentence_full_gradient_variance/p75": 73.25362396240234, "sentence_full_gradient_variance/p90": 84.1054458618164, "sentence_full_gradient_variance/p95": 84.1054458618164, "sentence_full_gradient_variance/p99": 92343.3203125, "state_level_variance/metric": 20.570755004882812, "state_level_variance_full_gradient/metric": 303.416015625, "step": 61 }, { "accuracy_reward": 0.7317708730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19653818011283875, "action_level_variance/metric": 466.6175231933594, "action_level_variance_full_gradient/metric": 4749.6923828125, "adam_stats/lr_effective_max": 3.388355253264308e-05, "adam_stats/lr_effective_mean": 1.1531463284253363e-10, "adam_stats/lr_effective_min": -3.38681747962255e-05, "adam_stats/m_t_max": 0.004905978683382273, "adam_stats/m_t_mean": 6.120087769900806e-11, "adam_stats/m_t_min": -0.0067093041725456715, "adam_stats/v_t_max": 6.169799598865211e-05, "adam_stats/v_t_mean": 2.3548639167120244e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.03488830476999283, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.808746576309204, "all_logprobs": -0.11217794567346573, "all_logprobs/max": 0.0, "all_logprobs/median": -3.814697265625e-06, "all_logprobs/min": -12.125, "all_logprobs/p1": -2.1875, "all_logprobs/p10": -0.2265625, "all_logprobs/p25": -0.003173828125, "all_logprobs/p5": -0.6953125, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.1859704554080963, "clip_ratio": 0.0, "completion_length": 507.125, "completion_length/correct": 442.487548828125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 417.0, "completion_length/correct/min": 105.0, "completion_length/correct/p25": 292.0, "completion_length/correct/p75": 537.0, "completion_length/correct/var": 38251.20703125, "completion_length/incorrect": 683.4660034179688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 659.0, "completion_length/incorrect/min": 130.0, "completion_length/incorrect/p25": 431.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 79659.015625, "completion_length/max": 1024.0, "completion_length/median": 448.0, "completion_length/min": 105.0, "completion_length/p25": 321.75, "completion_length/p75": 643.25, "completion_length/var": 60681.7109375, "epoch": 0.7936, "feature_vector_variance/max_squared_error": 100867.4921875, "feature_vector_variance/metric": 24745.966796875, "generated_tokens/total": 26498650.0, "grad_norm": 0.1613021045923233, "grouped_std_rewards": 0.19319918751716614, "learning_rate": 5.685585783002493e-06, "loss": 0.0349, "mean_logprobs": -0.1103515625, "mean_logprobs/var": 0.005828857421875, "num_completions/total": 47616, "per_sentence_gradient_norm": 4.251513957977295, "per_sentence_gradient_norm/max": 208.0381317138672, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 22.833858489990234, "per_sentence_gradient_norm/p99": 119.61213684082031, "per_sentence_gradient_norm/var": 449.1269836425781, "per_token_feature_norm": 159.45848083496094, "per_token_feature_norm/max": 336.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 126.0, "per_token_feature_norm/p75": 187.0, "per_token_feature_norm/var": 1872.822998046875, "per_token_full_gradient_variance/max_squared_error": 425.9524230957031, "per_token_full_gradient_variance/variance": 0.060999903827905655, "per_token_gradient_norm": 5.12636137008667, "per_token_gradient_norm/max": 6225.02197265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6071.70556640625, "per_token_policy_error_norm": 0.05940643325448036, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05129179358482361, "policy_entropy": 0.12493297457695007, "policy_entropy/max": 3.75, "policy_entropy/median": 5.435943603515625e-05, "policy_entropy/min": 5.906386491005833e-14, "policy_entropy/p25": 2.130866050720215e-06, "policy_entropy/p75": 0.0223388671875, "policy_entropy/var": 0.09780072420835495, "policy_error_vector_variance/max_squared_error": 2.016296863555908, "policy_error_vector_variance/metric": 0.05930372327566147, "policy_loss": 0.03488830104470253, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -7.4819159507751465, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.808746576309204, "policy_sharpness": 7.729403972625732, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.48614501953125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.142126083374023, "reward": 0.7317708730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19653818011283875, "rewards/accuracy_reward": 0.7317708730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19653818011283875, "sentence_full_gradient_variance/max_squared_error": 551390.625, "sentence_full_gradient_variance/metric": 5366.01904296875, "sentence_full_gradient_variance/p75": 153.33877563476562, "sentence_full_gradient_variance/p90": 237.35321044921875, "sentence_full_gradient_variance/p95": 237.35321044921875, "sentence_full_gradient_variance/p99": 147348.0625, "state_level_variance/metric": 40.67552185058594, "state_level_variance_full_gradient/metric": 616.3265991210938, "step": 62 }, { "accuracy_reward": 0.7734375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17546039819717407, "action_level_variance/metric": 4949.3310546875, "action_level_variance_full_gradient/metric": 9855.201171875, "adam_stats/lr_effective_max": 3.375588858034462e-05, "adam_stats/lr_effective_mean": 8.581915039718169e-11, "adam_stats/lr_effective_min": -3.293560439487919e-05, "adam_stats/m_t_max": 0.004339086823165417, "adam_stats/m_t_mean": 6.855012391060455e-11, "adam_stats/m_t_min": -0.005629437975585461, "adam_stats/v_t_max": 6.171354471007362e-05, "adam_stats/v_t_mean": 2.379845886330001e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.1503308266401291, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.039684295654297, "all_logprobs": -0.10452745109796524, "all_logprobs/max": 0.0, "all_logprobs/median": -2.9802322387695312e-06, "all_logprobs/min": -10.375, "all_logprobs/p1": -2.125, "all_logprobs/p10": -0.1982421875, "all_logprobs/p25": -0.00193023681640625, "all_logprobs/p5": -0.63671875, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.17330972850322723, "clip_ratio": 0.0, "completion_length": 504.3411560058594, "completion_length/correct": 436.8771057128906, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 386.0, "completion_length/correct/min": 78.0, "completion_length/correct/p25": 291.0, "completion_length/correct/p75": 559.0, "completion_length/correct/var": 40472.640625, "completion_length/incorrect": 734.6494140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 813.0, "completion_length/incorrect/min": 133.0, "completion_length/incorrect/p25": 466.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 84359.8046875, "completion_length/max": 1024.0, "completion_length/median": 427.0, "completion_length/min": 78.0, "completion_length/p25": 304.0, "completion_length/p75": 651.75, "completion_length/var": 65876.5859375, "epoch": 0.8064, "feature_vector_variance/max_squared_error": 108286.609375, "feature_vector_variance/metric": 24573.359375, "generated_tokens/total": 26885984.0, "grad_norm": 0.2933928072452545, "grouped_std_rewards": 0.1947314739227295, "learning_rate": 5.432719831372507e-06, "loss": -0.1503, "mean_logprobs": -0.1064453125, "mean_logprobs/var": 0.00537109375, "num_completions/total": 48384, "per_sentence_gradient_norm": 8.803719520568848, "per_sentence_gradient_norm/max": 1146.0130615234375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 18.56193733215332, "per_sentence_gradient_norm/p99": 198.7168731689453, "per_sentence_gradient_norm/var": 4878.177734375, "per_token_feature_norm": 158.79727172851562, "per_token_feature_norm/max": 340.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 126.0, "per_token_feature_norm/p75": 185.0, "per_token_feature_norm/var": 1825.892333984375, "per_token_full_gradient_variance/max_squared_error": 775.5554809570312, "per_token_full_gradient_variance/variance": 0.16669861972332, "per_token_gradient_norm": 10.18667984008789, "per_token_gradient_norm/max": 7810.96728515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 25705.16796875, "per_token_policy_error_norm": 0.05548298358917236, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04845726490020752, "policy_entropy": 0.11642183363437653, "policy_entropy/max": 3.78125, "policy_entropy/median": 4.315376281738281e-05, "policy_entropy/min": 1.787459069646502e-14, "policy_entropy/p25": 1.8477439880371094e-06, "policy_entropy/p75": 0.01434326171875, "policy_entropy/var": 0.09019078314304352, "policy_error_vector_variance/max_squared_error": 2.013223886489868, "policy_error_vector_variance/metric": 0.055396027863025665, "policy_loss": -0.1503308266401291, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.039684295654297, "policy_sharpness": 7.862528324127197, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.24609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.674659729003906, "reward": 0.7734375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17546039819717407, "rewards/accuracy_reward": 0.7734375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17546039819717407, "sentence_full_gradient_variance/max_squared_error": 1934448.0, "sentence_full_gradient_variance/metric": 11035.576171875, "sentence_full_gradient_variance/p75": 421.3467102050781, "sentence_full_gradient_variance/p90": 890.2626953125, "sentence_full_gradient_variance/p95": 890.2626953125, "sentence_full_gradient_variance/p99": 259586.75, "state_level_variance/metric": 546.8573608398438, "state_level_variance_full_gradient/metric": 1180.3741455078125, "step": 63 }, { "accuracy_reward": 0.6901041865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21413923799991608, "action_level_variance/metric": 1094.231689453125, "action_level_variance_full_gradient/metric": 8431.1513671875, "adam_stats/lr_effective_max": 3.004846985277254e-05, "adam_stats/lr_effective_mean": 1.0621536983279611e-10, "adam_stats/lr_effective_min": -2.906613372033462e-05, "adam_stats/m_t_max": 0.003694606712087989, "adam_stats/m_t_mean": 7.37289326813162e-11, "adam_stats/m_t_min": -0.0048528709448874, "adam_stats/v_t_max": 6.166371895233169e-05, "adam_stats/v_t_mean": 2.385610372440672e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.1473710536956787, "advantages/max": 7.48191499710083, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.276237726211548, "all_logprobs": -0.09988126903772354, "all_logprobs/max": 0.0, "all_logprobs/median": -2.86102294921875e-06, "all_logprobs/min": -10.625, "all_logprobs/p1": -2.0625, "all_logprobs/p10": -0.1806640625, "all_logprobs/p25": -0.001861572265625, "all_logprobs/p5": -0.6015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.16157421469688416, "clip_ratio": 0.0, "completion_length": 509.46875, "completion_length/correct": 437.3132019042969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 420.0, "completion_length/correct/min": 80.0, "completion_length/correct/p25": 306.0, "completion_length/correct/p75": 535.0, "completion_length/correct/var": 33235.0078125, "completion_length/incorrect": 670.1513061523438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 653.0, "completion_length/incorrect/min": 153.0, "completion_length/incorrect/p25": 431.0, "completion_length/incorrect/p75": 995.25, "completion_length/incorrect/var": 81771.6875, "completion_length/max": 1024.0, "completion_length/median": 456.0, "completion_length/min": 80.0, "completion_length/p25": 332.75, "completion_length/p75": 627.25, "completion_length/var": 59798.578125, "epoch": 0.8192, "feature_vector_variance/max_squared_error": 107438.46875, "feature_vector_variance/metric": 24739.54296875, "generated_tokens/total": 27277256.0, "grad_norm": 0.15830159187316895, "grouped_std_rewards": 0.2241913378238678, "learning_rate": 5.182372542187895e-06, "loss": 0.1474, "mean_logprobs": -0.10107421875, "mean_logprobs/var": 0.0040283203125, "num_completions/total": 49152, "per_sentence_gradient_norm": 6.13263463973999, "per_sentence_gradient_norm/max": 363.2262878417969, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 31.305316925048828, "per_sentence_gradient_norm/p99": 182.71237182617188, "per_sentence_gradient_norm/var": 1058.0001220703125, "per_token_feature_norm": 159.25213623046875, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 186.0, "per_token_feature_norm/var": 1721.455078125, "per_token_full_gradient_variance/max_squared_error": 155.64859008789062, "per_token_full_gradient_variance/variance": 0.07460957765579224, "per_token_gradient_norm": 6.967105865478516, "per_token_gradient_norm/max": 6275.27880859375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9017.3837890625, "per_token_policy_error_norm": 0.053547170013189316, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04688069969415665, "policy_entropy": 0.1114596426486969, "policy_entropy/max": 3.78125, "policy_entropy/median": 4.124641418457031e-05, "policy_entropy/min": 7.638334409421077e-14, "policy_entropy/p25": 1.519918441772461e-06, "policy_entropy/p75": 0.01416015625, "policy_entropy/var": 0.08218415081501007, "policy_error_vector_variance/max_squared_error": 2.017174243927002, "policy_error_vector_variance/metric": 0.05348015949130058, "policy_loss": 0.1473710536956787, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -7.481915473937988, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.276237726211548, "policy_sharpness": 7.877426624298096, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.24609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.481554985046387, "reward": 0.6901041865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21413923799991608, "rewards/accuracy_reward": 0.6901041865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21413923799991608, "sentence_full_gradient_variance/max_squared_error": 1352594.25, "sentence_full_gradient_variance/metric": 9523.251953125, "sentence_full_gradient_variance/p75": 172.20181274414062, "sentence_full_gradient_variance/p90": 433.73046875, "sentence_full_gradient_variance/p95": 8468.732421875, "sentence_full_gradient_variance/p99": 174578.53125, "state_level_variance/metric": 100.21366119384766, "state_level_variance_full_gradient/metric": 1092.100341796875, "step": 64 }, { "accuracy_reward": 0.7734375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17546039819717407, "action_level_variance/metric": 433.29248046875, "action_level_variance_full_gradient/metric": 2999.97998046875, "adam_stats/lr_effective_max": 2.711102570174262e-05, "adam_stats/lr_effective_mean": 3.443981905215665e-11, "adam_stats/lr_effective_min": -2.749733903328888e-05, "adam_stats/m_t_max": 0.0031360990833491087, "adam_stats/m_t_mean": 7.853358935383525e-11, "adam_stats/m_t_min": -0.004495757631957531, "adam_stats/v_t_max": 6.165089143905789e-05, "adam_stats/v_t_mean": 2.394938847932737e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.00668209046125412, "advantages/max": 9.659051895141602, "advantages/median": -0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.8696708679199219, "all_logprobs": -0.10619208961725235, "all_logprobs/max": 0.0, "all_logprobs/median": -3.2186508178710938e-06, "all_logprobs/min": -11.1875, "all_logprobs/p1": -2.15625, "all_logprobs/p10": -0.201171875, "all_logprobs/p25": -0.00213623046875, "all_logprobs/p5": -0.6484375, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.17846593260765076, "clip_ratio": 0.0, "completion_length": 498.52996826171875, "completion_length/correct": 417.1111145019531, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 385.0, "completion_length/correct/min": 52.0, "completion_length/correct/p25": 287.0, "completion_length/correct/p75": 518.75, "completion_length/correct/var": 35361.5234375, "completion_length/incorrect": 776.4769897460938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 824.0, "completion_length/incorrect/min": 178.0, "completion_length/incorrect/p25": 571.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 64493.29296875, "completion_length/max": 1024.0, "completion_length/median": 444.0, "completion_length/min": 52.0, "completion_length/p25": 307.75, "completion_length/p75": 619.5, "completion_length/var": 64545.84765625, "epoch": 0.832, "feature_vector_variance/max_squared_error": 106555.7265625, "feature_vector_variance/metric": 25133.1875, "generated_tokens/total": 27660128.0, "grad_norm": 0.17904338240623474, "grouped_std_rewards": 0.17368456721305847, "learning_rate": 4.934848925057485e-06, "loss": -0.0067, "mean_logprobs": -0.107421875, "mean_logprobs/var": 0.006927490234375, "num_completions/total": 49920, "per_sentence_gradient_norm": 3.7143306732177734, "per_sentence_gradient_norm/max": 259.0080871582031, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 96.69074249267578, "per_sentence_gradient_norm/var": 420.04315185546875, "per_token_feature_norm": 160.68763732910156, "per_token_feature_norm/max": 336.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 62.25, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 187.0, "per_token_feature_norm/var": 1841.20458984375, "per_token_full_gradient_variance/max_squared_error": 141.15274047851562, "per_token_full_gradient_variance/variance": 0.045489899814128876, "per_token_gradient_norm": 4.874124050140381, "per_token_gradient_norm/max": 4666.74560546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5691.30126953125, "per_token_policy_error_norm": 0.05598212406039238, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04881250113248825, "policy_entropy": 0.1185789555311203, "policy_entropy/max": 3.734375, "policy_entropy/median": 4.5299530029296875e-05, "policy_entropy/min": 6.765421556309548e-16, "policy_entropy/p25": 1.7434358596801758e-06, "policy_entropy/p75": 0.0159912109375, "policy_entropy/var": 0.09319404512643814, "policy_error_vector_variance/max_squared_error": 2.013366460800171, "policy_error_vector_variance/metric": 0.05587618052959442, "policy_loss": -0.006682089529931545, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.8696703910827637, "policy_sharpness": 7.825313091278076, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.86712646484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.811202049255371, "reward": 0.7734375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17546039819717407, "rewards/accuracy_reward": 0.7734375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17546039819717407, "sentence_full_gradient_variance/max_squared_error": 570934.25, "sentence_full_gradient_variance/metric": 3357.28955078125, "sentence_full_gradient_variance/p75": 115.02239990234375, "sentence_full_gradient_variance/p90": 322.5362854003906, "sentence_full_gradient_variance/p95": 322.5362854003906, "sentence_full_gradient_variance/p99": 63627.83984375, "state_level_variance/metric": 40.79020690917969, "state_level_variance_full_gradient/metric": 357.30938720703125, "step": 65 }, { "accuracy_reward": 0.71484375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20410791039466858, "action_level_variance/metric": 242.2905731201172, "action_level_variance_full_gradient/metric": 2270.66943359375, "adam_stats/lr_effective_max": 2.689378561626654e-05, "adam_stats/lr_effective_mean": 5.513635381593218e-11, "adam_stats/lr_effective_min": -2.650812893989496e-05, "adam_stats/m_t_max": 0.0029918616637587547, "adam_stats/m_t_mean": 7.344769237249693e-11, "adam_stats/m_t_min": -0.0040026940405368805, "adam_stats/v_t_max": 6.160051270853728e-05, "adam_stats/v_t_mean": 2.3951958038476162e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.03167508542537689, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.2952799797058105, "all_logprobs": -0.10072005540132523, "all_logprobs/max": 0.0, "all_logprobs/median": -2.6226043701171875e-06, "all_logprobs/min": -11.3125, "all_logprobs/p1": -2.0625, "all_logprobs/p10": -0.1806640625, "all_logprobs/p25": -0.00170135498046875, "all_logprobs/p5": -0.61328125, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.16451360285282135, "clip_ratio": 0.0, "completion_length": 530.3411865234375, "completion_length/correct": 425.3315124511719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 385.0, "completion_length/correct/min": 90.0, "completion_length/correct/p25": 269.0, "completion_length/correct/p75": 535.0, "completion_length/correct/var": 42274.2890625, "completion_length/incorrect": 793.5844116210938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 855.0, "completion_length/incorrect/min": 183.0, "completion_length/incorrect/p25": 621.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 62424.89453125, "completion_length/max": 1024.0, "completion_length/median": 468.0, "completion_length/min": 90.0, "completion_length/p25": 308.75, "completion_length/p75": 727.0, "completion_length/var": 75625.5859375, "epoch": 0.8448, "feature_vector_variance/max_squared_error": 114549.6484375, "feature_vector_variance/metric": 24307.44140625, "generated_tokens/total": 28067428.0, "grad_norm": 0.09769926965236664, "grouped_std_rewards": 0.15729478001594543, "learning_rate": 4.6904505493806595e-06, "loss": -0.0317, "mean_logprobs": -0.099609375, "mean_logprobs/var": 0.00506591796875, "num_completions/total": 50688, "per_sentence_gradient_norm": 2.7468581199645996, "per_sentence_gradient_norm/max": 169.60545349121094, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 87.17987060546875, "per_sentence_gradient_norm/var": 235.05142211914062, "per_token_feature_norm": 158.05868530273438, "per_token_feature_norm/max": 338.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 126.5, "per_token_feature_norm/p75": 184.0, "per_token_feature_norm/var": 1705.4827880859375, "per_token_full_gradient_variance/max_squared_error": 107.12752532958984, "per_token_full_gradient_variance/variance": 0.026811450719833374, "per_token_gradient_norm": 3.574932813644409, "per_token_gradient_norm/max": 5284.712890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3276.403564453125, "per_token_policy_error_norm": 0.053632065653800964, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.046642888337373734, "policy_entropy": 0.11305569857358932, "policy_entropy/max": 3.703125, "policy_entropy/median": 3.743171691894531e-05, "policy_entropy/min": 1.48318857196017e-16, "policy_entropy/p25": 1.646578311920166e-06, "policy_entropy/p75": 0.01324462890625, "policy_entropy/var": 0.08643285930156708, "policy_error_vector_variance/max_squared_error": 2.0157651901245117, "policy_error_vector_variance/metric": 0.05354803428053856, "policy_loss": -0.031675081700086594, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.295279860496521, "policy_sharpness": 7.8908843994140625, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.49609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.522080421447754, "reward": 0.71484375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20410791039466858, "rewards/accuracy_reward": 0.71484375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20410791039466858, "sentence_full_gradient_variance/max_squared_error": 303589.03125, "sentence_full_gradient_variance/metric": 2558.20361328125, "sentence_full_gradient_variance/p75": 70.24543762207031, "sentence_full_gradient_variance/p90": 80.04395294189453, "sentence_full_gradient_variance/p95": 80.04395294189453, "sentence_full_gradient_variance/p99": 87593.0234375, "state_level_variance/metric": 22.980470657348633, "state_level_variance_full_gradient/metric": 287.5341796875, "step": 66 }, { "accuracy_reward": 0.71484375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20410792529582977, "action_level_variance/metric": 892.0700073242188, "action_level_variance_full_gradient/metric": 8485.4521484375, "adam_stats/lr_effective_max": 2.5525761884637177e-05, "adam_stats/lr_effective_mean": 2.578958227528183e-11, "adam_stats/lr_effective_min": -2.566455805208534e-05, "adam_stats/m_t_max": 0.002737903967499733, "adam_stats/m_t_mean": 6.230291282882661e-11, "adam_stats/m_t_min": -0.0041212234646081924, "adam_stats/v_t_max": 6.160680641187355e-05, "adam_stats/v_t_mean": 2.398645735160465e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.02250199392437935, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.137567043304443, "all_logprobs": -0.10192302614450455, "all_logprobs/max": 0.0, "all_logprobs/median": -3.337860107421875e-06, "all_logprobs/min": -11.3125, "all_logprobs/p1": -2.102344512939453, "all_logprobs/p10": -0.189453125, "all_logprobs/p25": -0.00194549560546875, "all_logprobs/p5": -0.6328125, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.1635509580373764, "clip_ratio": 0.0, "completion_length": 519.3307495117188, "completion_length/correct": 424.6229553222656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 382.0, "completion_length/correct/min": 74.0, "completion_length/correct/p25": 283.0, "completion_length/correct/p75": 514.0, "completion_length/correct/var": 38917.58984375, "completion_length/incorrect": 756.7488403320312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 830.0, "completion_length/incorrect/min": 144.0, "completion_length/incorrect/p25": 556.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 78503.8046875, "completion_length/max": 1024.0, "completion_length/median": 439.0, "completion_length/min": 74.0, "completion_length/p25": 311.0, "completion_length/p75": 688.25, "completion_length/var": 72632.875, "epoch": 0.8576, "feature_vector_variance/max_squared_error": 103175.875, "feature_vector_variance/metric": 24622.962890625, "generated_tokens/total": 28466276.0, "grad_norm": 0.1466471403837204, "grouped_std_rewards": 0.20634835958480835, "learning_rate": 4.4494751769315e-06, "loss": 0.0225, "mean_logprobs": -0.09912109375, "mean_logprobs/var": 0.00433349609375, "num_completions/total": 51456, "per_sentence_gradient_norm": 5.616503715515137, "per_sentence_gradient_norm/max": 387.85528564453125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 28.20726203918457, "per_sentence_gradient_norm/p99": 161.6636962890625, "per_sentence_gradient_norm/var": 861.6468505859375, "per_token_feature_norm": 159.2957305908203, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 185.0, "per_token_feature_norm/var": 1671.0386962890625, "per_token_full_gradient_variance/max_squared_error": 573.8504028320312, "per_token_full_gradient_variance/variance": 0.0842636451125145, "per_token_gradient_norm": 7.15415096282959, "per_token_gradient_norm/max": 6643.1572265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 10419.853515625, "per_token_policy_error_norm": 0.054691608995199203, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04778126999735832, "policy_entropy": 0.11403946578502655, "policy_entropy/max": 3.375, "policy_entropy/median": 4.673004150390625e-05, "policy_entropy/min": 2.398081733190338e-14, "policy_entropy/p25": 1.996755599975586e-06, "policy_entropy/p75": 0.01507568359375, "policy_entropy/var": 0.0857180804014206, "policy_error_vector_variance/max_squared_error": 2.0129520893096924, "policy_error_vector_variance/metric": 0.05462294816970825, "policy_loss": 0.02250199019908905, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.137567043304443, "policy_sharpness": 7.855667591094971, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.11712646484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.64202880859375, "reward": 0.71484375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20410792529582977, "rewards/accuracy_reward": 0.71484375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20410792529582977, "sentence_full_gradient_variance/max_squared_error": 1696231.875, "sentence_full_gradient_variance/metric": 9638.75390625, "sentence_full_gradient_variance/p75": 141.60520935058594, "sentence_full_gradient_variance/p90": 143.81436157226562, "sentence_full_gradient_variance/p95": 143.81436157226562, "sentence_full_gradient_variance/p99": 193681.46875, "state_level_variance/metric": 80.80537414550781, "state_level_variance_full_gradient/metric": 1153.300537109375, "step": 67 }, { "accuracy_reward": 0.72265625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2006855010986328, "action_level_variance/metric": 2632.76513671875, "action_level_variance_full_gradient/metric": 13131.2890625, "adam_stats/lr_effective_max": 2.3654409233131446e-05, "adam_stats/lr_effective_mean": 1.7496103524305973e-11, "adam_stats/lr_effective_min": -2.3715479983366095e-05, "adam_stats/m_t_max": 0.002371797803789377, "adam_stats/m_t_mean": 5.055111884644248e-11, "adam_stats/m_t_min": -0.0027935735415667295, "adam_stats/v_t_max": 6.158061296446249e-05, "adam_stats/v_t_mean": 2.3985080414845594e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.04171628877520561, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.8811793327331543, "all_logprobs": -0.10070203989744186, "all_logprobs/max": 0.0, "all_logprobs/median": -3.0994415283203125e-06, "all_logprobs/min": -10.5, "all_logprobs/p1": -2.0625, "all_logprobs/p10": -0.1796875, "all_logprobs/p25": -0.001800537109375, "all_logprobs/p5": -0.61328125, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.1658833771944046, "clip_ratio": 0.0, "completion_length": 512.5872802734375, "completion_length/correct": 419.1513671875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 372.0, "completion_length/correct/min": 68.0, "completion_length/correct/p25": 261.0, "completion_length/correct/p75": 532.0, "completion_length/correct/var": 46215.6015625, "completion_length/incorrect": 756.0469360351562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 803.0, "completion_length/incorrect/min": 99.0, "completion_length/incorrect/p25": 568.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 62295.63671875, "completion_length/max": 1024.0, "completion_length/median": 433.0, "completion_length/min": 68.0, "completion_length/p25": 291.25, "completion_length/p75": 716.0, "completion_length/var": 73377.421875, "epoch": 0.8704, "feature_vector_variance/max_squared_error": 112061.875, "feature_vector_variance/metric": 24176.708984375, "generated_tokens/total": 28859942.0, "grad_norm": 0.10964144766330719, "grouped_std_rewards": 0.17377319931983948, "learning_rate": 4.212216399081919e-06, "loss": 0.0417, "mean_logprobs": -0.099609375, "mean_logprobs/var": 0.00433349609375, "num_completions/total": 52224, "per_sentence_gradient_norm": 5.843713760375977, "per_sentence_gradient_norm/max": 1026.9110107421875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 10.091050148010254, "per_sentence_gradient_norm/p99": 123.82996368408203, "per_sentence_gradient_norm/var": 2602.004150390625, "per_token_feature_norm": 157.46986389160156, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 63.75, "per_token_feature_norm/p25": 127.0, "per_token_feature_norm/p75": 183.0, "per_token_feature_norm/var": 1616.327392578125, "per_token_full_gradient_variance/max_squared_error": 880.9561767578125, "per_token_full_gradient_variance/variance": 0.0999060571193695, "per_token_gradient_norm": 6.053894996643066, "per_token_gradient_norm/max": 7665.60986328125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 13465.3544921875, "per_token_policy_error_norm": 0.05373280495405197, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.047056593000888824, "policy_entropy": 0.11176695674657822, "policy_entropy/max": 3.75, "policy_entropy/median": 4.4345855712890625e-05, "policy_entropy/min": 8.881784197001252e-16, "policy_entropy/p25": 1.8775463104248047e-06, "policy_entropy/p75": 0.0140380859375, "policy_entropy/var": 0.0849524512887001, "policy_error_vector_variance/max_squared_error": 2.0119078159332275, "policy_error_vector_variance/metric": 0.053653500974178314, "policy_loss": 0.04171627759933472, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.8811793327331543, "policy_sharpness": 7.881523132324219, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.24609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.457846641540527, "reward": 0.72265625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2006855010986328, "rewards/accuracy_reward": 0.72265625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2006855010986328, "sentence_full_gradient_variance/max_squared_error": 4006748.0, "sentence_full_gradient_variance/metric": 14794.5966796875, "sentence_full_gradient_variance/p75": 347.81671142578125, "sentence_full_gradient_variance/p90": 864.5999145507812, "sentence_full_gradient_variance/p95": 864.5999145507812, "sentence_full_gradient_variance/p99": 158466.421875, "state_level_variance/metric": 298.05133056640625, "state_level_variance_full_gradient/metric": 1663.3074951171875, "step": 68 }, { "accuracy_reward": 0.6901041865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21413922309875488, "action_level_variance/metric": 1211.049560546875, "action_level_variance_full_gradient/metric": 4985.5078125, "adam_stats/lr_effective_max": 2.2895008441992104e-05, "adam_stats/lr_effective_mean": -2.077844667158768e-11, "adam_stats/lr_effective_min": -2.3662312742089853e-05, "adam_stats/m_t_max": 0.0024702998343855143, "adam_stats/m_t_mean": 4.135911405023229e-11, "adam_stats/m_t_min": -0.0025831330567598343, "adam_stats/v_t_max": 6.153000140329823e-05, "adam_stats/v_t_mean": 2.4061937338448747e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.016578128561377525, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.804189920425415, "all_logprobs": -0.10602201521396637, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-06, "all_logprobs/min": -12.625, "all_logprobs/p1": -2.140625, "all_logprobs/p10": -0.201171875, "all_logprobs/p25": -0.0022125244140625, "all_logprobs/p5": -0.63671875, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.18183684349060059, "clip_ratio": 0.0, "completion_length": 487.18359375, "completion_length/correct": 409.35284423828125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 356.0, "completion_length/correct/min": 66.0, "completion_length/correct/p25": 250.5, "completion_length/correct/p75": 541.0, "completion_length/correct/var": 44708.90234375, "completion_length/incorrect": 660.5042114257812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 650.0, "completion_length/incorrect/min": 90.0, "completion_length/incorrect/p25": 469.5, "completion_length/incorrect/p75": 870.5, "completion_length/incorrect/var": 69174.65625, "completion_length/max": 1024.0, "completion_length/median": 421.0, "completion_length/min": 66.0, "completion_length/p25": 283.0, "completion_length/p75": 655.5, "completion_length/var": 65717.6953125, "epoch": 0.8832, "feature_vector_variance/max_squared_error": 108230.921875, "feature_vector_variance/metric": 24602.96484375, "generated_tokens/total": 29234100.0, "grad_norm": 0.22485150396823883, "grouped_std_rewards": 0.16132251918315887, "learning_rate": 3.978963279105821e-06, "loss": -0.0166, "mean_logprobs": -0.10595703125, "mean_logprobs/var": 0.00592041015625, "num_completions/total": 52992, "per_sentence_gradient_norm": 5.103490352630615, "per_sentence_gradient_norm/max": 501.166015625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 135.06658935546875, "per_sentence_gradient_norm/var": 1186.5489501953125, "per_token_feature_norm": 158.76902770996094, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 56.75, "per_token_feature_norm/p25": 127.0, "per_token_feature_norm/p75": 184.0, "per_token_feature_norm/var": 1716.8455810546875, "per_token_full_gradient_variance/max_squared_error": 261.6153564453125, "per_token_full_gradient_variance/variance": 0.08311314135789871, "per_token_gradient_norm": 6.741937160491943, "per_token_gradient_norm/max": 6813.87548828125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 11485.736328125, "per_token_policy_error_norm": 0.055819492787122726, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04895208775997162, "policy_entropy": 0.11749886721372604, "policy_entropy/max": 3.734375, "policy_entropy/median": 4.9591064453125e-05, "policy_entropy/min": 6.83481049534862e-16, "policy_entropy/p25": 1.9222497940063477e-06, "policy_entropy/p75": 0.01708984375, "policy_entropy/var": 0.09038043022155762, "policy_error_vector_variance/max_squared_error": 2.012826919555664, "policy_error_vector_variance/metric": 0.0557246208190918, "policy_loss": -0.016578122973442078, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.804189920425415, "policy_sharpness": 7.814713954925537, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.74212646484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.798893928527832, "reward": 0.6901041865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21413922309875488, "rewards/accuracy_reward": 0.6901041865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21413922309875488, "sentence_full_gradient_variance/max_squared_error": 917746.0, "sentence_full_gradient_variance/metric": 5649.0009765625, "sentence_full_gradient_variance/p75": 78.30414581298828, "sentence_full_gradient_variance/p90": 128.90985107421875, "sentence_full_gradient_variance/p95": 128.90985107421875, "sentence_full_gradient_variance/p99": 79155.4296875, "state_level_variance/metric": 126.65491485595703, "state_level_variance_full_gradient/metric": 663.4931640625, "step": 69 }, { "accuracy_reward": 0.7057291865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20794625580310822, "action_level_variance/metric": 643.501220703125, "action_level_variance_full_gradient/metric": 5686.0546875, "adam_stats/lr_effective_max": 2.1438496332848445e-05, "adam_stats/lr_effective_mean": 9.215565116571511e-11, "adam_stats/lr_effective_min": -2.2352809537551366e-05, "adam_stats/m_t_max": 0.002410593908280134, "adam_stats/m_t_mean": 5.142400047453144e-11, "adam_stats/m_t_min": -0.0028849253430962563, "adam_stats/v_t_max": 6.151603884063661e-05, "adam_stats/v_t_mean": 2.4170871467527055e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.008675414137542248, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.0462357997894287, "all_logprobs": -0.09166128188371658, "all_logprobs/max": 0.0, "all_logprobs/median": -2.0265579223632812e-06, "all_logprobs/min": -12.375, "all_logprobs/p1": -1.9140625, "all_logprobs/p10": -0.158203125, "all_logprobs/p25": -0.00127410888671875, "all_logprobs/p5": -0.546875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.14528781175613403, "clip_ratio": 0.0, "completion_length": 496.45574951171875, "completion_length/correct": 415.4649353027344, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 350.0, "completion_length/correct/min": 92.0, "completion_length/correct/p25": 272.0, "completion_length/correct/p75": 517.75, "completion_length/correct/var": 40734.53125, "completion_length/incorrect": 690.6902465820312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 665.0, "completion_length/incorrect/min": 135.0, "completion_length/incorrect/p25": 466.0, "completion_length/incorrect/p75": 1000.0, "completion_length/incorrect/var": 68738.3125, "completion_length/max": 1024.0, "completion_length/median": 419.0, "completion_length/min": 92.0, "completion_length/p25": 292.0, "completion_length/p75": 665.5, "completion_length/var": 64648.06640625, "epoch": 0.896, "feature_vector_variance/max_squared_error": 105995.59375, "feature_vector_variance/metric": 24248.427734375, "generated_tokens/total": 29615376.0, "grad_norm": 0.20628312230110168, "grouped_std_rewards": 0.19981597363948822, "learning_rate": 3.750000000000002e-06, "loss": -0.0087, "mean_logprobs": -0.0927734375, "mean_logprobs/var": 0.003631591796875, "num_completions/total": 53760, "per_sentence_gradient_norm": 4.2606916427612305, "per_sentence_gradient_norm/max": 426.9796142578125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 20.922237396240234, "per_sentence_gradient_norm/p99": 112.75196075439453, "per_sentence_gradient_norm/var": 626.1630249023438, "per_token_feature_norm": 157.76661682128906, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 183.0, "per_token_feature_norm/var": 1489.5341796875, "per_token_full_gradient_variance/max_squared_error": 512.2593994140625, "per_token_full_gradient_variance/variance": 0.06405975669622421, "per_token_gradient_norm": 5.345715045928955, "per_token_gradient_norm/max": 6589.65283203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 7739.98828125, "per_token_policy_error_norm": 0.04964079707860947, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.043489906936883926, "policy_entropy": 0.10271193087100983, "policy_entropy/max": 3.734375, "policy_entropy/median": 2.9206275939941406e-05, "policy_entropy/min": 1.7430501486614958e-14, "policy_entropy/p25": 1.1771917343139648e-06, "policy_entropy/p75": 0.01019287109375, "policy_entropy/var": 0.07283186912536621, "policy_error_vector_variance/max_squared_error": 2.017238140106201, "policy_error_vector_variance/metric": 0.049591775983572006, "policy_loss": -0.008675421588122845, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.046236276626587, "policy_sharpness": 7.980716705322266, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.99609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.975028038024902, "reward": 0.7057291865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20794625580310822, "rewards/accuracy_reward": 0.7057291865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20794625580310822, "sentence_full_gradient_variance/max_squared_error": 773278.0625, "sentence_full_gradient_variance/metric": 6394.24609375, "sentence_full_gradient_variance/p75": 180.3995361328125, "sentence_full_gradient_variance/p90": 377.9211120605469, "sentence_full_gradient_variance/p95": 377.9211120605469, "sentence_full_gradient_variance/p99": 200513.90625, "state_level_variance/metric": 62.939781188964844, "state_level_variance_full_gradient/metric": 708.191650390625, "step": 70 }, { "accuracy_reward": 0.7578125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1837719976902008, "action_level_variance/metric": 345.1279296875, "action_level_variance_full_gradient/metric": 6411.7783203125, "adam_stats/lr_effective_max": 2.053415846603457e-05, "adam_stats/lr_effective_mean": 7.044416439061507e-11, "adam_stats/lr_effective_min": -2.074026815535035e-05, "adam_stats/m_t_max": 0.002401878358796239, "adam_stats/m_t_mean": 4.979272202887408e-11, "adam_stats/m_t_min": -0.00375813408754766, "adam_stats/v_t_max": 6.148778629722074e-05, "adam_stats/v_t_mean": 2.4188717435286167e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.007886987179517746, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.9919397830963135, "all_logprobs": -0.08084549009799957, "all_logprobs/max": 0.0, "all_logprobs/median": -1.0728836059570312e-06, "all_logprobs/min": -12.5, "all_logprobs/p1": -1.828125, "all_logprobs/p10": -0.1103515625, "all_logprobs/p25": -0.0004405975341796875, "all_logprobs/p5": -0.462890625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1280677765607834, "clip_ratio": 0.0, "completion_length": 483.7734375, "completion_length/correct": 420.7903747558594, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 397.0, "completion_length/correct/min": 98.0, "completion_length/correct/p25": 303.0, "completion_length/correct/p75": 508.75, "completion_length/correct/var": 28504.833984375, "completion_length/incorrect": 680.8494873046875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 678.0, "completion_length/incorrect/min": 135.0, "completion_length/incorrect/p25": 424.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 83568.296875, "completion_length/max": 1024.0, "completion_length/median": 430.0, "completion_length/min": 98.0, "completion_length/p25": 313.0, "completion_length/p75": 593.25, "completion_length/var": 54177.57421875, "epoch": 0.9088, "feature_vector_variance/max_squared_error": 115264.6640625, "feature_vector_variance/metric": 24500.00390625, "generated_tokens/total": 29986916.0, "grad_norm": 0.1570151299238205, "grouped_std_rewards": 0.15637201070785522, "learning_rate": 3.525605518250964e-06, "loss": 0.0079, "mean_logprobs": -0.08056640625, "mean_logprobs/var": 0.0026397705078125, "num_completions/total": 54528, "per_sentence_gradient_norm": 2.992034673690796, "per_sentence_gradient_norm/max": 216.23257446289062, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 93.75485229492188, "per_sentence_gradient_norm/var": 336.61395263671875, "per_token_feature_norm": 159.0076446533203, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 155.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 130.0, "per_token_feature_norm/p75": 184.0, "per_token_feature_norm/var": 1399.7723388671875, "per_token_full_gradient_variance/max_squared_error": 325.7625427246094, "per_token_full_gradient_variance/variance": 0.04930388554930687, "per_token_gradient_norm": 4.010228633880615, "per_token_gradient_norm/max": 6122.18896484375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5651.9765625, "per_token_policy_error_norm": 0.04406271502375603, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03938364237546921, "policy_entropy": 0.08979105204343796, "policy_entropy/max": 3.75, "policy_entropy/median": 1.621246337890625e-05, "policy_entropy/min": 5.93275428784068e-16, "policy_entropy/p25": 5.811452865600586e-07, "policy_entropy/p75": 0.004119873046875, "policy_entropy/var": 0.0635220855474472, "policy_error_vector_variance/max_squared_error": 2.007056713104248, "policy_error_vector_variance/metric": 0.04401855915784836, "policy_loss": 0.007886983454227448, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.9919397830963135, "policy_sharpness": 8.195624351501465, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.75, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.128456115722656, "reward": 0.7578125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1837719976902008, "rewards/accuracy_reward": 0.7578125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1837719976902008, "sentence_full_gradient_variance/max_squared_error": 1794007.75, "sentence_full_gradient_variance/metric": 7265.15966796875, "sentence_full_gradient_variance/p75": 142.96719360351562, "sentence_full_gradient_variance/p90": 231.9871063232422, "sentence_full_gradient_variance/p95": 231.9871063232422, "sentence_full_gradient_variance/p99": 78098.671875, "state_level_variance/metric": 34.54859924316406, "state_level_variance_full_gradient/metric": 853.3812255859375, "step": 71 }, { "accuracy_reward": 0.76953125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17758414149284363, "action_level_variance/metric": 958.5744018554688, "action_level_variance_full_gradient/metric": 5735.009765625, "adam_stats/lr_effective_max": 1.9663457351271063e-05, "adam_stats/lr_effective_mean": 1.0446490894544525e-10, "adam_stats/lr_effective_min": -1.9466531739453785e-05, "adam_stats/m_t_max": 0.002391047077253461, "adam_stats/m_t_mean": 5.986827006365658e-11, "adam_stats/m_t_min": -0.0038095666095614433, "adam_stats/v_t_max": 6.142725032987073e-05, "adam_stats/v_t_mean": 2.425681183693129e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.043729107826948166, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.8901126384735107, "all_logprobs": -0.07486943900585175, "all_logprobs/max": 0.0, "all_logprobs/median": -7.152557373046875e-07, "all_logprobs/min": -12.0, "all_logprobs/p1": -1.71875, "all_logprobs/p10": -0.08935546875, "all_logprobs/p25": -0.0002956390380859375, "all_logprobs/p5": -0.412109375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11555832624435425, "clip_ratio": 0.0, "completion_length": 498.8294372558594, "completion_length/correct": 439.4839172363281, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 415.0, "completion_length/correct/min": 91.0, "completion_length/correct/p25": 286.5, "completion_length/correct/p75": 546.0, "completion_length/correct/var": 38479.34375, "completion_length/incorrect": 696.9830322265625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 707.0, "completion_length/incorrect/min": 76.0, "completion_length/incorrect/p25": 463.0, "completion_length/incorrect/p75": 984.0, "completion_length/incorrect/var": 69060.65625, "completion_length/max": 1024.0, "completion_length/median": 451.0, "completion_length/min": 76.0, "completion_length/p25": 316.75, "completion_length/p75": 637.75, "completion_length/var": 57221.38671875, "epoch": 0.9216, "feature_vector_variance/max_squared_error": 111092.2421875, "feature_vector_variance/metric": 24164.29296875, "generated_tokens/total": 30370016.0, "grad_norm": 0.2190813422203064, "grouped_std_rewards": 0.18973271548748016, "learning_rate": 3.3060532239694e-06, "loss": 0.0437, "mean_logprobs": -0.07421875, "mean_logprobs/var": 0.0021820068359375, "num_completions/total": 55296, "per_sentence_gradient_norm": 4.420442581176758, "per_sentence_gradient_norm/max": 555.4590454101562, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 16.049009323120117, "per_sentence_gradient_norm/p99": 118.30184173583984, "per_sentence_gradient_norm/var": 940.25830078125, "per_token_feature_norm": 158.40272521972656, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 155.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 130.0, "per_token_feature_norm/p75": 184.0, "per_token_feature_norm/var": 1307.7437744140625, "per_token_full_gradient_variance/max_squared_error": 1099.4935302734375, "per_token_full_gradient_variance/variance": 0.10603000968694687, "per_token_gradient_norm": 6.62750244140625, "per_token_gradient_norm/max": 6688.6201171875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 12666.4619140625, "per_token_policy_error_norm": 0.04115152359008789, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0364774651825428, "policy_entropy": 0.08344563096761703, "policy_entropy/max": 3.703125, "policy_entropy/median": 1.1444091796875e-05, "policy_entropy/min": 1.249000902703301e-16, "policy_entropy/p25": 4.3585896492004395e-07, "policy_entropy/p75": 0.0027618408203125, "policy_entropy/var": 0.0572807714343071, "policy_error_vector_variance/max_squared_error": 2.0126662254333496, "policy_error_vector_variance/metric": 0.04112313315272331, "policy_loss": 0.043729111552238464, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.8901126384735107, "policy_sharpness": 8.293004035949707, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.609370231628418, "reward": 0.76953125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17758414149284363, "rewards/accuracy_reward": 0.76953125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17758414149284363, "sentence_full_gradient_variance/max_squared_error": 1273276.375, "sentence_full_gradient_variance/metric": 6471.75390625, "sentence_full_gradient_variance/p75": 140.21585083007812, "sentence_full_gradient_variance/p90": 236.89996337890625, "sentence_full_gradient_variance/p95": 236.89996337890625, "sentence_full_gradient_variance/p99": 129907.203125, "state_level_variance/metric": 101.33708953857422, "state_level_variance_full_gradient/metric": 736.7445678710938, "step": 72 }, { "accuracy_reward": 0.7278646230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19833597540855408, "action_level_variance/metric": 436.9732360839844, "action_level_variance_full_gradient/metric": 6635.28515625, "adam_stats/lr_effective_max": 1.829114080464933e-05, "adam_stats/lr_effective_mean": 9.419653251852012e-11, "adam_stats/lr_effective_min": -1.700975735730026e-05, "adam_stats/m_t_max": 0.0021705389954149723, "adam_stats/m_t_mean": 5.527746663180899e-11, "adam_stats/m_t_min": -0.0034682827536016703, "adam_stats/v_t_max": 6.137551827123389e-05, "adam_stats/v_t_mean": 2.424127738820392e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.032293856143951416, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.1620352268218994, "all_logprobs": -0.07585111260414124, "all_logprobs/max": 0.0, "all_logprobs/median": -1.0728836059570312e-06, "all_logprobs/min": -12.0, "all_logprobs/p1": -1.7109375, "all_logprobs/p10": -0.091796875, "all_logprobs/p25": -0.00029754638671875, "all_logprobs/p5": -0.423828125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11877655982971191, "clip_ratio": 0.0, "completion_length": 529.2643432617188, "completion_length/correct": 447.1377258300781, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 412.0, "completion_length/correct/min": 50.0, "completion_length/correct/p25": 309.5, "completion_length/correct/p75": 541.0, "completion_length/correct/var": 35206.140625, "completion_length/incorrect": 748.9234008789062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 781.0, "completion_length/incorrect/min": 236.0, "completion_length/incorrect/p25": 539.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 66742.5234375, "completion_length/max": 1024.0, "completion_length/median": 463.0, "completion_length/min": 50.0, "completion_length/p25": 331.0, "completion_length/p75": 654.25, "completion_length/var": 61775.84375, "epoch": 0.9344, "feature_vector_variance/max_squared_error": 115084.3125, "feature_vector_variance/metric": 23754.056640625, "generated_tokens/total": 30776492.0, "grad_norm": 0.08594515919685364, "grouped_std_rewards": 0.17846564948558807, "learning_rate": 3.0916106078064522e-06, "loss": 0.0323, "mean_logprobs": -0.076171875, "mean_logprobs/var": 0.00177764892578125, "num_completions/total": 56064, "per_sentence_gradient_norm": 3.7131781578063965, "per_sentence_gradient_norm/max": 271.21905517578125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 91.37464904785156, "per_sentence_gradient_norm/var": 423.7372741699219, "per_token_feature_norm": 156.63946533203125, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 181.0, "per_token_feature_norm/var": 1337.023193359375, "per_token_full_gradient_variance/max_squared_error": 231.91311645507812, "per_token_full_gradient_variance/variance": 0.05602920055389404, "per_token_gradient_norm": 4.959705829620361, "per_token_gradient_norm/max": 7306.54541015625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 7061.966796875, "per_token_policy_error_norm": 0.04150184616446495, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.036630358546972275, "policy_entropy": 0.0846782997250557, "policy_entropy/max": 3.78125, "policy_entropy/median": 1.621246337890625e-05, "policy_entropy/min": 8.215650382226158e-15, "policy_entropy/p25": 7.413327693939209e-07, "policy_entropy/p75": 0.0028533935546875, "policy_entropy/var": 0.05904781073331833, "policy_error_vector_variance/max_squared_error": 2.0097768306732178, "policy_error_vector_variance/metric": 0.04147157073020935, "policy_loss": 0.03229385241866112, "policy_loss/max": 19.79339027404785, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.1620349884033203, "policy_sharpness": 8.283647537231445, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.669358253479004, "reward": 0.7278646230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19833597540855408, "rewards/accuracy_reward": 0.7278646230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19833597540855408, "sentence_full_gradient_variance/max_squared_error": 2535124.5, "sentence_full_gradient_variance/metric": 7505.02880859375, "sentence_full_gradient_variance/p75": 160.91758728027344, "sentence_full_gradient_variance/p90": 290.6499938964844, "sentence_full_gradient_variance/p95": 290.6499938964844, "sentence_full_gradient_variance/p99": 108283.46875, "state_level_variance/metric": 41.2637939453125, "state_level_variance_full_gradient/metric": 869.7425537109375, "step": 73 }, { "accuracy_reward": 0.75390625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18577350676059723, "action_level_variance/metric": 1484.025390625, "action_level_variance_full_gradient/metric": 7910.470703125, "adam_stats/lr_effective_max": 1.5600649931002408e-05, "adam_stats/lr_effective_mean": 1.0591534593817897e-10, "adam_stats/lr_effective_min": -1.5982692275429145e-05, "adam_stats/m_t_max": 0.0019176268251612782, "adam_stats/m_t_mean": 3.6379618068016484e-11, "adam_stats/m_t_min": -0.00275371759198606, "adam_stats/v_t_max": 6.145290535641834e-05, "adam_stats/v_t_mean": 2.429028115799592e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.007379847578704357, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.5639071464538574, "all_logprobs": -0.08603591471910477, "all_logprobs/max": 0.0, "all_logprobs/median": -1.3113021850585938e-06, "all_logprobs/min": -8.875, "all_logprobs/p1": -1.9140625, "all_logprobs/p10": -0.126953125, "all_logprobs/p25": -0.00077056884765625, "all_logprobs/p5": -0.4832038879394531, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1372797191143036, "clip_ratio": 0.0, "completion_length": 470.19921875, "completion_length/correct": 402.02935791015625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 373.0, "completion_length/correct/min": 121.0, "completion_length/correct/p25": 269.0, "completion_length/correct/p75": 485.5, "completion_length/correct/var": 33795.11328125, "completion_length/incorrect": 679.0369873046875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 678.0, "completion_length/incorrect/min": 166.0, "completion_length/incorrect/p25": 372.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 93409.4140625, "completion_length/max": 1024.0, "completion_length/median": 398.0, "completion_length/min": 121.0, "completion_length/p25": 284.0, "completion_length/p75": 596.25, "completion_length/var": 62618.16796875, "epoch": 0.9472, "feature_vector_variance/max_squared_error": 113718.046875, "feature_vector_variance/metric": 24718.84375, "generated_tokens/total": 31137604.0, "grad_norm": 0.1894468367099762, "grouped_std_rewards": 0.17293940484523773, "learning_rate": 2.882538935057563e-06, "loss": 0.0074, "mean_logprobs": -0.0849609375, "mean_logprobs/var": 0.00341796875, "num_completions/total": 56832, "per_sentence_gradient_norm": 5.777256011962891, "per_sentence_gradient_norm/max": 715.2738647460938, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 172.03170776367188, "per_sentence_gradient_norm/var": 1452.5400390625, "per_token_feature_norm": 159.2666015625, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 155.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 130.0, "per_token_feature_norm/p75": 185.0, "per_token_feature_norm/var": 1445.3597412109375, "per_token_full_gradient_variance/max_squared_error": 653.6185302734375, "per_token_full_gradient_variance/variance": 0.10352899134159088, "per_token_gradient_norm": 7.135194301605225, "per_token_gradient_norm/max": 6482.3359375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 12761.556640625, "per_token_policy_error_norm": 0.046724144369363785, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.041677966713905334, "policy_entropy": 0.095438651740551, "policy_entropy/max": 3.703125, "policy_entropy/median": 2.0384788513183594e-05, "policy_entropy/min": 1.2212453270876722e-15, "policy_entropy/p25": 7.078051567077637e-07, "policy_entropy/p75": 0.006591796875, "policy_entropy/var": 0.06815607845783234, "policy_error_vector_variance/max_squared_error": 2.0105934143066406, "policy_error_vector_variance/metric": 0.0466676726937294, "policy_loss": 0.007379844784736633, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.5639071464538574, "policy_sharpness": 8.089951515197754, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.527091979980469, "reward": 0.75390625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18577350676059723, "rewards/accuracy_reward": 0.75390625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18577350676059723, "sentence_full_gradient_variance/max_squared_error": 2715249.75, "sentence_full_gradient_variance/metric": 8957.994140625, "sentence_full_gradient_variance/p75": 250.868408203125, "sentence_full_gradient_variance/p90": 329.4015808105469, "sentence_full_gradient_variance/p95": 329.4015808105469, "sentence_full_gradient_variance/p99": 88247.5234375, "state_level_variance/metric": 153.72781372070312, "state_level_variance_full_gradient/metric": 1047.523681640625, "step": 74 }, { "accuracy_reward": 0.7408854365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19222451746463776, "action_level_variance/metric": 1188.285888671875, "action_level_variance_full_gradient/metric": 6196.3291015625, "adam_stats/lr_effective_max": 1.3762339222012088e-05, "adam_stats/lr_effective_mean": 8.93329923923325e-11, "adam_stats/lr_effective_min": -1.4110091797192581e-05, "adam_stats/m_t_max": 0.001772021991200745, "adam_stats/m_t_mean": 3.338849682843481e-11, "adam_stats/m_t_min": -0.002600416075438261, "adam_stats/v_t_max": 6.139225297374651e-05, "adam_stats/v_t_mean": 2.4271630712224823e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.026432838290929794, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.6924312114715576, "all_logprobs": -0.09744370728731155, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-06, "all_logprobs/min": -10.375, "all_logprobs/p1": -2.015625, "all_logprobs/p10": -0.1708984375, "all_logprobs/p25": -0.00164031982421875, "all_logprobs/p5": -0.58203125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15816880762577057, "clip_ratio": 0.0, "completion_length": 467.68359375, "completion_length/correct": 390.3128356933594, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 357.0, "completion_length/correct/min": 68.0, "completion_length/correct/p25": 249.0, "completion_length/correct/p75": 498.0, "completion_length/correct/var": 35886.078125, "completion_length/incorrect": 688.9095458984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 682.0, "completion_length/incorrect/min": 64.0, "completion_length/incorrect/p25": 467.0, "completion_length/incorrect/p75": 984.0, "completion_length/incorrect/var": 66323.3046875, "completion_length/max": 1024.0, "completion_length/median": 418.0, "completion_length/min": 64.0, "completion_length/p25": 286.0, "completion_length/p75": 600.5, "completion_length/var": 60835.34765625, "epoch": 0.96, "feature_vector_variance/max_squared_error": 106605.28125, "feature_vector_variance/metric": 24854.951171875, "generated_tokens/total": 31496784.0, "grad_norm": 0.08449138700962067, "grouped_std_rewards": 0.139755517244339, "learning_rate": 2.6790929273509547e-06, "loss": 0.0264, "mean_logprobs": -0.09716796875, "mean_logprobs/var": 0.00433349609375, "num_completions/total": 57600, "per_sentence_gradient_norm": 4.41793966293335, "per_sentence_gradient_norm/max": 582.5040893554688, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 118.00836181640625, "per_sentence_gradient_norm/var": 1170.29150390625, "per_token_feature_norm": 159.4192657470703, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 129.0, "per_token_feature_norm/p75": 185.0, "per_token_feature_norm/var": 1581.72412109375, "per_token_full_gradient_variance/max_squared_error": 829.7325439453125, "per_token_full_gradient_variance/variance": 0.07524526864290237, "per_token_gradient_norm": 5.334110736846924, "per_token_gradient_norm/max": 6589.6533203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9658.58203125, "per_token_policy_error_norm": 0.0522301122546196, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.045657481998205185, "policy_entropy": 0.10927015542984009, "policy_entropy/max": 3.71875, "policy_entropy/median": 3.4809112548828125e-05, "policy_entropy/min": 3.660266534311063e-16, "policy_entropy/p25": 1.125037670135498e-06, "policy_entropy/p75": 0.0128173828125, "policy_entropy/var": 0.0813007578253746, "policy_error_vector_variance/max_squared_error": 2.0159709453582764, "policy_error_vector_variance/metric": 0.052144113928079605, "policy_loss": 0.026432842016220093, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.6924314498901367, "policy_sharpness": 7.906267166137695, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.49609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.384970664978027, "reward": 0.7408854365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19222451746463776, "rewards/accuracy_reward": 0.7408854365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19222451746463776, "sentence_full_gradient_variance/max_squared_error": 2183785.5, "sentence_full_gradient_variance/metric": 7026.484375, "sentence_full_gradient_variance/p75": 117.72126770019531, "sentence_full_gradient_variance/p90": 180.2894287109375, "sentence_full_gradient_variance/p95": 180.2894287109375, "sentence_full_gradient_variance/p99": 117915.6484375, "state_level_variance/metric": 130.37562561035156, "state_level_variance_full_gradient/metric": 830.1566772460938, "step": 75 }, { "accuracy_reward": 0.7174479365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20298071205615997, "action_level_variance/metric": 861.58056640625, "action_level_variance_full_gradient/metric": 8310.736328125, "adam_stats/lr_effective_max": 1.2790066648449283e-05, "adam_stats/lr_effective_mean": 6.359431731217668e-11, "adam_stats/lr_effective_min": -1.3017131095693912e-05, "adam_stats/m_t_max": 0.001633763313293457, "adam_stats/m_t_mean": 2.7554963519249576e-11, "adam_stats/m_t_min": -0.002280865330249071, "adam_stats/v_t_max": 6.133235001470894e-05, "adam_stats/v_t_mean": 2.4260083959087853e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.017479196190834045, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.1848902702331543, "all_logprobs": -0.08838307857513428, "all_logprobs/max": 0.0, "all_logprobs/median": -1.6689300537109375e-06, "all_logprobs/min": -10.75, "all_logprobs/p1": -1.921875, "all_logprobs/p10": -0.12890625, "all_logprobs/p25": -0.000774383544921875, "all_logprobs/p5": -0.515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1447322815656662, "clip_ratio": 0.0, "completion_length": 484.42578125, "completion_length/correct": 403.10888671875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 358.0, "completion_length/correct/min": 73.0, "completion_length/correct/p25": 261.0, "completion_length/correct/p75": 503.5, "completion_length/correct/var": 36809.86328125, "completion_length/incorrect": 690.9031982421875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 689.0, "completion_length/incorrect/min": 128.0, "completion_length/incorrect/p25": 492.0, "completion_length/incorrect/p75": 973.0, "completion_length/incorrect/var": 64408.59765625, "completion_length/max": 1024.0, "completion_length/median": 423.0, "completion_length/min": 73.0, "completion_length/p25": 286.0, "completion_length/p75": 643.25, "completion_length/var": 61346.12109375, "epoch": 0.9728, "feature_vector_variance/max_squared_error": 111101.3359375, "feature_vector_variance/metric": 24731.1640625, "generated_tokens/total": 31868824.0, "grad_norm": 0.11273834854364395, "grouped_std_rewards": 0.1534351110458374, "learning_rate": 2.4815204523085656e-06, "loss": -0.0175, "mean_logprobs": -0.09130859375, "mean_logprobs/var": 0.005279541015625, "num_completions/total": 58368, "per_sentence_gradient_norm": 3.870727062225342, "per_sentence_gradient_norm/max": 604.5639038085938, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 96.06060791015625, "per_sentence_gradient_norm/var": 847.7018432617188, "per_token_feature_norm": 159.2911376953125, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 129.0, "per_token_feature_norm/p75": 185.0, "per_token_feature_norm/var": 1518.4891357421875, "per_token_full_gradient_variance/max_squared_error": 206.0087890625, "per_token_full_gradient_variance/variance": 0.04042378067970276, "per_token_gradient_norm": 3.877227544784546, "per_token_gradient_norm/max": 4861.935546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5260.75341796875, "per_token_policy_error_norm": 0.047455817461013794, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.042022548615932465, "policy_entropy": 0.09834911674261093, "policy_entropy/max": 3.6875, "policy_entropy/median": 2.467632293701172e-05, "policy_entropy/min": 1.951563910473908e-16, "policy_entropy/p25": 8.717179298400879e-07, "policy_entropy/p75": 0.00665283203125, "policy_entropy/var": 0.07285899668931961, "policy_error_vector_variance/max_squared_error": 2.015536308288574, "policy_error_vector_variance/metric": 0.047399651259183884, "policy_loss": -0.017479196190834045, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.184890031814575, "policy_sharpness": 8.07723331451416, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.70063591003418, "reward": 0.7174479365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20298071205615997, "rewards/accuracy_reward": 0.7174479365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20298071205615997, "sentence_full_gradient_variance/max_squared_error": 3512012.75, "sentence_full_gradient_variance/metric": 9459.0732421875, "sentence_full_gradient_variance/p75": 61.126468658447266, "sentence_full_gradient_variance/p90": 137.52047729492188, "sentence_full_gradient_variance/p95": 137.52047729492188, "sentence_full_gradient_variance/p99": 136046.3125, "state_level_variance/metric": 93.69099426269531, "state_level_variance_full_gradient/metric": 1148.336669921875, "step": 76 }, { "accuracy_reward": 0.79296875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16438336670398712, "action_level_variance/metric": 220.7324676513672, "action_level_variance_full_gradient/metric": 2675.025634765625, "adam_stats/lr_effective_max": 1.0872166967601515e-05, "adam_stats/lr_effective_mean": 5.374089367959911e-11, "adam_stats/lr_effective_min": -1.1277420526312198e-05, "adam_stats/m_t_max": 0.0014328418765217066, "adam_stats/m_t_mean": 2.307643312549068e-11, "adam_stats/m_t_min": -0.0018436877289786935, "adam_stats/v_t_max": 6.12720032222569e-05, "adam_stats/v_t_mean": 2.4243059816575485e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.09199478477239609, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.3128234148025513, "all_logprobs": -0.07983240485191345, "all_logprobs/max": 0.0, "all_logprobs/median": -1.430511474609375e-06, "all_logprobs/min": -12.375, "all_logprobs/p1": -1.8046875, "all_logprobs/p10": -0.10493087768554688, "all_logprobs/p25": -0.000553131103515625, "all_logprobs/p5": -0.46484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12476702034473419, "clip_ratio": 0.0, "completion_length": 486.21356201171875, "completion_length/correct": 432.8932800292969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 408.0, "completion_length/correct/min": 38.0, "completion_length/correct/p25": 282.0, "completion_length/correct/p75": 541.0, "completion_length/correct/var": 40877.4140625, "completion_length/incorrect": 690.4402465820312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 708.0, "completion_length/incorrect/min": 185.0, "completion_length/incorrect/p25": 445.0, "completion_length/incorrect/p75": 978.5, "completion_length/incorrect/var": 74110.234375, "completion_length/max": 1024.0, "completion_length/median": 438.0, "completion_length/min": 38.0, "completion_length/p25": 297.0, "completion_length/p75": 609.25, "completion_length/var": 58573.62109375, "epoch": 0.9856, "feature_vector_variance/max_squared_error": 110355.25, "feature_vector_variance/metric": 24192.66796875, "generated_tokens/total": 32242236.0, "grad_norm": 0.052969809621572495, "grouped_std_rewards": 0.140719473361969, "learning_rate": 2.29006222155752e-06, "loss": -0.092, "mean_logprobs": -0.0791015625, "mean_logprobs/var": 0.0024261474609375, "num_completions/total": 59136, "per_sentence_gradient_norm": 2.3792338371276855, "per_sentence_gradient_norm/max": 175.92691040039062, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 79.10427856445312, "per_sentence_gradient_norm/var": 215.35208129882812, "per_token_feature_norm": 157.41664123535156, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 183.0, "per_token_feature_norm/var": 1434.060791015625, "per_token_full_gradient_variance/max_squared_error": 302.0867919921875, "per_token_full_gradient_variance/variance": 0.02770599164068699, "per_token_gradient_norm": 2.989776611328125, "per_token_gradient_norm/max": 3499.755126953125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3025.531494140625, "per_token_policy_error_norm": 0.0436367504298687, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03870663046836853, "policy_entropy": 0.08932531625032425, "policy_entropy/max": 3.734375, "policy_entropy/median": 2.0742416381835938e-05, "policy_entropy/min": 1.231653667943533e-16, "policy_entropy/p25": 7.450580596923828e-07, "policy_entropy/p75": 0.004791259765625, "policy_entropy/var": 0.062390949577093124, "policy_error_vector_variance/max_squared_error": 2.01371693611145, "policy_error_vector_variance/metric": 0.04359884560108185, "policy_loss": -0.09199477732181549, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.3128232955932617, "policy_sharpness": 8.166534423828125, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.17090892791748, "reward": 0.79296875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16438336670398712, "rewards/accuracy_reward": 0.79296875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16438336670398712, "sentence_full_gradient_variance/max_squared_error": 817798.0, "sentence_full_gradient_variance/metric": 2959.688232421875, "sentence_full_gradient_variance/p75": 172.58560180664062, "sentence_full_gradient_variance/p90": 396.1665344238281, "sentence_full_gradient_variance/p95": 396.1665344238281, "sentence_full_gradient_variance/p99": 65694.6875, "state_level_variance/metric": 22.161651611328125, "state_level_variance_full_gradient/metric": 284.66259765625, "step": 77 }, { "accuracy_reward": 0.7591146230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18309804797172546, "action_level_variance/metric": 447.5387878417969, "action_level_variance_full_gradient/metric": 7041.34521484375, "adam_stats/lr_effective_max": 1.034094202623237e-05, "adam_stats/lr_effective_mean": 2.4636960874180325e-11, "adam_stats/lr_effective_min": -9.724612027639523e-06, "adam_stats/m_t_max": 0.00155505514703691, "adam_stats/m_t_mean": 1.7298265250764722e-11, "adam_stats/m_t_min": -0.002045366447418928, "adam_stats/v_t_max": 6.122770719230175e-05, "adam_stats/v_t_mean": 2.425638466127533e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.10230584442615509, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.709834575653076, "all_logprobs": -0.08980222791433334, "all_logprobs/max": 0.0, "all_logprobs/median": -1.9073486328125e-06, "all_logprobs/min": -12.625, "all_logprobs/p1": -1.9765625, "all_logprobs/p10": -0.1318359375, "all_logprobs/p25": -0.00091552734375, "all_logprobs/p5": -0.5234375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.14784343540668488, "clip_ratio": 0.0, "completion_length": 470.11590576171875, "completion_length/correct": 392.466552734375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 363.0, "completion_length/correct/min": 81.0, "completion_length/correct/p25": 275.0, "completion_length/correct/p75": 476.0, "completion_length/correct/var": 29388.244140625, "completion_length/incorrect": 714.8162231445312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 680.0, "completion_length/incorrect/min": 125.0, "completion_length/incorrect/p25": 489.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 69510.390625, "completion_length/max": 1024.0, "completion_length/median": 407.0, "completion_length/min": 81.0, "completion_length/p25": 291.75, "completion_length/p75": 576.5, "completion_length/var": 58000.6484375, "epoch": 0.9984, "feature_vector_variance/max_squared_error": 106918.7578125, "feature_vector_variance/metric": 24501.87890625, "generated_tokens/total": 32603284.0, "grad_norm": 0.12303988635540009, "grouped_std_rewards": 0.2052084058523178, "learning_rate": 2.104951497460118e-06, "loss": -0.1023, "mean_logprobs": -0.0908203125, "mean_logprobs/var": 0.004547119140625, "num_completions/total": 59904, "per_sentence_gradient_norm": 4.030129909515381, "per_sentence_gradient_norm/max": 228.35458374023438, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 17.2451171875, "per_sentence_gradient_norm/p99": 125.06488800048828, "per_sentence_gradient_norm/var": 431.8591613769531, "per_token_feature_norm": 158.32066345214844, "per_token_feature_norm/max": 336.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 184.0, "per_token_feature_norm/var": 1544.806884765625, "per_token_full_gradient_variance/max_squared_error": 169.304931640625, "per_token_full_gradient_variance/variance": 0.04337022453546524, "per_token_gradient_norm": 4.700672149658203, "per_token_gradient_norm/max": 6711.197265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5830.3076171875, "per_token_policy_error_norm": 0.048036832362413406, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04251472279429436, "policy_entropy": 0.10005170106887817, "policy_entropy/max": 3.71875, "policy_entropy/median": 2.849102020263672e-05, "policy_entropy/min": 2.5118795932144167e-15, "policy_entropy/p25": 9.685754776000977e-07, "policy_entropy/p75": 0.007720947265625, "policy_entropy/var": 0.07627830654382706, "policy_error_vector_variance/max_squared_error": 2.012552261352539, "policy_error_vector_variance/metric": 0.047954585403203964, "policy_loss": -0.28427785634994507, "policy_loss/max": 7.48191499710083, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.9756920337677, "policy_sharpness": 8.041114807128906, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.793907165527344, "reward": 0.7591146230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18309804797172546, "rewards/accuracy_reward": 0.7591146230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18309804797172546, "sentence_full_gradient_variance/max_squared_error": 1976815.375, "sentence_full_gradient_variance/metric": 7902.3125, "sentence_full_gradient_variance/p75": 202.99241638183594, "sentence_full_gradient_variance/p90": 432.93804931640625, "sentence_full_gradient_variance/p95": 4230.48193359375, "sentence_full_gradient_variance/p99": 179035.59375, "state_level_variance/metric": 40.11830139160156, "state_level_variance_full_gradient/metric": 860.9666748046875, "step": 78 }, { "accuracy_reward": 0.7161458730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20354601740837097, "action_level_variance/metric": 628.49853515625, "action_level_variance_full_gradient/metric": 5890.98828125, "adam_stats/lr_effective_max": 9.614335795049556e-06, "adam_stats/lr_effective_mean": 3.171292395354186e-11, "adam_stats/lr_effective_min": -9.485492228122894e-06, "adam_stats/m_t_max": 0.001569534419104457, "adam_stats/m_t_mean": 1.448296170492025e-11, "adam_stats/m_t_min": -0.002117013791576028, "adam_stats/v_t_max": 6.116708391346037e-05, "adam_stats/v_t_mean": 2.428106977633848e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.028221501037478447, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 3.5222437381744385, "all_logprobs": -0.09076780825853348, "all_logprobs/max": 0.0, "all_logprobs/median": -2.2649765014648438e-06, "all_logprobs/min": -10.375, "all_logprobs/p1": -1.9453125, "all_logprobs/p10": -0.140625, "all_logprobs/p25": -0.00099945068359375, "all_logprobs/p5": -0.53125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.14920634031295776, "clip_ratio": 0.0, "completion_length": 501.54559326171875, "completion_length/correct": 416.9891052246094, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 377.0, "completion_length/correct/min": 69.0, "completion_length/correct/p25": 254.25, "completion_length/correct/p75": 529.5, "completion_length/correct/var": 43630.9609375, "completion_length/incorrect": 714.8760986328125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 717.0, "completion_length/incorrect/min": 209.0, "completion_length/incorrect/p25": 496.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 67804.703125, "completion_length/max": 1024.0, "completion_length/median": 449.0, "completion_length/min": 69.0, "completion_length/p25": 290.0, "completion_length/p75": 663.5, "completion_length/var": 68475.328125, "epoch": 1.0128, "feature_vector_variance/max_squared_error": 109672.765625, "feature_vector_variance/metric": 24385.041015625, "generated_tokens/total": 32988472.0, "grad_norm": 0.19826188683509827, "grouped_std_rewards": 0.20661330223083496, "learning_rate": 1.9264138089195424e-06, "loss": 0.0282, "mean_logprobs": -0.0908203125, "mean_logprobs/var": 0.00408935546875, "num_completions/total": 60672, "per_sentence_gradient_norm": 4.612773418426514, "per_sentence_gradient_norm/max": 285.3846130371094, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 21.512468338012695, "per_sentence_gradient_norm/p99": 154.74560546875, "per_sentence_gradient_norm/var": 608.012451171875, "per_token_feature_norm": 158.0149688720703, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 183.0, "per_token_feature_norm/var": 1542.8297119140625, "per_token_full_gradient_variance/max_squared_error": 140.95114135742188, "per_token_full_gradient_variance/variance": 0.05997861176729202, "per_token_gradient_norm": 5.619486331939697, "per_token_gradient_norm/max": 6286.8759765625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 7683.13037109375, "per_token_policy_error_norm": 0.04864611476659775, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04296848177909851, "policy_entropy": 0.10111895203590393, "policy_entropy/max": 3.734375, "policy_entropy/median": 3.266334533691406e-05, "policy_entropy/min": 1.7208456881689926e-15, "policy_entropy/p25": 1.2889504432678223e-06, "policy_entropy/p75": 0.00823974609375, "policy_entropy/var": 0.07509865611791611, "policy_error_vector_variance/max_squared_error": 2.0130813121795654, "policy_error_vector_variance/metric": 0.04858464375138283, "policy_loss": 0.028221510350704193, "policy_loss/max": 19.79339599609375, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.5222442150115967, "policy_sharpness": 8.020001411437988, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.890504837036133, "reward": 0.7161458730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20354601740837097, "rewards/accuracy_reward": 0.7161458730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20354601740837097, "sentence_full_gradient_variance/max_squared_error": 780151.9375, "sentence_full_gradient_variance/metric": 6676.0263671875, "sentence_full_gradient_variance/p75": 64.2680435180664, "sentence_full_gradient_variance/p90": 112.80702209472656, "sentence_full_gradient_variance/p95": 2474.810302734375, "sentence_full_gradient_variance/p99": 263670.59375, "state_level_variance/metric": 57.88762283325195, "state_level_variance_full_gradient/metric": 785.0374755859375, "step": 79 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18774448335170746, "action_level_variance/metric": 561.2606201171875, "action_level_variance_full_gradient/metric": 3898.84326171875, "adam_stats/lr_effective_max": 8.830240403767675e-06, "adam_stats/lr_effective_mean": 3.860920264053824e-11, "adam_stats/lr_effective_min": -8.858643923304044e-06, "adam_stats/m_t_max": 0.0013113042805343866, "adam_stats/m_t_mean": 9.596328072458693e-12, "adam_stats/m_t_min": -0.0018236779142171144, "adam_stats/v_t_max": 6.110853428253904e-05, "adam_stats/v_t_mean": 2.4277381320547686e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.04018472135066986, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.7992453575134277, "all_logprobs": -0.08670757710933685, "all_logprobs/max": 0.0, "all_logprobs/median": -1.7881393432617188e-06, "all_logprobs/min": -12.5625, "all_logprobs/p1": -1.9140625, "all_logprobs/p10": -0.126953125, "all_logprobs/p25": -0.000762939453125, "all_logprobs/p5": -0.486328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.14283525943756104, "clip_ratio": 0.0, "completion_length": 501.1888122558594, "completion_length/correct": 427.6076354980469, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 398.0, "completion_length/correct/min": 87.0, "completion_length/correct/p25": 295.0, "completion_length/correct/p75": 545.0, "completion_length/correct/var": 32778.16015625, "completion_length/incorrect": 721.9323120117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 714.0, "completion_length/incorrect/min": 195.0, "completion_length/incorrect/p25": 482.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 65761.2578125, "completion_length/max": 1024.0, "completion_length/median": 445.0, "completion_length/min": 87.0, "completion_length/p25": 312.75, "completion_length/p75": 629.25, "completion_length/var": 57212.68359375, "epoch": 1.0256, "feature_vector_variance/max_squared_error": 114046.8046875, "feature_vector_variance/metric": 24656.873046875, "generated_tokens/total": 33373384.0, "grad_norm": 0.10953383147716522, "grouped_std_rewards": 0.1627262830734253, "learning_rate": 1.7546666766076658e-06, "loss": 0.0402, "mean_logprobs": -0.08544921875, "mean_logprobs/var": 0.003631591796875, "num_completions/total": 61440, "per_sentence_gradient_norm": 3.5992021560668945, "per_sentence_gradient_norm/max": 324.7635192871094, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 116.84805297851562, "per_sentence_gradient_norm/var": 549.0211181640625, "per_token_feature_norm": 158.77505493164062, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 129.0, "per_token_feature_norm/p75": 185.0, "per_token_feature_norm/var": 1532.4129638671875, "per_token_full_gradient_variance/max_squared_error": 693.8333129882812, "per_token_full_gradient_variance/variance": 0.057519782334566116, "per_token_gradient_norm": 4.319868087768555, "per_token_gradient_norm/max": 6670.8369140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6583.359375, "per_token_policy_error_norm": 0.04648542031645775, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.041525471955537796, "policy_entropy": 0.09655755013227463, "policy_entropy/max": 3.765625, "policy_entropy/median": 2.574920654296875e-05, "policy_entropy/min": 9.161508357502512e-18, "policy_entropy/p25": 8.083879947662354e-07, "policy_entropy/p75": 0.006591796875, "policy_entropy/var": 0.07123696804046631, "policy_error_vector_variance/max_squared_error": 2.0142722129821777, "policy_error_vector_variance/metric": 0.046420615166425705, "policy_loss": 0.04018472507596016, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.7992453575134277, "policy_sharpness": 8.083566665649414, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.599595069885254, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.18774448335170746, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18774448335170746, "sentence_full_gradient_variance/max_squared_error": 673550.0, "sentence_full_gradient_variance/metric": 4406.6376953125, "sentence_full_gradient_variance/p75": 116.39212036132812, "sentence_full_gradient_variance/p90": 134.5906524658203, "sentence_full_gradient_variance/p95": 134.5906524658203, "sentence_full_gradient_variance/p99": 140912.0625, "state_level_variance/metric": 57.80545425415039, "state_level_variance_full_gradient/metric": 507.7947998046875, "step": 80 }, { "accuracy_reward": 0.7213541865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2012643665075302, "action_level_variance/metric": 828.77294921875, "action_level_variance_full_gradient/metric": 5167.86865234375, "adam_stats/lr_effective_max": 8.138651537592523e-06, "adam_stats/lr_effective_mean": 1.793623236046038e-11, "adam_stats/lr_effective_min": -8.174873983080033e-06, "adam_stats/m_t_max": 0.0023556919768452644, "adam_stats/m_t_mean": -3.4520593848730297e-12, "adam_stats/m_t_min": -0.0015996333677321672, "adam_stats/v_t_max": 6.140542245702818e-05, "adam_stats/v_t_mean": 2.4356575784034717e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.016990192234516144, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.3176982402801514, "all_logprobs": -0.09050021320581436, "all_logprobs/max": 0.0, "all_logprobs/median": -2.1457672119140625e-06, "all_logprobs/min": -11.625, "all_logprobs/p1": -1.9296875, "all_logprobs/p10": -0.14453125, "all_logprobs/p25": -0.0011138916015625, "all_logprobs/p5": -0.52734375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1453661024570465, "clip_ratio": 0.0, "completion_length": 518.3541870117188, "completion_length/correct": 425.10107421875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 385.0, "completion_length/correct/min": 65.0, "completion_length/correct/p25": 282.0, "completion_length/correct/p75": 522.75, "completion_length/correct/var": 39187.55078125, "completion_length/incorrect": 759.766357421875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 794.0, "completion_length/incorrect/min": 219.0, "completion_length/incorrect/p25": 539.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 62655.5546875, "completion_length/max": 1024.0, "completion_length/median": 446.0, "completion_length/min": 65.0, "completion_length/p25": 311.0, "completion_length/p75": 697.25, "completion_length/var": 68195.4296875, "epoch": 1.0384, "feature_vector_variance/max_squared_error": 109335.453125, "feature_vector_variance/metric": 24294.982421875, "generated_tokens/total": 33771480.0, "grad_norm": 0.18845070898532867, "grouped_std_rewards": 0.2013004869222641, "learning_rate": 1.5899193479495858e-06, "loss": 0.017, "mean_logprobs": -0.08935546875, "mean_logprobs/var": 0.004241943359375, "num_completions/total": 62208, "per_sentence_gradient_norm": 4.289559841156006, "per_sentence_gradient_norm/max": 427.6639099121094, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 18.750436782836914, "per_sentence_gradient_norm/p99": 104.2740249633789, "per_sentence_gradient_norm/var": 811.4290771484375, "per_token_feature_norm": 157.53927612304688, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 127.0, "per_token_feature_norm/p75": 183.0, "per_token_feature_norm/var": 1548.7490234375, "per_token_full_gradient_variance/max_squared_error": 328.6741943359375, "per_token_full_gradient_variance/variance": 0.06743799149990082, "per_token_gradient_norm": 4.838956356048584, "per_token_gradient_norm/max": 7140.46630859375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 7554.9189453125, "per_token_policy_error_norm": 0.04875507205724716, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04290804639458656, "policy_entropy": 0.10151639580726624, "policy_entropy/max": 3.71875, "policy_entropy/median": 3.075599670410156e-05, "policy_entropy/min": 4.218847493575595e-15, "policy_entropy/p25": 1.087784767150879e-06, "policy_entropy/p75": 0.00909423828125, "policy_entropy/var": 0.07417795807123184, "policy_error_vector_variance/max_squared_error": 2.0174899101257324, "policy_error_vector_variance/metric": 0.048691339790821075, "policy_loss": 0.016990190371870995, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659052848815918, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.3176984786987305, "policy_sharpness": 7.999854564666748, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.948599815368652, "reward": 0.7213541865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2012643665075302, "rewards/accuracy_reward": 0.7213541865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2012643665075302, "sentence_full_gradient_variance/max_squared_error": 652505.5, "sentence_full_gradient_variance/metric": 5875.67333984375, "sentence_full_gradient_variance/p75": 67.471435546875, "sentence_full_gradient_variance/p90": 97.61656951904297, "sentence_full_gradient_variance/p95": 97.70018005371094, "sentence_full_gradient_variance/p99": 142613.15625, "state_level_variance/metric": 86.09309387207031, "state_level_variance_full_gradient/metric": 707.8041381835938, "step": 81 }, { "accuracy_reward": 0.734375, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1953226923942566, "action_level_variance/metric": 1310.9639892578125, "action_level_variance_full_gradient/metric": 10797.1435546875, "adam_stats/lr_effective_max": 7.687287506996654e-06, "adam_stats/lr_effective_mean": 1.5916981274677333e-11, "adam_stats/lr_effective_min": -7.353673481702572e-06, "adam_stats/m_t_max": 0.0015769695164635777, "adam_stats/m_t_mean": 6.686219909983124e-14, "adam_stats/m_t_min": -0.0025546362157911062, "adam_stats/v_t_max": 6.14323653280735e-05, "adam_stats/v_t_mean": 2.4647295922969326e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.10876920074224472, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.5486278533935547, "all_logprobs": -0.08820164948701859, "all_logprobs/max": 0.0, "all_logprobs/median": -1.7881393432617188e-06, "all_logprobs/min": -15.0, "all_logprobs/p1": -1.9140625, "all_logprobs/p10": -0.134765625, "all_logprobs/p25": -0.000911712646484375, "all_logprobs/p5": -0.51953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1419341266155243, "clip_ratio": 0.0, "completion_length": 507.29168701171875, "completion_length/correct": 424.7943115234375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 389.0, "completion_length/correct/min": 34.0, "completion_length/correct/p25": 272.75, "completion_length/correct/p75": 529.25, "completion_length/correct/var": 39376.15234375, "completion_length/incorrect": 735.37255859375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 777.0, "completion_length/incorrect/min": 164.0, "completion_length/incorrect/p25": 526.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 76397.8046875, "completion_length/max": 1024.0, "completion_length/median": 447.0, "completion_length/min": 34.0, "completion_length/p25": 296.0, "completion_length/p75": 665.5, "completion_length/var": 67963.8515625, "epoch": 1.0512, "feature_vector_variance/max_squared_error": 118658.0703125, "feature_vector_variance/metric": 23981.962890625, "generated_tokens/total": 34161080.0, "grad_norm": 0.3438369929790497, "grouped_std_rewards": 0.16452403366565704, "learning_rate": 1.432372542187895e-06, "loss": -0.1088, "mean_logprobs": -0.087890625, "mean_logprobs/var": 0.003753662109375, "num_completions/total": 62976, "per_sentence_gradient_norm": 4.513017654418945, "per_sentence_gradient_norm/max": 754.930419921875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 111.57376098632812, "per_sentence_gradient_norm/var": 1292.2791748046875, "per_token_feature_norm": 157.10728454589844, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 182.0, "per_token_feature_norm/var": 1482.1497802734375, "per_token_full_gradient_variance/max_squared_error": 641.5545043945312, "per_token_full_gradient_variance/variance": 0.06918732821941376, "per_token_gradient_norm": 4.794299602508545, "per_token_gradient_norm/max": 7979.2109375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8311.4541015625, "per_token_policy_error_norm": 0.04763362929224968, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04208102822303772, "policy_entropy": 0.0983477234840393, "policy_entropy/max": 3.53125, "policy_entropy/median": 2.6702880859375e-05, "policy_entropy/min": 1.5699247457590104e-16, "policy_entropy/p25": 9.834766387939453e-07, "policy_entropy/p75": 0.00762939453125, "policy_entropy/var": 0.0706910565495491, "policy_error_vector_variance/max_squared_error": 2.0144412517547607, "policy_error_vector_variance/metric": 0.047573793679475784, "policy_loss": -0.10876920819282532, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.5486276149749756, "policy_sharpness": 8.050305366516113, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.714594841003418, "reward": 0.734375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.1953226923942566, "rewards/accuracy_reward": 0.734375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1953226923942566, "sentence_full_gradient_variance/max_squared_error": 2948920.75, "sentence_full_gradient_variance/metric": 12139.11328125, "sentence_full_gradient_variance/p75": 287.4079284667969, "sentence_full_gradient_variance/p90": 824.54638671875, "sentence_full_gradient_variance/p95": 824.54638671875, "sentence_full_gradient_variance/p99": 190856.96875, "state_level_variance/metric": 145.0137176513672, "state_level_variance_full_gradient/metric": 1341.96923828125, "step": 82 }, { "accuracy_reward": 0.7434896230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19096145033836365, "action_level_variance/metric": 398.4344482421875, "action_level_variance_full_gradient/metric": 3782.95849609375, "adam_stats/lr_effective_max": 6.1955042838235386e-06, "adam_stats/lr_effective_mean": 1.4112931309706589e-11, "adam_stats/lr_effective_min": -6.220343493623659e-06, "adam_stats/m_t_max": 0.0015327802393585443, "adam_stats/m_t_mean": 7.47877454410073e-13, "adam_stats/m_t_min": -0.0024075100664049387, "adam_stats/v_t_max": 6.137238960945979e-05, "adam_stats/v_t_mean": 2.463341379835282e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0030387542210519314, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.9710915088653564, "all_logprobs": -0.08284860849380493, "all_logprobs/max": 0.0, "all_logprobs/median": -1.0728836059570312e-06, "all_logprobs/min": -12.0625, "all_logprobs/p1": -1.875, "all_logprobs/p10": -0.11279296875, "all_logprobs/p25": -0.00048828125, "all_logprobs/p5": -0.474609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.134722501039505, "clip_ratio": 0.0, "completion_length": 500.85809326171875, "completion_length/correct": 425.7880859375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 391.0, "completion_length/correct/min": 63.0, "completion_length/correct/p25": 282.0, "completion_length/correct/p75": 530.5, "completion_length/correct/var": 38813.2578125, "completion_length/incorrect": 718.4466552734375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 723.0, "completion_length/incorrect/min": 117.0, "completion_length/incorrect/p25": 506.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 65064.71484375, "completion_length/max": 1024.0, "completion_length/median": 438.0, "completion_length/min": 63.0, "completion_length/p25": 320.0, "completion_length/p75": 652.25, "completion_length/var": 61826.65625, "epoch": 1.064, "feature_vector_variance/max_squared_error": 109702.546875, "feature_vector_variance/metric": 24393.40234375, "generated_tokens/total": 34545740.0, "grad_norm": 0.09485433995723724, "grouped_std_rewards": 0.1931505650281906, "learning_rate": 1.282218205837188e-06, "loss": -0.003, "mean_logprobs": -0.083984375, "mean_logprobs/var": 0.003509521484375, "num_completions/total": 63744, "per_sentence_gradient_norm": 3.3309478759765625, "per_sentence_gradient_norm/max": 347.0408630371094, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 16.761104583740234, "per_sentence_gradient_norm/p99": 86.33845520019531, "per_sentence_gradient_norm/var": 387.84423828125, "per_token_feature_norm": 158.15708923339844, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 184.0, "per_token_feature_norm/var": 1467.5992431640625, "per_token_full_gradient_variance/max_squared_error": 388.2906188964844, "per_token_full_gradient_variance/variance": 0.0518014132976532, "per_token_gradient_norm": 4.087138652801514, "per_token_gradient_norm/max": 7528.9111328125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5702.85009765625, "per_token_policy_error_norm": 0.044880013912916183, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04005495458841324, "policy_entropy": 0.09154317528009415, "policy_entropy/max": 3.8125, "policy_entropy/median": 1.728534698486328e-05, "policy_entropy/min": 1.1188966420050406e-16, "policy_entropy/p25": 6.631016731262207e-07, "policy_entropy/p75": 0.004425048828125, "policy_entropy/var": 0.06609344482421875, "policy_error_vector_variance/max_squared_error": 2.0118207931518555, "policy_error_vector_variance/metric": 0.04482978209853172, "policy_loss": -0.0030387542210519314, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.9710915088653564, "policy_sharpness": 8.17745590209961, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.214188575744629, "reward": 0.7434896230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19096145033836365, "rewards/accuracy_reward": 0.7434896230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19096145033836365, "sentence_full_gradient_variance/max_squared_error": 459987.71875, "sentence_full_gradient_variance/metric": 4286.759765625, "sentence_full_gradient_variance/p75": 26.213319778442383, "sentence_full_gradient_variance/p90": 189.15960693359375, "sentence_full_gradient_variance/p95": 4156.326171875, "sentence_full_gradient_variance/p99": 130283.7265625, "state_level_variance/metric": 39.11656188964844, "state_level_variance_full_gradient/metric": 503.80169677734375, "step": 83 }, { "accuracy_reward": 0.7825521230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1703861653804779, "action_level_variance/metric": 1106.275634765625, "action_level_variance_full_gradient/metric": 8856.4892578125, "adam_stats/lr_effective_max": 5.528231213247636e-06, "adam_stats/lr_effective_mean": 1.7433162552427106e-11, "adam_stats/lr_effective_min": -5.72878661841969e-06, "adam_stats/m_t_max": 0.001396668259985745, "adam_stats/m_t_mean": 1.352602058135588e-12, "adam_stats/m_t_min": -0.0020370592828840017, "adam_stats/v_t_max": 6.133086571935564e-05, "adam_stats/v_t_mean": 2.4625473101641537e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.00871215295046568, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.8314993381500244, "all_logprobs": -0.08537911623716354, "all_logprobs/max": 0.0, "all_logprobs/median": -1.3113021850585938e-06, "all_logprobs/min": -10.625, "all_logprobs/p1": -1.8984375, "all_logprobs/p10": -0.126953125, "all_logprobs/p25": -0.0007171630859375, "all_logprobs/p5": -0.486328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1352115273475647, "clip_ratio": 0.0, "completion_length": 492.54296875, "completion_length/correct": 430.26287841796875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 391.0, "completion_length/correct/min": 52.0, "completion_length/correct/p25": 287.0, "completion_length/correct/p75": 537.0, "completion_length/correct/var": 38700.46484375, "completion_length/incorrect": 716.6766967773438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 703.0, "completion_length/incorrect/min": 89.0, "completion_length/incorrect/p25": 534.0, "completion_length/incorrect/p75": 1018.0, "completion_length/incorrect/var": 64405.8359375, "completion_length/max": 1024.0, "completion_length/median": 431.0, "completion_length/min": 52.0, "completion_length/p25": 314.0, "completion_length/p75": 649.25, "completion_length/var": 58190.625, "epoch": 1.0768, "feature_vector_variance/max_squared_error": 109395.6640625, "feature_vector_variance/metric": 24379.43359375, "generated_tokens/total": 34924012.0, "grad_norm": 0.11839216202497482, "grouped_std_rewards": 0.17074766755104065, "learning_rate": 1.1396392788268054e-06, "loss": 0.0087, "mean_logprobs": -0.08447265625, "mean_logprobs/var": 0.004302978515625, "num_completions/total": 64512, "per_sentence_gradient_norm": 5.159142017364502, "per_sentence_gradient_norm/max": 432.4154357910156, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 174.91954040527344, "per_sentence_gradient_norm/var": 1081.06640625, "per_token_feature_norm": 158.09121704101562, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 129.0, "per_token_feature_norm/p75": 184.0, "per_token_feature_norm/var": 1450.238525390625, "per_token_full_gradient_variance/max_squared_error": 355.329833984375, "per_token_full_gradient_variance/variance": 0.07638378441333771, "per_token_gradient_norm": 5.701384544372559, "per_token_gradient_norm/max": 6124.50830078125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9300.9326171875, "per_token_policy_error_norm": 0.046205852180719376, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04066551849246025, "policy_entropy": 0.0958857461810112, "policy_entropy/max": 3.609375, "policy_entropy/median": 1.9311904907226562e-05, "policy_entropy/min": 1.5785983631388945e-16, "policy_entropy/p25": 6.221234798431396e-07, "policy_entropy/p75": 0.006317138671875, "policy_entropy/var": 0.06940954923629761, "policy_error_vector_variance/max_squared_error": 2.0096216201782227, "policy_error_vector_variance/metric": 0.04615228250622749, "policy_loss": 0.008712163195014, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.8314993381500244, "policy_sharpness": 8.098200798034668, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.529733657836914, "reward": 0.7825521230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1703861653804779, "rewards/accuracy_reward": 0.7825521230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1703861653804779, "sentence_full_gradient_variance/max_squared_error": 2080410.125, "sentence_full_gradient_variance/metric": 10085.6435546875, "sentence_full_gradient_variance/p75": 50.69424819946289, "sentence_full_gradient_variance/p90": 131.53359985351562, "sentence_full_gradient_variance/p95": 131.53359985351562, "sentence_full_gradient_variance/p99": 157739.671875, "state_level_variance/metric": 112.84315490722656, "state_level_variance_full_gradient/metric": 1229.154541015625, "step": 84 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18774445354938507, "action_level_variance/metric": 429.0433349609375, "action_level_variance_full_gradient/metric": 5412.779296875, "adam_stats/lr_effective_max": 4.789862487086793e-06, "adam_stats/lr_effective_mean": 1.4337168605105255e-11, "adam_stats/lr_effective_min": -4.584106591210002e-06, "adam_stats/m_t_max": 0.002347907517105341, "adam_stats/m_t_mean": 1.0591427908324125e-11, "adam_stats/m_t_min": -0.0029099176172167063, "adam_stats/v_t_max": 6.135113653726876e-05, "adam_stats/v_t_mean": 2.465266272372313e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.03681766241788864, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.27087140083313, "all_logprobs": -0.08024172484874725, "all_logprobs/max": 0.0, "all_logprobs/median": -1.0728836059570312e-06, "all_logprobs/min": -12.375, "all_logprobs/p1": -1.8046875, "all_logprobs/p10": -0.10498046875, "all_logprobs/p25": -0.0004367828369140625, "all_logprobs/p5": -0.45703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12746712565422058, "clip_ratio": 0.0, "completion_length": 493.07684326171875, "completion_length/correct": 409.7899475097656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 367.0, "completion_length/correct/min": 77.0, "completion_length/correct/p25": 242.0, "completion_length/correct/p75": 530.25, "completion_length/correct/var": 41535.984375, "completion_length/incorrect": 742.9375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 755.0, "completion_length/incorrect/min": 206.0, "completion_length/incorrect/p25": 539.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 64344.37890625, "completion_length/max": 1024.0, "completion_length/median": 435.0, "completion_length/min": 77.0, "completion_length/p25": 284.75, "completion_length/p75": 655.0, "completion_length/var": 67998.875, "epoch": 1.0896, "feature_vector_variance/max_squared_error": 114158.34375, "feature_vector_variance/metric": 23634.03125, "generated_tokens/total": 35302696.0, "grad_norm": 0.14831164479255676, "grouped_std_rewards": 0.16881053149700165, "learning_rate": 1.0048094716167097e-06, "loss": 0.0368, "mean_logprobs": -0.08056640625, "mean_logprobs/var": 0.003021240234375, "num_completions/total": 65280, "per_sentence_gradient_norm": 3.537755012512207, "per_sentence_gradient_norm/max": 273.779052734375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 103.40017700195312, "per_sentence_gradient_norm/var": 417.0706787109375, "per_token_feature_norm": 156.02525329589844, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 127.0, "per_token_feature_norm/p75": 181.0, "per_token_feature_norm/var": 1432.2261962890625, "per_token_full_gradient_variance/max_squared_error": 247.02810668945312, "per_token_full_gradient_variance/variance": 0.06269268691539764, "per_token_gradient_norm": 5.138054847717285, "per_token_gradient_norm/max": 6270.79443359375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 7521.97021484375, "per_token_policy_error_norm": 0.043687738478183746, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.038766972720623016, "policy_entropy": 0.08931978046894073, "policy_entropy/max": 3.75, "policy_entropy/median": 1.7404556274414062e-05, "policy_entropy/min": 7.4593109467002705e-16, "policy_entropy/p25": 7.301568984985352e-07, "policy_entropy/p75": 0.0040283203125, "policy_entropy/var": 0.06316622346639633, "policy_error_vector_variance/max_squared_error": 2.009408712387085, "policy_error_vector_variance/metric": 0.04364989325404167, "policy_loss": 0.036817677319049835, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.27087140083313, "policy_sharpness": 8.202614784240723, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.75, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.077373504638672, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.18774445354938507, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18774445354938507, "sentence_full_gradient_variance/max_squared_error": 691455.3125, "sentence_full_gradient_variance/metric": 6139.09521484375, "sentence_full_gradient_variance/p75": 77.8857192993164, "sentence_full_gradient_variance/p90": 152.89817810058594, "sentence_full_gradient_variance/p95": 152.89817810058594, "sentence_full_gradient_variance/p99": 173933.578125, "state_level_variance/metric": 41.54749298095703, "state_level_variance_full_gradient/metric": 726.315673828125, "step": 85 }, { "accuracy_reward": 0.7630208730697632, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18105579912662506, "action_level_variance/metric": 786.4474487304688, "action_level_variance_full_gradient/metric": 4318.2255859375, "adam_stats/lr_effective_max": 3.936162556783529e-06, "adam_stats/lr_effective_mean": 1.1230859991995246e-11, "adam_stats/lr_effective_min": -3.860983269987628e-06, "adam_stats/m_t_max": 0.0021372446790337563, "adam_stats/m_t_mean": 8.908563123299906e-12, "adam_stats/m_t_min": -0.0026247433852404356, "adam_stats/v_t_max": 6.129044049885124e-05, "adam_stats/v_t_mean": 2.4629673300857746e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.07676141709089279, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.268261432647705, "all_logprobs": -0.08507983386516571, "all_logprobs/max": 0.0, "all_logprobs/median": -1.430511474609375e-06, "all_logprobs/min": -12.5, "all_logprobs/p1": -1.875, "all_logprobs/p10": -0.126953125, "all_logprobs/p25": -0.00070953369140625, "all_logprobs/p5": -0.484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.13454215228557587, "clip_ratio": 0.0, "completion_length": 507.03387451171875, "completion_length/correct": 436.498291015625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 390.0, "completion_length/correct/min": 52.0, "completion_length/correct/p25": 277.25, "completion_length/correct/p75": 544.25, "completion_length/correct/var": 43576.78515625, "completion_length/incorrect": 734.1428833007812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 736.0, "completion_length/incorrect/min": 197.0, "completion_length/incorrect/p25": 559.25, "completion_length/incorrect/p75": 1003.5, "completion_length/incorrect/var": 57664.8828125, "completion_length/max": 1024.0, "completion_length/median": 449.0, "completion_length/min": 52.0, "completion_length/p25": 305.0, "completion_length/p75": 673.25, "completion_length/var": 62884.6875, "epoch": 1.1024, "feature_vector_variance/max_squared_error": 110299.71875, "feature_vector_variance/metric": 24066.46875, "generated_tokens/total": 35692096.0, "grad_norm": 0.043008655309677124, "grouped_std_rewards": 0.159915491938591, "learning_rate": 8.778930535580476e-07, "loss": -0.0768, "mean_logprobs": -0.0859375, "mean_logprobs/var": 0.0030670166015625, "num_completions/total": 66048, "per_sentence_gradient_norm": 3.6315860748291016, "per_sentence_gradient_norm/max": 519.3458251953125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 96.04585266113281, "per_sentence_gradient_norm/var": 774.26708984375, "per_token_feature_norm": 157.31871032714844, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 183.0, "per_token_feature_norm/var": 1466.209716796875, "per_token_full_gradient_variance/max_squared_error": 418.6335754394531, "per_token_full_gradient_variance/variance": 0.06434561312198639, "per_token_gradient_norm": 4.887495517730713, "per_token_gradient_norm/max": 7427.470703125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8063.41796875, "per_token_policy_error_norm": 0.04618768021464348, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.040826719254255295, "policy_entropy": 0.09499844908714294, "policy_entropy/max": 3.734375, "policy_entropy/median": 2.205371856689453e-05, "policy_entropy/min": 1.1310397063368782e-15, "policy_entropy/p25": 8.158385753631592e-07, "policy_entropy/p75": 0.006103515625, "policy_entropy/var": 0.0671023353934288, "policy_error_vector_variance/max_squared_error": 2.017084836959839, "policy_error_vector_variance/metric": 0.046134669333696365, "policy_loss": -0.0767614096403122, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.268261432647705, "policy_sharpness": 8.107531547546387, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.474492073059082, "reward": 0.7630208730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18105579912662506, "rewards/accuracy_reward": 0.7630208730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18105579912662506, "sentence_full_gradient_variance/max_squared_error": 1873683.125, "sentence_full_gradient_variance/metric": 4863.4404296875, "sentence_full_gradient_variance/p75": 122.32120513916016, "sentence_full_gradient_variance/p90": 343.75439453125, "sentence_full_gradient_variance/p95": 343.75439453125, "sentence_full_gradient_variance/p99": 77068.5703125, "state_level_variance/metric": 86.01348876953125, "state_level_variance_full_gradient/metric": 545.2154541015625, "step": 86 }, { "accuracy_reward": 0.7369791865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19409359991550446, "action_level_variance/metric": 1317.76318359375, "action_level_variance_full_gradient/metric": 12015.9951171875, "adam_stats/lr_effective_max": 3.528457455104217e-06, "adam_stats/lr_effective_mean": 1.0536248089276778e-11, "adam_stats/lr_effective_min": -3.1997599307942437e-06, "adam_stats/m_t_max": 0.0017541474662721157, "adam_stats/m_t_mean": 6.277336289661761e-12, "adam_stats/m_t_min": -0.0022790490183979273, "adam_stats/v_t_max": 6.123611819930375e-05, "adam_stats/v_t_mean": 2.4613082839214373e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.07404300570487976, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.464665412902832, "all_logprobs": -0.08603266626596451, "all_logprobs/max": 0.0, "all_logprobs/median": -1.5497207641601562e-06, "all_logprobs/min": -12.9375, "all_logprobs/p1": -1.9140625, "all_logprobs/p10": -0.126953125, "all_logprobs/p25": -0.000629425048828125, "all_logprobs/p5": -0.484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.14050738513469696, "clip_ratio": 0.0, "completion_length": 521.5573120117188, "completion_length/correct": 448.0512390136719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 407.0, "completion_length/correct/min": 82.0, "completion_length/correct/p25": 281.0, "completion_length/correct/p75": 576.0, "completion_length/correct/var": 45287.0625, "completion_length/incorrect": 727.519775390625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 787.0, "completion_length/incorrect/min": 80.0, "completion_length/incorrect/p25": 501.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 83182.2109375, "completion_length/max": 1024.0, "completion_length/median": 459.0, "completion_length/min": 80.0, "completion_length/p25": 298.0, "completion_length/p75": 701.75, "completion_length/var": 70318.046875, "epoch": 1.1152, "feature_vector_variance/max_squared_error": 113771.9609375, "feature_vector_variance/metric": 24388.619140625, "generated_tokens/total": 36092656.0, "grad_norm": 0.0692303329706192, "grouped_std_rewards": 0.18930208683013916, "learning_rate": 7.59044652756249e-07, "loss": 0.074, "mean_logprobs": -0.0869140625, "mean_logprobs/var": 0.004730224609375, "num_completions/total": 66816, "per_sentence_gradient_norm": 5.419990539550781, "per_sentence_gradient_norm/max": 635.3728637695312, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 22.676733016967773, "per_sentence_gradient_norm/p99": 133.5966339111328, "per_sentence_gradient_norm/var": 1290.0667724609375, "per_token_feature_norm": 158.04364013671875, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 184.0, "per_token_feature_norm/var": 1526.57177734375, "per_token_full_gradient_variance/max_squared_error": 552.7352905273438, "per_token_full_gradient_variance/variance": 0.08162921667098999, "per_token_gradient_norm": 5.754554271697998, "per_token_gradient_norm/max": 7468.9130859375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9736.814453125, "per_token_policy_error_norm": 0.04599260538816452, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04056892916560173, "policy_entropy": 0.09647615253925323, "policy_entropy/max": 3.796875, "policy_entropy/median": 2.3365020751953125e-05, "policy_entropy/min": 9.020562075079397e-17, "policy_entropy/p25": 7.972121238708496e-07, "policy_entropy/p75": 0.005523681640625, "policy_entropy/var": 0.07363279163837433, "policy_error_vector_variance/max_squared_error": 2.012014627456665, "policy_error_vector_variance/metric": 0.04590282589197159, "policy_loss": 0.07404300570487976, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.46466588973999, "policy_sharpness": 8.12149429321289, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.484915733337402, "reward": 0.7369791865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19409359991550446, "rewards/accuracy_reward": 0.7369791865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19409359991550446, "sentence_full_gradient_variance/max_squared_error": 5145858.5, "sentence_full_gradient_variance/metric": 13641.8232421875, "sentence_full_gradient_variance/p75": 235.66116333007812, "sentence_full_gradient_variance/p90": 247.8936767578125, "sentence_full_gradient_variance/p95": 247.8936767578125, "sentence_full_gradient_variance/p99": 174338.421875, "state_level_variance/metric": 136.76878356933594, "state_level_variance_full_gradient/metric": 1625.82763671875, "step": 87 }, { "accuracy_reward": 0.78515625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16890586912631989, "action_level_variance/metric": 530.1829833984375, "action_level_variance_full_gradient/metric": 3253.354248046875, "adam_stats/lr_effective_max": 2.7952307846135227e-06, "adam_stats/lr_effective_mean": 1.5260494257152146e-11, "adam_stats/lr_effective_min": -2.785192236842704e-06, "adam_stats/m_t_max": 0.0018793308408930898, "adam_stats/m_t_mean": 1.0385136327284439e-11, "adam_stats/m_t_min": -0.00245380075648427, "adam_stats/v_t_max": 6.120850594015792e-05, "adam_stats/v_t_mean": 2.461524473834631e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.06709431856870651, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.5960502624511719, "all_logprobs": -0.0950227677822113, "all_logprobs/max": 0.0, "all_logprobs/median": -2.1457672119140625e-06, "all_logprobs/min": -15.3125, "all_logprobs/p1": -1.9765625, "all_logprobs/p10": -0.1611328125, "all_logprobs/p25": -0.00150299072265625, "all_logprobs/p5": -0.57421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15396246314048767, "clip_ratio": 0.0, "completion_length": 487.39453125, "completion_length/correct": 407.0198974609375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 374.0, "completion_length/correct/min": 56.0, "completion_length/correct/p25": 282.0, "completion_length/correct/p75": 484.0, "completion_length/correct/var": 32928.66015625, "completion_length/incorrect": 781.1272583007812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 906.0, "completion_length/incorrect/min": 114.0, "completion_length/incorrect/p25": 541.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 72984.828125, "completion_length/max": 1024.0, "completion_length/median": 416.0, "completion_length/min": 56.0, "completion_length/p25": 300.0, "completion_length/p75": 598.75, "completion_length/var": 65089.98046875, "epoch": 1.1280000000000001, "feature_vector_variance/max_squared_error": 111106.578125, "feature_vector_variance/metric": 24564.119140625, "generated_tokens/total": 36466972.0, "grad_norm": 0.1296578049659729, "grouped_std_rewards": 0.11477439105510712, "learning_rate": 6.484090676804927e-07, "loss": -0.0671, "mean_logprobs": -0.091796875, "mean_logprobs/var": 0.00421142578125, "num_completions/total": 67584, "per_sentence_gradient_norm": 2.6006407737731934, "per_sentence_gradient_norm/max": 465.0931396484375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 54.12263870239258, "per_sentence_gradient_norm/var": 524.1021118164062, "per_token_feature_norm": 158.65829467773438, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 185.0, "per_token_feature_norm/var": 1539.42724609375, "per_token_full_gradient_variance/max_squared_error": 414.37841796875, "per_token_full_gradient_variance/variance": 0.032028667628765106, "per_token_gradient_norm": 2.9927024841308594, "per_token_gradient_norm/max": 6914.0791015625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4051.364990234375, "per_token_policy_error_norm": 0.051090389490127563, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.044762518256902695, "policy_entropy": 0.10602709650993347, "policy_entropy/max": 3.609375, "policy_entropy/median": 3.147125244140625e-05, "policy_entropy/min": 5.0523821237824507e-17, "policy_entropy/p25": 1.0952353477478027e-06, "policy_entropy/p75": 0.01141357421875, "policy_entropy/var": 0.07640133798122406, "policy_error_vector_variance/max_squared_error": 2.0137875080108643, "policy_error_vector_variance/metric": 0.05103885382413864, "policy_loss": -0.06709431856870651, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.5960502624511719, "policy_sharpness": 7.949156284332275, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.87109375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.171916961669922, "reward": 0.78515625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16890586912631989, "rewards/accuracy_reward": 0.78515625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16890586912631989, "sentence_full_gradient_variance/max_squared_error": 1179347.5, "sentence_full_gradient_variance/metric": 3678.649169921875, "sentence_full_gradient_variance/p75": 59.60350799560547, "sentence_full_gradient_variance/p90": 142.92315673828125, "sentence_full_gradient_variance/p95": 142.92315673828125, "sentence_full_gradient_variance/p99": 63381.1015625, "state_level_variance/metric": 60.13595962524414, "state_level_variance_full_gradient/metric": 425.29498291015625, "step": 88 }, { "accuracy_reward": 0.7643229365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18036822974681854, "action_level_variance/metric": 210.34072875976562, "action_level_variance_full_gradient/metric": 1821.4853515625, "adam_stats/lr_effective_max": 2.2610058749705786e-06, "adam_stats/lr_effective_mean": 1.1187576906546148e-11, "adam_stats/lr_effective_min": -2.291067403348279e-06, "adam_stats/m_t_max": 0.0017592994263395667, "adam_stats/m_t_mean": 9.761396552177004e-12, "adam_stats/m_t_min": -0.002386948326602578, "adam_stats/v_t_max": 6.11477589700371e-05, "adam_stats/v_t_mean": 2.459410713279153e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0963163673877716, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 1.8679897785186768, "all_logprobs": -0.08104287832975388, "all_logprobs/max": 0.0, "all_logprobs/median": -1.430511474609375e-06, "all_logprobs/min": -10.6875, "all_logprobs/p1": -1.8359375, "all_logprobs/p10": -0.10888671875, "all_logprobs/p25": -0.00048828125, "all_logprobs/p5": -0.474609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12662683427333832, "clip_ratio": 0.0, "completion_length": 510.05340576171875, "completion_length/correct": 433.4701843261719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 412.0, "completion_length/correct/min": 71.0, "completion_length/correct/p25": 315.0, "completion_length/correct/p75": 521.0, "completion_length/correct/var": 33864.7578125, "completion_length/incorrect": 758.419921875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 827.0, "completion_length/incorrect/min": 189.0, "completion_length/incorrect/p25": 509.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 71511.375, "completion_length/max": 1024.0, "completion_length/median": 446.0, "completion_length/min": 71.0, "completion_length/p25": 339.0, "completion_length/p75": 639.0, "completion_length/var": 61701.0390625, "epoch": 1.1408, "feature_vector_variance/max_squared_error": 108019.6484375, "feature_vector_variance/metric": 24070.798828125, "generated_tokens/total": 36858696.0, "grad_norm": 0.0550420917570591, "grouped_std_rewards": 0.15908217430114746, "learning_rate": 5.461210907490952e-07, "loss": -0.0963, "mean_logprobs": -0.07958984375, "mean_logprobs/var": 0.00274658203125, "num_completions/total": 68352, "per_sentence_gradient_norm": 2.428060293197632, "per_sentence_gradient_norm/max": 201.65774536132812, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 63.2221794128418, "per_sentence_gradient_norm/var": 204.71177673339844, "per_token_feature_norm": 157.2506866455078, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 183.0, "per_token_feature_norm/var": 1437.56005859375, "per_token_full_gradient_variance/max_squared_error": 169.7821807861328, "per_token_full_gradient_variance/variance": 0.035701487213373184, "per_token_gradient_norm": 3.5342800617218018, "per_token_gradient_norm/max": 5347.30859375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4294.99853515625, "per_token_policy_error_norm": 0.04395778849720955, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03855609893798828, "policy_entropy": 0.09109295904636383, "policy_entropy/max": 3.734375, "policy_entropy/median": 2.110004425048828e-05, "policy_entropy/min": 6.38378239159465e-16, "policy_entropy/p25": 8.381903171539307e-07, "policy_entropy/p75": 0.00439453125, "policy_entropy/var": 0.06652620434761047, "policy_error_vector_variance/max_squared_error": 2.0131871700286865, "policy_error_vector_variance/metric": 0.04391968622803688, "policy_loss": -0.0963163673877716, "policy_loss/max": 9.659050941467285, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.8679896593093872, "policy_sharpness": 8.185272216796875, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.12457275390625, "reward": 0.7643229365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18036822974681854, "rewards/accuracy_reward": 0.7643229365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18036822974681854, "sentence_full_gradient_variance/max_squared_error": 245668.671875, "sentence_full_gradient_variance/metric": 2014.4544677734375, "sentence_full_gradient_variance/p75": 157.12680053710938, "sentence_full_gradient_variance/p90": 279.8136901855469, "sentence_full_gradient_variance/p95": 279.8136901855469, "sentence_full_gradient_variance/p99": 62927.25390625, "state_level_variance/metric": 20.611818313598633, "state_level_variance_full_gradient/metric": 192.96905517578125, "step": 89 }, { "accuracy_reward": 0.7747396230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17474567890167236, "action_level_variance/metric": 298.02532958984375, "action_level_variance_full_gradient/metric": 2450.7568359375, "adam_stats/lr_effective_max": 1.8734584728008485e-06, "adam_stats/lr_effective_mean": 9.937236797319393e-12, "adam_stats/lr_effective_min": -2.036789965131902e-06, "adam_stats/m_t_max": 0.0018075767438858747, "adam_stats/m_t_mean": 7.636493867813066e-12, "adam_stats/m_t_min": -0.002850157907232642, "adam_stats/v_t_max": 6.109279638621956e-05, "adam_stats/v_t_mean": 2.459880389660274e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.03311116248369217, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.4543391466140747, "all_logprobs": -0.08014107495546341, "all_logprobs/max": 0.0, "all_logprobs/median": -8.344650268554688e-07, "all_logprobs/min": -11.3125, "all_logprobs/p1": -1.8125, "all_logprobs/p10": -0.10693359375, "all_logprobs/p25": -0.0003910064697265625, "all_logprobs/p5": -0.462890625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1247488409280777, "clip_ratio": 0.0, "completion_length": 533.0573120117188, "completion_length/correct": 452.9260559082031, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 436.0, "completion_length/correct/min": 32.0, "completion_length/correct/p25": 288.0, "completion_length/correct/p75": 578.5, "completion_length/correct/var": 38895.39453125, "completion_length/incorrect": 808.6531372070312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 851.0, "completion_length/incorrect/min": 247.0, "completion_length/incorrect/p25": 616.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 51269.234375, "completion_length/max": 1024.0, "completion_length/median": 487.0, "completion_length/min": 32.0, "completion_length/p25": 339.5, "completion_length/p75": 700.75, "completion_length/var": 63732.15234375, "epoch": 1.1536, "feature_vector_variance/max_squared_error": 120046.4921875, "feature_vector_variance/metric": 24234.548828125, "generated_tokens/total": 37268080.0, "grad_norm": 0.13116610050201416, "grouped_std_rewards": 0.16400396823883057, "learning_rate": 4.5230534410568764e-07, "loss": -0.0331, "mean_logprobs": -0.07666015625, "mean_logprobs/var": 0.0024261474609375, "num_completions/total": 69120, "per_sentence_gradient_norm": 2.8205783367156982, "per_sentence_gradient_norm/max": 212.3269500732422, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 84.96583557128906, "per_sentence_gradient_norm/var": 290.4478454589844, "per_token_feature_norm": 157.8640594482422, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 184.0, "per_token_feature_norm/var": 1435.5457763671875, "per_token_full_gradient_variance/max_squared_error": 115.0986099243164, "per_token_full_gradient_variance/variance": 0.0316040925681591, "per_token_gradient_norm": 3.58260178565979, "per_token_gradient_norm/max": 4297.017578125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3974.244384765625, "per_token_policy_error_norm": 0.04380619153380394, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03884804993867874, "policy_entropy": 0.08931180834770203, "policy_entropy/max": 3.6875, "policy_entropy/median": 1.2934207916259766e-05, "policy_entropy/min": 2.194425197110661e-16, "policy_entropy/p25": 5.289912223815918e-07, "policy_entropy/p75": 0.0037078857421875, "policy_entropy/var": 0.06296923011541367, "policy_error_vector_variance/max_squared_error": 2.0119118690490723, "policy_error_vector_variance/metric": 0.04377236217260361, "policy_loss": -0.03311116248369217, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.4543391466140747, "policy_sharpness": 8.220620155334473, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.024298667907715, "reward": 0.7747396230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17474567890167236, "rewards/accuracy_reward": 0.7747396230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17474567890167236, "sentence_full_gradient_variance/max_squared_error": 311808.34375, "sentence_full_gradient_variance/metric": 2762.66162109375, "sentence_full_gradient_variance/p75": 48.75177764892578, "sentence_full_gradient_variance/p90": 129.0189971923828, "sentence_full_gradient_variance/p95": 129.0189971923828, "sentence_full_gradient_variance/p99": 76870.3984375, "state_level_variance/metric": 29.605892181396484, "state_level_variance_full_gradient/metric": 311.90472412109375, "step": 90 }, { "accuracy_reward": 0.7591146230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18309803307056427, "action_level_variance/metric": 396.38421630859375, "action_level_variance_full_gradient/metric": 3761.53076171875, "adam_stats/lr_effective_max": 1.5996661204553675e-06, "adam_stats/lr_effective_mean": 3.2619250182885917e-12, "adam_stats/lr_effective_min": -1.6514849221493932e-06, "adam_stats/m_t_max": 0.0036792901810258627, "adam_stats/m_t_mean": 1.934783583346089e-11, "adam_stats/m_t_min": -0.004664751701056957, "adam_stats/v_t_max": 6.103552004788071e-05, "adam_stats/v_t_mean": 2.4905967045685262e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.05595340579748154, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.1993218660354614, "all_logprobs": -0.08111319690942764, "all_logprobs/max": 0.0, "all_logprobs/median": -1.3113021850585938e-06, "all_logprobs/min": -13.625, "all_logprobs/p1": -1.8046875, "all_logprobs/p10": -0.11328125, "all_logprobs/p25": -0.00051116943359375, "all_logprobs/p5": -0.474609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12724021077156067, "clip_ratio": 0.0, "completion_length": 478.00653076171875, "completion_length/correct": 389.66552734375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 336.0, "completion_length/correct/min": 73.0, "completion_length/correct/p25": 253.0, "completion_length/correct/p75": 488.0, "completion_length/correct/var": 37749.12109375, "completion_length/incorrect": 756.4000244140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 776.0, "completion_length/incorrect/min": 170.0, "completion_length/incorrect/p25": 578.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 71072.2109375, "completion_length/max": 1024.0, "completion_length/median": 397.0, "completion_length/min": 73.0, "completion_length/p25": 275.0, "completion_length/p75": 643.25, "completion_length/var": 70319.5859375, "epoch": 1.1663999999999999, "feature_vector_variance/max_squared_error": 114197.28125, "feature_vector_variance/metric": 24086.029296875, "generated_tokens/total": 37635192.0, "grad_norm": 0.2811209559440613, "grouped_std_rewards": 0.14365391433238983, "learning_rate": 3.6707612778634855e-07, "loss": -0.056, "mean_logprobs": -0.0791015625, "mean_logprobs/var": 0.003021240234375, "num_completions/total": 69888, "per_sentence_gradient_norm": 2.738339900970459, "per_sentence_gradient_norm/max": 302.29644775390625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 73.85781860351562, "per_sentence_gradient_norm/var": 389.3926696777344, "per_token_feature_norm": 157.2758026123047, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 183.0, "per_token_feature_norm/var": 1440.641357421875, "per_token_full_gradient_variance/max_squared_error": 287.4969177246094, "per_token_full_gradient_variance/variance": 0.030080990865826607, "per_token_gradient_norm": 3.6240293979644775, "per_token_gradient_norm/max": 3428.96337890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4021.37109375, "per_token_policy_error_norm": 0.04406649246811867, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03871503472328186, "policy_entropy": 0.09142231196165085, "policy_entropy/max": 3.75, "policy_entropy/median": 1.9550323486328125e-05, "policy_entropy/min": 3.8163916471489756e-16, "policy_entropy/p25": 7.450580596923828e-07, "policy_entropy/p75": 0.00469970703125, "policy_entropy/var": 0.0651731789112091, "policy_error_vector_variance/max_squared_error": 2.0156118869781494, "policy_error_vector_variance/metric": 0.04401477053761482, "policy_loss": -0.05595340579748154, "policy_loss/max": 9.659051895141602, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.1993218660354614, "policy_sharpness": 8.170604705810547, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.2201566696167, "reward": 0.7591146230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18309803307056427, "rewards/accuracy_reward": 0.7591146230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18309803307056427, "sentence_full_gradient_variance/max_squared_error": 792738.375, "sentence_full_gradient_variance/metric": 4268.55517578125, "sentence_full_gradient_variance/p75": 45.82564163208008, "sentence_full_gradient_variance/p90": 85.70288848876953, "sentence_full_gradient_variance/p95": 85.70288848876953, "sentence_full_gradient_variance/p99": 92052.9609375, "state_level_variance/metric": 42.49214172363281, "state_level_variance_full_gradient/metric": 507.0244140625, "step": 91 }, { "accuracy_reward": 0.7252604365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19951750338077545, "action_level_variance/metric": 545.49267578125, "action_level_variance_full_gradient/metric": 4924.9736328125, "adam_stats/lr_effective_max": 1.2858158697781619e-06, "adam_stats/lr_effective_mean": 5.377645486033533e-13, "adam_stats/lr_effective_min": -1.3684990562978783e-06, "adam_stats/m_t_max": 0.0031862391624599695, "adam_stats/m_t_mean": 1.9975280110551275e-11, "adam_stats/m_t_min": -0.0039480323903262615, "adam_stats/v_t_max": 6.099299935158342e-05, "adam_stats/v_t_mean": 2.48912912850785e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.0439995676279068, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.400709867477417, "all_logprobs": -0.0885075181722641, "all_logprobs/max": 0.0, "all_logprobs/median": -1.3113021850585938e-06, "all_logprobs/min": -11.5625, "all_logprobs/p1": -1.921875, "all_logprobs/p10": -0.134765625, "all_logprobs/p25": -0.00077056884765625, "all_logprobs/p5": -0.515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.14353808760643005, "clip_ratio": 0.0, "completion_length": 475.3606872558594, "completion_length/correct": 400.9192199707031, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 345.0, "completion_length/correct/min": 120.0, "completion_length/correct/p25": 270.0, "completion_length/correct/p75": 462.0, "completion_length/correct/var": 38751.1875, "completion_length/incorrect": 671.8720703125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 656.0, "completion_length/incorrect/min": 86.0, "completion_length/incorrect/p25": 412.5, "completion_length/incorrect/p75": 1015.0, "completion_length/incorrect/var": 83299.484375, "completion_length/max": 1024.0, "completion_length/median": 384.0, "completion_length/min": 86.0, "completion_length/p25": 287.0, "completion_length/p75": 620.25, "completion_length/var": 65545.390625, "epoch": 1.1792, "feature_vector_variance/max_squared_error": 116348.8984375, "feature_vector_variance/metric": 24838.31640625, "generated_tokens/total": 38000268.0, "grad_norm": 0.08783075213432312, "grouped_std_rewards": 0.1777239441871643, "learning_rate": 2.905372804626083e-07, "loss": 0.044, "mean_logprobs": -0.08935546875, "mean_logprobs/var": 0.004852294921875, "num_completions/total": 70656, "per_sentence_gradient_norm": 3.2827887535095215, "per_sentence_gradient_norm/max": 343.8291931152344, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 12.621380805969238, "per_sentence_gradient_norm/p99": 76.95979309082031, "per_sentence_gradient_norm/var": 535.4130859375, "per_token_feature_norm": 159.7892608642578, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 155.0, "per_token_feature_norm/min": 68.5, "per_token_feature_norm/p25": 130.0, "per_token_feature_norm/p75": 186.0, "per_token_feature_norm/var": 1505.6263427734375, "per_token_full_gradient_variance/max_squared_error": 430.7980041503906, "per_token_full_gradient_variance/variance": 0.05108002945780754, "per_token_gradient_norm": 4.457535743713379, "per_token_gradient_norm/max": 6805.52490234375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6259.4775390625, "per_token_policy_error_norm": 0.04766338691115379, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.042209308594465256, "policy_entropy": 0.09858889877796173, "policy_entropy/max": 3.75, "policy_entropy/median": 1.9788742065429688e-05, "policy_entropy/min": 3.219646771412954e-15, "policy_entropy/p25": 6.668269634246826e-07, "policy_entropy/p75": 0.006591796875, "policy_entropy/var": 0.07218848168849945, "policy_error_vector_variance/max_squared_error": 2.011786937713623, "policy_error_vector_variance/metric": 0.04760412499308586, "policy_loss": 0.043999575078487396, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.400709867477417, "policy_sharpness": 8.082252502441406, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.9375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.665379524230957, "reward": 0.7252604365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19951750338077545, "rewards/accuracy_reward": 0.7252604365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19951750338077545, "sentence_full_gradient_variance/max_squared_error": 1595957.5, "sentence_full_gradient_variance/metric": 5561.85546875, "sentence_full_gradient_variance/p75": 132.597412109375, "sentence_full_gradient_variance/p90": 148.1846466064453, "sentence_full_gradient_variance/p95": 148.1846466064453, "sentence_full_gradient_variance/p99": 80420.2734375, "state_level_variance/metric": 58.014190673828125, "state_level_variance_full_gradient/metric": 636.88232421875, "step": 92 }, { "accuracy_reward": 0.7578125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1837719827890396, "action_level_variance/metric": 1152.7044677734375, "action_level_variance_full_gradient/metric": 4520.79931640625, "adam_stats/lr_effective_max": 9.674860166342114e-07, "adam_stats/lr_effective_mean": 5.106670294963145e-13, "adam_stats/lr_effective_min": -9.441728821002471e-07, "adam_stats/m_t_max": 0.0030423281714320183, "adam_stats/m_t_mean": 2.436255884530336e-11, "adam_stats/m_t_min": -0.0038233096711337566, "adam_stats/v_t_max": 6.0937531088711694e-05, "adam_stats/v_t_mean": 2.4878198459643563e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.04534851759672165, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.358213424682617, "all_logprobs": -0.08315152674913406, "all_logprobs/max": 0.0, "all_logprobs/median": -9.5367431640625e-07, "all_logprobs/min": -10.375, "all_logprobs/p1": -1.890625, "all_logprobs/p10": -0.1123046875, "all_logprobs/p25": -0.0004634857177734375, "all_logprobs/p5": -0.474609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1354856640100479, "clip_ratio": 0.0, "completion_length": 489.40106201171875, "completion_length/correct": 411.2079162597656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 375.0, "completion_length/correct/min": 71.0, "completion_length/correct/p25": 282.0, "completion_length/correct/p75": 490.5, "completion_length/correct/var": 33838.6875, "completion_length/incorrect": 734.0698852539062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 770.0, "completion_length/incorrect/min": 151.0, "completion_length/incorrect/p25": 477.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 77617.78125, "completion_length/max": 1024.0, "completion_length/median": 421.0, "completion_length/min": 71.0, "completion_length/p25": 307.75, "completion_length/p75": 608.25, "completion_length/var": 63510.43359375, "epoch": 1.192, "feature_vector_variance/max_squared_error": 118541.1171875, "feature_vector_variance/metric": 24988.572265625, "generated_tokens/total": 38376128.0, "grad_norm": 0.09875607490539551, "grouped_std_rewards": 0.1511576771736145, "learning_rate": 2.2278205293002645e-07, "loss": 0.0453, "mean_logprobs": -0.08154296875, "mean_logprobs/var": 0.0036468505859375, "num_completions/total": 71424, "per_sentence_gradient_norm": 3.5898704528808594, "per_sentence_gradient_norm/max": 797.7249145507812, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 80.3184585571289, "per_sentence_gradient_norm/var": 1141.3033447265625, "per_token_feature_norm": 159.9940643310547, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 156.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 130.0, "per_token_feature_norm/p75": 186.0, "per_token_feature_norm/var": 1483.488037109375, "per_token_full_gradient_variance/max_squared_error": 490.9827575683594, "per_token_full_gradient_variance/variance": 0.0754600241780281, "per_token_gradient_norm": 4.719181537628174, "per_token_gradient_norm/max": 7429.94482421875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 10257.5068359375, "per_token_policy_error_norm": 0.04475763812661171, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.040011804550886154, "policy_entropy": 0.09196097403764725, "policy_entropy/max": 3.765625, "policy_entropy/median": 1.4781951904296875e-05, "policy_entropy/min": 1.7997756063259374e-17, "policy_entropy/p25": 5.178153514862061e-07, "policy_entropy/p75": 0.004241943359375, "policy_entropy/var": 0.06781153380870819, "policy_error_vector_variance/max_squared_error": 2.007237195968628, "policy_error_vector_variance/metric": 0.04469463601708412, "policy_loss": 0.04534851759672165, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.358213424682617, "policy_sharpness": 8.184222221374512, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.671875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.21384334564209, "reward": 0.7578125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1837719827890396, "rewards/accuracy_reward": 0.7578125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1837719827890396, "sentence_full_gradient_variance/max_squared_error": 753101.0, "sentence_full_gradient_variance/metric": 5156.50244140625, "sentence_full_gradient_variance/p75": 16.579444885253906, "sentence_full_gradient_variance/p90": 21.152090072631836, "sentence_full_gradient_variance/p95": 21.152090072631836, "sentence_full_gradient_variance/p99": 104223.8671875, "state_level_variance/metric": 132.58193969726562, "state_level_variance_full_gradient/metric": 635.7032470703125, "step": 93 }, { "accuracy_reward": 0.75390625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18577349185943604, "action_level_variance/metric": 308.94329833984375, "action_level_variance_full_gradient/metric": 3524.754638671875, "adam_stats/lr_effective_max": 6.376606620506209e-07, "adam_stats/lr_effective_mean": -3.175846358897255e-14, "adam_stats/lr_effective_min": -6.25435291112808e-07, "adam_stats/m_t_max": 0.0026216376572847366, "adam_stats/m_t_mean": 2.0960674168568616e-11, "adam_stats/m_t_min": -0.003257873235270381, "adam_stats/v_t_max": 6.087665678933263e-05, "adam_stats/v_t_mean": 2.4854731987822287e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.03702150285243988, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.5291693210601807, "all_logprobs": -0.08629021048545837, "all_logprobs/max": 0.0, "all_logprobs/median": -1.5497207641601562e-06, "all_logprobs/min": -10.8125, "all_logprobs/p1": -1.8828125, "all_logprobs/p10": -0.1318359375, "all_logprobs/p25": -0.0008087158203125, "all_logprobs/p5": -0.494140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.13609302043914795, "clip_ratio": 0.0, "completion_length": 486.5234375, "completion_length/correct": 419.80828857421875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 367.0, "completion_length/correct/min": 92.0, "completion_length/correct/p25": 272.0, "completion_length/correct/p75": 536.5, "completion_length/correct/var": 39505.2421875, "completion_length/incorrect": 690.9047241210938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 680.0, "completion_length/incorrect/min": 156.0, "completion_length/incorrect/p25": 414.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 84684.9921875, "completion_length/max": 1024.0, "completion_length/median": 415.0, "completion_length/min": 92.0, "completion_length/p25": 290.0, "completion_length/p75": 631.25, "completion_length/var": 64180.88671875, "epoch": 1.2048, "feature_vector_variance/max_squared_error": 111807.25, "feature_vector_variance/metric": 24131.33203125, "generated_tokens/total": 38749776.0, "grad_norm": 0.034780774265527725, "grouped_std_rewards": 0.15317261219024658, "learning_rate": 1.6389299449645734e-07, "loss": 0.037, "mean_logprobs": -0.0859375, "mean_logprobs/var": 0.0034637451171875, "num_completions/total": 72192, "per_sentence_gradient_norm": 2.933504819869995, "per_sentence_gradient_norm/max": 223.72616577148438, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 96.5540542602539, "per_sentence_gradient_norm/var": 300.7293701171875, "per_token_feature_norm": 157.30738830566406, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 183.0, "per_token_feature_norm/var": 1500.968994140625, "per_token_full_gradient_variance/max_squared_error": 133.32772827148438, "per_token_full_gradient_variance/variance": 0.042308155447244644, "per_token_gradient_norm": 3.861257314682007, "per_token_gradient_norm/max": 6213.11474609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4936.4658203125, "per_token_policy_error_norm": 0.046715933829545975, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04104842618107796, "policy_entropy": 0.09729871153831482, "policy_entropy/max": 3.640625, "policy_entropy/median": 2.3603439331054688e-05, "policy_entropy/min": 6.217248937900877e-15, "policy_entropy/p25": 9.275972843170166e-07, "policy_entropy/p75": 0.00695037841796875, "policy_entropy/var": 0.06948426365852356, "policy_error_vector_variance/max_squared_error": 2.016594171524048, "policy_error_vector_variance/metric": 0.04664977267384529, "policy_loss": 0.03702151030302048, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958683013916016, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.5291695594787598, "policy_sharpness": 8.073078155517578, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.634438514709473, "reward": 0.75390625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18577349185943604, "rewards/accuracy_reward": 0.75390625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18577349185943604, "sentence_full_gradient_variance/max_squared_error": 750976.1875, "sentence_full_gradient_variance/metric": 3991.928466796875, "sentence_full_gradient_variance/p75": 53.18174743652344, "sentence_full_gradient_variance/p90": 185.1329345703125, "sentence_full_gradient_variance/p95": 185.1329345703125, "sentence_full_gradient_variance/p99": 77675.765625, "state_level_variance/metric": 30.328378677368164, "state_level_variance_full_gradient/metric": 467.173828125, "step": 94 }, { "accuracy_reward": 0.6770833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2189265638589859, "action_level_variance/metric": 1090.789306640625, "action_level_variance_full_gradient/metric": 10202.8447265625, "adam_stats/lr_effective_max": 4.882041935161396e-07, "adam_stats/lr_effective_mean": -2.5154474315135233e-13, "adam_stats/lr_effective_min": -4.87937370508007e-07, "adam_stats/m_t_max": 0.0024952772073447704, "adam_stats/m_t_mean": 1.892706477657491e-11, "adam_stats/m_t_min": -0.0031220577657222748, "adam_stats/v_t_max": 6.0815804317826405e-05, "adam_stats/v_t_mean": 2.483707467124119e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.07491806149482727, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.261159896850586, "all_logprobs": -0.09234194457530975, "all_logprobs/max": 0.0, "all_logprobs/median": -2.6226043701171875e-06, "all_logprobs/min": -9.75, "all_logprobs/p1": -2.0, "all_logprobs/p10": -0.14453125, "all_logprobs/p25": -0.00110626220703125, "all_logprobs/p5": -0.53125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15234944224357605, "clip_ratio": 0.0, "completion_length": 542.8919677734375, "completion_length/correct": 461.1423034667969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 411.0, "completion_length/correct/min": 126.0, "completion_length/correct/p25": 301.0, "completion_length/correct/p75": 564.0, "completion_length/correct/var": 42820.0078125, "completion_length/incorrect": 714.3023681640625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 728.0, "completion_length/incorrect/min": 145.0, "completion_length/incorrect/p25": 499.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 72453.0390625, "completion_length/max": 1024.0, "completion_length/median": 483.0, "completion_length/min": 126.0, "completion_length/p25": 330.0, "completion_length/p75": 728.0, "completion_length/var": 66338.03125, "epoch": 1.2176, "feature_vector_variance/max_squared_error": 108900.1015625, "feature_vector_variance/metric": 23663.498046875, "generated_tokens/total": 39166720.0, "grad_norm": 0.07779530435800552, "grouped_std_rewards": 0.21918007731437683, "learning_rate": 1.1394185240843985e-07, "loss": -0.0749, "mean_logprobs": -0.08935546875, "mean_logprobs/var": 0.0037689208984375, "num_completions/total": 72960, "per_sentence_gradient_norm": 5.514296531677246, "per_sentence_gradient_norm/max": 594.3683471679688, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 21.181949615478516, "per_sentence_gradient_norm/p99": 131.9208526611328, "per_sentence_gradient_norm/var": 1061.764404296875, "per_token_feature_norm": 156.4165496826172, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 62.0, "per_token_feature_norm/p25": 126.0, "per_token_feature_norm/p75": 182.0, "per_token_feature_norm/var": 1566.3330078125, "per_token_full_gradient_variance/max_squared_error": 279.7569580078125, "per_token_full_gradient_variance/variance": 0.07535908371210098, "per_token_gradient_norm": 6.445279121398926, "per_token_gradient_norm/max": 7847.77099609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9182.3857421875, "per_token_policy_error_norm": 0.04938284307718277, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.043972209095954895, "policy_entropy": 0.1022113487124443, "policy_entropy/max": 3.71875, "policy_entropy/median": 3.695487976074219e-05, "policy_entropy/min": 7.389922007661198e-16, "policy_entropy/p25": 1.4901161193847656e-06, "policy_entropy/p75": 0.00909423828125, "policy_entropy/var": 0.07556947320699692, "policy_error_vector_variance/max_squared_error": 2.013850450515747, "policy_error_vector_variance/metric": 0.04931918531656265, "policy_loss": -0.07491806149482727, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.261159896850586, "policy_sharpness": 7.996319770812988, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.95046329498291, "reward": 0.6770833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2189265638589859, "rewards/accuracy_reward": 0.6770833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2189265638589859, "sentence_full_gradient_variance/max_squared_error": 3573665.75, "sentence_full_gradient_variance/metric": 11611.5107421875, "sentence_full_gradient_variance/p75": 70.16439819335938, "sentence_full_gradient_variance/p90": 256.1082763671875, "sentence_full_gradient_variance/p95": 516.5338745117188, "sentence_full_gradient_variance/p99": 215884.4375, "state_level_variance/metric": 107.05638122558594, "state_level_variance_full_gradient/metric": 1408.66748046875, "step": 95 }, { "accuracy_reward": 0.7408854365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19222451746463776, "action_level_variance/metric": 807.4954833984375, "action_level_variance_full_gradient/metric": 10095.2587890625, "adam_stats/lr_effective_max": 3.236900454339775e-07, "adam_stats/lr_effective_mean": -8.406958939763398e-13, "adam_stats/lr_effective_min": -3.151859857553063e-07, "adam_stats/m_t_max": 0.0018856418319046497, "adam_stats/m_t_mean": -3.2358046363895054e-13, "adam_stats/m_t_min": -0.002168982522562146, "adam_stats/v_t_max": 6.0870381275890395e-05, "adam_stats/v_t_mean": 2.4901727815190844e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.016877155750989914, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.2308199405670166, "all_logprobs": -0.07725818455219269, "all_logprobs/max": 0.0, "all_logprobs/median": -9.5367431640625e-07, "all_logprobs/min": -13.0, "all_logprobs/p1": -1.765625, "all_logprobs/p10": -0.10009765625, "all_logprobs/p25": -0.0003204345703125, "all_logprobs/p5": -0.4296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12255760282278061, "clip_ratio": 0.0, "completion_length": 541.8671875, "completion_length/correct": 454.53253173828125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 419.0, "completion_length/correct/min": 129.0, "completion_length/correct/p25": 319.0, "completion_length/correct/p75": 557.0, "completion_length/correct/var": 34006.25, "completion_length/incorrect": 791.5828857421875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 930.0, "completion_length/incorrect/min": 194.0, "completion_length/incorrect/p25": 581.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 68599.75, "completion_length/max": 1024.0, "completion_length/median": 475.0, "completion_length/min": 129.0, "completion_length/p25": 340.5, "completion_length/p75": 693.75, "completion_length/var": 64729.44921875, "epoch": 1.2304, "feature_vector_variance/max_squared_error": 113254.3515625, "feature_vector_variance/metric": 23715.9765625, "generated_tokens/total": 39582872.0, "grad_norm": 0.23900002241134644, "grouped_std_rewards": 0.18569540977478027, "learning_rate": 7.298948443822229e-08, "loss": 0.0169, "mean_logprobs": -0.078125, "mean_logprobs/var": 0.003173828125, "num_completions/total": 73728, "per_sentence_gradient_norm": 4.037513732910156, "per_sentence_gradient_norm/max": 569.7072143554688, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 11.974952697753906, "per_sentence_gradient_norm/p99": 93.77979278564453, "per_sentence_gradient_norm/var": 792.2254028320312, "per_token_feature_norm": 156.68099975585938, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 52.0, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 182.0, "per_token_feature_norm/var": 1422.7286376953125, "per_token_full_gradient_variance/max_squared_error": 771.8363037109375, "per_token_full_gradient_variance/variance": 0.05219849571585655, "per_token_gradient_norm": 4.296998977661133, "per_token_gradient_norm/max": 6680.26953125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6087.5302734375, "per_token_policy_error_norm": 0.04208395257592201, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.037343256175518036, "policy_entropy": 0.08620954304933548, "policy_entropy/max": 3.796875, "policy_entropy/median": 1.4424324035644531e-05, "policy_entropy/min": 2.393918396847994e-16, "policy_entropy/p25": 6.742775440216064e-07, "policy_entropy/p75": 0.0030364990234375, "policy_entropy/var": 0.061424143612384796, "policy_error_vector_variance/max_squared_error": 2.0121009349823, "policy_error_vector_variance/metric": 0.042035769671201706, "policy_loss": 0.016877157613635063, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.2308199405670166, "policy_sharpness": 8.263504028320312, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.25, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.826728820800781, "reward": 0.7408854365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19222451746463776, "rewards/accuracy_reward": 0.7408854365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19222451746463776, "sentence_full_gradient_variance/max_squared_error": 4079406.25, "sentence_full_gradient_variance/metric": 11450.24609375, "sentence_full_gradient_variance/p75": 174.7630157470703, "sentence_full_gradient_variance/p90": 261.2863464355469, "sentence_full_gradient_variance/p95": 261.2863464355469, "sentence_full_gradient_variance/p99": 96226.3203125, "state_level_variance/metric": 85.52632141113281, "state_level_variance_full_gradient/metric": 1354.987548828125, "step": 96 }, { "accuracy_reward": 0.7356771230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19470982253551483, "action_level_variance/metric": 450.470458984375, "action_level_variance_full_gradient/metric": 5440.41796875, "adam_stats/lr_effective_max": 1.7678812014310097e-07, "adam_stats/lr_effective_mean": -7.993970069231082e-13, "adam_stats/lr_effective_min": -1.8140552526801912e-07, "adam_stats/m_t_max": 0.0025650046300143003, "adam_stats/m_t_mean": 1.7517320927140645e-11, "adam_stats/m_t_min": -0.0035512056201696396, "adam_stats/v_t_max": 6.137612217571586e-05, "adam_stats/v_t_mean": 2.501970419038768e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.046124305576086044, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.3664755821228027, "all_logprobs": -0.0810641273856163, "all_logprobs/max": 0.0, "all_logprobs/median": -1.0728836059570312e-06, "all_logprobs/min": -14.25, "all_logprobs/p1": -1.8046875, "all_logprobs/p10": -0.111328125, "all_logprobs/p25": -0.0004367828369140625, "all_logprobs/p5": -0.474609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1293797343969345, "clip_ratio": 0.0, "completion_length": 513.9921875, "completion_length/correct": 418.4442443847656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 369.0, "completion_length/correct/min": 77.0, "completion_length/correct/p25": 253.0, "completion_length/correct/p75": 520.0, "completion_length/correct/var": 49688.66796875, "completion_length/incorrect": 779.9260864257812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 838.0, "completion_length/incorrect/min": 138.0, "completion_length/incorrect/p25": 575.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63923.6328125, "completion_length/max": 1024.0, "completion_length/median": 437.0, "completion_length/min": 77.0, "completion_length/p25": 277.75, "completion_length/p75": 716.5, "completion_length/var": 78815.421875, "epoch": 1.2432, "feature_vector_variance/max_squared_error": 114379.2265625, "feature_vector_variance/metric": 23811.505859375, "generated_tokens/total": 39977620.0, "grad_norm": 0.22049102187156677, "grouped_std_rewards": 0.14326155185699463, "learning_rate": 4.108578473795033e-08, "loss": 0.0461, "mean_logprobs": -0.0810546875, "mean_logprobs/var": 0.002716064453125, "num_completions/total": 74496, "per_sentence_gradient_norm": 2.8863558769226074, "per_sentence_gradient_norm/max": 327.68499755859375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 69.40322875976562, "per_sentence_gradient_norm/var": 442.7158508300781, "per_token_feature_norm": 156.78797912597656, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 182.0, "per_token_feature_norm/var": 1410.910888671875, "per_token_full_gradient_variance/max_squared_error": 1387.8421630859375, "per_token_full_gradient_variance/variance": 0.04658053442835808, "per_token_gradient_norm": 3.625260591506958, "per_token_gradient_norm/max": 7097.78662109375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5664.251953125, "per_token_policy_error_norm": 0.04403790086507797, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.038923878222703934, "policy_entropy": 0.09039664268493652, "policy_entropy/max": 3.734375, "policy_entropy/median": 1.6570091247558594e-05, "policy_entropy/min": 6.83481049534862e-16, "policy_entropy/p25": 7.487833499908447e-07, "policy_entropy/p75": 0.004058837890625, "policy_entropy/var": 0.06468703597784042, "policy_error_vector_variance/max_squared_error": 2.0110607147216797, "policy_error_vector_variance/metric": 0.04399275407195091, "policy_loss": 0.046124301850795746, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.3664755821228027, "policy_sharpness": 8.197400093078613, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.75, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.130582809448242, "reward": 0.7356771230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19470982253551483, "rewards/accuracy_reward": 0.7356771230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19470982253551483, "sentence_full_gradient_variance/max_squared_error": 2528439.75, "sentence_full_gradient_variance/metric": 6164.427734375, "sentence_full_gradient_variance/p75": 131.32106018066406, "sentence_full_gradient_variance/p90": 137.7561798095703, "sentence_full_gradient_variance/p95": 137.7561798095703, "sentence_full_gradient_variance/p99": 114103.1015625, "state_level_variance/metric": 48.4827880859375, "state_level_variance_full_gradient/metric": 724.0097045898438, "step": 97 }, { "accuracy_reward": 0.69140625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21364182233810425, "action_level_variance/metric": 559.847412109375, "action_level_variance_full_gradient/metric": 8977.05859375, "adam_stats/lr_effective_max": 8.538950169167947e-08, "adam_stats/lr_effective_mean": -3.874527651839821e-13, "adam_stats/lr_effective_min": -8.604041568105458e-08, "adam_stats/m_t_max": 0.002437099115923047, "adam_stats/m_t_mean": 1.812798695877138e-11, "adam_stats/m_t_min": -0.003336465684697032, "adam_stats/v_t_max": 6.136147567303851e-05, "adam_stats/v_t_mean": 2.5045564580605806e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.03144645690917969, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.6659131050109863, "all_logprobs": -0.08090877532958984, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-06, "all_logprobs/min": -11.8125, "all_logprobs/p1": -1.8125, "all_logprobs/p10": -0.1103515625, "all_logprobs/p25": -0.0004863739013671875, "all_logprobs/p5": -0.462890625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12764890491962433, "clip_ratio": 0.0, "completion_length": 547.4830932617188, "completion_length/correct": 447.052734375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 403.0, "completion_length/correct/min": 109.0, "completion_length/correct/p25": 303.0, "completion_length/correct/p75": 578.5, "completion_length/correct/var": 38820.00390625, "completion_length/incorrect": 772.4978637695312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 852.0, "completion_length/incorrect/min": 179.0, "completion_length/incorrect/p25": 555.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 66367.3828125, "completion_length/max": 1024.0, "completion_length/median": 485.0, "completion_length/min": 109.0, "completion_length/p25": 336.0, "completion_length/p75": 716.25, "completion_length/var": 69873.2890625, "epoch": 1.256, "feature_vector_variance/max_squared_error": 113612.3203125, "feature_vector_variance/metric": 23529.125, "generated_tokens/total": 40398088.0, "grad_norm": 0.14749158918857574, "grouped_std_rewards": 0.14361517131328583, "learning_rate": 1.8269623051318517e-08, "loss": -0.0314, "mean_logprobs": -0.07958984375, "mean_logprobs/var": 0.0029449462890625, "num_completions/total": 75264, "per_sentence_gradient_norm": 3.4831981658935547, "per_sentence_gradient_norm/max": 314.91644287109375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 109.9445571899414, "per_sentence_gradient_norm/var": 548.4288330078125, "per_token_feature_norm": 155.93824768066406, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 127.0, "per_token_feature_norm/p75": 181.0, "per_token_feature_norm/var": 1429.6448974609375, "per_token_full_gradient_variance/max_squared_error": 260.3194580078125, "per_token_full_gradient_variance/variance": 0.05799761414527893, "per_token_gradient_norm": 4.099884033203125, "per_token_gradient_norm/max": 6658.00244140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6410.2841796875, "per_token_policy_error_norm": 0.044033415615558624, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03909013047814369, "policy_entropy": 0.09015648812055588, "policy_entropy/max": 3.375, "policy_entropy/median": 1.8477439880371094e-05, "policy_entropy/min": 1.8908485888147197e-16, "policy_entropy/p25": 8.009374141693115e-07, "policy_entropy/p75": 0.00433349609375, "policy_entropy/var": 0.06330426782369614, "policy_error_vector_variance/max_squared_error": 2.014585018157959, "policy_error_vector_variance/metric": 0.04400051757693291, "policy_loss": -0.031446442008018494, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.6659131050109863, "policy_sharpness": 8.186666488647461, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.122931480407715, "reward": 0.69140625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21364182233810425, "rewards/accuracy_reward": 0.69140625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21364182233810425, "sentence_full_gradient_variance/max_squared_error": 2306732.75, "sentence_full_gradient_variance/metric": 10183.6396484375, "sentence_full_gradient_variance/p75": 205.37579345703125, "sentence_full_gradient_variance/p90": 285.39239501953125, "sentence_full_gradient_variance/p95": 285.39239501953125, "sentence_full_gradient_variance/p99": 140640.234375, "state_level_variance/metric": 58.457183837890625, "state_level_variance_full_gradient/metric": 1206.580322265625, "step": 98 }, { "accuracy_reward": 0.7330729365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19593213498592377, "action_level_variance/metric": 611.976318359375, "action_level_variance_full_gradient/metric": 8061.60546875, "adam_stats/lr_effective_max": 2.0489837382342557e-08, "adam_stats/lr_effective_mean": -8.900272554547561e-14, "adam_stats/lr_effective_min": -2.2065892224532035e-08, "adam_stats/m_t_max": 0.002410063985735178, "adam_stats/m_t_mean": 1.630491575088655e-11, "adam_stats/m_t_min": -0.0032561151310801506, "adam_stats/v_t_max": 6.13020674791187e-05, "adam_stats/v_t_mean": 2.503025130912162e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.042001873254776, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.482421875, "all_logprobs": -0.08166328072547913, "all_logprobs/max": 0.0, "all_logprobs/median": -1.430511474609375e-06, "all_logprobs/min": -10.8125, "all_logprobs/p1": -1.8125, "all_logprobs/p10": -0.11279296875, "all_logprobs/p25": -0.000492095947265625, "all_logprobs/p5": -0.474609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1285901516675949, "clip_ratio": 0.0, "completion_length": 540.01171875, "completion_length/correct": 456.01422119140625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 398.0, "completion_length/correct/min": 103.0, "completion_length/correct/p25": 291.5, "completion_length/correct/p75": 588.5, "completion_length/correct/var": 46640.66796875, "completion_length/incorrect": 770.6975708007812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 900.0, "completion_length/incorrect/min": 118.0, "completion_length/incorrect/p25": 545.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 77098.7421875, "completion_length/max": 1024.0, "completion_length/median": 477.0, "completion_length/min": 103.0, "completion_length/p25": 315.75, "completion_length/p75": 741.25, "completion_length/var": 74083.1328125, "epoch": 1.2688, "feature_vector_variance/max_squared_error": 110769.4609375, "feature_vector_variance/metric": 23329.603515625, "generated_tokens/total": 40812816.0, "grad_norm": 0.10721082240343094, "grouped_std_rewards": 0.197793647646904, "learning_rate": 4.568797356781784e-09, "loss": 0.042, "mean_logprobs": -0.08251953125, "mean_logprobs/var": 0.002685546875, "num_completions/total": 76032, "per_sentence_gradient_norm": 4.149986267089844, "per_sentence_gradient_norm/max": 415.48760986328125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 21.36093521118164, "per_sentence_gradient_norm/p99": 116.9016342163086, "per_sentence_gradient_norm/var": 595.5293579101562, "per_token_feature_norm": 155.1552276611328, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 149.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 126.0, "per_token_feature_norm/p75": 180.0, "per_token_feature_norm/var": 1422.423828125, "per_token_full_gradient_variance/max_squared_error": 150.450439453125, "per_token_full_gradient_variance/variance": 0.05248600244522095, "per_token_gradient_norm": 4.881479740142822, "per_token_gradient_norm/max": 5957.81103515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6494.54052734375, "per_token_policy_error_norm": 0.04436353221535683, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03919766843318939, "policy_entropy": 0.09161948412656784, "policy_entropy/max": 3.734375, "policy_entropy/median": 2.0623207092285156e-05, "policy_entropy/min": 2.4424906541753444e-15, "policy_entropy/p25": 9.238719940185547e-07, "policy_entropy/p75": 0.00445556640625, "policy_entropy/var": 0.06552077829837799, "policy_error_vector_variance/max_squared_error": 2.0157644748687744, "policy_error_vector_variance/metric": 0.044315777719020844, "policy_loss": 0.0420018695294857, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.482421875, "policy_sharpness": 8.176507949829102, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.194221496582031, "reward": 0.7330729365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19593213498592377, "rewards/accuracy_reward": 0.7330729365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19593213498592377, "sentence_full_gradient_variance/max_squared_error": 3058532.5, "sentence_full_gradient_variance/metric": 9132.126953125, "sentence_full_gradient_variance/p75": 187.1688995361328, "sentence_full_gradient_variance/p90": 223.8455810546875, "sentence_full_gradient_variance/p95": 223.8455810546875, "sentence_full_gradient_variance/p99": 145828.59375, "state_level_variance/metric": 59.898597717285156, "state_level_variance_full_gradient/metric": 1070.520263671875, "step": 99 }, { "accuracy_reward": 0.80078125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15973864495754242, "action_level_variance/metric": 440.47662353515625, "action_level_variance_full_gradient/metric": 8449.73046875, "adam_stats/lr_effective_max": -0.0, "adam_stats/lr_effective_mean": 0.0, "adam_stats/lr_effective_min": -0.0, "adam_stats/m_t_max": 0.0023643700405955315, "adam_stats/m_t_mean": 2.232455713346848e-11, "adam_stats/m_t_min": -0.003327232087031007, "adam_stats/v_t_max": 6.127617962192744e-05, "adam_stats/v_t_mean": 2.5026270118744254e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.05410628020763397, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.5755720138549805, "all_logprobs": -0.08364827930927277, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-06, "all_logprobs/min": -12.5625, "all_logprobs/p1": -1.875, "all_logprobs/p10": -0.11767578125, "all_logprobs/p25": -0.0005645751953125, "all_logprobs/p5": -0.4765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.13240082561969757, "clip_ratio": 0.0, "completion_length": 495.6067810058594, "completion_length/correct": 430.37396240234375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 388.0, "completion_length/correct/min": 95.0, "completion_length/correct/p25": 289.0, "completion_length/correct/p75": 527.5, "completion_length/correct/var": 37693.7421875, "completion_length/incorrect": 757.8170166015625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 879.0, "completion_length/incorrect/min": 151.0, "completion_length/incorrect/p25": 527.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 82399.921875, "completion_length/max": 1024.0, "completion_length/median": 422.0, "completion_length/min": 95.0, "completion_length/p25": 305.0, "completion_length/p75": 631.75, "completion_length/var": 63631.23828125, "epoch": 1.2816, "feature_vector_variance/max_squared_error": 111732.3046875, "feature_vector_variance/metric": 23756.62890625, "generated_tokens/total": 41193440.0, "grad_norm": 0.13889339566230774, "grouped_std_rewards": 0.1600952297449112, "learning_rate": 0.0, "loss": -0.0541, "mean_logprobs": -0.0810546875, "mean_logprobs/var": 0.002532958984375, "num_completions/total": 76800, "per_sentence_gradient_norm": 3.3275766372680664, "per_sentence_gradient_norm/max": 344.563232421875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 105.06635284423828, "per_sentence_gradient_norm/var": 429.9636535644531, "per_token_feature_norm": 156.32345581054688, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 63.75, "per_token_feature_norm/p25": 127.0, "per_token_feature_norm/p75": 181.0, "per_token_feature_norm/var": 1444.8541259765625, "per_token_full_gradient_variance/max_squared_error": 293.7066955566406, "per_token_full_gradient_variance/variance": 0.04254545643925667, "per_token_gradient_norm": 4.072396755218506, "per_token_gradient_norm/max": 4225.2705078125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4577.44091796875, "per_token_policy_error_norm": 0.04551321268081665, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04033314064145088, "policy_entropy": 0.09267936646938324, "policy_entropy/max": 3.78125, "policy_entropy/median": 1.9073486328125e-05, "policy_entropy/min": 8.066464163292153e-17, "policy_entropy/p25": 7.338821887969971e-07, "policy_entropy/p75": 0.005126953125, "policy_entropy/var": 0.06590993702411652, "policy_error_vector_variance/max_squared_error": 2.0087156295776367, "policy_error_vector_variance/metric": 0.04547927528619766, "policy_loss": -0.05410628765821457, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.5755720138549805, "policy_sharpness": 8.150750160217285, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.290011405944824, "reward": 0.80078125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15973864495754242, "rewards/accuracy_reward": 0.80078125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15973864495754242, "sentence_full_gradient_variance/max_squared_error": 2357278.5, "sentence_full_gradient_variance/metric": 9585.0546875, "sentence_full_gradient_variance/p75": 136.93934631347656, "sentence_full_gradient_variance/p90": 221.48550415039062, "sentence_full_gradient_variance/p95": 221.48550415039062, "sentence_full_gradient_variance/p99": 134924.609375, "state_level_variance/metric": 44.44982147216797, "state_level_variance_full_gradient/metric": 1135.3245849609375, "step": 100 }, { "adam_stats/lr_effective_max": -0.0, "adam_stats/lr_effective_mean": 0.0, "adam_stats/lr_effective_min": -0.0, "adam_stats/m_t_max": 0.0023643700405955315, "adam_stats/m_t_mean": 2.232455713346848e-11, "adam_stats/m_t_min": -0.003327232087031007, "adam_stats/v_t_max": 6.127617962192744e-05, "adam_stats/v_t_mean": 2.5026270118744254e-12, "adam_stats/v_t_min": 0.0, "epoch": 1.2816, "step": 100, "total_flos": 0.0, "train_loss": 0.003623776203021407, "train_runtime": 74281.8123, "train_samples_per_second": 1.034, "train_steps_per_second": 0.001 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 24, "trial_name": null, "trial_params": null }