{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2816, "eval_steps": 10, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "accuracy_reward": 0.5690104365348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24555730819702148, "action_level_variance/metric": 7819.75, "action_level_variance_full_gradient/metric": 21696.34765625, "adam_stats/lr_effective_max": 4.743407316709636e-06, "adam_stats/lr_effective_mean": -6.496157344687559e-12, "adam_stats/lr_effective_min": -4.7434045882255305e-06, "adam_stats/m_t_max": 0.0036865242291241884, "adam_stats/m_t_mean": -4.9113921263277405e-11, "adam_stats/m_t_min": -0.003466797759756446, "adam_stats/v_t_max": 1.3590280332209659e-06, "adam_stats/v_t_mean": 1.3584012387157784e-13, "adam_stats/v_t_min": 0.0, "advantages": 0.06461089849472046, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 5.050792217254639, "all_logprobs": -0.1560993194580078, "all_logprobs/max": 0.0, "all_logprobs/median": -2.4318695068359375e-05, "all_logprobs/min": -13.875, "all_logprobs/p1": -2.65625, "all_logprobs/p10": -0.4140625, "all_logprobs/p25": -0.01483154296875, "all_logprobs/p5": -0.98828125, "all_logprobs/p75": -4.76837158203125e-07, "all_logprobs/var": 0.2731952965259552, "clip_ratio": 0.0, "completion_length": 612.8763427734375, "completion_length/correct": 523.2471313476562, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 468.0, "completion_length/correct/min": 7.0, "completion_length/correct/p25": 315.0, "completion_length/correct/p75": 694.0, "completion_length/correct/var": 71338.71875, "completion_length/incorrect": 731.2084350585938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 996.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 422.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 129463.8203125, "completion_length/max": 1024.0, "completion_length/median": 581.0, "completion_length/min": 2.0, "completion_length/p25": 334.75, "completion_length/p75": 1024.0, "completion_length/var": 106873.7421875, "epoch": 0.0128, "feature_vector_variance/max_squared_error": 104647.7265625, "feature_vector_variance/metric": 24687.375, "generated_tokens/total": 470689.0, "grad_norm": 0.5280356407165527, "grouped_std_rewards": 0.3329414129257202, "learning_rate": 1.5e-06, "loss": -0.0646, "mean_logprobs": -0.1904296875, "mean_logprobs/var": 0.04052734375, "num_completions/total": 768, "per_sentence_gradient_norm": 15.756444931030273, "per_sentence_gradient_norm/max": 1676.722412109375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 96.5174789428711, "per_sentence_gradient_norm/p99": 319.3581848144531, "per_sentence_gradient_norm/var": 7581.35595703125, "per_token_feature_norm": 161.6763458251953, "per_token_feature_norm/max": 340.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 62.5, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2377.87158203125, "per_token_full_gradient_variance/max_squared_error": 351.6488952636719, "per_token_full_gradient_variance/variance": 0.11968386918306351, "per_token_gradient_norm": 11.789984703063965, "per_token_gradient_norm/max": 6951.5009765625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 16752.578125, "per_token_policy_error_norm": 0.07990707457065582, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06715479493141174, "policy_entropy": 0.17294414341449738, "policy_entropy/max": 3.765625, "policy_entropy/median": 0.0002956390380859375, "policy_entropy/min": 1.6764367671839864e-14, "policy_entropy/p25": 9.179115295410156e-06, "policy_entropy/p75": 0.08349609375, "policy_entropy/var": 0.14960762858390808, "policy_error_vector_variance/max_squared_error": 2.0191116333007812, "policy_error_vector_variance/metric": 0.07945062220096588, "policy_loss": -0.06461089104413986, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 5.050792217254639, "policy_sharpness": 7.184231758117676, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.1279296875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.095711708068848, "reward": 0.5690104365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24555730819702148, "rewards/accuracy_reward": 0.5690104365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24555730819702148, "sentence_full_gradient_variance/max_squared_error": 4261071.5, "sentence_full_gradient_variance/metric": 24485.33984375, "sentence_full_gradient_variance/p75": 595.4851684570312, "sentence_full_gradient_variance/p90": 677.557373046875, "sentence_full_gradient_variance/p95": 30516.966796875, "sentence_full_gradient_variance/p99": 557760.5625, "state_level_variance/metric": 736.8790893554688, "state_level_variance_full_gradient/metric": 2788.989990234375, "step": 1 }, { "accuracy_reward": 0.6432291865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.22978457808494568, "action_level_variance/metric": 18659.220703125, "action_level_variance_full_gradient/metric": 27829.28125, "adam_stats/lr_effective_max": 1.2765775863954332e-05, "adam_stats/lr_effective_mean": 2.0650993415305408e-10, "adam_stats/lr_effective_min": -1.2765945029968861e-05, "adam_stats/m_t_max": 0.010065997019410133, "adam_stats/m_t_mean": 1.2283730976836438e-10, "adam_stats/m_t_min": -0.012003075331449509, "adam_stats/v_t_max": 1.8091988749802113e-05, "adam_stats/v_t_mean": 6.059802182614704e-13, "adam_stats/v_t_min": 0.0, "advantages": -0.13988877832889557, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 7.640215873718262, "all_logprobs": -0.15809345245361328, "all_logprobs/max": 0.0, "all_logprobs/median": -2.372264862060547e-05, "all_logprobs/min": -12.0, "all_logprobs/p1": -2.6871871948242188, "all_logprobs/p10": -0.42578125, "all_logprobs/p25": -0.0159912109375, "all_logprobs/p5": -1.0078125, "all_logprobs/p75": -4.76837158203125e-07, "all_logprobs/var": 0.27630048990249634, "clip_ratio": 0.0, "completion_length": 607.1654052734375, "completion_length/correct": 564.9453735351562, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 496.0, "completion_length/correct/min": 24.0, "completion_length/correct/p25": 369.0, "completion_length/correct/p75": 767.75, "completion_length/correct/var": 72451.640625, "completion_length/incorrect": 683.28466796875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 828.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 336.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 136555.828125, "completion_length/max": 1024.0, "completion_length/median": 559.0, "completion_length/min": 2.0, "completion_length/p25": 365.5, "completion_length/p75": 972.5, "completion_length/var": 98391.875, "epoch": 0.0256, "feature_vector_variance/max_squared_error": 99639.3125, "feature_vector_variance/metric": 25086.638671875, "generated_tokens/total": 936992.0, "grad_norm": 1.5490106344223022, "grouped_std_rewards": 0.395636647939682, "learning_rate": 3e-06, "loss": 0.1399, "mean_logprobs": -0.2109375, "mean_logprobs/var": 0.10205078125, "num_completions/total": 1536, "per_sentence_gradient_norm": 26.563568115234375, "per_sentence_gradient_norm/max": 2069.01171875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 40.29927062988281, "per_sentence_gradient_norm/p95": 112.88459014892578, "per_sentence_gradient_norm/p99": 562.3847045898438, "per_sentence_gradient_norm/var": 17977.005859375, "per_token_feature_norm": 162.4080352783203, "per_token_feature_norm/max": 346.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 60.5, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2381.980224609375, "per_token_full_gradient_variance/max_squared_error": 831379.875, "per_token_full_gradient_variance/variance": 2.0858049392700195, "per_token_gradient_norm": 20.17038345336914, "per_token_gradient_norm/max": 7858.59521484375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 40275.78125, "per_token_policy_error_norm": 0.08074752241373062, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06747222691774368, "policy_entropy": 0.17592249810695648, "policy_entropy/max": 3.765625, "policy_entropy/median": 0.0002899169921875, "policy_entropy/min": 1.4224732503009818e-15, "policy_entropy/p25": 8.285045623779297e-06, "policy_entropy/p75": 0.08935546875, "policy_entropy/var": 0.1539994776248932, "policy_error_vector_variance/max_squared_error": 2.0190200805664062, "policy_error_vector_variance/metric": 0.08022210001945496, "policy_loss": 0.13988877832889557, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 7.64021635055542, "policy_sharpness": 7.149451732635498, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.998828172683716, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.27035140991211, "reward": 0.6432291865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.22978457808494568, "rewards/accuracy_reward": 0.6432291865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.22978457808494568, "sentence_full_gradient_variance/max_squared_error": 4919875.0, "sentence_full_gradient_variance/metric": 31633.427734375, "sentence_full_gradient_variance/p75": 285.924560546875, "sentence_full_gradient_variance/p90": 743.4141845703125, "sentence_full_gradient_variance/p95": 54455.12890625, "sentence_full_gradient_variance/p99": 654409.4375, "state_level_variance/metric": 1643.9034423828125, "state_level_variance_full_gradient/metric": 3804.143310546875, "step": 2 }, { "accuracy_reward": 0.5625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24641458690166473, "action_level_variance/metric": 4565.50732421875, "action_level_variance_full_gradient/metric": 19198.236328125, "adam_stats/lr_effective_max": 2.2348145648720674e-05, "adam_stats/lr_effective_mean": 2.077978900061339e-10, "adam_stats/lr_effective_min": -2.2353271560859866e-05, "adam_stats/m_t_max": 0.011981425806879997, "adam_stats/m_t_mean": 1.0544754652785926e-10, "adam_stats/m_t_min": -0.014757846482098103, "adam_stats/v_t_max": 1.9638140656752512e-05, "adam_stats/v_t_mean": 1.0316644457122881e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.07609724998474121, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.5029826164245605, "all_logprobs": -0.1602993756532669, "all_logprobs/max": 0.0, "all_logprobs/median": -2.5987625122070312e-05, "all_logprobs/min": -14.25, "all_logprobs/p1": -2.703125, "all_logprobs/p10": -0.4296875, "all_logprobs/p25": -0.0164794921875, "all_logprobs/p5": -1.015625, "all_logprobs/p75": -5.960464477539062e-07, "all_logprobs/var": 0.2826879024505615, "clip_ratio": 0.0, "completion_length": 641.16015625, "completion_length/correct": 548.2106323242188, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 506.0, "completion_length/correct/min": 78.0, "completion_length/correct/p25": 349.75, "completion_length/correct/p75": 714.0, "completion_length/correct/var": 64695.18359375, "completion_length/incorrect": 760.6666870117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 511.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 125678.3984375, "completion_length/max": 1024.0, "completion_length/median": 616.0, "completion_length/min": 2.0, "completion_length/p25": 369.5, "completion_length/p75": 1024.0, "completion_length/var": 102368.8125, "epoch": 0.0384, "feature_vector_variance/max_squared_error": 97680.4765625, "feature_vector_variance/metric": 24956.03125, "generated_tokens/total": 1429403.0, "grad_norm": 0.9809289574623108, "grouped_std_rewards": 0.33661389350891113, "learning_rate": 4.5e-06, "loss": 0.0761, "mean_logprobs": -0.1923828125, "mean_logprobs/var": 0.05810546875, "num_completions/total": 2304, "per_sentence_gradient_norm": 13.944941520690918, "per_sentence_gradient_norm/max": 866.2412109375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 81.7944107055664, "per_sentence_gradient_norm/p99": 266.68328857421875, "per_sentence_gradient_norm/var": 4376.74462890625, "per_token_feature_norm": 162.16439819335938, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 61.0, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2402.30908203125, "per_token_full_gradient_variance/max_squared_error": 10308525.0, "per_token_full_gradient_variance/variance": 24.857603073120117, "per_token_gradient_norm": 10.671329498291016, "per_token_gradient_norm/max": 5110.37744140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 15177.0439453125, "per_token_policy_error_norm": 0.0817776620388031, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0688956007361412, "policy_entropy": 0.17695631086826324, "policy_entropy/max": 3.78125, "policy_entropy/median": 0.000316619873046875, "policy_entropy/min": 6.938893903907228e-16, "policy_entropy/p25": 9.834766387939453e-06, "policy_entropy/p75": 0.09033203125, "policy_entropy/var": 0.1539929211139679, "policy_error_vector_variance/max_squared_error": 2.017479419708252, "policy_error_vector_variance/metric": 0.08131411671638489, "policy_loss": 0.07609724998474121, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.5029826164245605, "policy_sharpness": 7.136756420135498, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.998779296875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.280470848083496, "reward": 0.5625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24641458690166473, "rewards/accuracy_reward": 0.5625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24641458690166473, "sentence_full_gradient_variance/max_squared_error": 4396233.5, "sentence_full_gradient_variance/metric": 21767.70703125, "sentence_full_gradient_variance/p75": 328.7176208496094, "sentence_full_gradient_variance/p90": 539.719970703125, "sentence_full_gradient_variance/p95": 37764.359375, "sentence_full_gradient_variance/p99": 437603.4375, "state_level_variance/metric": 380.1872863769531, "state_level_variance_full_gradient/metric": 2569.46875, "step": 3 }, { "accuracy_reward": 0.66015625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.22464245557785034, "action_level_variance/metric": 5276.9501953125, "action_level_variance_full_gradient/metric": 19831.9453125, "adam_stats/lr_effective_max": 3.2857507903827354e-05, "adam_stats/lr_effective_mean": 2.856167247156094e-10, "adam_stats/lr_effective_min": -3.282064062659629e-05, "adam_stats/m_t_max": 0.016679121181368828, "adam_stats/m_t_mean": 1.828315693286342e-10, "adam_stats/m_t_min": -0.02401869185268879, "adam_stats/v_t_max": 3.114587161689997e-05, "adam_stats/v_t_mean": 1.5276656892618257e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.012082990258932114, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 5.478076457977295, "all_logprobs": -0.1597290188074112, "all_logprobs/max": 0.0, "all_logprobs/median": -2.5987625122070312e-05, "all_logprobs/min": -14.6875, "all_logprobs/p1": -2.6875, "all_logprobs/p10": -0.431640625, "all_logprobs/p25": -0.01708984375, "all_logprobs/p5": -1.015625, "all_logprobs/p75": -4.76837158203125e-07, "all_logprobs/var": 0.2772267758846283, "clip_ratio": 0.0, "completion_length": 599.1549682617188, "completion_length/correct": 566.7061157226562, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 527.0, "completion_length/correct/min": 67.0, "completion_length/correct/p25": 351.5, "completion_length/correct/p75": 752.5, "completion_length/correct/var": 68795.015625, "completion_length/incorrect": 662.187744140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 749.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 330.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 135929.46875, "completion_length/max": 1024.0, "completion_length/median": 563.0, "completion_length/min": 2.0, "completion_length/p25": 347.0, "completion_length/p75": 921.5, "completion_length/var": 93510.765625, "epoch": 0.0512, "feature_vector_variance/max_squared_error": 102499.8671875, "feature_vector_variance/metric": 24927.970703125, "generated_tokens/total": 1889554.0, "grad_norm": 1.5735406875610352, "grouped_std_rewards": 0.36178839206695557, "learning_rate": 6e-06, "loss": 0.0121, "mean_logprobs": -0.203125, "mean_logprobs/var": 0.0693359375, "num_completions/total": 3072, "per_sentence_gradient_norm": 14.986808776855469, "per_sentence_gradient_norm/max": 1098.7222900390625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 25.7145938873291, "per_sentence_gradient_norm/p95": 88.62165069580078, "per_sentence_gradient_norm/p99": 350.6887512207031, "per_sentence_gradient_norm/var": 5058.9326171875, "per_token_feature_norm": 162.7161102294922, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 61.75, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 2453.195556640625, "per_token_full_gradient_variance/max_squared_error": 360.702392578125, "per_token_full_gradient_variance/variance": 0.15067313611507416, "per_token_gradient_norm": 12.048707962036133, "per_token_gradient_norm/max": 7306.54541015625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 17909.541015625, "per_token_policy_error_norm": 0.08181656897068024, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06825359165668488, "policy_entropy": 0.17762140929698944, "policy_entropy/max": 3.796875, "policy_entropy/median": 0.0003185272216796875, "policy_entropy/min": 1.096345236817342e-15, "policy_entropy/p25": 9.119510650634766e-06, "policy_entropy/p75": 0.0927734375, "policy_entropy/var": 0.15284153819084167, "policy_error_vector_variance/max_squared_error": 2.0252041816711426, "policy_error_vector_variance/metric": 0.08133509755134583, "policy_loss": 0.012082993984222412, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958683013916016, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 5.478076457977295, "policy_sharpness": 7.121942520141602, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.9613280296325684, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.33455753326416, "reward": 0.66015625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.22464245557785034, "rewards/accuracy_reward": 0.66015625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.22464245557785034, "sentence_full_gradient_variance/max_squared_error": 4682243.0, "sentence_full_gradient_variance/metric": 22353.890625, "sentence_full_gradient_variance/p75": 279.25140380859375, "sentence_full_gradient_variance/p90": 1648.078369140625, "sentence_full_gradient_variance/p95": 52215.91015625, "sentence_full_gradient_variance/p99": 450360.84375, "state_level_variance/metric": 439.5932922363281, "state_level_variance_full_gradient/metric": 2521.9482421875, "step": 4 }, { "accuracy_reward": 0.6770833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2189265340566635, "action_level_variance/metric": 2101.18701171875, "action_level_variance_full_gradient/metric": 7412.95703125, "adam_stats/lr_effective_max": 4.3723593989852816e-05, "adam_stats/lr_effective_mean": 3.4454977759779126e-10, "adam_stats/lr_effective_min": -4.3790205381810665e-05, "adam_stats/m_t_max": 0.009623775258660316, "adam_stats/m_t_mean": 1.0588169924163893e-10, "adam_stats/m_t_min": -0.009874100796878338, "adam_stats/v_t_max": 5.144298847881146e-05, "adam_stats/v_t_mean": 1.9881865251369923e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.08492681384086609, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.7029552459716797, "all_logprobs": -0.14210492372512817, "all_logprobs/max": 0.0, "all_logprobs/median": -1.52587890625e-05, "all_logprobs/min": -12.8125, "all_logprobs/p1": -2.5, "all_logprobs/p10": -0.359375, "all_logprobs/p25": -0.00958251953125, "all_logprobs/p5": -0.8984375, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.24217984080314636, "clip_ratio": 0.0, "completion_length": 609.7018432617188, "completion_length/correct": 556.9038696289062, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 472.0, "completion_length/correct/min": 84.0, "completion_length/correct/p25": 335.0, "completion_length/correct/p75": 762.0, "completion_length/correct/var": 70707.296875, "completion_length/incorrect": 720.4072265625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 933.0, "completion_length/incorrect/min": 4.0, "completion_length/incorrect/p25": 410.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 128930.90625, "completion_length/max": 1024.0, "completion_length/median": 556.0, "completion_length/min": 4.0, "completion_length/p25": 336.75, "completion_length/p75": 979.5, "completion_length/var": 95217.7421875, "epoch": 0.064, "feature_vector_variance/max_squared_error": 101939.421875, "feature_vector_variance/metric": 24516.9375, "generated_tokens/total": 2357805.0, "grad_norm": 0.9503998160362244, "grouped_std_rewards": 0.3252720236778259, "learning_rate": 7.5e-06, "loss": -0.0849, "mean_logprobs": -0.162109375, "mean_logprobs/var": 0.0203857421875, "num_completions/total": 3840, "per_sentence_gradient_norm": 10.328163146972656, "per_sentence_gradient_norm/max": 651.5977172851562, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 64.11333465576172, "per_sentence_gradient_norm/p99": 204.94680786132812, "per_sentence_gradient_norm/var": 1997.1165771484375, "per_token_feature_norm": 159.73190307617188, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 149.0, "per_token_feature_norm/min": 59.5, "per_token_feature_norm/p25": 122.5, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 2268.667236328125, "per_token_full_gradient_variance/max_squared_error": 424879.75, "per_token_full_gradient_variance/variance": 1.0098323822021484, "per_token_gradient_norm": 10.948210716247559, "per_token_gradient_norm/max": 7459.634765625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 14024.38671875, "per_token_policy_error_norm": 0.07375631481409073, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06205785274505615, "policy_entropy": 0.15834511816501617, "policy_entropy/max": 3.8125, "policy_entropy/median": 0.00019550323486328125, "policy_entropy/min": 9.71445146547012e-16, "policy_entropy/p25": 6.973743438720703e-06, "policy_entropy/p75": 0.057861328125, "policy_entropy/var": 0.13291004300117493, "policy_error_vector_variance/max_squared_error": 2.019089460372925, "policy_error_vector_variance/metric": 0.07331915199756622, "policy_loss": -0.08492681384086609, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.7029552459716797, "policy_sharpness": 7.3589324951171875, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.523775100708008, "reward": 0.6770833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2189265340566635, "rewards/accuracy_reward": 0.6770833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2189265340566635, "sentence_full_gradient_variance/max_squared_error": 858171.3125, "sentence_full_gradient_variance/metric": 8245.447265625, "sentence_full_gradient_variance/p75": 434.385498046875, "sentence_full_gradient_variance/p90": 667.9683227539062, "sentence_full_gradient_variance/p95": 30102.4375, "sentence_full_gradient_variance/p99": 157848.921875, "state_level_variance/metric": 157.61927795410156, "state_level_variance_full_gradient/metric": 832.490234375, "step": 5 }, { "accuracy_reward": 0.6549479365348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.22628577053546906, "action_level_variance/metric": 3191.171875, "action_level_variance_full_gradient/metric": 9255.2373046875, "adam_stats/lr_effective_max": 5.471120311995037e-05, "adam_stats/lr_effective_mean": 1.433947960371995e-10, "adam_stats/lr_effective_min": -5.468957533594221e-05, "adam_stats/m_t_max": 0.00894826278090477, "adam_stats/m_t_mean": 3.5952227306346174e-11, "adam_stats/m_t_min": -0.009777803905308247, "adam_stats/v_t_max": 5.262749255052768e-05, "adam_stats/v_t_mean": 2.032017566364064e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.19023805856704712, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 4.61489200592041, "all_logprobs": -0.16107900440692902, "all_logprobs/max": 0.0, "all_logprobs/median": -3.409385681152344e-05, "all_logprobs/min": -11.75, "all_logprobs/p1": -2.6875, "all_logprobs/p10": -0.443359375, "all_logprobs/p25": -0.0186767578125, "all_logprobs/p5": -1.0234375, "all_logprobs/p75": -5.960464477539062e-07, "all_logprobs/var": 0.27792680263519287, "clip_ratio": 0.0, "completion_length": 616.0703125, "completion_length/correct": 538.3001708984375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 483.0, "completion_length/correct/min": 32.0, "completion_length/correct/p25": 331.0, "completion_length/correct/p75": 692.0, "completion_length/correct/var": 68469.7421875, "completion_length/incorrect": 763.6868286132812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 15.0, "completion_length/incorrect/p25": 444.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 111239.5234375, "completion_length/max": 1024.0, "completion_length/median": 562.0, "completion_length/min": 15.0, "completion_length/p25": 343.0, "completion_length/p75": 1024.0, "completion_length/var": 94596.8671875, "epoch": 0.0768, "feature_vector_variance/max_squared_error": 102605.4140625, "feature_vector_variance/metric": 25243.775390625, "generated_tokens/total": 2830947.0, "grad_norm": 0.3218063414096832, "grouped_std_rewards": 0.3443847596645355, "learning_rate": 9e-06, "loss": -0.1902, "mean_logprobs": -0.1826171875, "mean_logprobs/var": 0.0162353515625, "num_completions/total": 4608, "per_sentence_gradient_norm": 12.64547348022461, "per_sentence_gradient_norm/max": 648.781005859375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 88.27383422851562, "per_sentence_gradient_norm/p99": 239.76535034179688, "per_sentence_gradient_norm/var": 3035.2158203125, "per_token_feature_norm": 164.25355529785156, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 58.75, "per_token_feature_norm/p25": 124.5, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 2436.237060546875, "per_token_full_gradient_variance/max_squared_error": 823.4221801757812, "per_token_full_gradient_variance/variance": 0.135040745139122, "per_token_gradient_norm": 12.86369514465332, "per_token_gradient_norm/max": 7362.83251953125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 17887.779296875, "per_token_policy_error_norm": 0.08257220685482025, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06868641078472137, "policy_entropy": 0.1798093616962433, "policy_entropy/max": 3.796875, "policy_entropy/median": 0.00040435791015625, "policy_entropy/min": 1.366962099069724e-15, "policy_entropy/p25": 1.1026859283447266e-05, "policy_entropy/p75": 0.1005859375, "policy_entropy/var": 0.15380579233169556, "policy_error_vector_variance/max_squared_error": 2.0185694694519043, "policy_error_vector_variance/metric": 0.08201576769351959, "policy_loss": -0.19023805856704712, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.61489200592041, "policy_sharpness": 7.08561372756958, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.8970212936401367, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.421631813049316, "reward": 0.6549479365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.22628577053546906, "rewards/accuracy_reward": 0.6549479365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.22628577053546906, "sentence_full_gradient_variance/max_squared_error": 1090511.875, "sentence_full_gradient_variance/metric": 10277.4091796875, "sentence_full_gradient_variance/p75": 288.6682434082031, "sentence_full_gradient_variance/p90": 1648.3511962890625, "sentence_full_gradient_variance/p95": 19135.0625, "sentence_full_gradient_variance/p99": 245828.015625, "state_level_variance/metric": 241.504150390625, "state_level_variance_full_gradient/metric": 1022.1719970703125, "step": 6 }, { "accuracy_reward": 0.7109375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20577330887317657, "action_level_variance/metric": 3565.18994140625, "action_level_variance_full_gradient/metric": 6812.30224609375, "adam_stats/lr_effective_max": 6.60949808661826e-05, "adam_stats/lr_effective_mean": -2.5897126110230317e-10, "adam_stats/lr_effective_min": -6.557820597663522e-05, "adam_stats/m_t_max": 0.014674816280603409, "adam_stats/m_t_mean": 8.148551278175376e-11, "adam_stats/m_t_min": -0.01569865085184574, "adam_stats/v_t_max": 6.922059401404113e-05, "adam_stats/v_t_mean": 2.504001780229137e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.03321941941976547, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.90073561668396, "all_logprobs": -0.14488138258457184, "all_logprobs/max": 0.0, "all_logprobs/median": -1.71661376953125e-05, "all_logprobs/min": -12.125, "all_logprobs/p1": -2.546875, "all_logprobs/p10": -0.376953125, "all_logprobs/p25": -0.01104736328125, "all_logprobs/p5": -0.921875, "all_logprobs/p75": -4.76837158203125e-07, "all_logprobs/var": 0.2482609897851944, "clip_ratio": 0.0, "completion_length": 626.94140625, "completion_length/correct": 554.4560546875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 500.0, "completion_length/correct/min": 95.0, "completion_length/correct/p25": 353.0, "completion_length/correct/p75": 720.5, "completion_length/correct/var": 65025.3203125, "completion_length/incorrect": 805.2162475585938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 569.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 84164.984375, "completion_length/max": 1024.0, "completion_length/median": 573.0, "completion_length/min": 2.0, "completion_length/p25": 377.75, "completion_length/p75": 946.25, "completion_length/var": 83394.5234375, "epoch": 0.0896, "feature_vector_variance/max_squared_error": 99100.1484375, "feature_vector_variance/metric": 24059.515625, "generated_tokens/total": 3312438.0, "grad_norm": 1.809020757675171, "grouped_std_rewards": 0.2855607867240906, "learning_rate": 1.05e-05, "loss": -0.0332, "mean_logprobs": -0.1650390625, "mean_logprobs/var": 0.047607421875, "num_completions/total": 5376, "per_sentence_gradient_norm": 10.52778434753418, "per_sentence_gradient_norm/max": 1186.2457275390625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 59.068443298339844, "per_sentence_gradient_norm/p99": 232.0110626220703, "per_sentence_gradient_norm/var": 3458.859130859375, "per_token_feature_norm": 159.07015991210938, "per_token_feature_norm/max": 336.0, "per_token_feature_norm/median": 148.0, "per_token_feature_norm/min": 61.0, "per_token_feature_norm/p25": 121.5, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 2275.869140625, "per_token_full_gradient_variance/max_squared_error": 8578207.0, "per_token_full_gradient_variance/variance": 18.402868270874023, "per_token_gradient_norm": 10.15084171295166, "per_token_gradient_norm/max": 7098.4052734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 15939.2470703125, "per_token_policy_error_norm": 0.07508238404989243, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06302721053361893, "policy_entropy": 0.16068735718727112, "policy_entropy/max": 3.75, "policy_entropy/median": 0.000217437744140625, "policy_entropy/min": 3.0184188481996443e-16, "policy_entropy/p25": 9.059906005859375e-06, "policy_entropy/p75": 0.0634765625, "policy_entropy/var": 0.13317403197288513, "policy_error_vector_variance/max_squared_error": 2.021959066390991, "policy_error_vector_variance/metric": 0.07458607852458954, "policy_loss": -0.033219411969184875, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.900735378265381, "policy_sharpness": 7.334780216217041, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.49951171875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.607458114624023, "reward": 0.7109375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20577330887317657, "rewards/accuracy_reward": 0.7109375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20577330887317657, "sentence_full_gradient_variance/max_squared_error": 1310099.0, "sentence_full_gradient_variance/metric": 7667.97607421875, "sentence_full_gradient_variance/p75": 263.50018310546875, "sentence_full_gradient_variance/p90": 269.1328125, "sentence_full_gradient_variance/p95": 20648.638671875, "sentence_full_gradient_variance/p99": 182110.09375, "state_level_variance/metric": 338.3388671875, "state_level_variance_full_gradient/metric": 855.6728515625, "step": 7 }, { "accuracy_reward": 0.6666666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.22251196205615997, "action_level_variance/metric": 2382.9912109375, "action_level_variance_full_gradient/metric": 6510.93896484375, "adam_stats/lr_effective_max": 7.688036566833034e-05, "adam_stats/lr_effective_mean": -4.208192405652511e-10, "adam_stats/lr_effective_min": -7.596211798954755e-05, "adam_stats/m_t_max": 0.012673276476562023, "adam_stats/m_t_mean": 7.416239844459938e-11, "adam_stats/m_t_min": -0.013335328549146652, "adam_stats/v_t_max": 6.921433669049293e-05, "adam_stats/v_t_mean": 2.5770973896555027e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.09272240102291107, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.796598434448242, "all_logprobs": -0.15846621990203857, "all_logprobs/max": 0.0, "all_logprobs/median": -3.361701965332031e-05, "all_logprobs/min": -12.25, "all_logprobs/p1": -2.625, "all_logprobs/p10": -0.431640625, "all_logprobs/p25": -0.0191650390625, "all_logprobs/p5": -1.0078125, "all_logprobs/p75": -8.344650268554688e-07, "all_logprobs/var": 0.2679429352283478, "clip_ratio": 0.0, "completion_length": 662.5625, "completion_length/correct": 563.4453125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 498.0, "completion_length/correct/min": 147.0, "completion_length/correct/p25": 366.0, "completion_length/correct/p75": 746.5, "completion_length/correct/var": 62315.66796875, "completion_length/incorrect": 860.796875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 23.0, "completion_length/incorrect/p25": 699.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 58604.52734375, "completion_length/max": 1024.0, "completion_length/median": 603.0, "completion_length/min": 23.0, "completion_length/p25": 412.0, "completion_length/p75": 1024.0, "completion_length/var": 80674.65625, "epoch": 0.1024, "feature_vector_variance/max_squared_error": 103279.0078125, "feature_vector_variance/metric": 24077.92578125, "generated_tokens/total": 3821286.0, "grad_norm": 0.4025658369064331, "grouped_std_rewards": 0.2355465143918991, "learning_rate": 1.2e-05, "loss": 0.0927, "mean_logprobs": -0.1669921875, "mean_logprobs/var": 0.0135498046875, "num_completions/total": 6144, "per_sentence_gradient_norm": 9.288363456726074, "per_sentence_gradient_norm/max": 745.3187255859375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 56.48509216308594, "per_sentence_gradient_norm/p99": 213.6743621826172, "per_sentence_gradient_norm/var": 2299.7119140625, "per_token_feature_norm": 161.23304748535156, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 61.25, "per_token_feature_norm/p25": 122.5, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2402.576171875, "per_token_full_gradient_variance/max_squared_error": 887.216796875, "per_token_full_gradient_variance/variance": 0.12309814989566803, "per_token_gradient_norm": 9.9465970993042, "per_token_gradient_norm/max": 6821.60693359375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 15855.8427734375, "per_token_policy_error_norm": 0.0819774642586708, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06838034838438034, "policy_entropy": 0.17630234360694885, "policy_entropy/max": 3.78125, "policy_entropy/median": 0.000400543212890625, "policy_entropy/min": 1.7780915628762273e-17, "policy_entropy/p25": 1.3947486877441406e-05, "policy_entropy/p75": 0.10107421875, "policy_entropy/var": 0.14607736468315125, "policy_error_vector_variance/max_squared_error": 2.022257089614868, "policy_error_vector_variance/metric": 0.08142057061195374, "policy_loss": 0.09272241592407227, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.7965989112854004, "policy_sharpness": 7.0970306396484375, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.9541869163513184, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.243340492248535, "reward": 0.6666666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.22251196205615997, "rewards/accuracy_reward": 0.6666666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.22251196205615997, "sentence_full_gradient_variance/max_squared_error": 1626473.375, "sentence_full_gradient_variance/metric": 7386.4638671875, "sentence_full_gradient_variance/p75": 125.28727722167969, "sentence_full_gradient_variance/p90": 164.40110778808594, "sentence_full_gradient_variance/p95": 164.40110778808594, "sentence_full_gradient_variance/p99": 165242.515625, "state_level_variance/metric": 213.8275909423828, "state_level_variance_full_gradient/metric": 875.5250854492188, "step": 8 }, { "accuracy_reward": 0.6471354365348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.22864890098571777, "action_level_variance/metric": 3906.68017578125, "action_level_variance_full_gradient/metric": 10103.2509765625, "adam_stats/lr_effective_max": 8.526090095983818e-05, "adam_stats/lr_effective_mean": -3.547672433601434e-10, "adam_stats/lr_effective_min": -8.603603055234998e-05, "adam_stats/m_t_max": 0.012321475893259048, "adam_stats/m_t_mean": 7.60472240735055e-11, "adam_stats/m_t_min": -0.012773890979588032, "adam_stats/v_t_max": 6.92047324264422e-05, "adam_stats/v_t_mean": 2.5839753513973163e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.032721228897571564, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 5.549973964691162, "all_logprobs": -0.16452854871749878, "all_logprobs/max": 0.0, "all_logprobs/median": -3.838539123535156e-05, "all_logprobs/min": -13.0, "all_logprobs/p1": -2.703125, "all_logprobs/p10": -0.458984375, "all_logprobs/p25": -0.021728515625, "all_logprobs/p5": -1.046875, "all_logprobs/p75": -9.5367431640625e-07, "all_logprobs/var": 0.2846592962741852, "clip_ratio": 0.0, "completion_length": 692.19921875, "completion_length/correct": 604.5995483398438, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 547.0, "completion_length/correct/min": 179.0, "completion_length/correct/p25": 419.0, "completion_length/correct/p75": 780.0, "completion_length/correct/var": 57761.55078125, "completion_length/incorrect": 852.8524169921875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 23.0, "completion_length/incorrect/p25": 686.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 51492.78125, "completion_length/max": 1024.0, "completion_length/median": 667.0, "completion_length/min": 23.0, "completion_length/p25": 468.75, "completion_length/p75": 1024.0, "completion_length/var": 69571.0078125, "epoch": 0.1152, "feature_vector_variance/max_squared_error": 108568.640625, "feature_vector_variance/metric": 24479.111328125, "generated_tokens/total": 4352895.0, "grad_norm": 0.18563701212406158, "grouped_std_rewards": 0.2778139114379883, "learning_rate": 1.3500000000000001e-05, "loss": -0.0327, "mean_logprobs": -0.169921875, "mean_logprobs/var": 0.00921630859375, "num_completions/total": 6912, "per_sentence_gradient_norm": 12.505756378173828, "per_sentence_gradient_norm/max": 813.6565551757812, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 80.3398666381836, "per_sentence_gradient_norm/p99": 296.9717102050781, "per_sentence_gradient_norm/var": 3755.176025390625, "per_token_feature_norm": 162.5606231689453, "per_token_feature_norm/max": 336.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 58.0, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 2441.76513671875, "per_token_full_gradient_variance/max_squared_error": 329183.65625, "per_token_full_gradient_variance/variance": 0.7802909016609192, "per_token_gradient_norm": 12.382017135620117, "per_token_gradient_norm/max": 8205.9072265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 22977.6484375, "per_token_policy_error_norm": 0.08458972722291946, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.07067150622606277, "policy_entropy": 0.18230512738227844, "policy_entropy/max": 3.796875, "policy_entropy/median": 0.00045013427734375, "policy_entropy/min": 1.7416623698807143e-15, "policy_entropy/p25": 1.4901161193847656e-05, "policy_entropy/p75": 0.11181640625, "policy_entropy/var": 0.1524876058101654, "policy_error_vector_variance/max_squared_error": 2.0215394496917725, "policy_error_vector_variance/metric": 0.08399044722318649, "policy_loss": -0.032721228897571564, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 5.54997444152832, "policy_sharpness": 7.043859481811523, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.861328125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.484260559082031, "reward": 0.6471354365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.22864890098571777, "rewards/accuracy_reward": 0.6471354365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.22864890098571777, "sentence_full_gradient_variance/max_squared_error": 2122195.0, "sentence_full_gradient_variance/metric": 11452.7734375, "sentence_full_gradient_variance/p75": 148.2578887939453, "sentence_full_gradient_variance/p90": 216.5675506591797, "sentence_full_gradient_variance/p95": 6491.078125, "sentence_full_gradient_variance/p99": 240438.46875, "state_level_variance/metric": 335.4351806640625, "state_level_variance_full_gradient/metric": 1349.523681640625, "step": 9 }, { "accuracy_reward": 0.703125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20901235938072205, "action_level_variance/metric": 3117.663818359375, "action_level_variance_full_gradient/metric": 4771.36279296875, "adam_stats/lr_effective_max": 9.634427260607481e-05, "adam_stats/lr_effective_mean": -3.7832489918621093e-10, "adam_stats/lr_effective_min": -9.636855975259095e-05, "adam_stats/m_t_max": 0.011362460441887379, "adam_stats/m_t_mean": 7.494873471847185e-11, "adam_stats/m_t_min": -0.01249747909605503, "adam_stats/v_t_max": 6.923572800587863e-05, "adam_stats/v_t_mean": 2.593672238787592e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.03979356214404106, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 5.781472206115723, "all_logprobs": -0.16488991677761078, "all_logprobs/max": 0.0, "all_logprobs/median": -3.4332275390625e-05, "all_logprobs/min": -14.875, "all_logprobs/p1": -2.671875, "all_logprobs/p10": -0.474609375, "all_logprobs/p25": -0.0244140625, "all_logprobs/p5": -1.0546875, "all_logprobs/p75": -8.344650268554688e-07, "all_logprobs/var": 0.27706387639045715, "clip_ratio": 0.0, "completion_length": 651.1536865234375, "completion_length/correct": 554.57958984375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 491.0, "completion_length/correct/min": 179.0, "completion_length/correct/p25": 363.75, "completion_length/correct/p75": 692.75, "completion_length/correct/var": 56165.41015625, "completion_length/incorrect": 879.881591796875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 113.0, "completion_length/incorrect/p25": 746.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 43875.234375, "completion_length/max": 1024.0, "completion_length/median": 604.0, "completion_length/min": 113.0, "completion_length/p25": 419.75, "completion_length/p75": 993.5, "completion_length/var": 74572.7734375, "epoch": 0.128, "feature_vector_variance/max_squared_error": 98971.703125, "feature_vector_variance/metric": 24079.72265625, "generated_tokens/total": 4852981.0, "grad_norm": 0.2142612338066101, "grouped_std_rewards": 0.23501527309417725, "learning_rate": 1.5e-05, "loss": -0.0398, "mean_logprobs": -0.16796875, "mean_logprobs/var": 0.004852294921875, "num_completions/total": 7680, "per_sentence_gradient_norm": 10.924694061279297, "per_sentence_gradient_norm/max": 603.653076171875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 54.398929595947266, "per_sentence_gradient_norm/p99": 311.46759033203125, "per_sentence_gradient_norm/var": 3002.2236328125, "per_token_feature_norm": 161.52337646484375, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 60.25, "per_token_feature_norm/p25": 122.5, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2421.5693359375, "per_token_full_gradient_variance/max_squared_error": 744.9137573242188, "per_token_full_gradient_variance/variance": 0.19635412096977234, "per_token_gradient_norm": 13.07180118560791, "per_token_gradient_norm/max": 7641.48681640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 26060.986328125, "per_token_policy_error_norm": 0.0855560302734375, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.07112357020378113, "policy_entropy": 0.18279030919075012, "policy_entropy/max": 3.78125, "policy_entropy/median": 0.000408172607421875, "policy_entropy/min": 2.6506574712925612e-15, "policy_entropy/p25": 1.3172626495361328e-05, "policy_entropy/p75": 0.12255859375, "policy_entropy/var": 0.14792917668819427, "policy_error_vector_variance/max_squared_error": 2.018209934234619, "policy_error_vector_variance/metric": 0.08511826395988464, "policy_loss": -0.03979355841875076, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 5.781472206115723, "policy_sharpness": 7.032755374908447, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.824414014816284, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.47594165802002, "reward": 0.703125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20901235938072205, "rewards/accuracy_reward": 0.703125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20901235938072205, "sentence_full_gradient_variance/max_squared_error": 704846.3125, "sentence_full_gradient_variance/metric": 5336.42138671875, "sentence_full_gradient_variance/p75": 257.62457275390625, "sentence_full_gradient_variance/p90": 283.9346618652344, "sentence_full_gradient_variance/p95": 283.9346618652344, "sentence_full_gradient_variance/p99": 105342.625, "state_level_variance/metric": 273.20489501953125, "state_level_variance_full_gradient/metric": 565.0586547851562, "step": 10 }, { "accuracy_reward": 0.7200521230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2018398940563202, "action_level_variance/metric": 2442.6767578125, "action_level_variance_full_gradient/metric": 5438.18310546875, "adam_stats/lr_effective_max": 9.642512304708362e-05, "adam_stats/lr_effective_mean": -2.1778745473710615e-10, "adam_stats/lr_effective_min": -9.715063060866669e-05, "adam_stats/m_t_max": 0.011861956678330898, "adam_stats/m_t_mean": 9.519009191993888e-11, "adam_stats/m_t_min": -0.013615895062685013, "adam_stats/v_t_max": 6.972730625420809e-05, "adam_stats/v_t_mean": 2.6209507654473274e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.05769258737564087, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 5.051448822021484, "all_logprobs": -0.1595654934644699, "all_logprobs/max": 0.0, "all_logprobs/median": -2.7179718017578125e-05, "all_logprobs/min": -12.75, "all_logprobs/p1": -2.671875, "all_logprobs/p10": -0.431640625, "all_logprobs/p25": -0.0167236328125, "all_logprobs/p5": -1.0234375, "all_logprobs/p75": -7.152557373046875e-07, "all_logprobs/var": 0.2753432095050812, "clip_ratio": 0.0, "completion_length": 691.3177490234375, "completion_length/correct": 627.2405395507812, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 583.0, "completion_length/correct/min": 189.0, "completion_length/correct/p25": 442.0, "completion_length/correct/p75": 781.0, "completion_length/correct/var": 52873.0234375, "completion_length/incorrect": 856.1302490234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 689.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 52678.78515625, "completion_length/max": 1024.0, "completion_length/median": 660.0, "completion_length/min": 2.0, "completion_length/p25": 485.5, "completion_length/p75": 1007.25, "completion_length/var": 63324.390625, "epoch": 0.1408, "feature_vector_variance/max_squared_error": 104853.8828125, "feature_vector_variance/metric": 24237.84765625, "generated_tokens/total": 5383913.0, "grad_norm": 0.30311644077301025, "grouped_std_rewards": 0.22256295382976532, "learning_rate": 1.4995431202643219e-05, "loss": 0.0577, "mean_logprobs": -0.173828125, "mean_logprobs/var": 0.04443359375, "num_completions/total": 8448, "per_sentence_gradient_norm": 9.067361831665039, "per_sentence_gradient_norm/max": 651.784423828125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 55.00558090209961, "per_sentence_gradient_norm/p99": 232.51295471191406, "per_sentence_gradient_norm/var": 2363.537109375, "per_token_feature_norm": 161.7747039794922, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 54.75, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2424.546630859375, "per_token_full_gradient_variance/max_squared_error": 21631660.0, "per_token_full_gradient_variance/variance": 46.103233337402344, "per_token_gradient_norm": 10.989493370056152, "per_token_gradient_norm/max": 8032.40576171875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 21403.33984375, "per_token_policy_error_norm": 0.08187873661518097, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06837870925664902, "policy_entropy": 0.17735683917999268, "policy_entropy/max": 3.78125, "policy_entropy/median": 0.0003299713134765625, "policy_entropy/min": 2.0095036745715333e-14, "policy_entropy/p25": 1.2636184692382812e-05, "policy_entropy/p75": 0.0908203125, "policy_entropy/var": 0.15227235853672028, "policy_error_vector_variance/max_squared_error": 2.0190367698669434, "policy_error_vector_variance/metric": 0.08141925185918808, "policy_loss": 0.057692598551511765, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 5.051448822021484, "policy_sharpness": 7.154346466064453, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.0101380348205566, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.265374183654785, "reward": 0.7200521230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2018398940563202, "rewards/accuracy_reward": 0.7200521230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2018398940563202, "sentence_full_gradient_variance/max_squared_error": 2607561.0, "sentence_full_gradient_variance/metric": 6159.1787109375, "sentence_full_gradient_variance/p75": 103.24983978271484, "sentence_full_gradient_variance/p90": 142.844482421875, "sentence_full_gradient_variance/p95": 142.844482421875, "sentence_full_gradient_variance/p99": 71491.5625, "state_level_variance/metric": 225.46612548828125, "state_level_variance_full_gradient/metric": 720.9954833984375, "step": 11 }, { "accuracy_reward": 0.6875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21512386202812195, "action_level_variance/metric": 1320.9344482421875, "action_level_variance_full_gradient/metric": 2336.261474609375, "adam_stats/lr_effective_max": 9.372103522764519e-05, "adam_stats/lr_effective_mean": -2.052432113153202e-10, "adam_stats/lr_effective_min": -9.166816744254902e-05, "adam_stats/m_t_max": 0.009915873408317566, "adam_stats/m_t_mean": 7.547460573187337e-11, "adam_stats/m_t_min": -0.011161775328218937, "adam_stats/v_t_max": 6.977694283705205e-05, "adam_stats/v_t_mean": 2.622127992166212e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.03844552859663963, "advantages/max": 9.659051895141602, "advantages/median": -0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.1233084201812744, "all_logprobs": -0.1604868769645691, "all_logprobs/max": 0.0, "all_logprobs/median": -3.147125244140625e-05, "all_logprobs/min": -14.875, "all_logprobs/p1": -2.65625, "all_logprobs/p10": -0.443359375, "all_logprobs/p25": -0.0186767578125, "all_logprobs/p5": -1.015625, "all_logprobs/p75": -8.344650268554688e-07, "all_logprobs/var": 0.2748382091522217, "clip_ratio": 0.0, "completion_length": 702.96484375, "completion_length/correct": 597.587158203125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 563.0, "completion_length/correct/min": 188.0, "completion_length/correct/p25": 430.5, "completion_length/correct/p75": 724.25, "completion_length/correct/var": 49460.6015625, "completion_length/incorrect": 934.7958984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 274.0, "completion_length/incorrect/p25": 923.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 28571.810546875, "completion_length/max": 1024.0, "completion_length/median": 667.0, "completion_length/min": 188.0, "completion_length/p25": 486.75, "completion_length/p75": 1024.0, "completion_length/var": 67348.7578125, "epoch": 0.1536, "feature_vector_variance/max_squared_error": 98438.484375, "feature_vector_variance/metric": 23959.291015625, "generated_tokens/total": 5923790.0, "grad_norm": 0.09757667779922485, "grouped_std_rewards": 0.24725203216075897, "learning_rate": 1.4981730376948682e-05, "loss": -0.0384, "mean_logprobs": -0.1650390625, "mean_logprobs/var": 0.005645751953125, "num_completions/total": 9216, "per_sentence_gradient_norm": 7.967013359069824, "per_sentence_gradient_norm/max": 360.4595031738281, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 61.46986389160156, "per_sentence_gradient_norm/p99": 191.1277313232422, "per_sentence_gradient_norm/var": 1259.1005859375, "per_token_feature_norm": 161.5159912109375, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 61.25, "per_token_feature_norm/p25": 122.5, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2424.458740234375, "per_token_full_gradient_variance/max_squared_error": 165.93968200683594, "per_token_full_gradient_variance/variance": 0.06387678533792496, "per_token_gradient_norm": 8.612545013427734, "per_token_gradient_norm/max": 4459.00146484375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9726.7060546875, "per_token_policy_error_norm": 0.08238786458969116, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06834746897220612, "policy_entropy": 0.17972786724567413, "policy_entropy/max": 3.78125, "policy_entropy/median": 0.000377655029296875, "policy_entropy/min": 1.0547118733938987e-14, "policy_entropy/p25": 1.33514404296875e-05, "policy_entropy/p75": 0.1005859375, "policy_entropy/var": 0.1511714607477188, "policy_error_vector_variance/max_squared_error": 2.0222320556640625, "policy_error_vector_variance/metric": 0.08201432228088379, "policy_loss": -0.03844553232192993, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.1233084201812744, "policy_sharpness": 7.103696823120117, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.9351806640625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.395743370056152, "reward": 0.6875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21512386202812195, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21512386202812195, "sentence_full_gradient_variance/max_squared_error": 492539.84375, "sentence_full_gradient_variance/metric": 2602.453125, "sentence_full_gradient_variance/p75": 99.07122802734375, "sentence_full_gradient_variance/p90": 175.8088836669922, "sentence_full_gradient_variance/p95": 175.8941650390625, "sentence_full_gradient_variance/p99": 51754.8984375, "state_level_variance/metric": 102.71344757080078, "state_level_variance_full_gradient/metric": 266.1916809082031, "step": 12 }, { "accuracy_reward": 0.66015625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.22464245557785034, "action_level_variance/metric": 2738.16748046875, "action_level_variance_full_gradient/metric": 6381.5078125, "adam_stats/lr_effective_max": 9.361866250401363e-05, "adam_stats/lr_effective_mean": -2.0586812810030608e-10, "adam_stats/lr_effective_min": -9.136895823758096e-05, "adam_stats/m_t_max": 0.006531707011163235, "adam_stats/m_t_mean": 4.5173299129119826e-11, "adam_stats/m_t_min": -0.007543155457824469, "adam_stats/v_t_max": 7.033337897155434e-05, "adam_stats/v_t_mean": 2.679769600666404e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.012431505136191845, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 4.6202826499938965, "all_logprobs": -0.16767285764217377, "all_logprobs/max": 0.0, "all_logprobs/median": -3.981590270996094e-05, "all_logprobs/min": -12.75, "all_logprobs/p1": -2.703125, "all_logprobs/p10": -0.4765625, "all_logprobs/p25": -0.0234375, "all_logprobs/p5": -1.0703125, "all_logprobs/p75": -8.344650268554688e-07, "all_logprobs/var": 0.2854352593421936, "clip_ratio": 0.0, "completion_length": 730.5182495117188, "completion_length/correct": 633.94677734375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 584.0, "completion_length/correct/min": 171.0, "completion_length/correct/p25": 444.0, "completion_length/correct/p75": 827.0, "completion_length/correct/var": 57198.0703125, "completion_length/incorrect": 918.111083984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 294.0, "completion_length/incorrect/p25": 853.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 33881.09765625, "completion_length/max": 1024.0, "completion_length/median": 729.0, "completion_length/min": 171.0, "completion_length/p25": 495.75, "completion_length/p75": 1024.0, "completion_length/var": 67359.1796875, "epoch": 0.1664, "feature_vector_variance/max_squared_error": 105254.484375, "feature_vector_variance/metric": 24003.41015625, "generated_tokens/total": 6484828.0, "grad_norm": 0.3654309809207916, "grouped_std_rewards": 0.275564044713974, "learning_rate": 1.495891421526205e-05, "loss": -0.0124, "mean_logprobs": -0.169921875, "mean_logprobs/var": 0.005035400390625, "num_completions/total": 9984, "per_sentence_gradient_norm": 10.934791564941406, "per_sentence_gradient_norm/max": 603.870361328125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 71.20174407958984, "per_sentence_gradient_norm/p99": 237.62826538085938, "per_sentence_gradient_norm/var": 2622.011474609375, "per_token_feature_norm": 162.46046447753906, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 61.75, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 2471.50537109375, "per_token_full_gradient_variance/max_squared_error": 8783169.0, "per_token_full_gradient_variance/variance": 15.782729148864746, "per_token_gradient_norm": 11.61191177368164, "per_token_gradient_norm/max": 7369.3271484375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 17660.099609375, "per_token_policy_error_norm": 0.0862489864230156, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.07126473635435104, "policy_entropy": 0.1868823617696762, "policy_entropy/max": 3.796875, "policy_entropy/median": 0.00046539306640625, "policy_entropy/min": 1.3270634591222574e-16, "policy_entropy/p25": 1.4066696166992188e-05, "policy_entropy/p75": 0.12060546875, "policy_entropy/var": 0.15808114409446716, "policy_error_vector_variance/max_squared_error": 2.021073341369629, "policy_error_vector_variance/metric": 0.08584874123334885, "policy_loss": -0.012431517243385315, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 4.620282173156738, "policy_sharpness": 7.0228800773620605, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.8199219703674316, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.581043243408203, "reward": 0.66015625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.22464245557785034, "rewards/accuracy_reward": 0.66015625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.22464245557785034, "sentence_full_gradient_variance/max_squared_error": 2878168.25, "sentence_full_gradient_variance/metric": 7232.607421875, "sentence_full_gradient_variance/p75": 112.02001953125, "sentence_full_gradient_variance/p90": 133.83444213867188, "sentence_full_gradient_variance/p95": 134.30096435546875, "sentence_full_gradient_variance/p99": 102760.046875, "state_level_variance/metric": 225.0454559326172, "state_level_variance_full_gradient/metric": 851.0988159179688, "step": 13 }, { "accuracy_reward": 0.6809896230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21752598881721497, "action_level_variance/metric": 5380.0693359375, "action_level_variance_full_gradient/metric": 3756.14404296875, "adam_stats/lr_effective_max": 9.120021422859281e-05, "adam_stats/lr_effective_mean": -1.6637514976824974e-10, "adam_stats/lr_effective_min": -8.921115659177303e-05, "adam_stats/m_t_max": 0.005820552818477154, "adam_stats/m_t_mean": 5.29111685021455e-11, "adam_stats/m_t_min": -0.006245626602321863, "adam_stats/v_t_max": 7.029255357338116e-05, "adam_stats/v_t_mean": 2.6984126738627268e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.044144563376903534, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 5.061094760894775, "all_logprobs": -0.18628843128681183, "all_logprobs/max": 0.0, "all_logprobs/median": -7.152557373046875e-05, "all_logprobs/min": -12.75, "all_logprobs/p1": -2.890625, "all_logprobs/p10": -0.5546875, "all_logprobs/p25": -0.035888671875, "all_logprobs/p5": -1.1796875, "all_logprobs/p75": -1.7881393432617188e-06, "all_logprobs/var": 0.32931751012802124, "clip_ratio": 0.0, "completion_length": 724.11328125, "completion_length/correct": 622.2064819335938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 578.0, "completion_length/correct/min": 193.0, "completion_length/correct/p25": 443.5, "completion_length/correct/p75": 789.5, "completion_length/correct/var": 52340.953125, "completion_length/incorrect": 941.6530151367188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 283.0, "completion_length/incorrect/p25": 943.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 25813.45703125, "completion_length/max": 1024.0, "completion_length/median": 710.0, "completion_length/min": 193.0, "completion_length/p25": 505.75, "completion_length/p75": 1024.0, "completion_length/var": 66031.3984375, "epoch": 0.1792, "feature_vector_variance/max_squared_error": 106966.4453125, "feature_vector_variance/metric": 24039.875, "generated_tokens/total": 7040947.0, "grad_norm": 0.2506157457828522, "grouped_std_rewards": 0.24391409754753113, "learning_rate": 1.4927010515561777e-05, "loss": -0.0441, "mean_logprobs": -0.189453125, "mean_logprobs/var": 0.006622314453125, "num_completions/total": 10752, "per_sentence_gradient_norm": 12.38483715057373, "per_sentence_gradient_norm/max": 1229.374755859375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 72.33045959472656, "per_sentence_gradient_norm/p99": 251.9599609375, "per_sentence_gradient_norm/var": 5233.49951171875, "per_token_feature_norm": 165.50668334960938, "per_token_feature_norm/max": 338.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 53.75, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 200.0, "per_token_feature_norm/var": 2724.195556640625, "per_token_full_gradient_variance/max_squared_error": 22934358.0, "per_token_full_gradient_variance/variance": 64.6756820678711, "per_token_gradient_norm": 13.886796951293945, "per_token_gradient_norm/max": 8313.224609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 32286.984375, "per_token_policy_error_norm": 0.09419108927249908, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.07772098481655121, "policy_entropy": 0.20685884356498718, "policy_entropy/max": 3.828125, "policy_entropy/median": 0.00080108642578125, "policy_entropy/min": 5.9674487573602164e-15, "policy_entropy/p25": 2.7298927307128906e-05, "policy_entropy/p75": 0.16796875, "policy_entropy/var": 0.17998196184635162, "policy_error_vector_variance/max_squared_error": 2.021909236907959, "policy_error_vector_variance/metric": 0.0937500074505806, "policy_loss": -0.04414454847574234, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 5.061094760894775, "policy_sharpness": 6.794856071472168, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.5009765625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 15.228362083435059, "reward": 0.6809896230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21752598881721497, "rewards/accuracy_reward": 0.6809896230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21752598881721497, "sentence_full_gradient_variance/max_squared_error": 482202.65625, "sentence_full_gradient_variance/metric": 4191.0322265625, "sentence_full_gradient_variance/p75": 227.7181854248047, "sentence_full_gradient_variance/p90": 295.0648498535156, "sentence_full_gradient_variance/p95": 952.94580078125, "sentence_full_gradient_variance/p99": 71866.4453125, "state_level_variance/metric": 524.5889282226562, "state_level_variance_full_gradient/metric": 434.88787841796875, "step": 14 }, { "accuracy_reward": 0.7799479365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1718529313802719, "action_level_variance/metric": 2156.09130859375, "action_level_variance_full_gradient/metric": 2201.48095703125, "adam_stats/lr_effective_max": 8.748360414756462e-05, "adam_stats/lr_effective_mean": -2.1874627109674805e-10, "adam_stats/lr_effective_min": -9.025011240737513e-05, "adam_stats/m_t_max": 0.005148470867425203, "adam_stats/m_t_mean": 4.347595547460337e-11, "adam_stats/m_t_min": -0.005840790458023548, "adam_stats/v_t_max": 7.022709178272635e-05, "adam_stats/v_t_mean": 2.7081798175537797e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.0036698828916996717, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.4041972160339355, "all_logprobs": -0.19382381439208984, "all_logprobs/max": 0.0, "all_logprobs/median": -9.489059448242188e-05, "all_logprobs/min": -14.3125, "all_logprobs/p1": -3.0, "all_logprobs/p10": -0.578125, "all_logprobs/p25": -0.03857421875, "all_logprobs/p5": -1.2265625, "all_logprobs/p75": -2.384185791015625e-06, "all_logprobs/var": 0.35147547721862793, "clip_ratio": 0.0, "completion_length": 694.16796875, "completion_length/correct": 631.1685791015625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 597.0, "completion_length/correct/min": 185.0, "completion_length/correct/p25": 442.5, "completion_length/correct/p75": 802.5, "completion_length/correct/var": 56075.6171875, "completion_length/incorrect": 917.4615478515625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 93.0, "completion_length/incorrect/p25": 864.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 34867.94140625, "completion_length/max": 1024.0, "completion_length/median": 661.0, "completion_length/min": 93.0, "completion_length/p25": 480.75, "completion_length/p75": 1024.0, "completion_length/var": 65442.97265625, "epoch": 0.192, "feature_vector_variance/max_squared_error": 111409.375, "feature_vector_variance/metric": 24213.576171875, "generated_tokens/total": 7574068.0, "grad_norm": 0.16425828635692596, "grouped_std_rewards": 0.20279619097709656, "learning_rate": 1.488605814759156e-05, "loss": 0.0037, "mean_logprobs": -0.2001953125, "mean_logprobs/var": 0.00872802734375, "num_completions/total": 11520, "per_sentence_gradient_norm": 8.716080665588379, "per_sentence_gradient_norm/max": 617.7246704101562, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 56.94747543334961, "per_sentence_gradient_norm/p99": 215.99862670898438, "per_sentence_gradient_norm/var": 2082.8330078125, "per_token_feature_norm": 167.51133728027344, "per_token_feature_norm/max": 338.0, "per_token_feature_norm/median": 156.0, "per_token_feature_norm/min": 59.75, "per_token_feature_norm/p25": 125.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 2767.1435546875, "per_token_full_gradient_variance/max_squared_error": 1185.992431640625, "per_token_full_gradient_variance/variance": 0.12958525121212006, "per_token_gradient_norm": 10.4173002243042, "per_token_gradient_norm/max": 7340.8740234375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 17967.513671875, "per_token_policy_error_norm": 0.09660933166742325, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.07930446416139603, "policy_entropy": 0.21621152758598328, "policy_entropy/max": 3.78125, "policy_entropy/median": 0.00102996826171875, "policy_entropy/min": 1.0047518372857667e-14, "policy_entropy/p25": 3.504753112792969e-05, "policy_entropy/p75": 0.1787109375, "policy_entropy/var": 0.19698572158813477, "policy_error_vector_variance/max_squared_error": 2.019986391067505, "policy_error_vector_variance/metric": 0.09587320685386658, "policy_loss": 0.0036698803305625916, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.4041965007781982, "policy_sharpness": 6.705787181854248, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.326061964035034, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 15.61894702911377, "reward": 0.7799479365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1718529313802719, "rewards/accuracy_reward": 0.7799479365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1718529313802719, "sentence_full_gradient_variance/max_squared_error": 617017.125, "sentence_full_gradient_variance/metric": 2498.891357421875, "sentence_full_gradient_variance/p75": 29.15709114074707, "sentence_full_gradient_variance/p90": 44.15487289428711, "sentence_full_gradient_variance/p95": 44.15487289428711, "sentence_full_gradient_variance/p99": 60563.70703125, "state_level_variance/metric": 195.57859802246094, "state_level_variance_full_gradient/metric": 297.41033935546875, "step": 15 }, { "accuracy_reward": 0.7109375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20577330887317657, "action_level_variance/metric": 1983.909912109375, "action_level_variance_full_gradient/metric": 1589.841796875, "adam_stats/lr_effective_max": 8.891418110579252e-05, "adam_stats/lr_effective_mean": -3.054657082834922e-10, "adam_stats/lr_effective_min": -8.712815179023892e-05, "adam_stats/m_t_max": 0.003995299339294434, "adam_stats/m_t_mean": -1.275536859295101e-12, "adam_stats/m_t_min": -0.004388749599456787, "adam_stats/v_t_max": 7.204535359051079e-05, "adam_stats/v_t_mean": 2.767994817728936e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.041300754994153976, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.565761089324951, "all_logprobs": -0.20843367278575897, "all_logprobs/max": 0.0, "all_logprobs/median": -0.000141143798828125, "all_logprobs/min": -14.75, "all_logprobs/p1": -3.078125, "all_logprobs/p10": -0.63671875, "all_logprobs/p25": -0.05712890625, "all_logprobs/p5": -1.3125, "all_logprobs/p75": -2.86102294921875e-06, "all_logprobs/var": 0.3723924458026886, "clip_ratio": 0.0, "completion_length": 717.0885620117188, "completion_length/correct": 635.9029541015625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 602.0, "completion_length/correct/min": 194.0, "completion_length/correct/p25": 464.75, "completion_length/correct/p75": 774.75, "completion_length/correct/var": 48469.0, "completion_length/incorrect": 916.7612915039062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 327.0, "completion_length/incorrect/p25": 845.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 32786.70703125, "completion_length/max": 1024.0, "completion_length/median": 673.0, "completion_length/min": 194.0, "completion_length/p25": 509.0, "completion_length/p75": 1024.0, "completion_length/var": 60118.86328125, "epoch": 0.2048, "feature_vector_variance/max_squared_error": 123977.296875, "feature_vector_variance/metric": 24329.19921875, "generated_tokens/total": 8124792.0, "grad_norm": 0.3641529679298401, "grouped_std_rewards": 0.19845223426818848, "learning_rate": 1.4836107005503543e-05, "loss": -0.0413, "mean_logprobs": -0.2119140625, "mean_logprobs/var": 0.0072021484375, "num_completions/total": 12288, "per_sentence_gradient_norm": 9.101045608520508, "per_sentence_gradient_norm/max": 472.31988525390625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 70.5029067993164, "per_sentence_gradient_norm/p99": 273.9361267089844, "per_sentence_gradient_norm/var": 1903.559326171875, "per_token_feature_norm": 169.19119262695312, "per_token_feature_norm/max": 336.0, "per_token_feature_norm/median": 158.0, "per_token_feature_norm/min": 58.0, "per_token_feature_norm/p25": 125.5, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 2849.91943359375, "per_token_full_gradient_variance/max_squared_error": 2171524.5, "per_token_full_gradient_variance/variance": 4.044502258300781, "per_token_gradient_norm": 10.539895057678223, "per_token_gradient_norm/max": 6362.33837890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 14198.705078125, "per_token_policy_error_norm": 0.10380686074495316, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0078125, "per_token_policy_error_norm/var": 0.08426681905984879, "policy_entropy": 0.2331170290708542, "policy_entropy/max": 3.796875, "policy_entropy/median": 0.0014801025390625, "policy_entropy/min": 4.926614671774132e-16, "policy_entropy/p25": 4.172325134277344e-05, "policy_entropy/p75": 0.2412109375, "policy_entropy/var": 0.20757217705249786, "policy_error_vector_variance/max_squared_error": 2.026060104370117, "policy_error_vector_variance/metric": 0.10312414914369583, "policy_loss": -0.041300754994153976, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.565760850906372, "policy_sharpness": 6.508166313171387, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.083966016769409, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 15.95085334777832, "reward": 0.7109375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20577330887317657, "rewards/accuracy_reward": 0.7109375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20577330887317657, "sentence_full_gradient_variance/max_squared_error": 373328.09375, "sentence_full_gradient_variance/metric": 1787.359375, "sentence_full_gradient_variance/p75": 47.703269958496094, "sentence_full_gradient_variance/p90": 109.70722961425781, "sentence_full_gradient_variance/p95": 109.70722961425781, "sentence_full_gradient_variance/p99": 36424.3359375, "state_level_variance/metric": 166.89820861816406, "state_level_variance_full_gradient/metric": 197.51751708984375, "step": 16 }, { "accuracy_reward": 0.7044271230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20848102867603302, "action_level_variance/metric": 5147.375, "action_level_variance_full_gradient/metric": 5060.728515625, "adam_stats/lr_effective_max": 8.999794954434037e-05, "adam_stats/lr_effective_mean": -3.2015740059065934e-10, "adam_stats/lr_effective_min": -8.751658606342971e-05, "adam_stats/m_t_max": 0.0037330985069274902, "adam_stats/m_t_mean": 5.695814479789174e-12, "adam_stats/m_t_min": -0.004284134600311518, "adam_stats/v_t_max": 7.208223541965708e-05, "adam_stats/v_t_mean": 2.785858132722807e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.17578363418579102, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 5.714510917663574, "all_logprobs": -0.18183650076389313, "all_logprobs/max": 0.0, "all_logprobs/median": -5.888938903808594e-05, "all_logprobs/min": -11.875, "all_logprobs/p1": -2.875, "all_logprobs/p10": -0.52734375, "all_logprobs/p25": -0.031005859375, "all_logprobs/p5": -1.1484375, "all_logprobs/p75": -1.3113021850585938e-06, "all_logprobs/var": 0.32386481761932373, "clip_ratio": 0.0, "completion_length": 743.2994995117188, "completion_length/correct": 656.50830078125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 600.0, "completion_length/correct/min": 201.0, "completion_length/correct/p25": 480.0, "completion_length/correct/p75": 884.0, "completion_length/correct/var": 57648.3046875, "completion_length/incorrect": 950.1453247070312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 214.0, "completion_length/incorrect/p25": 1014.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 24753.3828125, "completion_length/max": 1024.0, "completion_length/median": 734.0, "completion_length/min": 201.0, "completion_length/p25": 525.75, "completion_length/p75": 1024.0, "completion_length/var": 65856.3046875, "epoch": 0.2176, "feature_vector_variance/max_squared_error": 105173.8828125, "feature_vector_variance/metric": 24040.423828125, "generated_tokens/total": 8695646.0, "grad_norm": 0.23672588169574738, "grouped_std_rewards": 0.23234383761882782, "learning_rate": 1.4777217947069972e-05, "loss": -0.1758, "mean_logprobs": -0.1875, "mean_logprobs/var": 0.0067138671875, "num_completions/total": 13056, "per_sentence_gradient_norm": 11.926051139831543, "per_sentence_gradient_norm/max": 1278.7147216796875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 62.0040397644043, "per_sentence_gradient_norm/p99": 261.9454345703125, "per_sentence_gradient_norm/var": 5011.669921875, "per_token_feature_norm": 164.1352996826172, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 60.5, "per_token_feature_norm/p25": 124.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 2550.803466796875, "per_token_full_gradient_variance/max_squared_error": 1188.8477783203125, "per_token_full_gradient_variance/variance": 0.18830183148384094, "per_token_gradient_norm": 12.712790489196777, "per_token_gradient_norm/max": 7960.65478515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 27121.505859375, "per_token_policy_error_norm": 0.09174489229917526, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.07565385848283768, "policy_entropy": 0.20227791368961334, "policy_entropy/max": 3.78125, "policy_entropy/median": 0.00067138671875, "policy_entropy/min": 1.723881454251952e-17, "policy_entropy/p25": 2.0503997802734375e-05, "policy_entropy/p75": 0.1513671875, "policy_entropy/var": 0.1780097782611847, "policy_error_vector_variance/max_squared_error": 2.021521806716919, "policy_error_vector_variance/metric": 0.09118998795747757, "policy_loss": -0.17578363418579102, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 5.714511871337891, "policy_sharpness": 6.8460588455200195, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.5863280296325684, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 15.12243938446045, "reward": 0.7044271230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20848102867603302, "rewards/accuracy_reward": 0.7044271230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20848102867603302, "sentence_full_gradient_variance/max_squared_error": 3167708.25, "sentence_full_gradient_variance/metric": 5622.0400390625, "sentence_full_gradient_variance/p75": 117.46495819091797, "sentence_full_gradient_variance/p90": 984.2857666015625, "sentence_full_gradient_variance/p95": 984.2857666015625, "sentence_full_gradient_variance/p99": 46224.4609375, "state_level_variance/metric": 506.46685791015625, "state_level_variance_full_gradient/metric": 561.3123168945312, "step": 17 }, { "accuracy_reward": 0.7057291865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20794625580310822, "action_level_variance/metric": 3340.08056640625, "action_level_variance_full_gradient/metric": 4428.27197265625, "adam_stats/lr_effective_max": 9.197212057188153e-05, "adam_stats/lr_effective_mean": -2.0498383546119214e-10, "adam_stats/lr_effective_min": -9.06172426766716e-05, "adam_stats/m_t_max": 0.003599351504817605, "adam_stats/m_t_mean": 1.111391794567096e-12, "adam_stats/m_t_min": -0.004040352534502745, "adam_stats/v_t_max": 7.201328844530508e-05, "adam_stats/v_t_mean": 2.7901860509549348e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.05387674272060394, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.5848498344421387, "all_logprobs": -0.1804545521736145, "all_logprobs/max": 0.0, "all_logprobs/median": -4.601478576660156e-05, "all_logprobs/min": -14.0625, "all_logprobs/p1": -2.84375, "all_logprobs/p10": -0.5234375, "all_logprobs/p25": -0.03125, "all_logprobs/p5": -1.140625, "all_logprobs/p75": -9.5367431640625e-07, "all_logprobs/var": 0.3147335350513458, "clip_ratio": 0.0, "completion_length": 702.2161865234375, "completion_length/correct": 615.2694091796875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 592.0, "completion_length/correct/min": 242.0, "completion_length/correct/p25": 453.0, "completion_length/correct/p75": 746.75, "completion_length/correct/var": 43694.80078125, "completion_length/incorrect": 910.7344970703125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 252.0, "completion_length/incorrect/p25": 825.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 33559.93359375, "completion_length/max": 1024.0, "completion_length/median": 670.0, "completion_length/min": 242.0, "completion_length/p25": 503.0, "completion_length/p75": 1011.5, "completion_length/var": 58818.40234375, "epoch": 0.2304, "feature_vector_variance/max_squared_error": 106257.3984375, "feature_vector_variance/metric": 24791.861328125, "generated_tokens/total": 9234948.0, "grad_norm": 0.15892381966114044, "grouped_std_rewards": 0.25289615988731384, "learning_rate": 1.4709462719537392e-05, "loss": 0.0539, "mean_logprobs": -0.1826171875, "mean_logprobs/var": 0.00732421875, "num_completions/total": 13824, "per_sentence_gradient_norm": 11.22635555267334, "per_sentence_gradient_norm/max": 618.883056640625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 66.87669372558594, "per_sentence_gradient_norm/p99": 283.6802673339844, "per_sentence_gradient_norm/var": 3218.239990234375, "per_token_feature_norm": 166.189453125, "per_token_feature_norm/max": 334.0, "per_token_feature_norm/median": 155.0, "per_token_feature_norm/min": 61.25, "per_token_feature_norm/p25": 125.5, "per_token_feature_norm/p75": 200.0, "per_token_feature_norm/var": 2574.17822265625, "per_token_full_gradient_variance/max_squared_error": 819.1124267578125, "per_token_full_gradient_variance/variance": 0.13667403161525726, "per_token_gradient_norm": 11.978948593139648, "per_token_gradient_norm/max": 6769.34033203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 18592.615234375, "per_token_policy_error_norm": 0.09166914224624634, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.07590518146753311, "policy_entropy": 0.2006140947341919, "policy_entropy/max": 3.796875, "policy_entropy/median": 0.000537872314453125, "policy_entropy/min": 4.385380947269368e-15, "policy_entropy/p25": 1.5735626220703125e-05, "policy_entropy/p75": 0.1513671875, "policy_entropy/var": 0.17248599231243134, "policy_error_vector_variance/max_squared_error": 2.025455951690674, "policy_error_vector_variance/metric": 0.09124830365180969, "policy_loss": 0.05387674272060394, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.5848498344421387, "policy_sharpness": 6.887751579284668, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.617159843444824, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 15.087788581848145, "reward": 0.7057291865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20794625580310822, "rewards/accuracy_reward": 0.7057291865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20794625580310822, "sentence_full_gradient_variance/max_squared_error": 1461873.125, "sentence_full_gradient_variance/metric": 5000.5126953125, "sentence_full_gradient_variance/p75": 129.45626831054688, "sentence_full_gradient_variance/p90": 165.38739013671875, "sentence_full_gradient_variance/p95": 165.38739013671875, "sentence_full_gradient_variance/p99": 98862.9921875, "state_level_variance/metric": 294.5472106933594, "state_level_variance_full_gradient/metric": 572.2401123046875, "step": 18 }, { "accuracy_reward": 0.7083333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20686659216880798, "action_level_variance/metric": 1791.66748046875, "action_level_variance_full_gradient/metric": 4996.60302734375, "adam_stats/lr_effective_max": 9.287433931604028e-05, "adam_stats/lr_effective_mean": -1.4931952896368728e-10, "adam_stats/lr_effective_min": -9.260539809474722e-05, "adam_stats/m_t_max": 0.0030441037379205227, "adam_stats/m_t_mean": 8.005193730120652e-12, "adam_stats/m_t_min": -0.004012122284621, "adam_stats/v_t_max": 7.198635285021737e-05, "adam_stats/v_t_mean": 2.8035475416882116e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.048875853419303894, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.6061017513275146, "all_logprobs": -0.15774136781692505, "all_logprobs/max": 0.0, "all_logprobs/median": -2.3603439331054688e-05, "all_logprobs/min": -13.6875, "all_logprobs/p1": -2.59375, "all_logprobs/p10": -0.43359375, "all_logprobs/p25": -0.018798828125, "all_logprobs/p5": -1.0, "all_logprobs/p75": -5.960464477539062e-07, "all_logprobs/var": 0.2663809061050415, "clip_ratio": 0.0, "completion_length": 716.203125, "completion_length/correct": 644.1727905273438, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 617.0, "completion_length/correct/min": 203.0, "completion_length/correct/p25": 461.5, "completion_length/correct/p75": 808.0, "completion_length/correct/var": 49449.2109375, "completion_length/incorrect": 891.1339721679688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 374.0, "completion_length/incorrect/p25": 763.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 33708.30859375, "completion_length/max": 1024.0, "completion_length/median": 698.0, "completion_length/min": 203.0, "completion_length/p25": 502.0, "completion_length/p75": 1024.0, "completion_length/var": 57424.93359375, "epoch": 0.2432, "feature_vector_variance/max_squared_error": 106642.265625, "feature_vector_variance/metric": 24030.810546875, "generated_tokens/total": 9784992.0, "grad_norm": 0.2603472173213959, "grouped_std_rewards": 0.22546076774597168, "learning_rate": 1.4632923872213653e-05, "loss": -0.0489, "mean_logprobs": -0.16015625, "mean_logprobs/var": 0.004638671875, "num_completions/total": 14592, "per_sentence_gradient_norm": 7.843219757080078, "per_sentence_gradient_norm/max": 719.28759765625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 52.24294662475586, "per_sentence_gradient_norm/p99": 197.42056274414062, "per_sentence_gradient_norm/var": 1732.4071044921875, "per_token_feature_norm": 161.5361785888672, "per_token_feature_norm/max": 336.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 60.25, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2322.404541015625, "per_token_full_gradient_variance/max_squared_error": 666.2379150390625, "per_token_full_gradient_variance/variance": 0.08993501216173172, "per_token_gradient_norm": 7.956911563873291, "per_token_gradient_norm/max": 7210.98046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 11693.0595703125, "per_token_policy_error_norm": 0.08194439858198166, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06841959059238434, "policy_entropy": 0.17554235458374023, "policy_entropy/max": 3.796875, "policy_entropy/median": 0.0002880096435546875, "policy_entropy/min": 3.5388358909926865e-15, "policy_entropy/p25": 1.0192394256591797e-05, "policy_entropy/p75": 0.1005859375, "policy_entropy/var": 0.14318570494651794, "policy_error_vector_variance/max_squared_error": 2.0221617221832275, "policy_error_vector_variance/metric": 0.08164045959711075, "policy_loss": -0.04887586459517479, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.79339599609375, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.6061017513275146, "policy_sharpness": 7.138190746307373, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.998779296875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.221818923950195, "reward": 0.7083333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20686659216880798, "rewards/accuracy_reward": 0.7083333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20686659216880798, "sentence_full_gradient_variance/max_squared_error": 2098960.5, "sentence_full_gradient_variance/metric": 5650.58154296875, "sentence_full_gradient_variance/p75": 110.56169891357422, "sentence_full_gradient_variance/p90": 134.74468994140625, "sentence_full_gradient_variance/p95": 7532.82958984375, "sentence_full_gradient_variance/p99": 70654.09375, "state_level_variance/metric": 164.15228271484375, "state_level_variance_full_gradient/metric": 653.9776000976562, "step": 19 }, { "accuracy_reward": 0.8020833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15895263850688934, "action_level_variance/metric": 2434.30859375, "action_level_variance_full_gradient/metric": 3542.37158203125, "adam_stats/lr_effective_max": 8.37774932733737e-05, "adam_stats/lr_effective_mean": -1.2099646284902121e-10, "adam_stats/lr_effective_min": -8.44755704747513e-05, "adam_stats/m_t_max": 0.002989033469930291, "adam_stats/m_t_mean": -7.080741158335679e-13, "adam_stats/m_t_min": -0.003420938039198518, "adam_stats/v_t_max": 7.207864837255329e-05, "adam_stats/v_t_mean": 2.808186409103408e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.006637305021286011, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.3686647415161133, "all_logprobs": -0.15276890993118286, "all_logprobs/max": 0.0, "all_logprobs/median": -1.7762184143066406e-05, "all_logprobs/min": -14.0625, "all_logprobs/p1": -2.53125, "all_logprobs/p10": -0.427734375, "all_logprobs/p25": -0.0162353515625, "all_logprobs/p5": -0.9765625, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.2514726519584656, "clip_ratio": 0.0, "completion_length": 660.2630615234375, "completion_length/correct": 609.9545288085938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 570.0, "completion_length/correct/min": 174.0, "completion_length/correct/p25": 432.0, "completion_length/correct/p75": 756.5, "completion_length/correct/var": 51961.94921875, "completion_length/incorrect": 864.1447143554688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 306.0, "completion_length/incorrect/p25": 777.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 46955.62109375, "completion_length/max": 1024.0, "completion_length/median": 625.0, "completion_length/min": 174.0, "completion_length/p25": 452.0, "completion_length/p75": 900.0, "completion_length/var": 61178.9453125, "epoch": 0.256, "feature_vector_variance/max_squared_error": 101392.84375, "feature_vector_variance/metric": 24283.12890625, "generated_tokens/total": 10292074.0, "grad_norm": 0.14484046399593353, "grouped_std_rewards": 0.17092740535736084, "learning_rate": 1.4547694655894313e-05, "loss": 0.0066, "mean_logprobs": -0.154296875, "mean_logprobs/var": 0.00482177734375, "num_completions/total": 15360, "per_sentence_gradient_norm": 7.326737403869629, "per_sentence_gradient_norm/max": 942.4642944335938, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 17.862884521484375, "per_sentence_gradient_norm/p99": 214.59097290039062, "per_sentence_gradient_norm/var": 2383.7314453125, "per_token_feature_norm": 161.03919982910156, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 62.5, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 2246.669921875, "per_token_full_gradient_variance/max_squared_error": 1117.2935791015625, "per_token_full_gradient_variance/variance": 0.1032462865114212, "per_token_gradient_norm": 8.170347213745117, "per_token_gradient_norm/max": 7695.2998046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 14921.1416015625, "per_token_policy_error_norm": 0.08017002046108246, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06661767512559891, "policy_entropy": 0.16958770155906677, "policy_entropy/max": 3.796875, "policy_entropy/median": 0.000225067138671875, "policy_entropy/min": 3.642919299551295e-17, "policy_entropy/p25": 7.212162017822266e-06, "policy_entropy/p75": 0.08935546875, "policy_entropy/var": 0.13537125289440155, "policy_error_vector_variance/max_squared_error": 2.0189626216888428, "policy_error_vector_variance/metric": 0.07996830344200134, "policy_loss": 0.006637312471866608, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.368664026260376, "policy_sharpness": 7.2237162590026855, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.185546875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.905522346496582, "reward": 0.8020833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15895263850688934, "rewards/accuracy_reward": 0.8020833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15895263850688934, "sentence_full_gradient_variance/max_squared_error": 1439376.625, "sentence_full_gradient_variance/metric": 4024.56396484375, "sentence_full_gradient_variance/p75": 28.954519271850586, "sentence_full_gradient_variance/p90": 115.00599670410156, "sentence_full_gradient_variance/p95": 115.00599670410156, "sentence_full_gradient_variance/p99": 98922.53125, "state_level_variance/metric": 253.24545288085938, "state_level_variance_full_gradient/metric": 482.19268798828125, "step": 20 }, { "accuracy_reward": 0.6809896230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21752598881721497, "action_level_variance/metric": 1021.3668212890625, "action_level_variance_full_gradient/metric": 1464.448974609375, "adam_stats/lr_effective_max": 7.813797856215388e-05, "adam_stats/lr_effective_mean": -2.022403217116775e-10, "adam_stats/lr_effective_min": -8.08035911177285e-05, "adam_stats/m_t_max": 0.007889514788985252, "adam_stats/m_t_mean": -2.7629761326086744e-11, "adam_stats/m_t_min": -0.007763583213090897, "adam_stats/v_t_max": 7.277195254573599e-05, "adam_stats/v_t_mean": 3.3305871081912297e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.03942088037729263, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.3907625675201416, "all_logprobs": -0.14919930696487427, "all_logprobs/max": 0.0, "all_logprobs/median": -1.2993812561035156e-05, "all_logprobs/min": -13.75, "all_logprobs/p1": -2.53125, "all_logprobs/p10": -0.400390625, "all_logprobs/p25": -0.01416015625, "all_logprobs/p5": -0.96484375, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.24755705893039703, "clip_ratio": 0.0, "completion_length": 705.359375, "completion_length/correct": 607.4359130859375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 576.0, "completion_length/correct/min": 193.0, "completion_length/correct/p25": 431.0, "completion_length/correct/p75": 738.5, "completion_length/correct/var": 49249.7109375, "completion_length/incorrect": 914.3958740234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 272.0, "completion_length/incorrect/p25": 850.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 33399.71875, "completion_length/max": 1024.0, "completion_length/median": 681.0, "completion_length/min": 193.0, "completion_length/p25": 495.0, "completion_length/p75": 1024.0, "completion_length/var": 64639.52734375, "epoch": 0.2688, "feature_vector_variance/max_squared_error": 101785.296875, "feature_vector_variance/metric": 25142.65234375, "generated_tokens/total": 10833790.0, "grad_norm": 11.719164848327637, "grouped_std_rewards": 0.19902142882347107, "learning_rate": 1.4453878909250906e-05, "loss": -0.0394, "mean_logprobs": -0.1513671875, "mean_logprobs/var": 0.00836181640625, "num_completions/total": 16128, "per_sentence_gradient_norm": 6.247600555419922, "per_sentence_gradient_norm/max": 334.5028076171875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 34.01880645751953, "per_sentence_gradient_norm/p99": 179.96412658691406, "per_sentence_gradient_norm/var": 983.614990234375, "per_token_feature_norm": 162.35409545898438, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 62.0, "per_token_feature_norm/p25": 125.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2255.14501953125, "per_token_full_gradient_variance/max_squared_error": 2769455.5, "per_token_full_gradient_variance/variance": 5.175050258636475, "per_token_gradient_norm": 7.142555236816406, "per_token_gradient_norm/max": 6386.4619140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8867.8740234375, "per_token_policy_error_norm": 0.07800092548131943, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06516959518194199, "policy_entropy": 0.16563810408115387, "policy_entropy/max": 3.796875, "policy_entropy/median": 0.00016689300537109375, "policy_entropy/min": 2.688821387764051e-16, "policy_entropy/p25": 5.364418029785156e-06, "policy_entropy/p75": 0.07861328125, "policy_entropy/var": 0.13415345549583435, "policy_error_vector_variance/max_squared_error": 2.018744945526123, "policy_error_vector_variance/metric": 0.07780127972364426, "policy_loss": -0.03942088037729263, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.3907625675201416, "policy_sharpness": 7.293710231781006, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.3230104446411133, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.7709321975708, "reward": 0.6809896230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21752598881721497, "rewards/accuracy_reward": 0.6809896230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21752598881721497, "sentence_full_gradient_variance/max_squared_error": 180847.53125, "sentence_full_gradient_variance/metric": 1649.5614013671875, "sentence_full_gradient_variance/p75": 39.4024658203125, "sentence_full_gradient_variance/p90": 66.60662841796875, "sentence_full_gradient_variance/p95": 66.60662841796875, "sentence_full_gradient_variance/p99": 60971.515625, "state_level_variance/metric": 89.57138061523438, "state_level_variance_full_gradient/metric": 185.1123504638672, "step": 21 }, { "accuracy_reward": 0.7643229365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18036825954914093, "action_level_variance/metric": 1808.586669921875, "action_level_variance_full_gradient/metric": 3477.669921875, "adam_stats/lr_effective_max": 7.235553493956104e-05, "adam_stats/lr_effective_mean": -2.7830696081032613e-10, "adam_stats/lr_effective_min": -7.458158506779e-05, "adam_stats/m_t_max": 0.007062225136905909, "adam_stats/m_t_mean": -2.3132820312077307e-11, "adam_stats/m_t_min": -0.006986271124333143, "adam_stats/v_t_max": 7.271292997756973e-05, "adam_stats/v_t_mean": 3.3283018268520648e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.11874654144048691, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.0787365436553955, "all_logprobs": -0.13737492263317108, "all_logprobs/max": 0.0, "all_logprobs/median": -7.62939453125e-06, "all_logprobs/min": -12.5, "all_logprobs/p1": -2.375, "all_logprobs/p10": -0.359375, "all_logprobs/p25": -0.00872802734375, "all_logprobs/p5": -0.890625, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.21742483973503113, "clip_ratio": 0.0, "completion_length": 680.2604370117188, "completion_length/correct": 611.541748046875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 568.0, "completion_length/correct/min": 216.0, "completion_length/correct/p25": 439.0, "completion_length/correct/p75": 754.5, "completion_length/correct/var": 47353.40625, "completion_length/incorrect": 903.12158203125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 336.0, "completion_length/incorrect/p25": 837.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 37259.265625, "completion_length/max": 1024.0, "completion_length/median": 646.0, "completion_length/min": 216.0, "completion_length/p25": 466.75, "completion_length/p75": 922.0, "completion_length/var": 60257.453125, "epoch": 0.2816, "feature_vector_variance/max_squared_error": 97100.8046875, "feature_vector_variance/metric": 24562.705078125, "generated_tokens/total": 11356230.0, "grad_norm": 0.0777759924530983, "grouped_std_rewards": 0.21581634879112244, "learning_rate": 1.4351590932319506e-05, "loss": -0.1187, "mean_logprobs": -0.13671875, "mean_logprobs/var": 0.0032958984375, "num_completions/total": 16896, "per_sentence_gradient_norm": 7.513842582702637, "per_sentence_gradient_norm/max": 506.0652770996094, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 42.976524353027344, "per_sentence_gradient_norm/p99": 194.3228302001953, "per_sentence_gradient_norm/var": 1754.4130859375, "per_token_feature_norm": 159.63063049316406, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 62.25, "per_token_feature_norm/p25": 124.0, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 2115.07666015625, "per_token_full_gradient_variance/max_squared_error": 355.0660705566406, "per_token_full_gradient_variance/variance": 0.09330734610557556, "per_token_gradient_norm": 8.859217643737793, "per_token_gradient_norm/max": 7003.14990234375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 14236.6357421875, "per_token_policy_error_norm": 0.07293906807899475, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06074488162994385, "policy_entropy": 0.15323245525360107, "policy_entropy/max": 3.796875, "policy_entropy/median": 0.00010156631469726562, "policy_entropy/min": 3.344546861683284e-15, "policy_entropy/p25": 3.904104232788086e-06, "policy_entropy/p75": 0.05517578125, "policy_entropy/var": 0.12017843127250671, "policy_error_vector_variance/max_squared_error": 2.0228652954101562, "policy_error_vector_variance/metric": 0.0728311613202095, "policy_loss": -0.11874654144048691, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.0787365436553955, "policy_sharpness": 7.471223831176758, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.870849609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.1456880569458, "reward": 0.7643229365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18036825954914093, "rewards/accuracy_reward": 0.7643229365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18036825954914093, "sentence_full_gradient_variance/max_squared_error": 1534053.125, "sentence_full_gradient_variance/metric": 3913.837890625, "sentence_full_gradient_variance/p75": 87.39244079589844, "sentence_full_gradient_variance/p90": 138.60745239257812, "sentence_full_gradient_variance/p95": 138.60745239257812, "sentence_full_gradient_variance/p99": 66484.8828125, "state_level_variance/metric": 171.40090942382812, "state_level_variance_full_gradient/metric": 436.16802978515625, "step": 22 }, { "accuracy_reward": 0.7734375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17546039819717407, "action_level_variance/metric": 1135.79736328125, "action_level_variance_full_gradient/metric": 1369.9736328125, "adam_stats/lr_effective_max": 6.83681428199634e-05, "adam_stats/lr_effective_mean": -2.3049730180080275e-10, "adam_stats/lr_effective_min": -6.834104715380818e-05, "adam_stats/m_t_max": 0.0062942043878138065, "adam_stats/m_t_mean": -2.5198305247942088e-11, "adam_stats/m_t_min": -0.006222793832421303, "adam_stats/v_t_max": 7.264907617354766e-05, "adam_stats/v_t_mean": 3.3260900544201943e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.015415887348353863, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.781151056289673, "all_logprobs": -0.13558495044708252, "all_logprobs/max": 0.0, "all_logprobs/median": -5.841255187988281e-06, "all_logprobs/min": -14.3125, "all_logprobs/p1": -2.359375, "all_logprobs/p10": -0.349609375, "all_logprobs/p25": -0.00823974609375, "all_logprobs/p5": -0.875, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.21658997237682343, "clip_ratio": 0.0, "completion_length": 629.48828125, "completion_length/correct": 552.1245727539062, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 511.0, "completion_length/correct/min": 157.0, "completion_length/correct/p25": 396.5, "completion_length/correct/p75": 681.0, "completion_length/correct/var": 43675.578125, "completion_length/incorrect": 893.5919799804688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 256.0, "completion_length/incorrect/p25": 795.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 44925.70703125, "completion_length/max": 1024.0, "completion_length/median": 565.0, "completion_length/min": 157.0, "completion_length/p25": 422.75, "completion_length/p75": 843.5, "completion_length/var": 64359.28125, "epoch": 0.2944, "feature_vector_variance/max_squared_error": 96725.8046875, "feature_vector_variance/metric": 25098.025390625, "generated_tokens/total": 11839677.0, "grad_norm": 0.05808613821864128, "grouped_std_rewards": 0.173471137881279, "learning_rate": 1.4240955347243754e-05, "loss": 0.0154, "mean_logprobs": -0.1376953125, "mean_logprobs/var": 0.003997802734375, "num_completions/total": 17664, "per_sentence_gradient_norm": 5.40058708190918, "per_sentence_gradient_norm/max": 456.68096923828125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 24.50052833557129, "per_sentence_gradient_norm/p99": 134.4169464111328, "per_sentence_gradient_norm/var": 1108.0738525390625, "per_token_feature_norm": 160.36282348632812, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 62.5, "per_token_feature_norm/p25": 124.5, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 2103.1640625, "per_token_full_gradient_variance/max_squared_error": 950.7937622070312, "per_token_full_gradient_variance/variance": 0.080792635679245, "per_token_gradient_norm": 7.260979175567627, "per_token_gradient_norm/max": 6754.4951171875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 12411.591796875, "per_token_policy_error_norm": 0.07215166836977005, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.060473401099443436, "policy_entropy": 0.15031619369983673, "policy_entropy/max": 3.8125, "policy_entropy/median": 8.0108642578125e-05, "policy_entropy/min": 4.145989107584569e-16, "policy_entropy/p25": 2.950429916381836e-06, "policy_entropy/p75": 0.05029296875, "policy_entropy/var": 0.11693821847438812, "policy_error_vector_variance/max_squared_error": 2.0182723999023438, "policy_error_vector_variance/metric": 0.07206054031848907, "policy_loss": 0.015415887348353863, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.781151056289673, "policy_sharpness": 7.506528377532959, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.9375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.018738746643066, "reward": 0.7734375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17546039819717407, "rewards/accuracy_reward": 0.7734375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17546039819717407, "sentence_full_gradient_variance/max_squared_error": 376448.09375, "sentence_full_gradient_variance/metric": 1538.0496826171875, "sentence_full_gradient_variance/p75": 64.42050170898438, "sentence_full_gradient_variance/p90": 105.71647644042969, "sentence_full_gradient_variance/p95": 105.71647644042969, "sentence_full_gradient_variance/p99": 44317.47265625, "state_level_variance/metric": 113.99581909179688, "state_level_variance_full_gradient/metric": 168.0758514404297, "step": 23 }, { "accuracy_reward": 0.7552083730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1851097196340561, "action_level_variance/metric": 1228.6458740234375, "action_level_variance_full_gradient/metric": 3368.344482421875, "adam_stats/lr_effective_max": 6.843258597655222e-05, "adam_stats/lr_effective_mean": -2.4547605614877455e-10, "adam_stats/lr_effective_min": -7.014572474872693e-05, "adam_stats/m_t_max": 0.005529743619263172, "adam_stats/m_t_mean": -3.032205042607927e-11, "adam_stats/m_t_min": -0.0054875994101166725, "adam_stats/v_t_max": 7.26069338270463e-05, "adam_stats/v_t_mean": 3.3255332081844058e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.05675429105758667, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.8137056827545166, "all_logprobs": -0.1384056657552719, "all_logprobs/max": 0.0, "all_logprobs/median": -6.198883056640625e-06, "all_logprobs/min": -18.0, "all_logprobs/p1": -2.40625, "all_logprobs/p10": -0.357421875, "all_logprobs/p25": -0.00927734375, "all_logprobs/p5": -0.8984375, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.22346511483192444, "clip_ratio": 0.0, "completion_length": 644.3815307617188, "completion_length/correct": 554.1827392578125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 527.0, "completion_length/correct/min": 170.0, "completion_length/correct/p25": 382.0, "completion_length/correct/p75": 686.5, "completion_length/correct/var": 50083.94140625, "completion_length/incorrect": 922.6542358398438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 260.0, "completion_length/incorrect/p25": 903.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 33933.328125, "completion_length/max": 1024.0, "completion_length/median": 600.0, "completion_length/min": 170.0, "completion_length/p25": 424.75, "completion_length/p75": 925.75, "completion_length/var": 71213.6015625, "epoch": 0.3072, "feature_vector_variance/max_squared_error": 102387.7578125, "feature_vector_variance/metric": 25263.3515625, "generated_tokens/total": 12334562.0, "grad_norm": 0.08192799240350723, "grouped_std_rewards": 0.178506538271904, "learning_rate": 1.4122106946441953e-05, "loss": 0.0568, "mean_logprobs": -0.140625, "mean_logprobs/var": 0.0040283203125, "num_completions/total": 18432, "per_sentence_gradient_norm": 6.148721218109131, "per_sentence_gradient_norm/max": 349.1404724121094, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 15.070100784301758, "per_sentence_gradient_norm/p99": 210.9252166748047, "per_sentence_gradient_norm/var": 1192.3917236328125, "per_token_feature_norm": 161.05145263671875, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 60.0, "per_token_feature_norm/p25": 125.0, "per_token_feature_norm/p75": 191.0, "per_token_feature_norm/var": 2098.034912109375, "per_token_full_gradient_variance/max_squared_error": 456.4871826171875, "per_token_full_gradient_variance/variance": 0.08026197552680969, "per_token_gradient_norm": 7.678711414337158, "per_token_gradient_norm/max": 6683.05322265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 10841.15234375, "per_token_policy_error_norm": 0.07352498918771744, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.062041692435741425, "policy_entropy": 0.15239205956459045, "policy_entropy/max": 3.8125, "policy_entropy/median": 8.392333984375e-05, "policy_entropy/min": 3.7192471324942744e-15, "policy_entropy/p25": 2.6971101760864258e-06, "policy_entropy/p75": 0.0556640625, "policy_entropy/var": 0.11879201978445053, "policy_error_vector_variance/max_squared_error": 2.018714189529419, "policy_error_vector_variance/metric": 0.07343816012144089, "policy_loss": 0.05675429105758667, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.8137054443359375, "policy_sharpness": 7.476948261260986, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.870849609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.108796119689941, "reward": 0.7552083730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1851097196340561, "rewards/accuracy_reward": 0.7552083730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1851097196340561, "sentence_full_gradient_variance/max_squared_error": 1617193.75, "sentence_full_gradient_variance/metric": 3824.316162109375, "sentence_full_gradient_variance/p75": 45.730228424072266, "sentence_full_gradient_variance/p90": 54.69700622558594, "sentence_full_gradient_variance/p95": 54.69700622558594, "sentence_full_gradient_variance/p99": 61211.2421875, "state_level_variance/metric": 116.99263763427734, "state_level_variance_full_gradient/metric": 455.9720153808594, "step": 24 }, { "accuracy_reward": 0.70703125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2074081301689148, "action_level_variance/metric": 659.8510131835938, "action_level_variance_full_gradient/metric": 2592.798583984375, "adam_stats/lr_effective_max": 6.764694990124553e-05, "adam_stats/lr_effective_mean": -1.7986646894119218e-10, "adam_stats/lr_effective_min": -6.821620627306402e-05, "adam_stats/m_t_max": 0.004970856010913849, "adam_stats/m_t_mean": -2.4911699439411628e-11, "adam_stats/m_t_min": -0.004673336632549763, "adam_stats/v_t_max": 7.260526763275266e-05, "adam_stats/v_t_mean": 3.326587703217365e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.010542958974838257, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.8195699453353882, "all_logprobs": -0.12834763526916504, "all_logprobs/max": 0.0, "all_logprobs/median": -4.172325134277344e-06, "all_logprobs/min": -11.9375, "all_logprobs/p1": -2.296875, "all_logprobs/p10": -0.318359375, "all_logprobs/p25": -0.0062255859375, "all_logprobs/p5": -0.828125, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.20089809596538544, "clip_ratio": 0.0, "completion_length": 616.03125, "completion_length/correct": 514.0792236328125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 483.0, "completion_length/correct/min": 184.0, "completion_length/correct/p25": 364.0, "completion_length/correct/p75": 627.5, "completion_length/correct/var": 39740.59375, "completion_length/incorrect": 862.0755615234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 292.0, "completion_length/incorrect/p25": 732.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 50690.6015625, "completion_length/max": 1024.0, "completion_length/median": 548.0, "completion_length/min": 184.0, "completion_length/p25": 401.75, "completion_length/p75": 848.0, "completion_length/var": 68004.1328125, "epoch": 0.32, "feature_vector_variance/max_squared_error": 104515.65625, "feature_vector_variance/metric": 25271.1328125, "generated_tokens/total": 12807674.0, "grad_norm": 0.12038435786962509, "grouped_std_rewards": 0.16464881598949432, "learning_rate": 1.3995190528383292e-05, "loss": 0.0105, "mean_logprobs": -0.1318359375, "mean_logprobs/var": 0.00360107421875, "num_completions/total": 19200, "per_sentence_gradient_norm": 4.670718669891357, "per_sentence_gradient_norm/max": 314.8869323730469, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 128.27987670898438, "per_sentence_gradient_norm/var": 638.8672485351562, "per_token_feature_norm": 160.3237762451172, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 60.75, "per_token_feature_norm/p25": 125.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 2019.5010986328125, "per_token_full_gradient_variance/max_squared_error": 89.32473754882812, "per_token_full_gradient_variance/variance": 0.042836349457502365, "per_token_gradient_norm": 5.4543585777282715, "per_token_gradient_norm/max": 3745.29736328125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5876.5439453125, "per_token_policy_error_norm": 0.068654865026474, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05757628381252289, "policy_entropy": 0.14288626611232758, "policy_entropy/max": 3.8125, "policy_entropy/median": 5.793571472167969e-05, "policy_entropy/min": 8.326672684688674e-16, "policy_entropy/p25": 2.2314488887786865e-06, "policy_entropy/p75": 0.04052734375, "policy_entropy/var": 0.10882021486759186, "policy_error_vector_variance/max_squared_error": 2.0164947509765625, "policy_error_vector_variance/metric": 0.06857121735811234, "policy_loss": 0.010542958974838257, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.819569706916809, "policy_sharpness": 7.592527389526367, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.716444969177246, "reward": 0.70703125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2074081301689148, "rewards/accuracy_reward": 0.70703125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2074081301689148, "sentence_full_gradient_variance/max_squared_error": 700970.375, "sentence_full_gradient_variance/metric": 2932.54052734375, "sentence_full_gradient_variance/p75": 55.37390899658203, "sentence_full_gradient_variance/p90": 73.57392120361328, "sentence_full_gradient_variance/p95": 73.57392120361328, "sentence_full_gradient_variance/p99": 81522.171875, "state_level_variance/metric": 61.30434799194336, "state_level_variance_full_gradient/metric": 339.7418518066406, "step": 25 }, { "accuracy_reward": 0.80859375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15497168898582458, "action_level_variance/metric": 1084.2630615234375, "action_level_variance_full_gradient/metric": 3401.05029296875, "adam_stats/lr_effective_max": 6.570501864189282e-05, "adam_stats/lr_effective_mean": -1.134713781270058e-10, "adam_stats/lr_effective_min": -6.657830090261996e-05, "adam_stats/m_t_max": 0.004693496972322464, "adam_stats/m_t_mean": -3.193298056536342e-11, "adam_stats/m_t_min": -0.004240716341882944, "adam_stats/v_t_max": 7.27000879123807e-05, "adam_stats/v_t_mean": 3.3270458870554576e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.05570385977625847, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.305098533630371, "all_logprobs": -0.12848390638828278, "all_logprobs/max": 0.0, "all_logprobs/median": -3.2186508178710938e-06, "all_logprobs/min": -10.9375, "all_logprobs/p1": -2.3106231689453125, "all_logprobs/p10": -0.3203125, "all_logprobs/p25": -0.005859375, "all_logprobs/p5": -0.828125, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.20066896080970764, "clip_ratio": 0.0, "completion_length": 624.88671875, "completion_length/correct": 575.04345703125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 509.0, "completion_length/correct/min": 223.0, "completion_length/correct/p25": 401.0, "completion_length/correct/p75": 728.0, "completion_length/correct/var": 49241.84765625, "completion_length/incorrect": 835.448974609375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 915.0, "completion_length/incorrect/min": 305.0, "completion_length/incorrect/p25": 663.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 45019.5234375, "completion_length/max": 1024.0, "completion_length/median": 579.0, "completion_length/min": 223.0, "completion_length/p25": 419.75, "completion_length/p75": 843.75, "completion_length/var": 58882.7109375, "epoch": 0.3328, "feature_vector_variance/max_squared_error": 103932.921875, "feature_vector_variance/metric": 25227.396484375, "generated_tokens/total": 13287587.0, "grad_norm": 0.13521376252174377, "grouped_std_rewards": 0.18455711007118225, "learning_rate": 1.3860360721173195e-05, "loss": -0.0557, "mean_logprobs": -0.130859375, "mean_logprobs/var": 0.003326416015625, "num_completions/total": 19968, "per_sentence_gradient_norm": 5.570864677429199, "per_sentence_gradient_norm/max": 425.00677490234375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 31.44040298461914, "per_sentence_gradient_norm/p99": 142.21234130859375, "per_sentence_gradient_norm/var": 1054.6015625, "per_token_feature_norm": 159.97698974609375, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 63.5, "per_token_feature_norm/p25": 125.0, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 1999.612060546875, "per_token_full_gradient_variance/max_squared_error": 470.9205017089844, "per_token_full_gradient_variance/variance": 0.06079423055052757, "per_token_gradient_norm": 6.233417987823486, "per_token_gradient_norm/max": 6001.10888671875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8582.6142578125, "per_token_policy_error_norm": 0.06907250732183456, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.058172766119241714, "policy_entropy": 0.1420539766550064, "policy_entropy/max": 3.8125, "policy_entropy/median": 4.506111145019531e-05, "policy_entropy/min": 2.19824158875781e-14, "policy_entropy/p25": 1.6614794731140137e-06, "policy_entropy/p75": 0.037109375, "policy_entropy/var": 0.10791933536529541, "policy_error_vector_variance/max_squared_error": 2.0182578563690186, "policy_error_vector_variance/metric": 0.06900110840797424, "policy_loss": -0.05570385977625847, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.305098533630371, "policy_sharpness": 7.633749485015869, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.2342529296875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.601491928100586, "reward": 0.80859375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15497168898582458, "rewards/accuracy_reward": 0.80859375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15497168898582458, "sentence_full_gradient_variance/max_squared_error": 766545.25, "sentence_full_gradient_variance/metric": 3864.37255859375, "sentence_full_gradient_variance/p75": 44.09136962890625, "sentence_full_gradient_variance/p90": 65.16825866699219, "sentence_full_gradient_variance/p95": 65.16825866699219, "sentence_full_gradient_variance/p99": 75250.3359375, "state_level_variance/metric": 105.59833526611328, "state_level_variance_full_gradient/metric": 463.32196044921875, "step": 26 }, { "accuracy_reward": 0.7252604365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19951753318309784, "action_level_variance/metric": 881.3820190429688, "action_level_variance_full_gradient/metric": 2965.595703125, "adam_stats/lr_effective_max": 6.698070501442999e-05, "adam_stats/lr_effective_mean": 1.6919174394836034e-11, "adam_stats/lr_effective_min": -6.578981265192851e-05, "adam_stats/m_t_max": 0.0044423481449484825, "adam_stats/m_t_mean": -2.8223767073454198e-11, "adam_stats/m_t_min": -0.003938714973628521, "adam_stats/v_t_max": 7.262876897584647e-05, "adam_stats/v_t_mean": 3.325547736493517e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.006702922284603119, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.907405376434326, "all_logprobs": -0.11812495440244675, "all_logprobs/max": 0.0, "all_logprobs/median": -2.2649765014648438e-06, "all_logprobs/min": -12.5, "all_logprobs/p1": -2.1875, "all_logprobs/p10": -0.28125, "all_logprobs/p25": -0.0032806396484375, "all_logprobs/p5": -0.7578125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.18307697772979736, "clip_ratio": 0.0, "completion_length": 674.3893432617188, "completion_length/correct": 578.8312377929688, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 531.0, "completion_length/correct/min": 194.0, "completion_length/correct/p25": 408.0, "completion_length/correct/p75": 725.0, "completion_length/correct/var": 46819.2421875, "completion_length/incorrect": 926.6445922851562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 291.0, "completion_length/incorrect/p25": 1017.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 38224.2421875, "completion_length/max": 1024.0, "completion_length/median": 637.0, "completion_length/min": 194.0, "completion_length/p25": 440.75, "completion_length/p75": 1006.0, "completion_length/var": 68541.390625, "epoch": 0.3456, "feature_vector_variance/max_squared_error": 105770.4296875, "feature_vector_variance/metric": 24714.208984375, "generated_tokens/total": 13805518.0, "grad_norm": 0.10042519122362137, "grouped_std_rewards": 0.19612276554107666, "learning_rate": 1.3717781794162813e-05, "loss": 0.0067, "mean_logprobs": -0.11962890625, "mean_logprobs/var": 0.00244140625, "num_completions/total": 20736, "per_sentence_gradient_norm": 5.884788513183594, "per_sentence_gradient_norm/max": 355.9634094238281, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 31.665090560913086, "per_sentence_gradient_norm/p99": 155.40000915527344, "per_sentence_gradient_norm/var": 847.855224609375, "per_token_feature_norm": 158.27645874023438, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 60.25, "per_token_feature_norm/p25": 125.0, "per_token_feature_norm/p75": 186.0, "per_token_feature_norm/var": 1847.87060546875, "per_token_full_gradient_variance/max_squared_error": 120.07005310058594, "per_token_full_gradient_variance/variance": 0.057002101093530655, "per_token_gradient_norm": 6.745466709136963, "per_token_gradient_norm/max": 4465.07568359375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 7784.69775390625, "per_token_policy_error_norm": 0.06384122371673584, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05390560254454613, "policy_entropy": 0.13077667355537415, "policy_entropy/max": 3.734375, "policy_entropy/median": 3.314018249511719e-05, "policy_entropy/min": 2.020605904817785e-14, "policy_entropy/p25": 1.6316771507263184e-06, "policy_entropy/p75": 0.0238037109375, "policy_entropy/var": 0.0977574959397316, "policy_error_vector_variance/max_squared_error": 2.0147221088409424, "policy_error_vector_variance/metric": 0.06378882378339767, "policy_loss": 0.006702917627990246, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.907405138015747, "policy_sharpness": 7.772586345672607, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.048362731933594, "reward": 0.7252604365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19951753318309784, "rewards/accuracy_reward": 0.7252604365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19951753318309784, "sentence_full_gradient_variance/max_squared_error": 611915.8125, "sentence_full_gradient_variance/metric": 3358.69189453125, "sentence_full_gradient_variance/p75": 51.40003967285156, "sentence_full_gradient_variance/p90": 86.10294342041016, "sentence_full_gradient_variance/p95": 86.10294342041016, "sentence_full_gradient_variance/p99": 60666.02734375, "state_level_variance/metric": 76.33718872070312, "state_level_variance_full_gradient/metric": 393.09576416015625, "step": 27 }, { "accuracy_reward": 0.7265625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19892846047878265, "action_level_variance/metric": 913.6650390625, "action_level_variance_full_gradient/metric": 2611.17431640625, "adam_stats/lr_effective_max": 6.654956814600155e-05, "adam_stats/lr_effective_mean": 3.4957427577642086e-12, "adam_stats/lr_effective_min": -6.805858720326796e-05, "adam_stats/m_t_max": 0.003870702348649502, "adam_stats/m_t_mean": -2.2182360115419186e-11, "adam_stats/m_t_min": -0.0034357428085058928, "adam_stats/v_t_max": 7.257823017425835e-05, "adam_stats/v_t_mean": 3.323935527863031e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.016852371394634247, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.32533597946167, "all_logprobs": -0.13311846554279327, "all_logprobs/max": 0.0, "all_logprobs/median": -3.814697265625e-06, "all_logprobs/min": -12.25, "all_logprobs/p1": -2.34375, "all_logprobs/p10": -0.34765625, "all_logprobs/p25": -0.007659912109375, "all_logprobs/p5": -0.859375, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.2083868831396103, "clip_ratio": 0.0, "completion_length": 622.5911865234375, "completion_length/correct": 532.9766845703125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 512.0, "completion_length/correct/min": 163.0, "completion_length/correct/p25": 362.0, "completion_length/correct/p75": 683.0, "completion_length/correct/var": 44630.625, "completion_length/incorrect": 860.7095336914062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 970.0, "completion_length/incorrect/min": 243.0, "completion_length/incorrect/p25": 712.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 41616.90625, "completion_length/max": 1024.0, "completion_length/median": 584.0, "completion_length/min": 163.0, "completion_length/p25": 421.0, "completion_length/p75": 831.5, "completion_length/var": 65117.89453125, "epoch": 0.3584, "feature_vector_variance/max_squared_error": 109300.5234375, "feature_vector_variance/metric": 25440.474609375, "generated_tokens/total": 14283668.0, "grad_norm": 0.0920061469078064, "grouped_std_rewards": 0.19039130210876465, "learning_rate": 1.3567627457812107e-05, "loss": 0.0169, "mean_logprobs": -0.1337890625, "mean_logprobs/var": 0.003875732421875, "num_completions/total": 21504, "per_sentence_gradient_norm": 5.836501121520996, "per_sentence_gradient_norm/max": 310.939208984375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 32.905521392822266, "per_sentence_gradient_norm/p99": 159.37960815429688, "per_sentence_gradient_norm/var": 880.7471313476562, "per_token_feature_norm": 161.15350341796875, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 153.0, "per_token_feature_norm/min": 62.5, "per_token_feature_norm/p25": 126.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 2008.978515625, "per_token_full_gradient_variance/max_squared_error": 1640.016357421875, "per_token_full_gradient_variance/variance": 0.07234231382608414, "per_token_gradient_norm": 6.892120838165283, "per_token_gradient_norm/max": 6645.94091796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9777.638671875, "per_token_policy_error_norm": 0.07145841419696808, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.060042716562747955, "policy_entropy": 0.1470557451248169, "policy_entropy/max": 3.78125, "policy_entropy/median": 5.435943603515625e-05, "policy_entropy/min": 6.261657858885883e-14, "policy_entropy/p25": 1.9371509552001953e-06, "policy_entropy/p75": 0.049072265625, "policy_entropy/var": 0.11094148457050323, "policy_error_vector_variance/max_squared_error": 2.0158796310424805, "policy_error_vector_variance/metric": 0.07140326499938965, "policy_loss": 0.016852371394634247, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.325336217880249, "policy_sharpness": 7.5522661209106445, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.011474609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.886159896850586, "reward": 0.7265625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19892846047878265, "rewards/accuracy_reward": 0.7265625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19892846047878265, "sentence_full_gradient_variance/max_squared_error": 405557.6875, "sentence_full_gradient_variance/metric": 2957.35107421875, "sentence_full_gradient_variance/p75": 53.272335052490234, "sentence_full_gradient_variance/p90": 60.67972946166992, "sentence_full_gradient_variance/p95": 60.67972946166992, "sentence_full_gradient_variance/p99": 86969.984375, "state_level_variance/metric": 80.98699951171875, "state_level_variance_full_gradient/metric": 346.1767272949219, "step": 28 }, { "accuracy_reward": 0.7825521230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1703862100839615, "action_level_variance/metric": 1601.5775146484375, "action_level_variance_full_gradient/metric": 2682.8564453125, "adam_stats/lr_effective_max": 6.842275615781546e-05, "adam_stats/lr_effective_mean": -4.1183664117872e-11, "adam_stats/lr_effective_min": -6.509861850645393e-05, "adam_stats/m_t_max": 0.0037888078950345516, "adam_stats/m_t_mean": -2.3573599669535206e-11, "adam_stats/m_t_min": -0.004093128256499767, "adam_stats/v_t_max": 7.264155283337459e-05, "adam_stats/v_t_mean": 3.3269045070921655e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.001917931018397212, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.618875503540039, "all_logprobs": -0.12686659395694733, "all_logprobs/max": 0.0, "all_logprobs/median": -2.7418136596679688e-06, "all_logprobs/min": -11.25, "all_logprobs/p1": -2.25, "all_logprobs/p10": -0.314453125, "all_logprobs/p25": -0.006072998046875, "all_logprobs/p5": -0.82421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.19756309688091278, "clip_ratio": 0.0, "completion_length": 572.7448120117188, "completion_length/correct": 505.0116271972656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 461.0, "completion_length/correct/min": 126.0, "completion_length/correct/p25": 350.0, "completion_length/correct/p75": 621.0, "completion_length/correct/var": 42021.42578125, "completion_length/incorrect": 816.5030517578125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 257.0, "completion_length/incorrect/p25": 578.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 60241.4140625, "completion_length/max": 1024.0, "completion_length/median": 502.0, "completion_length/min": 126.0, "completion_length/p25": 378.75, "completion_length/p75": 731.25, "completion_length/var": 62441.98828125, "epoch": 0.3712, "feature_vector_variance/max_squared_error": 105667.8515625, "feature_vector_variance/metric": 25662.814453125, "generated_tokens/total": 14723536.0, "grad_norm": 0.1785881519317627, "grouped_std_rewards": 0.12502412497997284, "learning_rate": 1.3410080652050414e-05, "loss": -0.0019, "mean_logprobs": -0.1259765625, "mean_logprobs/var": 0.003021240234375, "num_completions/total": 22272, "per_sentence_gradient_norm": 5.65326452255249, "per_sentence_gradient_norm/max": 658.408935546875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 226.4359588623047, "per_sentence_gradient_norm/var": 1571.6644287109375, "per_token_feature_norm": 161.201416015625, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 61.5, "per_token_feature_norm/p25": 127.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 1926.22802734375, "per_token_full_gradient_variance/max_squared_error": 1322.7698974609375, "per_token_full_gradient_variance/variance": 0.11226382851600647, "per_token_gradient_norm": 7.589423656463623, "per_token_gradient_norm/max": 7511.59228515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 15602.9892578125, "per_token_policy_error_norm": 0.06839156150817871, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05752430856227875, "policy_entropy": 0.14007049798965454, "policy_entropy/max": 3.78125, "policy_entropy/median": 3.981590270996094e-05, "policy_entropy/min": 6.661338147750939e-15, "policy_entropy/p25": 1.2516975402832031e-06, "policy_entropy/p75": 0.040283203125, "policy_entropy/var": 0.10349223017692566, "policy_error_vector_variance/max_squared_error": 2.014653205871582, "policy_error_vector_variance/metric": 0.06832646578550339, "policy_loss": -0.0019179371884092689, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.79339599609375, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.618875741958618, "policy_sharpness": 7.626527786254883, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.1875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.572869300842285, "reward": 0.7825521230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1703862100839615, "rewards/accuracy_reward": 0.7825521230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1703862100839615, "sentence_full_gradient_variance/max_squared_error": 681862.4375, "sentence_full_gradient_variance/metric": 3017.06689453125, "sentence_full_gradient_variance/p75": 101.96746063232422, "sentence_full_gradient_variance/p90": 118.49298858642578, "sentence_full_gradient_variance/p95": 118.49298858642578, "sentence_full_gradient_variance/p99": 69507.734375, "state_level_variance/metric": 170.00869750976562, "state_level_variance_full_gradient/metric": 334.21075439453125, "step": 29 }, { "accuracy_reward": 0.7981771230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16130046546459198, "action_level_variance/metric": 918.77734375, "action_level_variance_full_gradient/metric": 2586.1357421875, "adam_stats/lr_effective_max": 6.514934648294002e-05, "adam_stats/lr_effective_mean": -1.3255876431250702e-11, "adam_stats/lr_effective_min": -6.32787196082063e-05, "adam_stats/m_t_max": 0.003497665049508214, "adam_stats/m_t_mean": -2.044830450798063e-11, "adam_stats/m_t_min": -0.0038135149516165257, "adam_stats/v_t_max": 7.258664845721796e-05, "adam_stats/v_t_mean": 3.324885072125694e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.010604890063405037, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.976581335067749, "all_logprobs": -0.12745970487594604, "all_logprobs/max": 0.0, "all_logprobs/median": -3.337860107421875e-06, "all_logprobs/min": -13.0625, "all_logprobs/p1": -2.296875, "all_logprobs/p10": -0.314453125, "all_logprobs/p25": -0.00640869140625, "all_logprobs/p5": -0.82421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.19927506148815155, "clip_ratio": 0.0, "completion_length": 576.6068115234375, "completion_length/correct": 523.8597412109375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 474.0, "completion_length/correct/min": 173.0, "completion_length/correct/p25": 382.0, "completion_length/correct/p75": 634.0, "completion_length/correct/var": 37712.78515625, "completion_length/incorrect": 785.212890625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 842.0, "completion_length/incorrect/min": 205.0, "completion_length/incorrect/p25": 572.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63134.48046875, "completion_length/max": 1024.0, "completion_length/median": 514.0, "completion_length/min": 173.0, "completion_length/p25": 399.5, "completion_length/p75": 725.0, "completion_length/var": 53785.55078125, "epoch": 0.384, "feature_vector_variance/max_squared_error": 113024.84375, "feature_vector_variance/metric": 26024.7421875, "generated_tokens/total": 15166370.0, "grad_norm": 0.0888582244515419, "grouped_std_rewards": 0.13208399713039398, "learning_rate": 1.3245333323392335e-05, "loss": -0.0106, "mean_logprobs": -0.1279296875, "mean_logprobs/var": 0.00341796875, "num_completions/total": 23040, "per_sentence_gradient_norm": 4.210992813110352, "per_sentence_gradient_norm/max": 512.601806640625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 134.23464965820312, "per_sentence_gradient_norm/var": 902.2197875976562, "per_token_feature_norm": 162.61183166503906, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 156.0, "per_token_feature_norm/min": 63.75, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 1909.1529541015625, "per_token_full_gradient_variance/max_squared_error": 608.7572021484375, "per_token_full_gradient_variance/variance": 0.06988346576690674, "per_token_gradient_norm": 5.6456990242004395, "per_token_gradient_norm/max": 7819.626953125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9543.8896484375, "per_token_policy_error_norm": 0.06847098469734192, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05774032324552536, "policy_entropy": 0.14128859341144562, "policy_entropy/max": 3.796875, "policy_entropy/median": 4.7206878662109375e-05, "policy_entropy/min": 9.603429163007604e-15, "policy_entropy/p25": 1.430511474609375e-06, "policy_entropy/p75": 0.04052734375, "policy_entropy/var": 0.1066129133105278, "policy_error_vector_variance/max_squared_error": 2.023103713989258, "policy_error_vector_variance/metric": 0.06836915761232376, "policy_loss": -0.010604883544147015, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.976581335067749, "policy_sharpness": 7.598212718963623, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.1405029296875, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.685304641723633, "reward": 0.7981771230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16130046546459198, "rewards/accuracy_reward": 0.7981771230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16130046546459198, "sentence_full_gradient_variance/max_squared_error": 694287.75, "sentence_full_gradient_variance/metric": 2932.60546875, "sentence_full_gradient_variance/p75": 25.752084732055664, "sentence_full_gradient_variance/p90": 78.28060150146484, "sentence_full_gradient_variance/p95": 78.28060150146484, "sentence_full_gradient_variance/p99": 44755.86328125, "state_level_variance/metric": 98.13697814941406, "state_level_variance_full_gradient/metric": 346.4698181152344, "step": 30 }, { "accuracy_reward": 0.7408854365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19222448766231537, "action_level_variance/metric": 1166.23828125, "action_level_variance_full_gradient/metric": 3239.488037109375, "adam_stats/lr_effective_max": 6.397221295628697e-05, "adam_stats/lr_effective_mean": -2.8186617970216155e-11, "adam_stats/lr_effective_min": -6.591320561710745e-05, "adam_stats/m_t_max": 0.003550940426066518, "adam_stats/m_t_mean": -8.24424122719547e-12, "adam_stats/m_t_min": -0.004701297264546156, "adam_stats/v_t_max": 7.423661736538634e-05, "adam_stats/v_t_mean": 3.444652332471043e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.09959080070257187, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.240631580352783, "all_logprobs": -0.1312340646982193, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-06, "all_logprobs/min": -13.5, "all_logprobs/p1": -2.328125, "all_logprobs/p10": -0.337890625, "all_logprobs/p25": -0.0079345703125, "all_logprobs/p5": -0.8359375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.20427945256233215, "clip_ratio": 0.0, "completion_length": 597.2435302734375, "completion_length/correct": 502.1722412109375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 447.0, "completion_length/correct/min": 166.0, "completion_length/correct/p25": 357.0, "completion_length/correct/p75": 625.0, "completion_length/correct/var": 40411.171875, "completion_length/incorrect": 869.0803833007812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 301.0, "completion_length/incorrect/p25": 703.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 45552.57421875, "completion_length/max": 1024.0, "completion_length/median": 543.0, "completion_length/min": 166.0, "completion_length/p25": 378.0, "completion_length/p75": 814.5, "completion_length/var": 67563.3046875, "epoch": 0.3968, "feature_vector_variance/max_squared_error": 110643.65625, "feature_vector_variance/metric": 25622.546875, "generated_tokens/total": 15625053.0, "grad_norm": 0.4931849241256714, "grouped_std_rewards": 0.13295979797840118, "learning_rate": 1.3073586191080456e-05, "loss": 0.0996, "mean_logprobs": -0.1328125, "mean_logprobs/var": 0.0035552978515625, "num_completions/total": 23808, "per_sentence_gradient_norm": 5.312954902648926, "per_sentence_gradient_norm/max": 441.848388671875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 166.0408935546875, "per_sentence_gradient_norm/var": 1139.4945068359375, "per_token_feature_norm": 161.7657928466797, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 63.75, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 1934.24951171875, "per_token_full_gradient_variance/max_squared_error": 280.39154052734375, "per_token_full_gradient_variance/variance": 0.07619213312864304, "per_token_gradient_norm": 6.613218784332275, "per_token_gradient_norm/max": 6263.37158203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 10927.8125, "per_token_policy_error_norm": 0.07052220404148102, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05914692208170891, "policy_entropy": 0.14540590345859528, "policy_entropy/max": 3.796875, "policy_entropy/median": 5.1021575927734375e-05, "policy_entropy/min": 4.085620730620576e-14, "policy_entropy/p25": 1.5720725059509277e-06, "policy_entropy/p75": 0.0498046875, "policy_entropy/var": 0.10777761042118073, "policy_error_vector_variance/max_squared_error": 2.0178799629211426, "policy_error_vector_variance/metric": 0.07045117020606995, "policy_loss": 0.09959080070257187, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.240631580352783, "policy_sharpness": 7.5443549156188965, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.83449649810791, "reward": 0.7408854365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19222448766231537, "rewards/accuracy_reward": 0.7408854365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19222448766231537, "sentence_full_gradient_variance/max_squared_error": 931052.1875, "sentence_full_gradient_variance/metric": 3640.697998046875, "sentence_full_gradient_variance/p75": 94.89314270019531, "sentence_full_gradient_variance/p90": 319.97100830078125, "sentence_full_gradient_variance/p95": 319.97100830078125, "sentence_full_gradient_variance/p99": 51127.25, "state_level_variance/metric": 118.78966522216797, "state_level_variance_full_gradient/metric": 401.20965576171875, "step": 31 }, { "accuracy_reward": 0.7252604365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19951753318309784, "action_level_variance/metric": 1304.5963134765625, "action_level_variance_full_gradient/metric": 5085.25244140625, "adam_stats/lr_effective_max": 6.758701056241989e-05, "adam_stats/lr_effective_mean": 1.1834266205879018e-11, "adam_stats/lr_effective_min": -6.364363071043044e-05, "adam_stats/m_t_max": 0.0033362270332872868, "adam_stats/m_t_mean": 6.18597404203336e-12, "adam_stats/m_t_min": -0.0042456635273993015, "adam_stats/v_t_max": 7.418333552777767e-05, "adam_stats/v_t_mean": 3.4464993792920895e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.009409806691110134, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.803623676300049, "all_logprobs": -0.12298998981714249, "all_logprobs/max": 0.0, "all_logprobs/median": -2.5033950805664062e-06, "all_logprobs/min": -11.5625, "all_logprobs/p1": -2.234375, "all_logprobs/p10": -0.302734375, "all_logprobs/p25": -0.0052490234375, "all_logprobs/p5": -0.7890625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.19243943691253662, "clip_ratio": 0.0, "completion_length": 608.62890625, "completion_length/correct": 530.3572998046875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 506.0, "completion_length/correct/min": 69.0, "completion_length/correct/p25": 368.0, "completion_length/correct/p75": 662.0, "completion_length/correct/var": 42619.03125, "completion_length/incorrect": 815.251220703125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 893.0, "completion_length/incorrect/min": 271.0, "completion_length/incorrect/p25": 624.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 55822.6484375, "completion_length/max": 1024.0, "completion_length/median": 567.0, "completion_length/min": 69.0, "completion_length/p25": 409.75, "completion_length/p75": 800.0, "completion_length/var": 62372.28125, "epoch": 0.4096, "feature_vector_variance/max_squared_error": 104837.9375, "feature_vector_variance/metric": 25800.62109375, "generated_tokens/total": 16092480.0, "grad_norm": 0.14228376746177673, "grouped_std_rewards": 0.1704835146665573, "learning_rate": 1.2895048502539883e-05, "loss": 0.0094, "mean_logprobs": -0.1259765625, "mean_logprobs/var": 0.00421142578125, "num_completions/total": 24576, "per_sentence_gradient_norm": 5.565011024475098, "per_sentence_gradient_norm/max": 723.393310546875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 133.00961303710938, "per_sentence_gradient_norm/var": 1275.28759765625, "per_token_feature_norm": 161.70309448242188, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 155.0, "per_token_feature_norm/min": 61.0, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 1820.5494384765625, "per_token_full_gradient_variance/max_squared_error": 287448.53125, "per_token_full_gradient_variance/variance": 0.684951901435852, "per_token_gradient_norm": 6.5251874923706055, "per_token_gradient_norm/max": 7637.77490234375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 10402.140625, "per_token_policy_error_norm": 0.06633231788873672, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.056086309254169464, "policy_entropy": 0.13609333336353302, "policy_entropy/max": 3.796875, "policy_entropy/median": 3.647804260253906e-05, "policy_entropy/min": 1.3500311979441904e-13, "policy_entropy/p25": 1.1995434761047363e-06, "policy_entropy/p75": 0.033935546875, "policy_entropy/var": 0.101021908223629, "policy_error_vector_variance/max_squared_error": 2.0142011642456055, "policy_error_vector_variance/metric": 0.06625385582447052, "policy_loss": 0.009409795515239239, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.803623676300049, "policy_sharpness": 7.65946102142334, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.249999523162842, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.43028736114502, "reward": 0.7252604365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19951753318309784, "rewards/accuracy_reward": 0.7252604365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19951753318309784, "sentence_full_gradient_variance/max_squared_error": 1551383.625, "sentence_full_gradient_variance/metric": 5756.8017578125, "sentence_full_gradient_variance/p75": 65.47956848144531, "sentence_full_gradient_variance/p90": 273.92413330078125, "sentence_full_gradient_variance/p95": 273.92413330078125, "sentence_full_gradient_variance/p99": 120406.625, "state_level_variance/metric": 133.49578857421875, "state_level_variance_full_gradient/metric": 671.549560546875, "step": 32 }, { "accuracy_reward": 0.7395833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19285094738006592, "action_level_variance/metric": 1147.342529296875, "action_level_variance_full_gradient/metric": 4546.294921875, "adam_stats/lr_effective_max": 5.99724444327876e-05, "adam_stats/lr_effective_mean": -3.717140484305048e-11, "adam_stats/lr_effective_min": -6.235957698663697e-05, "adam_stats/m_t_max": 0.003142222296446562, "adam_stats/m_t_mean": 6.617955315701973e-12, "adam_stats/m_t_min": -0.0039492710493505, "adam_stats/v_t_max": 7.41138428566046e-05, "adam_stats/v_t_mean": 3.4433124754262856e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0017391828587278724, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 2.9905848503112793, "all_logprobs": -0.12040477991104126, "all_logprobs/max": 0.0, "all_logprobs/median": -2.0265579223632812e-06, "all_logprobs/min": -14.5, "all_logprobs/p1": -2.203125, "all_logprobs/p10": -0.283203125, "all_logprobs/p25": -0.00421142578125, "all_logprobs/p5": -0.765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1901954710483551, "clip_ratio": 0.0, "completion_length": 618.96875, "completion_length/correct": 537.1707763671875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 502.0, "completion_length/correct/min": 179.0, "completion_length/correct/p25": 367.75, "completion_length/correct/p75": 663.25, "completion_length/correct/var": 43210.421875, "completion_length/incorrect": 851.2749633789062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 235.0, "completion_length/incorrect/p25": 658.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 52105.03125, "completion_length/max": 1024.0, "completion_length/median": 584.0, "completion_length/min": 179.0, "completion_length/p25": 405.75, "completion_length/p75": 829.25, "completion_length/var": 64488.7734375, "epoch": 0.4224, "feature_vector_variance/max_squared_error": 102140.7890625, "feature_vector_variance/metric": 25549.26171875, "generated_tokens/total": 16567848.0, "grad_norm": 0.06015798822045326, "grouped_std_rewards": 0.1775471419095993, "learning_rate": 1.270993777844248e-05, "loss": -0.0017, "mean_logprobs": -0.12353515625, "mean_logprobs/var": 0.003753662109375, "num_completions/total": 25344, "per_sentence_gradient_norm": 5.366275787353516, "per_sentence_gradient_norm/max": 526.7509765625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 25.904497146606445, "per_sentence_gradient_norm/p99": 130.7364044189453, "per_sentence_gradient_norm/var": 1120.0040283203125, "per_token_feature_norm": 160.82479858398438, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 60.5, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 1806.57275390625, "per_token_full_gradient_variance/max_squared_error": 469.95477294921875, "per_token_full_gradient_variance/variance": 0.06915782392024994, "per_token_gradient_norm": 5.999410629272461, "per_token_gradient_norm/max": 6724.80517578125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8731.18359375, "per_token_policy_error_norm": 0.06464548408985138, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.054618414491415024, "policy_entropy": 0.1336805820465088, "policy_entropy/max": 3.734375, "policy_entropy/median": 2.9921531677246094e-05, "policy_entropy/min": 4.385380947269368e-15, "policy_entropy/p25": 1.1101365089416504e-06, "policy_entropy/p75": 0.029541015625, "policy_entropy/var": 0.0999864712357521, "policy_error_vector_variance/max_squared_error": 2.0156474113464355, "policy_error_vector_variance/metric": 0.0645647868514061, "policy_loss": -0.0017391815781593323, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.9905848503112793, "policy_sharpness": 7.711147308349609, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.373046398162842, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.300061225891113, "reward": 0.7395833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19285094738006592, "rewards/accuracy_reward": 0.7395833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19285094738006592, "sentence_full_gradient_variance/max_squared_error": 2522037.5, "sentence_full_gradient_variance/metric": 5152.98291015625, "sentence_full_gradient_variance/p75": 69.33831787109375, "sentence_full_gradient_variance/p90": 219.79295349121094, "sentence_full_gradient_variance/p95": 219.79295349121094, "sentence_full_gradient_variance/p99": 48101.76171875, "state_level_variance/metric": 115.82743835449219, "state_level_variance_full_gradient/metric": 606.6888427734375, "step": 33 }, { "accuracy_reward": 0.7421875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.191594660282135, "action_level_variance/metric": 738.1448974609375, "action_level_variance_full_gradient/metric": 2707.552001953125, "adam_stats/lr_effective_max": 5.9664598666131496e-05, "adam_stats/lr_effective_mean": -3.4853685909608245e-11, "adam_stats/lr_effective_min": -6.346932059386745e-05, "adam_stats/m_t_max": 0.002791378879919648, "adam_stats/m_t_mean": 8.8999163941339e-12, "adam_stats/m_t_min": -0.0034994122106581926, "adam_stats/v_t_max": 7.404021016554907e-05, "adam_stats/v_t_mean": 3.4406128120167967e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.03916705772280693, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.9327360391616821, "all_logprobs": -0.12032566219568253, "all_logprobs/max": 0.0, "all_logprobs/median": -2.0265579223632812e-06, "all_logprobs/min": -13.75, "all_logprobs/p1": -2.203125, "all_logprobs/p10": -0.28125, "all_logprobs/p25": -0.00408935546875, "all_logprobs/p5": -0.76953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.18924453854560852, "clip_ratio": 0.0, "completion_length": 615.1784057617188, "completion_length/correct": 535.3140258789062, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 498.0, "completion_length/correct/min": 149.0, "completion_length/correct/p25": 378.25, "completion_length/correct/p75": 656.0, "completion_length/correct/var": 43760.26171875, "completion_length/incorrect": 845.0908813476562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 216.0, "completion_length/incorrect/p25": 662.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 53344.59765625, "completion_length/max": 1024.0, "completion_length/median": 552.0, "completion_length/min": 149.0, "completion_length/p25": 415.75, "completion_length/p75": 827.25, "completion_length/var": 64550.65625, "epoch": 0.4352, "feature_vector_variance/max_squared_error": 104685.421875, "feature_vector_variance/metric": 25450.794921875, "generated_tokens/total": 17040304.0, "grad_norm": 0.07252345234155655, "grouped_std_rewards": 0.17334292829036713, "learning_rate": 1.2518479547691437e-05, "loss": 0.0392, "mean_logprobs": -0.12353515625, "mean_logprobs/var": 0.0034027099609375, "num_completions/total": 26112, "per_sentence_gradient_norm": 4.6060638427734375, "per_sentence_gradient_norm/max": 402.2428894042969, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 8.186493873596191, "per_sentence_gradient_norm/p99": 158.6878204345703, "per_sentence_gradient_norm/var": 717.86376953125, "per_token_feature_norm": 160.8482208251953, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 61.0, "per_token_feature_norm/p25": 127.5, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 1810.271728515625, "per_token_full_gradient_variance/max_squared_error": 587.1907958984375, "per_token_full_gradient_variance/variance": 0.04778974503278732, "per_token_gradient_norm": 5.011244773864746, "per_token_gradient_norm/max": 5907.55419921875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6543.5068359375, "per_token_policy_error_norm": 0.06475617736577988, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05488913133740425, "policy_entropy": 0.13325011730194092, "policy_entropy/max": 3.59375, "policy_entropy/median": 2.9087066650390625e-05, "policy_entropy/min": 4.773959005888173e-15, "policy_entropy/p25": 1.0654330253601074e-06, "policy_entropy/p75": 0.0279541015625, "policy_entropy/var": 0.09956613928079605, "policy_error_vector_variance/max_squared_error": 2.015536069869995, "policy_error_vector_variance/metric": 0.06469320505857468, "policy_loss": 0.03916705772280693, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -7.48191499710083, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.9327360391616821, "policy_sharpness": 7.719906806945801, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.285324096679688, "reward": 0.7421875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.191594660282135, "rewards/accuracy_reward": 0.7421875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.191594660282135, "sentence_full_gradient_variance/max_squared_error": 430479.5, "sentence_full_gradient_variance/metric": 3059.42578125, "sentence_full_gradient_variance/p75": 94.04957580566406, "sentence_full_gradient_variance/p90": 97.69635772705078, "sentence_full_gradient_variance/p95": 97.69635772705078, "sentence_full_gradient_variance/p99": 59932.52734375, "state_level_variance/metric": 71.80020904541016, "state_level_variance_full_gradient/metric": 351.873779296875, "step": 34 }, { "accuracy_reward": 0.8033854365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15816323459148407, "action_level_variance/metric": 1139.1392822265625, "action_level_variance_full_gradient/metric": 3319.740966796875, "adam_stats/lr_effective_max": 5.956537279416807e-05, "adam_stats/lr_effective_mean": -7.815054159365786e-11, "adam_stats/lr_effective_min": -5.596770643023774e-05, "adam_stats/m_t_max": 0.002195231383666396, "adam_stats/m_t_mean": 6.486617666612293e-12, "adam_stats/m_t_min": -0.0029068561270833015, "adam_stats/v_t_max": 7.397643639706075e-05, "adam_stats/v_t_mean": 3.4380965956148923e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.08858293294906616, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.1948330402374268, "all_logprobs": -0.13468122482299805, "all_logprobs/max": 0.0, "all_logprobs/median": -3.6954879760742188e-06, "all_logprobs/min": -12.5, "all_logprobs/p1": -2.375, "all_logprobs/p10": -0.34765625, "all_logprobs/p25": -0.00860595703125, "all_logprobs/p5": -0.85546875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.21683815121650696, "clip_ratio": 0.0, "completion_length": 560.1771240234375, "completion_length/correct": 501.708251953125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 462.0, "completion_length/correct/min": 167.0, "completion_length/correct/p25": 358.0, "completion_length/correct/p75": 621.0, "completion_length/correct/var": 39220.890625, "completion_length/incorrect": 799.0861206054688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 901.0, "completion_length/incorrect/min": 253.0, "completion_length/incorrect/p25": 574.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 61954.46484375, "completion_length/max": 1024.0, "completion_length/median": 509.0, "completion_length/min": 167.0, "completion_length/p25": 375.0, "completion_length/p75": 712.0, "completion_length/var": 57602.62890625, "epoch": 0.448, "feature_vector_variance/max_squared_error": 110642.7421875, "feature_vector_variance/metric": 25661.23828125, "generated_tokens/total": 17470520.0, "grad_norm": 0.07240030169487, "grouped_std_rewards": 0.1920093297958374, "learning_rate": 1.2320907072649045e-05, "loss": 0.0886, "mean_logprobs": -0.1318359375, "mean_logprobs/var": 0.0048828125, "num_completions/total": 26880, "per_sentence_gradient_norm": 6.132970333099365, "per_sentence_gradient_norm/max": 423.0939025878906, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 34.581600189208984, "per_sentence_gradient_norm/p99": 169.5654754638672, "per_sentence_gradient_norm/var": 1102.962158203125, "per_token_feature_norm": 161.53465270996094, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 155.0, "per_token_feature_norm/min": 61.0, "per_token_feature_norm/p25": 128.0, "per_token_feature_norm/p75": 190.0, "per_token_feature_norm/var": 1810.0904541015625, "per_token_full_gradient_variance/max_squared_error": 1003.7889404296875, "per_token_full_gradient_variance/variance": 0.0816141813993454, "per_token_gradient_norm": 7.485046863555908, "per_token_gradient_norm/max": 6541.5615234375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 11056.16796875, "per_token_policy_error_norm": 0.07177506387233734, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06061546504497528, "policy_entropy": 0.1480938345193863, "policy_entropy/max": 3.78125, "policy_entropy/median": 5.173683166503906e-05, "policy_entropy/min": 2.0539125955565396e-14, "policy_entropy/p25": 1.5422701835632324e-06, "policy_entropy/p75": 0.052978515625, "policy_entropy/var": 0.11497347801923752, "policy_error_vector_variance/max_squared_error": 2.016406297683716, "policy_error_vector_variance/metric": 0.07161011546850204, "policy_loss": 0.08858293294906616, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.194833517074585, "policy_sharpness": 7.513739109039307, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.8831634521484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.973040580749512, "reward": 0.8033854365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15816323459148407, "rewards/accuracy_reward": 0.8033854365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15816323459148407, "sentence_full_gradient_variance/max_squared_error": 769731.4375, "sentence_full_gradient_variance/metric": 3744.435546875, "sentence_full_gradient_variance/p75": 91.70620727539062, "sentence_full_gradient_variance/p90": 100.17528533935547, "sentence_full_gradient_variance/p95": 100.17528533935547, "sentence_full_gradient_variance/p99": 78979.703125, "state_level_variance/metric": 105.88201904296875, "state_level_variance_full_gradient/metric": 424.6944580078125, "step": 35 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16514559090137482, "action_level_variance/metric": 1136.2322998046875, "action_level_variance_full_gradient/metric": 5155.9111328125, "adam_stats/lr_effective_max": 5.7943048886954784e-05, "adam_stats/lr_effective_mean": -1.7546676611690515e-10, "adam_stats/lr_effective_min": -6.041303640813567e-05, "adam_stats/m_t_max": 0.002613525604829192, "adam_stats/m_t_mean": 5.567300526837515e-12, "adam_stats/m_t_min": -0.0026094948407262564, "adam_stats/v_t_max": 7.393098348984495e-05, "adam_stats/v_t_mean": 3.445007300262315e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.018869567662477493, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.7943403720855713, "all_logprobs": -0.11441724747419357, "all_logprobs/max": 0.0, "all_logprobs/median": -1.430511474609375e-06, "all_logprobs/min": -11.125, "all_logprobs/p1": -2.140625, "all_logprobs/p10": -0.255859375, "all_logprobs/p25": -0.003173828125, "all_logprobs/p5": -0.73046875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.17774906754493713, "clip_ratio": 0.0, "completion_length": 595.390625, "completion_length/correct": 528.5082397460938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 490.0, "completion_length/correct/min": 61.0, "completion_length/correct/p25": 381.0, "completion_length/correct/p75": 642.25, "completion_length/correct/var": 41367.05859375, "completion_length/incorrect": 849.5437622070312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 285.0, "completion_length/incorrect/p25": 684.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 47599.3046875, "completion_length/max": 1024.0, "completion_length/median": 541.0, "completion_length/min": 61.0, "completion_length/p25": 399.75, "completion_length/p75": 770.0, "completion_length/var": 59625.6171875, "epoch": 0.4608, "feature_vector_variance/max_squared_error": 106047.5390625, "feature_vector_variance/metric": 25424.55859375, "generated_tokens/total": 17927780.0, "grad_norm": 0.2138976752758026, "grouped_std_rewards": 0.19600704312324524, "learning_rate": 1.2117461064942437e-05, "loss": 0.0189, "mean_logprobs": -0.1171875, "mean_logprobs/var": 0.002838134765625, "num_completions/total": 27648, "per_sentence_gradient_norm": 5.215394020080566, "per_sentence_gradient_norm/max": 688.5933837890625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 29.673507690429688, "per_sentence_gradient_norm/p99": 112.05218505859375, "per_sentence_gradient_norm/var": 1110.47802734375, "per_token_feature_norm": 160.95738220214844, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 156.0, "per_token_feature_norm/min": 63.25, "per_token_feature_norm/p25": 129.0, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 1672.5941162109375, "per_token_full_gradient_variance/max_squared_error": 499.9583740234375, "per_token_full_gradient_variance/variance": 0.058241814374923706, "per_token_gradient_norm": 6.2297773361206055, "per_token_gradient_norm/max": 6332.33935546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8013.9765625, "per_token_policy_error_norm": 0.06193319335579872, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05263875052332878, "policy_entropy": 0.12667420506477356, "policy_entropy/max": 3.734375, "policy_entropy/median": 2.2411346435546875e-05, "policy_entropy/min": 6.838973831690964e-14, "policy_entropy/p25": 7.487833499908447e-07, "policy_entropy/p75": 0.0223388671875, "policy_entropy/var": 0.09266264736652374, "policy_error_vector_variance/max_squared_error": 2.018986940383911, "policy_error_vector_variance/metric": 0.06187666207551956, "policy_loss": 0.018869563937187195, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.7943403720855713, "policy_sharpness": 7.7878546714782715, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.55462646484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.965502738952637, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16514559090137482, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16514559090137482, "sentence_full_gradient_variance/max_squared_error": 2137032.0, "sentence_full_gradient_variance/metric": 5861.4814453125, "sentence_full_gradient_variance/p75": 62.332923889160156, "sentence_full_gradient_variance/p90": 86.5784683227539, "sentence_full_gradient_variance/p95": 86.5784683227539, "sentence_full_gradient_variance/p99": 51927.23046875, "state_level_variance/metric": 116.03741455078125, "state_level_variance_full_gradient/metric": 705.5695190429688, "step": 36 }, { "accuracy_reward": 0.76171875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1817399561405182, "action_level_variance/metric": 847.0009155273438, "action_level_variance_full_gradient/metric": 6374.01953125, "adam_stats/lr_effective_max": 5.7076773373410106e-05, "adam_stats/lr_effective_mean": -1.3079438920815534e-10, "adam_stats/lr_effective_min": -6.0942697018617764e-05, "adam_stats/m_t_max": 0.00223010266199708, "adam_stats/m_t_mean": 1.134843694711174e-11, "adam_stats/m_t_min": -0.002287510083988309, "adam_stats/v_t_max": 7.38576491130516e-05, "adam_stats/v_t_mean": 3.4435314842651277e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.05799906700849533, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.0657856464385986, "all_logprobs": -0.107604019343853, "all_logprobs/max": 0.0, "all_logprobs/median": -9.5367431640625e-07, "all_logprobs/min": -9.875, "all_logprobs/p1": -2.09375, "all_logprobs/p10": -0.2275390625, "all_logprobs/p25": -0.0020599365234375, "all_logprobs/p5": -0.69140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.16702838242053986, "clip_ratio": 0.0, "completion_length": 577.7109375, "completion_length/correct": 487.9008483886719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 448.0, "completion_length/correct/min": 156.0, "completion_length/correct/p25": 329.0, "completion_length/correct/p75": 606.0, "completion_length/correct/var": 44280.75, "completion_length/incorrect": 864.8087158203125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 363.0, "completion_length/incorrect/p25": 668.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 47782.19140625, "completion_length/max": 1024.0, "completion_length/median": 514.0, "completion_length/min": 156.0, "completion_length/p25": 371.75, "completion_length/p75": 783.0, "completion_length/var": 70871.7578125, "epoch": 0.4736, "feature_vector_variance/max_squared_error": 120151.9765625, "feature_vector_variance/metric": 25738.306640625, "generated_tokens/total": 18371464.0, "grad_norm": 0.12202322483062744, "grouped_std_rewards": 0.17236942052841187, "learning_rate": 1.1908389392193549e-05, "loss": -0.058, "mean_logprobs": -0.1142578125, "mean_logprobs/var": 0.00250244140625, "num_completions/total": 28416, "per_sentence_gradient_norm": 4.876709938049316, "per_sentence_gradient_norm/max": 363.8238525390625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 140.35813903808594, "per_sentence_gradient_norm/var": 824.2918090820312, "per_token_feature_norm": 161.82418823242188, "per_token_feature_norm/max": 336.0, "per_token_feature_norm/median": 157.0, "per_token_feature_norm/min": 62.0, "per_token_feature_norm/p25": 131.0, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 1593.8548583984375, "per_token_full_gradient_variance/max_squared_error": 424.7098693847656, "per_token_full_gradient_variance/variance": 0.061630647629499435, "per_token_gradient_norm": 5.487251281738281, "per_token_gradient_norm/max": 7181.2900390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8313.4619140625, "per_token_policy_error_norm": 0.05841514468193054, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04996495693922043, "policy_entropy": 0.11929791420698166, "policy_entropy/max": 3.765625, "policy_entropy/median": 1.4960765838623047e-05, "policy_entropy/min": 3.747002708109903e-15, "policy_entropy/p25": 5.401670932769775e-07, "policy_entropy/p75": 0.01556396484375, "policy_entropy/var": 0.0864192321896553, "policy_error_vector_variance/max_squared_error": 2.01699161529541, "policy_error_vector_variance/metric": 0.05835925415158272, "policy_loss": -0.05799907445907593, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.0657856464385986, "policy_sharpness": 7.88505744934082, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.24609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.603094100952148, "reward": 0.76171875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1817399561405182, "rewards/accuracy_reward": 0.76171875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1817399561405182, "sentence_full_gradient_variance/max_squared_error": 2117765.75, "sentence_full_gradient_variance/metric": 7197.7138671875, "sentence_full_gradient_variance/p75": 187.55113220214844, "sentence_full_gradient_variance/p90": 190.55172729492188, "sentence_full_gradient_variance/p95": 190.55172729492188, "sentence_full_gradient_variance/p99": 97345.171875, "state_level_variance/metric": 82.95694732666016, "state_level_variance_full_gradient/metric": 823.6927490234375, "step": 37 }, { "accuracy_reward": 0.81640625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15008249878883362, "action_level_variance/metric": 460.1241149902344, "action_level_variance_full_gradient/metric": 1904.7059326171875, "adam_stats/lr_effective_max": 6.0973128711339086e-05, "adam_stats/lr_effective_mean": -5.306341824273808e-11, "adam_stats/lr_effective_min": -5.8706511481432244e-05, "adam_stats/m_t_max": 0.002442006953060627, "adam_stats/m_t_mean": 2.0094965622052818e-11, "adam_stats/m_t_min": -0.0031024604104459286, "adam_stats/v_t_max": 7.403561903629452e-05, "adam_stats/v_t_mean": 3.4453188999666873e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.04026787728071213, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.4433250427246094, "all_logprobs": -0.10811257362365723, "all_logprobs/max": 0.0, "all_logprobs/median": -1.0728836059570312e-06, "all_logprobs/min": -10.3125, "all_logprobs/p1": -2.078125, "all_logprobs/p10": -0.2314453125, "all_logprobs/p25": -0.002532958984375, "all_logprobs/p5": -0.69140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.16780652105808258, "clip_ratio": 0.0, "completion_length": 545.1497802734375, "completion_length/correct": 485.9856262207031, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 455.0, "completion_length/correct/min": 126.0, "completion_length/correct/p25": 348.5, "completion_length/correct/p75": 605.0, "completion_length/correct/var": 34964.2578125, "completion_length/incorrect": 808.2410888671875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 796.0, "completion_length/incorrect/min": 319.0, "completion_length/incorrect/p25": 672.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 44194.51171875, "completion_length/max": 1024.0, "completion_length/median": 498.0, "completion_length/min": 126.0, "completion_length/p25": 372.0, "completion_length/p75": 688.25, "completion_length/var": 52189.3203125, "epoch": 0.4864, "feature_vector_variance/max_squared_error": 112011.4296875, "feature_vector_variance/metric": 26255.3515625, "generated_tokens/total": 18790138.0, "grad_norm": 0.11731699854135513, "grouped_std_rewards": 0.14729884266853333, "learning_rate": 1.1693946776030601e-05, "loss": -0.0403, "mean_logprobs": -0.109375, "mean_logprobs/var": 0.0023193359375, "num_completions/total": 29184, "per_sentence_gradient_norm": 3.232868194580078, "per_sentence_gradient_norm/max": 403.0068359375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 80.07925415039062, "per_sentence_gradient_norm/var": 450.25897216796875, "per_token_feature_norm": 163.65736389160156, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 160.0, "per_token_feature_norm/min": 63.25, "per_token_feature_norm/p25": 133.0, "per_token_feature_norm/p75": 191.0, "per_token_feature_norm/var": 1583.5931396484375, "per_token_full_gradient_variance/max_squared_error": 676.34521484375, "per_token_full_gradient_variance/variance": 0.04225867986679077, "per_token_gradient_norm": 4.345276355743408, "per_token_gradient_norm/max": 6816.349609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5700.28466796875, "per_token_policy_error_norm": 0.05868794023990631, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.05016056448221207, "policy_entropy": 0.11991731822490692, "policy_entropy/max": 3.796875, "policy_entropy/median": 1.6570091247558594e-05, "policy_entropy/min": 1.6237011735142914e-15, "policy_entropy/p25": 4.842877388000488e-07, "policy_entropy/p75": 0.01904296875, "policy_entropy/var": 0.08524026721715927, "policy_error_vector_variance/max_squared_error": 2.0137839317321777, "policy_error_vector_variance/metric": 0.05863006040453911, "policy_loss": -0.04026786983013153, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -7.48191499710083, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.443325161933899, "policy_sharpness": 7.846042156219482, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.86712646484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.70484447479248, "reward": 0.81640625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15008249878883362, "rewards/accuracy_reward": 0.81640625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15008249878883362, "sentence_full_gradient_variance/max_squared_error": 407542.125, "sentence_full_gradient_variance/metric": 2141.515380859375, "sentence_full_gradient_variance/p75": 33.34227752685547, "sentence_full_gradient_variance/p90": 185.65512084960938, "sentence_full_gradient_variance/p95": 185.65512084960938, "sentence_full_gradient_variance/p99": 57309.31640625, "state_level_variance/metric": 47.559486389160156, "state_level_variance_full_gradient/metric": 236.8093719482422, "step": 38 }, { "accuracy_reward": 0.7877604365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1674119383096695, "action_level_variance/metric": 435.5542907714844, "action_level_variance_full_gradient/metric": 3277.71630859375, "adam_stats/lr_effective_max": 5.311491622705944e-05, "adam_stats/lr_effective_mean": -8.171467842654767e-12, "adam_stats/lr_effective_min": -5.37364867341239e-05, "adam_stats/m_t_max": 0.002008597133681178, "adam_stats/m_t_mean": 1.4046077669027657e-11, "adam_stats/m_t_min": -0.0024824608117341995, "adam_stats/v_t_max": 7.398198795272037e-05, "adam_stats/v_t_mean": 3.444147961620403e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0766320526599884, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.2001841068267822, "all_logprobs": -0.10773880034685135, "all_logprobs/max": 0.0, "all_logprobs/median": -7.152557373046875e-07, "all_logprobs/min": -12.5, "all_logprobs/p1": -2.109375, "all_logprobs/p10": -0.228515625, "all_logprobs/p25": -0.0024871826171875, "all_logprobs/p5": -0.69140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.16434569656848907, "clip_ratio": 0.0, "completion_length": 555.72265625, "completion_length/correct": 491.8859558105469, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 456.0, "completion_length/correct/min": 137.0, "completion_length/correct/p25": 322.0, "completion_length/correct/p75": 630.0, "completion_length/correct/var": 44151.85546875, "completion_length/incorrect": 792.6625366210938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 856.0, "completion_length/incorrect/min": 257.0, "completion_length/incorrect/p25": 577.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 55583.48828125, "completion_length/max": 1024.0, "completion_length/median": 507.0, "completion_length/min": 137.0, "completion_length/p25": 356.75, "completion_length/p75": 722.5, "completion_length/var": 61653.98046875, "epoch": 0.4992, "feature_vector_variance/max_squared_error": 116382.046875, "feature_vector_variance/metric": 26782.88671875, "generated_tokens/total": 19216932.0, "grad_norm": 0.08796430379152298, "grouped_std_rewards": 0.160445898771286, "learning_rate": 1.1474394481749037e-05, "loss": -0.0766, "mean_logprobs": -0.111328125, "mean_logprobs/var": 0.002838134765625, "num_completions/total": 29952, "per_sentence_gradient_norm": 3.6868600845336914, "per_sentence_gradient_norm/max": 270.91351318359375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 102.8224105834961, "per_sentence_gradient_norm/var": 422.511474609375, "per_token_feature_norm": 165.65390014648438, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 163.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 135.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 1582.3271484375, "per_token_full_gradient_variance/max_squared_error": 148.01937866210938, "per_token_full_gradient_variance/variance": 0.03735451027750969, "per_token_gradient_norm": 4.337393283843994, "per_token_gradient_norm/max": 5686.57958984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4660.4775390625, "per_token_policy_error_norm": 0.05869946628808975, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.050280965864658356, "policy_entropy": 0.1198931410908699, "policy_entropy/max": 3.640625, "policy_entropy/median": 1.233816146850586e-05, "policy_entropy/min": 2.5326962749261384e-16, "policy_entropy/p25": 2.849847078323364e-07, "policy_entropy/p75": 0.018310546875, "policy_entropy/var": 0.08626392483711243, "policy_error_vector_variance/max_squared_error": 2.0181779861450195, "policy_error_vector_variance/metric": 0.058619752526283264, "policy_loss": -0.0766320526599884, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.2001841068267822, "policy_sharpness": 7.853672504425049, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.86712646484375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.667441368103027, "reward": 0.7877604365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1674119383096695, "rewards/accuracy_reward": 0.7877604365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1674119383096695, "sentence_full_gradient_variance/max_squared_error": 429645.4375, "sentence_full_gradient_variance/metric": 3692.918701171875, "sentence_full_gradient_variance/p75": 77.86636352539062, "sentence_full_gradient_variance/p90": 220.07215881347656, "sentence_full_gradient_variance/p95": 220.07215881347656, "sentence_full_gradient_variance/p99": 111012.8828125, "state_level_variance/metric": 41.28136444091797, "state_level_variance_full_gradient/metric": 415.2024841308594, "step": 39 }, { "accuracy_reward": 0.7799479365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17185291647911072, "action_level_variance/metric": 671.8590087890625, "action_level_variance_full_gradient/metric": 9815.8876953125, "adam_stats/lr_effective_max": 5.0157730584032834e-05, "adam_stats/lr_effective_mean": -5.648835219029813e-11, "adam_stats/lr_effective_min": -5.4510164773091674e-05, "adam_stats/m_t_max": 0.001778454752638936, "adam_stats/m_t_mean": -2.3095126071986938e-12, "adam_stats/m_t_min": -0.0017581403953954577, "adam_stats/v_t_max": 7.397690933430567e-05, "adam_stats/v_t_mean": 3.446765659345652e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.006163406185805798, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.3881969451904297, "all_logprobs": -0.10265446454286575, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-07, "all_logprobs/min": -11.0, "all_logprobs/p1": -2.015625, "all_logprobs/p10": -0.2099609375, "all_logprobs/p25": -0.00177001953125, "all_logprobs/p5": -0.640625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15360309183597565, "clip_ratio": 0.0, "completion_length": 565.3216552734375, "completion_length/correct": 505.9014892578125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 485.0, "completion_length/correct/min": 123.0, "completion_length/correct/p25": 357.5, "completion_length/correct/p75": 621.0, "completion_length/correct/var": 37276.8515625, "completion_length/incorrect": 775.9290161132812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 819.0, "completion_length/incorrect/min": 148.0, "completion_length/incorrect/p25": 616.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 63541.6484375, "completion_length/max": 1024.0, "completion_length/median": 527.0, "completion_length/min": 123.0, "completion_length/p25": 379.75, "completion_length/p75": 726.0, "completion_length/var": 55511.79296875, "epoch": 0.512, "feature_vector_variance/max_squared_error": 108523.4375, "feature_vector_variance/metric": 27464.2109375, "generated_tokens/total": 19651100.0, "grad_norm": 0.12805095314979553, "grouped_std_rewards": 0.17206653952598572, "learning_rate": 1.125e-05, "loss": -0.0062, "mean_logprobs": -0.107421875, "mean_logprobs/var": 0.00225830078125, "num_completions/total": 30720, "per_sentence_gradient_norm": 4.6265692710876465, "per_sentence_gradient_norm/max": 299.98162841796875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 11.96924877166748, "per_sentence_gradient_norm/p99": 132.30868530273438, "per_sentence_gradient_norm/var": 651.302001953125, "per_token_feature_norm": 167.76039123535156, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 166.0, "per_token_feature_norm/min": 63.25, "per_token_feature_norm/p25": 138.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 1499.0667724609375, "per_token_full_gradient_variance/max_squared_error": 186.29473876953125, "per_token_full_gradient_variance/variance": 0.05087680369615555, "per_token_gradient_norm": 4.69643497467041, "per_token_gradient_norm/max": 5693.07421875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5942.32568359375, "per_token_policy_error_norm": 0.05639941617846489, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0484183095395565, "policy_entropy": 0.11410742253065109, "policy_entropy/max": 3.625, "policy_entropy/median": 8.821487426757812e-06, "policy_entropy/min": 2.5326962749261384e-16, "policy_entropy/p25": 1.8719583749771118e-07, "policy_entropy/p75": 0.0140380859375, "policy_entropy/var": 0.07971151173114777, "policy_error_vector_variance/max_squared_error": 2.0148966312408447, "policy_error_vector_variance/metric": 0.05634878948330879, "policy_loss": -0.006163414567708969, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.3881969451904297, "policy_sharpness": 7.92899751663208, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.49609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.36038875579834, "reward": 0.7799479365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17185291647911072, "rewards/accuracy_reward": 0.7799479365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17185291647911072, "sentence_full_gradient_variance/max_squared_error": 2712512.0, "sentence_full_gradient_variance/metric": 10978.763671875, "sentence_full_gradient_variance/p75": 615.4442749023438, "sentence_full_gradient_variance/p90": 1117.8819580078125, "sentence_full_gradient_variance/p95": 1117.8819580078125, "sentence_full_gradient_variance/p99": 185976.90625, "state_level_variance/metric": 63.23594665527344, "state_level_variance_full_gradient/metric": 1162.874267578125, "step": 40 }, { "accuracy_reward": 0.8046875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1573704481124878, "action_level_variance/metric": 657.9805908203125, "action_level_variance_full_gradient/metric": 2906.8486328125, "adam_stats/lr_effective_max": 5.4791289585409686e-05, "adam_stats/lr_effective_mean": -2.6037397238276583e-10, "adam_stats/lr_effective_min": -5.8828634792007506e-05, "adam_stats/m_t_max": 0.0017137809190899134, "adam_stats/m_t_mean": -3.2904343312545326e-12, "adam_stats/m_t_min": -0.002001174259930849, "adam_stats/v_t_max": 7.39658935344778e-05, "adam_stats/v_t_mean": 3.4713453899576363e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.12535151839256287, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 2.3105595111846924, "all_logprobs": -0.09728533029556274, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -9.875, "all_logprobs/p1": -1.96875, "all_logprobs/p10": -0.1845703125, "all_logprobs/p25": -0.00125885009765625, "all_logprobs/p5": -0.5859375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1495426893234253, "clip_ratio": 0.0, "completion_length": 565.08203125, "completion_length/correct": 490.4886779785156, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 442.0, "completion_length/correct/min": 129.0, "completion_length/correct/p25": 339.5, "completion_length/correct/p75": 617.0, "completion_length/correct/var": 42754.390625, "completion_length/incorrect": 872.4066772460938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 236.0, "completion_length/incorrect/p25": 760.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 42369.4140625, "completion_length/max": 1024.0, "completion_length/median": 486.0, "completion_length/min": 129.0, "completion_length/p25": 369.75, "completion_length/p75": 767.75, "completion_length/var": 65578.125, "epoch": 0.5248, "feature_vector_variance/max_squared_error": 128731.2109375, "feature_vector_variance/metric": 27603.96875, "generated_tokens/total": 20085084.0, "grad_norm": 0.2869085669517517, "grouped_std_rewards": 0.1781141757965088, "learning_rate": 1.1021036720894182e-05, "loss": -0.1254, "mean_logprobs": -0.1005859375, "mean_logprobs/var": 0.0027008056640625, "num_completions/total": 31488, "per_sentence_gradient_norm": 4.338146209716797, "per_sentence_gradient_norm/max": 338.6802978515625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 11.22918701171875, "per_sentence_gradient_norm/p99": 138.38059997558594, "per_sentence_gradient_norm/var": 639.9943237304688, "per_token_feature_norm": 169.00437927246094, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 168.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 139.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 1508.8001708984375, "per_token_full_gradient_variance/max_squared_error": 952.1041870117188, "per_token_full_gradient_variance/variance": 0.06422336399555206, "per_token_gradient_norm": 5.607981204986572, "per_token_gradient_norm/max": 6587.79736328125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 7700.84375, "per_token_policy_error_norm": 0.05310405418276787, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04601449519395828, "policy_entropy": 0.10800265520811081, "policy_entropy/max": 3.8125, "policy_entropy/median": 5.632638931274414e-06, "policy_entropy/min": 1.033895191682177e-15, "policy_entropy/p25": 1.0849907994270325e-07, "policy_entropy/p75": 0.0101318359375, "policy_entropy/var": 0.07550007849931717, "policy_error_vector_variance/max_squared_error": 2.013803005218506, "policy_error_vector_variance/metric": 0.05305157229304314, "policy_loss": -0.12535153329372406, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.3105597496032715, "policy_sharpness": 8.001922607421875, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.24609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.059128761291504, "reward": 0.8046875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1573704481124878, "rewards/accuracy_reward": 0.8046875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1573704481124878, "sentence_full_gradient_variance/max_squared_error": 457269.25, "sentence_full_gradient_variance/metric": 3232.60498046875, "sentence_full_gradient_variance/p75": 190.76028442382812, "sentence_full_gradient_variance/p90": 202.84036254882812, "sentence_full_gradient_variance/p95": 202.84036254882812, "sentence_full_gradient_variance/p99": 70490.15625, "state_level_variance/metric": 64.09571838378906, "state_level_variance_full_gradient/metric": 325.7559814453125, "step": 41 }, { "accuracy_reward": 0.7721354365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1761717051267624, "action_level_variance/metric": 296.5523376464844, "action_level_variance_full_gradient/metric": 883.16796875, "adam_stats/lr_effective_max": 5.399841393227689e-05, "adam_stats/lr_effective_mean": -2.3469623466887413e-10, "adam_stats/lr_effective_min": -5.5579246691195294e-05, "adam_stats/m_t_max": 0.001568533480167389, "adam_stats/m_t_mean": 9.342511573390277e-13, "adam_stats/m_t_min": -0.0018094491679221392, "adam_stats/v_t_max": 7.393659325316548e-05, "adam_stats/v_t_mean": 3.469333978087241e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.11551395058631897, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.7881487607955933, "all_logprobs": -0.08645277470350266, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -12.875, "all_logprobs/p1": -1.8515625, "all_logprobs/p10": -0.142578125, "all_logprobs/p25": -0.000484466552734375, "all_logprobs/p5": -0.5234375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.13021782040596008, "clip_ratio": 0.0, "completion_length": 591.984375, "completion_length/correct": 506.54132080078125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 483.0, "completion_length/correct/min": 133.0, "completion_length/correct/p25": 325.0, "completion_length/correct/p75": 663.0, "completion_length/correct/var": 48753.1953125, "completion_length/incorrect": 881.5142822265625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 385.0, "completion_length/incorrect/p25": 718.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 38415.6484375, "completion_length/max": 1024.0, "completion_length/median": 563.0, "completion_length/min": 133.0, "completion_length/p25": 366.0, "completion_length/p75": 806.0, "completion_length/var": 71115.0546875, "epoch": 0.5376, "feature_vector_variance/max_squared_error": 118271.234375, "feature_vector_variance/metric": 28887.826171875, "generated_tokens/total": 20539728.0, "grad_norm": 0.0879029631614685, "grouped_std_rewards": 0.10867391526699066, "learning_rate": 1.078778360091808e-05, "loss": 0.1155, "mean_logprobs": -0.09033203125, "mean_logprobs/var": 0.002227783203125, "num_completions/total": 32256, "per_sentence_gradient_norm": 2.3031203746795654, "per_sentence_gradient_norm/max": 217.94483947753906, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 65.31776428222656, "per_sentence_gradient_norm/var": 291.627685546875, "per_token_feature_norm": 175.25575256347656, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 177.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 147.0, "per_token_feature_norm/p75": 202.0, "per_token_feature_norm/var": 1403.8795166015625, "per_token_full_gradient_variance/max_squared_error": 275.82452392578125, "per_token_full_gradient_variance/variance": 0.03718431293964386, "per_token_gradient_norm": 3.34230637550354, "per_token_gradient_norm/max": 6123.58056640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4528.359375, "per_token_policy_error_norm": 0.047836676239967346, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04180005565285683, "policy_entropy": 0.09564325958490372, "policy_entropy/max": 3.75, "policy_entropy/median": 2.041459083557129e-06, "policy_entropy/min": 1.249000902703301e-16, "policy_entropy/p25": 3.4458935260772705e-08, "policy_entropy/p75": 0.004241943359375, "policy_entropy/var": 0.06468096375465393, "policy_error_vector_variance/max_squared_error": 2.009164333343506, "policy_error_vector_variance/metric": 0.047813113778829575, "policy_loss": 0.11551394313573837, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -7.48191499710083, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.7881487607955933, "policy_sharpness": 8.177999496459961, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.298463821411133, "reward": 0.7721354365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1761717051267624, "rewards/accuracy_reward": 0.7721354365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1761717051267624, "sentence_full_gradient_variance/max_squared_error": 484357.21875, "sentence_full_gradient_variance/metric": 1001.6049194335938, "sentence_full_gradient_variance/p75": 23.99488639831543, "sentence_full_gradient_variance/p90": 29.886640548706055, "sentence_full_gradient_variance/p95": 29.886640548706055, "sentence_full_gradient_variance/p99": 3347.9736328125, "state_level_variance/metric": 32.09904098510742, "state_level_variance_full_gradient/metric": 118.43695068359375, "step": 42 }, { "accuracy_reward": 0.7526041865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18643391132354736, "action_level_variance/metric": 455.62591552734375, "action_level_variance_full_gradient/metric": 4687.3056640625, "adam_stats/lr_effective_max": 5.6122175010386854e-05, "adam_stats/lr_effective_mean": -3.0083885382836684e-10, "adam_stats/lr_effective_min": -5.434488411992788e-05, "adam_stats/m_t_max": 0.0013103766832500696, "adam_stats/m_t_mean": -4.581138853682054e-12, "adam_stats/m_t_min": -0.0018093209946528077, "adam_stats/v_t_max": 7.386502693407238e-05, "adam_stats/v_t_mean": 3.4703795826623862e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.013876463286578655, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.5115926265716553, "all_logprobs": -0.0858382135629654, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -12.375, "all_logprobs/p1": -1.875, "all_logprobs/p10": -0.1337890625, "all_logprobs/p25": -0.000431060791015625, "all_logprobs/p5": -0.51171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1299348920583725, "clip_ratio": 0.0, "completion_length": 582.0546875, "completion_length/correct": 502.73876953125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 444.0, "completion_length/correct/min": 176.0, "completion_length/correct/p25": 337.0, "completion_length/correct/p75": 631.75, "completion_length/correct/var": 42112.13671875, "completion_length/incorrect": 823.3421630859375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 983.0, "completion_length/incorrect/min": 267.0, "completion_length/incorrect/p25": 621.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 57975.50390625, "completion_length/max": 1024.0, "completion_length/median": 531.0, "completion_length/min": 176.0, "completion_length/p25": 369.0, "completion_length/p75": 773.5, "completion_length/var": 65129.078125, "epoch": 0.5504, "feature_vector_variance/max_squared_error": 122076.3046875, "feature_vector_variance/metric": 29273.177734375, "generated_tokens/total": 20986744.0, "grad_norm": 0.12013561278581619, "grouped_std_rewards": 0.13646666705608368, "learning_rate": 1.0550524823068504e-05, "loss": 0.0139, "mean_logprobs": -0.08740234375, "mean_logprobs/var": 0.002349853515625, "num_completions/total": 33024, "per_sentence_gradient_norm": 3.11124587059021, "per_sentence_gradient_norm/max": 382.92572021484375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 73.26615142822266, "per_sentence_gradient_norm/var": 446.5274658203125, "per_token_feature_norm": 176.4788360595703, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 179.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 149.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 1387.919677734375, "per_token_full_gradient_variance/max_squared_error": 285.21307373046875, "per_token_full_gradient_variance/variance": 0.04931848868727684, "per_token_gradient_norm": 3.818352460861206, "per_token_gradient_norm/max": 7321.69970703125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6010.908203125, "per_token_policy_error_norm": 0.04720994457602501, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04115096107125282, "policy_entropy": 0.09530606120824814, "policy_entropy/max": 3.796875, "policy_entropy/median": 1.862645149230957e-06, "policy_entropy/min": 2.211772431870429e-17, "policy_entropy/p25": 2.735760062932968e-08, "policy_entropy/p75": 0.0037689208984375, "policy_entropy/var": 0.06644779443740845, "policy_error_vector_variance/max_squared_error": 2.016004800796509, "policy_error_vector_variance/metric": 0.047176510095596313, "policy_loss": 0.013876460492610931, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.5115926265716553, "policy_sharpness": 8.198448181152344, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.75, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.228233337402344, "reward": 0.7526041865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18643391132354736, "rewards/accuracy_reward": 0.7526041865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18643391132354736, "sentence_full_gradient_variance/max_squared_error": 2303468.25, "sentence_full_gradient_variance/metric": 5337.1953125, "sentence_full_gradient_variance/p75": 24.82356834411621, "sentence_full_gradient_variance/p90": 38.55428695678711, "sentence_full_gradient_variance/p95": 38.55428695678711, "sentence_full_gradient_variance/p99": 58843.19921875, "state_level_variance/metric": 47.770999908447266, "state_level_variance_full_gradient/metric": 649.8915405273438, "step": 43 }, { "accuracy_reward": 0.7591146230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18309803307056427, "action_level_variance/metric": 379.177978515625, "action_level_variance_full_gradient/metric": 1681.602783203125, "adam_stats/lr_effective_max": 5.3264764574123546e-05, "adam_stats/lr_effective_mean": -2.2840529467771375e-10, "adam_stats/lr_effective_min": -5.3834377467865124e-05, "adam_stats/m_t_max": 0.0014201872982084751, "adam_stats/m_t_mean": -1.0367449954085117e-11, "adam_stats/m_t_min": -0.0017413039458915591, "adam_stats/v_t_max": 7.384752098005265e-05, "adam_stats/v_t_mean": 3.4730131097393535e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0018380408873781562, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.9294040203094482, "all_logprobs": -0.07569937407970428, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.25, "all_logprobs/p1": -1.71875, "all_logprobs/p10": -0.0927734375, "all_logprobs/p25": -0.0001239776611328125, "all_logprobs/p5": -0.419921875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.11789799481630325, "clip_ratio": 0.0, "completion_length": 567.2877807617188, "completion_length/correct": 475.0737609863281, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 431.0, "completion_length/correct/min": 169.0, "completion_length/correct/p25": 329.0, "completion_length/correct/p75": 578.5, "completion_length/correct/var": 40430.3515625, "completion_length/incorrect": 857.8865356445312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 249.0, "completion_length/incorrect/p25": 696.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 49678.13671875, "completion_length/max": 1024.0, "completion_length/median": 492.0, "completion_length/min": 169.0, "completion_length/p25": 355.75, "completion_length/p75": 757.5, "completion_length/var": 69428.359375, "epoch": 0.5632, "feature_vector_variance/max_squared_error": 137462.125, "feature_vector_variance/metric": 30419.044921875, "generated_tokens/total": 21422422.0, "grad_norm": 0.13965947926044464, "grouped_std_rewards": 0.14724811911582947, "learning_rate": 1.0309549450619342e-05, "loss": -0.0018, "mean_logprobs": -0.07861328125, "mean_logprobs/var": 0.0027618408203125, "num_completions/total": 33792, "per_sentence_gradient_norm": 2.7278735637664795, "per_sentence_gradient_norm/max": 351.3748779296875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 74.31112670898438, "per_sentence_gradient_norm/var": 372.2213134765625, "per_token_feature_norm": 182.9429473876953, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 62.0, "per_token_feature_norm/p25": 159.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 1276.828857421875, "per_token_full_gradient_variance/max_squared_error": 310.5561828613281, "per_token_full_gradient_variance/variance": 0.04585323482751846, "per_token_gradient_norm": 3.865523099899292, "per_token_gradient_norm/max": 7390.04833984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6170.3466796875, "per_token_policy_error_norm": 0.04156889766454697, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03674589842557907, "policy_entropy": 0.08324624598026276, "policy_entropy/max": 3.796875, "policy_entropy/median": 5.252659320831299e-07, "policy_entropy/min": 1.1102230246251565e-16, "policy_entropy/p25": 6.373738870024681e-09, "policy_entropy/p75": 0.00125885009765625, "policy_entropy/var": 0.05879359692335129, "policy_error_vector_variance/max_squared_error": 2.0096192359924316, "policy_error_vector_variance/metric": 0.041503097862005234, "policy_loss": -0.001838040305301547, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.9294039011001587, "policy_sharpness": 8.37968921661377, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.410645484924316, "reward": 0.7591146230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18309803307056427, "rewards/accuracy_reward": 0.7591146230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18309803307056427, "sentence_full_gradient_variance/max_squared_error": 341706.75, "sentence_full_gradient_variance/metric": 1894.03515625, "sentence_full_gradient_variance/p75": 77.67172241210938, "sentence_full_gradient_variance/p90": 84.6331558227539, "sentence_full_gradient_variance/p95": 84.6331558227539, "sentence_full_gradient_variance/p99": 62756.1875, "state_level_variance/metric": 40.37654113769531, "state_level_variance_full_gradient/metric": 212.43258666992188, "step": 44 }, { "accuracy_reward": 0.7200521230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2018398940563202, "action_level_variance/metric": 186.87472534179688, "action_level_variance_full_gradient/metric": 1208.3990478515625, "adam_stats/lr_effective_max": 5.494988363352604e-05, "adam_stats/lr_effective_mean": -1.1544442485300621e-10, "adam_stats/lr_effective_min": -5.19106506544631e-05, "adam_stats/m_t_max": 0.0011928863823413849, "adam_stats/m_t_mean": -3.851076575689394e-12, "adam_stats/m_t_min": -0.0018570906249806285, "adam_stats/v_t_max": 7.38058952265419e-05, "adam_stats/v_t_mean": 3.472708448928885e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.06416819989681244, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 1.253929615020752, "all_logprobs": -0.0665457472205162, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.5625, "all_logprobs/p1": -1.6015625, "all_logprobs/p10": -0.0654296875, "all_logprobs/p25": -4.982948303222656e-05, "all_logprobs/p5": -0.353515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.09840472042560577, "clip_ratio": 0.0, "completion_length": 615.7838745117188, "completion_length/correct": 527.0470581054688, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 495.0, "completion_length/correct/min": 116.0, "completion_length/correct/p25": 356.0, "completion_length/correct/p75": 667.0, "completion_length/correct/var": 44987.58984375, "completion_length/incorrect": 844.0232543945312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 983.0, "completion_length/incorrect/min": 207.0, "completion_length/incorrect/p25": 654.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 49959.1484375, "completion_length/max": 1024.0, "completion_length/median": 587.0, "completion_length/min": 116.0, "completion_length/p25": 397.75, "completion_length/p75": 825.75, "completion_length/var": 66595.6953125, "epoch": 0.576, "feature_vector_variance/max_squared_error": 132600.625, "feature_vector_variance/metric": 30642.109375, "generated_tokens/total": 21895344.0, "grad_norm": 0.11667156964540482, "grouped_std_rewards": 0.15041089057922363, "learning_rate": 1.0065151074942516e-05, "loss": -0.0642, "mean_logprobs": -0.0693359375, "mean_logprobs/var": 0.0015869140625, "num_completions/total": 34560, "per_sentence_gradient_norm": 2.274091958999634, "per_sentence_gradient_norm/max": 140.2667236328125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 68.5433120727539, "per_sentence_gradient_norm/var": 181.94012451171875, "per_token_feature_norm": 185.99044799804688, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 190.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 164.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 1183.3514404296875, "per_token_full_gradient_variance/max_squared_error": 66.4773941040039, "per_token_full_gradient_variance/variance": 0.019724125042557716, "per_token_gradient_norm": 2.5185251235961914, "per_token_gradient_norm/max": 4218.25341796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2340.49267578125, "per_token_policy_error_norm": 0.037276800721883774, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.033091332763433456, "policy_entropy": 0.07364295423030853, "policy_entropy/max": 3.71875, "policy_entropy/median": 2.980232238769531e-07, "policy_entropy/min": 9.703609443745265e-18, "policy_entropy/p25": 3.812601789832115e-09, "policy_entropy/p75": 0.000560760498046875, "policy_entropy/var": 0.04835543408989906, "policy_error_vector_variance/max_squared_error": 2.0074501037597656, "policy_error_vector_variance/metric": 0.037258297204971313, "policy_loss": -0.06416819989681244, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -7.48191499710083, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.2539294958114624, "policy_sharpness": 8.501315116882324, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.771217346191406, "reward": 0.7200521230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2018398940563202, "rewards/accuracy_reward": 0.7200521230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2018398940563202, "sentence_full_gradient_variance/max_squared_error": 123067.65625, "sentence_full_gradient_variance/metric": 1365.8343505859375, "sentence_full_gradient_variance/p75": 30.785053253173828, "sentence_full_gradient_variance/p90": 33.69972229003906, "sentence_full_gradient_variance/p95": 33.69972229003906, "sentence_full_gradient_variance/p99": 45619.31640625, "state_level_variance/metric": 18.379297256469727, "state_level_variance_full_gradient/metric": 157.43511962890625, "step": 45 }, { "accuracy_reward": 0.8138021230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15172582864761353, "action_level_variance/metric": 619.024169921875, "action_level_variance_full_gradient/metric": 4268.7216796875, "adam_stats/lr_effective_max": 5.457980660139583e-05, "adam_stats/lr_effective_mean": -1.5972380362772043e-10, "adam_stats/lr_effective_min": -5.3062522056279704e-05, "adam_stats/m_t_max": 0.001110218814574182, "adam_stats/m_t_mean": -2.8116244141923596e-12, "adam_stats/m_t_min": -0.0016381936147809029, "adam_stats/v_t_max": 7.373490370810032e-05, "adam_stats/v_t_mean": 3.4714668206009547e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.12060742825269699, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.3498382568359375, "all_logprobs": -0.05938895791769028, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.9375, "all_logprobs/p1": -1.5, "all_logprobs/p10": -0.04296875, "all_logprobs/p25": -1.6927719116210938e-05, "all_logprobs/p5": -0.28125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.08990821987390518, "clip_ratio": 0.0, "completion_length": 587.5182495117188, "completion_length/correct": 526.2767944335938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 499.0, "completion_length/correct/min": 149.0, "completion_length/correct/p25": 367.0, "completion_length/correct/p75": 678.0, "completion_length/correct/var": 40576.23046875, "completion_length/incorrect": 855.1818237304688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 310.0, "completion_length/incorrect/p25": 710.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 51440.61328125, "completion_length/max": 1024.0, "completion_length/median": 548.0, "completion_length/min": 149.0, "completion_length/p25": 387.0, "completion_length/p75": 751.5, "completion_length/var": 58948.19921875, "epoch": 0.5888, "feature_vector_variance/max_squared_error": 143509.59375, "feature_vector_variance/metric": 30555.595703125, "generated_tokens/total": 22346558.0, "grad_norm": 0.10425768792629242, "grouped_std_rewards": 0.12520305812358856, "learning_rate": 9.817627457812105e-06, "loss": 0.1206, "mean_logprobs": -0.062255859375, "mean_logprobs/var": 0.001708984375, "num_completions/total": 35328, "per_sentence_gradient_norm": 3.1765201091766357, "per_sentence_gradient_norm/max": 409.653564453125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 84.6298599243164, "per_sentence_gradient_norm/var": 609.7279052734375, "per_token_feature_norm": 190.05410766601562, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 194.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 173.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 1012.2535400390625, "per_token_full_gradient_variance/max_squared_error": 363.6437072753906, "per_token_full_gradient_variance/variance": 0.07245254516601562, "per_token_gradient_norm": 3.856923818588257, "per_token_gradient_norm/max": 6413.05908203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8285.068359375, "per_token_policy_error_norm": 0.033008407801389694, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.029653890058398247, "policy_entropy": 0.06538646668195724, "policy_entropy/max": 3.484375, "policy_entropy/median": 1.150183379650116e-07, "policy_entropy/min": 2.3527187142935446e-17, "policy_entropy/p25": 1.229636836796999e-09, "policy_entropy/p75": 0.00020885467529296875, "policy_entropy/var": 0.04372909665107727, "policy_error_vector_variance/max_squared_error": 2.0094542503356934, "policy_error_vector_variance/metric": 0.03298301249742508, "policy_loss": 0.12060742825269699, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.3498382568359375, "policy_sharpness": 8.637670516967773, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.066560745239258, "reward": 0.8138021230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15172582864761353, "rewards/accuracy_reward": 0.8138021230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15172582864761353, "sentence_full_gradient_variance/max_squared_error": 1650262.0, "sentence_full_gradient_variance/metric": 4835.93701171875, "sentence_full_gradient_variance/p75": 107.52641296386719, "sentence_full_gradient_variance/p90": 171.58457946777344, "sentence_full_gradient_variance/p95": 171.58457946777344, "sentence_full_gradient_variance/p99": 46209.55078125, "state_level_variance/metric": 67.99604034423828, "state_level_variance_full_gradient/metric": 567.2146606445312, "step": 46 }, { "accuracy_reward": 0.7473958730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18904145061969757, "action_level_variance/metric": 234.7720184326172, "action_level_variance_full_gradient/metric": 4695.103515625, "adam_stats/lr_effective_max": 5.649082231684588e-05, "adam_stats/lr_effective_mean": -2.2827879864184553e-10, "adam_stats/lr_effective_min": -5.3694824600825086e-05, "adam_stats/m_t_max": 0.0010499324416741729, "adam_stats/m_t_mean": -2.9381475551149006e-12, "adam_stats/m_t_min": -0.0014095243532210588, "adam_stats/v_t_max": 7.366535282926634e-05, "adam_stats/v_t_mean": 3.4692860563512173e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.0670444518327713, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.4566054344177246, "all_logprobs": -0.061129968613386154, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.5625, "all_logprobs/p1": -1.5078125, "all_logprobs/p10": -0.048095703125, "all_logprobs/p25": -2.300739288330078e-05, "all_logprobs/p5": -0.306640625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.09539376944303513, "clip_ratio": 0.0, "completion_length": 559.1458740234375, "completion_length/correct": 489.31011962890625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 467.0, "completion_length/correct/min": 132.0, "completion_length/correct/p25": 337.25, "completion_length/correct/p75": 617.75, "completion_length/correct/var": 37751.1328125, "completion_length/incorrect": 765.773193359375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 811.0, "completion_length/incorrect/min": 237.0, "completion_length/incorrect/p25": 549.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 58559.71875, "completion_length/max": 1024.0, "completion_length/median": 522.0, "completion_length/min": 132.0, "completion_length/p25": 370.75, "completion_length/p75": 727.0, "completion_length/var": 57386.75390625, "epoch": 0.6016, "feature_vector_variance/max_squared_error": 134604.359375, "feature_vector_variance/metric": 30716.169921875, "generated_tokens/total": 22775982.0, "grad_norm": 0.0955476313829422, "grouped_std_rewards": 0.14553901553153992, "learning_rate": 9.567280168627493e-06, "loss": 0.067, "mean_logprobs": -0.0634765625, "mean_logprobs/var": 0.0025787353515625, "num_completions/total": 36096, "per_sentence_gradient_norm": 2.5080971717834473, "per_sentence_gradient_norm/max": 162.15850830078125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 84.17059326171875, "per_sentence_gradient_norm/var": 228.7793731689453, "per_token_feature_norm": 189.7257843017578, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 194.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 173.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 1022.9802856445312, "per_token_full_gradient_variance/max_squared_error": 266.1631164550781, "per_token_full_gradient_variance/variance": 0.05196314677596092, "per_token_gradient_norm": 3.113696336746216, "per_token_gradient_norm/max": 6782.32958984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4967.3779296875, "per_token_policy_error_norm": 0.03365159034729004, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.030088838189840317, "policy_entropy": 0.06759242713451385, "policy_entropy/max": 3.75, "policy_entropy/median": 1.387670636177063e-07, "policy_entropy/min": 4.553649124439119e-18, "policy_entropy/p25": 1.229636836796999e-09, "policy_entropy/p75": 0.00028228759765625, "policy_entropy/var": 0.04747755452990532, "policy_error_vector_variance/max_squared_error": 2.0095772743225098, "policy_error_vector_variance/metric": 0.03358267620205879, "policy_loss": 0.0670444518327713, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.4566056728363037, "policy_sharpness": 8.608304023742676, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.225728988647461, "reward": 0.7473958730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.18904145061969757, "rewards/accuracy_reward": 0.7473958730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18904145061969757, "sentence_full_gradient_variance/max_squared_error": 791157.0625, "sentence_full_gradient_variance/metric": 5291.7666015625, "sentence_full_gradient_variance/p75": 128.1251220703125, "sentence_full_gradient_variance/p90": 194.4393768310547, "sentence_full_gradient_variance/p95": 194.4393768310547, "sentence_full_gradient_variance/p99": 94405.9921875, "state_level_variance/metric": 23.298648834228516, "state_level_variance_full_gradient/metric": 596.6632690429688, "step": 47 }, { "accuracy_reward": 0.7747396230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17474570870399475, "action_level_variance/metric": 162.85470581054688, "action_level_variance_full_gradient/metric": 2606.265380859375, "adam_stats/lr_effective_max": 5.7395216572331265e-05, "adam_stats/lr_effective_mean": -2.2823191947463073e-10, "adam_stats/lr_effective_min": -5.534533556783572e-05, "adam_stats/m_t_max": 0.0010669577168300748, "adam_stats/m_t_mean": 2.1528423575084554e-12, "adam_stats/m_t_min": -0.0014928762102499604, "adam_stats/v_t_max": 7.364759221673012e-05, "adam_stats/v_t_mean": 3.470242539507784e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.004133875481784344, "advantages/max": 7.48191499710083, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.957183837890625, "all_logprobs": -0.0566679984331131, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.125, "all_logprobs/p1": -1.5, "all_logprobs/p10": -0.0380859375, "all_logprobs/p25": -1.8596649169921875e-05, "all_logprobs/p5": -0.25390625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.08578583598136902, "clip_ratio": 0.0, "completion_length": 546.4401245117188, "completion_length/correct": 476.3899230957031, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 433.0, "completion_length/correct/min": 100.0, "completion_length/correct/p25": 312.0, "completion_length/correct/p75": 616.5, "completion_length/correct/var": 45310.6640625, "completion_length/incorrect": 787.3641357421875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 867.0, "completion_length/incorrect/min": 267.0, "completion_length/incorrect/p25": 532.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 61754.69921875, "completion_length/max": 1024.0, "completion_length/median": 489.0, "completion_length/min": 100.0, "completion_length/p25": 340.75, "completion_length/p75": 733.25, "completion_length/var": 65837.9375, "epoch": 0.6144, "feature_vector_variance/max_squared_error": 126503.109375, "feature_vector_variance/metric": 30507.49609375, "generated_tokens/total": 23195648.0, "grad_norm": 0.1307685822248459, "grouped_std_rewards": 0.15948650240898132, "learning_rate": 9.314414216997507e-06, "loss": 0.0041, "mean_logprobs": -0.05859375, "mean_logprobs/var": 0.00136566162109375, "num_completions/total": 36864, "per_sentence_gradient_norm": 2.1190171241760254, "per_sentence_gradient_norm/max": 187.7915496826172, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 68.58332824707031, "per_sentence_gradient_norm/var": 158.57090759277344, "per_token_feature_norm": 189.32510375976562, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 194.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 174.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 955.1315307617188, "per_token_full_gradient_variance/max_squared_error": 205.90518188476562, "per_token_full_gradient_variance/variance": 0.030782222747802734, "per_token_gradient_norm": 2.639432430267334, "per_token_gradient_norm/max": 5556.2216796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3309.368408203125, "per_token_policy_error_norm": 0.03174324706196785, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.02878091111779213, "policy_entropy": 0.061978355050086975, "policy_entropy/max": 3.65625, "policy_entropy/median": 1.3504177331924438e-07, "policy_entropy/min": 2.290377089375628e-18, "policy_entropy/p25": 1.1496013030409813e-09, "policy_entropy/p75": 0.000225067138671875, "policy_entropy/var": 0.03984035179018974, "policy_error_vector_variance/max_squared_error": 2.00917911529541, "policy_error_vector_variance/metric": 0.03172362223267555, "policy_loss": 0.004133874084800482, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -7.48191499710083, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.9571837186813354, "policy_sharpness": 8.662108421325684, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.874518394470215, "reward": 0.7747396230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17474570870399475, "rewards/accuracy_reward": 0.7747396230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17474570870399475, "sentence_full_gradient_variance/max_squared_error": 950381.25, "sentence_full_gradient_variance/metric": 2964.0947265625, "sentence_full_gradient_variance/p75": 32.26557922363281, "sentence_full_gradient_variance/p90": 34.21684646606445, "sentence_full_gradient_variance/p95": 34.21684646606445, "sentence_full_gradient_variance/p99": 63351.8828125, "state_level_variance/metric": 16.033620834350586, "state_level_variance_full_gradient/metric": 357.829345703125, "step": 48 }, { "accuracy_reward": 0.828125, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14251956343650818, "action_level_variance/metric": 220.5033416748047, "action_level_variance_full_gradient/metric": 1379.162353515625, "adam_stats/lr_effective_max": 5.05640018673148e-05, "adam_stats/lr_effective_mean": -2.1044564701977464e-10, "adam_stats/lr_effective_min": -4.875130980508402e-05, "adam_stats/m_t_max": 0.0009687019046396017, "adam_stats/m_t_mean": 2.261538829470555e-12, "adam_stats/m_t_min": -0.0013446197845041752, "adam_stats/v_t_max": 7.357418508036062e-05, "adam_stats/v_t_mean": 3.4667861029819003e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.08769124001264572, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.449187755584717, "all_logprobs": -0.05020865052938461, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -12.75, "all_logprobs/p1": -1.3125, "all_logprobs/p10": -0.0233154296875, "all_logprobs/p25": -6.4373016357421875e-06, "all_logprobs/p5": -0.2041015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.07475338131189346, "clip_ratio": 0.0, "completion_length": 569.3450927734375, "completion_length/correct": 518.6351928710938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 473.0, "completion_length/correct/min": 175.0, "completion_length/correct/p25": 366.0, "completion_length/correct/p75": 648.75, "completion_length/correct/var": 40195.5625, "completion_length/incorrect": 813.6742553710938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 949.0, "completion_length/incorrect/min": 277.0, "completion_length/incorrect/p25": 581.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 57147.44140625, "completion_length/max": 1024.0, "completion_length/median": 533.0, "completion_length/min": 175.0, "completion_length/p25": 381.0, "completion_length/p75": 710.25, "completion_length/var": 55444.5, "epoch": 0.6272, "feature_vector_variance/max_squared_error": 134588.234375, "feature_vector_variance/metric": 30309.77734375, "generated_tokens/total": 23632904.0, "grad_norm": 0.011657895520329475, "grouped_std_rewards": 0.10687534511089325, "learning_rate": 9.059337681133194e-06, "loss": 0.0877, "mean_logprobs": -0.05126953125, "mean_logprobs/var": 0.001220703125, "num_completions/total": 37632, "per_sentence_gradient_norm": 1.836769938468933, "per_sentence_gradient_norm/max": 282.76446533203125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 55.37886047363281, "per_sentence_gradient_norm/var": 217.41268920898438, "per_token_feature_norm": 193.18772888183594, "per_token_feature_norm/max": 300.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 825.0177001953125, "per_token_full_gradient_variance/max_squared_error": 288.9124450683594, "per_token_full_gradient_variance/variance": 0.038730688393116, "per_token_gradient_norm": 2.3254733085632324, "per_token_gradient_norm/max": 6760.52587890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4048.444091796875, "per_token_policy_error_norm": 0.028332673013210297, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.02550533413887024, "policy_entropy": 0.05543107911944389, "policy_entropy/max": 3.546875, "policy_entropy/median": 5.960464477539063e-08, "policy_entropy/min": 4.845028458294598e-19, "policy_entropy/p25": 5.820766091346741e-10, "policy_entropy/p75": 8.630752563476562e-05, "policy_entropy/var": 0.0353984497487545, "policy_error_vector_variance/max_squared_error": 2.007999897003174, "policy_error_vector_variance/metric": 0.0283119548112154, "policy_loss": 0.08769124746322632, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.449187755584717, "policy_sharpness": 8.778227806091309, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.282679557800293, "reward": 0.828125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14251956343650818, "rewards/accuracy_reward": 0.828125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14251956343650818, "sentence_full_gradient_variance/max_squared_error": 368439.53125, "sentence_full_gradient_variance/metric": 1570.1175537109375, "sentence_full_gradient_variance/p75": 13.494827270507812, "sentence_full_gradient_variance/p90": 13.541082382202148, "sentence_full_gradient_variance/p95": 13.541082382202148, "sentence_full_gradient_variance/p99": 24053.533203125, "state_level_variance/metric": 24.44381332397461, "state_level_variance_full_gradient/metric": 190.955322265625, "step": 49 }, { "accuracy_reward": 0.83203125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.13993747532367706, "action_level_variance/metric": 603.639404296875, "action_level_variance_full_gradient/metric": 1256.57470703125, "adam_stats/lr_effective_max": 4.955342228640802e-05, "adam_stats/lr_effective_mean": -2.6253951790344843e-10, "adam_stats/lr_effective_min": -4.862634523306042e-05, "adam_stats/m_t_max": 0.000868610164616257, "adam_stats/m_t_mean": -3.04094829829335e-13, "adam_stats/m_t_min": -0.001327650505118072, "adam_stats/v_t_max": 7.351011299761012e-05, "adam_stats/v_t_mean": 3.4647783773988916e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.019455179572105408, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.391165256500244, "all_logprobs": -0.04964360222220421, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.375, "all_logprobs/p1": -1.3125, "all_logprobs/p10": -0.023193359375, "all_logprobs/p25": -4.76837158203125e-06, "all_logprobs/p5": -0.2021484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.07330552488565445, "clip_ratio": 0.0, "completion_length": 552.3255615234375, "completion_length/correct": 490.5555419921875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 441.0, "completion_length/correct/min": 96.0, "completion_length/correct/p25": 346.5, "completion_length/correct/p75": 614.5, "completion_length/correct/var": 43314.66015625, "completion_length/incorrect": 858.3023071289062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 962.0, "completion_length/incorrect/min": 279.0, "completion_length/incorrect/p25": 753.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 43175.8125, "completion_length/max": 1024.0, "completion_length/median": 481.0, "completion_length/min": 96.0, "completion_length/p25": 365.0, "completion_length/p75": 739.0, "completion_length/var": 62159.828125, "epoch": 0.64, "feature_vector_variance/max_squared_error": 134853.078125, "feature_vector_variance/metric": 30322.587890625, "generated_tokens/total": 24057092.0, "grad_norm": 0.09867305308580399, "grouped_std_rewards": 0.1291896104812622, "learning_rate": 8.80236133250198e-06, "loss": -0.0195, "mean_logprobs": -0.0517578125, "mean_logprobs/var": 0.0013885498046875, "num_completions/total": 38400, "per_sentence_gradient_norm": 2.519113302230835, "per_sentence_gradient_norm/max": 556.9674072265625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 61.68351364135742, "per_sentence_gradient_norm/var": 598.0722045898438, "per_token_feature_norm": 193.7740478515625, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 799.4959716796875, "per_token_full_gradient_variance/max_squared_error": 207.88902282714844, "per_token_full_gradient_variance/variance": 0.051126204431056976, "per_token_gradient_norm": 3.3205902576446533, "per_token_gradient_norm/max": 5704.20849609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6178.50634765625, "per_token_policy_error_norm": 0.028062963858246803, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.025379978120326996, "policy_entropy": 0.054663170129060745, "policy_entropy/max": 3.8125, "policy_entropy/median": 6.05359673500061e-08, "policy_entropy/min": 4.553649124439119e-18, "policy_entropy/p25": 6.075424607843161e-10, "policy_entropy/p75": 6.580352783203125e-05, "policy_entropy/var": 0.03484557941555977, "policy_error_vector_variance/max_squared_error": 2.0082287788391113, "policy_error_vector_variance/metric": 0.028040768578648567, "policy_loss": -0.019455179572105408, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.391165256500244, "policy_sharpness": 8.798562049865723, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.194507598876953, "reward": 0.83203125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.13993747532367706, "rewards/accuracy_reward": 0.83203125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.13993747532367706, "sentence_full_gradient_variance/max_squared_error": 443460.0, "sentence_full_gradient_variance/metric": 1408.2264404296875, "sentence_full_gradient_variance/p75": 85.60978698730469, "sentence_full_gradient_variance/p90": 87.29026794433594, "sentence_full_gradient_variance/p95": 87.29026794433594, "sentence_full_gradient_variance/p99": 18178.119140625, "state_level_variance/metric": 69.83646392822266, "state_level_variance_full_gradient/metric": 151.65167236328125, "step": 50 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18774443864822388, "action_level_variance/metric": 112.09815979003906, "action_level_variance_full_gradient/metric": 1935.1693115234375, "adam_stats/lr_effective_max": 5.085550947114825e-05, "adam_stats/lr_effective_mean": -1.799268650737318e-10, "adam_stats/lr_effective_min": -4.982125392416492e-05, "adam_stats/m_t_max": 0.0007188067538663745, "adam_stats/m_t_mean": -3.4228728792301544e-12, "adam_stats/m_t_min": -0.0010107132839038968, "adam_stats/v_t_max": 7.3436793172732e-05, "adam_stats/v_t_mean": 3.462503287560148e-12, "adam_stats/v_t_min": 0.0, "advantages": 4.183252895018086e-05, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.114304304122925, "all_logprobs": -0.04344475269317627, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.5, "all_logprobs/p1": -1.15625, "all_logprobs/p10": -0.01416015625, "all_logprobs/p25": -2.0265579223632812e-06, "all_logprobs/p5": -0.16015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.06283074617385864, "clip_ratio": 0.0, "completion_length": 620.0794677734375, "completion_length/correct": 528.3801879882812, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 520.0, "completion_length/correct/min": 173.0, "completion_length/correct/p25": 358.75, "completion_length/correct/p75": 649.0, "completion_length/correct/var": 40981.3984375, "completion_length/incorrect": 895.1771240234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 407.0, "completion_length/incorrect/p25": 777.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 33532.30859375, "completion_length/max": 1024.0, "completion_length/median": 576.0, "completion_length/min": 173.0, "completion_length/p25": 403.75, "completion_length/p75": 825.5, "completion_length/var": 64332.10546875, "epoch": 0.6528, "feature_vector_variance/max_squared_error": 135483.296875, "feature_vector_variance/metric": 30081.0078125, "generated_tokens/total": 24533312.0, "grad_norm": 0.10042762011289597, "grouped_std_rewards": 0.13852691650390625, "learning_rate": 8.543798257200491e-06, "loss": -0.0, "mean_logprobs": -0.04443359375, "mean_logprobs/var": 0.00064849853515625, "num_completions/total": 39168, "per_sentence_gradient_norm": 1.6677355766296387, "per_sentence_gradient_norm/max": 109.26421356201172, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 62.716617584228516, "per_sentence_gradient_norm/var": 109.45933532714844, "per_token_feature_norm": 196.93276977539062, "per_token_feature_norm/max": 300.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 69.5, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 213.0, "per_token_feature_norm/var": 678.2456665039062, "per_token_full_gradient_variance/max_squared_error": 145.96934509277344, "per_token_full_gradient_variance/variance": 0.026249069720506668, "per_token_gradient_norm": 2.009917736053467, "per_token_gradient_norm/max": 5704.517578125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2627.586669921875, "per_token_policy_error_norm": 0.024747701361775398, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.022409828379750252, "policy_entropy": 0.0480385385453701, "policy_entropy/max": 3.1875, "policy_entropy/median": 3.725290298461914e-08, "policy_entropy/min": 2.6969529040576923e-18, "policy_entropy/p25": 4.420144250616431e-10, "policy_entropy/p75": 2.8371810913085938e-05, "policy_entropy/var": 0.02927306853234768, "policy_error_vector_variance/max_squared_error": 2.0079257488250732, "policy_error_vector_variance/metric": 0.02473384700715542, "policy_loss": -4.1828803659882396e-05, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.114304304122925, "policy_sharpness": 8.900132179260254, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.618468761444092, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.18774443864822388, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18774443864822388, "sentence_full_gradient_variance/max_squared_error": 330213.8125, "sentence_full_gradient_variance/metric": 2190.44091796875, "sentence_full_gradient_variance/p75": 16.110319137573242, "sentence_full_gradient_variance/p90": 117.63306427001953, "sentence_full_gradient_variance/p95": 117.63306427001953, "sentence_full_gradient_variance/p99": 45819.5, "state_level_variance/metric": 11.34914779663086, "state_level_variance_full_gradient/metric": 255.2718505859375, "step": 51 }, { "accuracy_reward": 0.8111979365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15335553884506226, "action_level_variance/metric": 113.87972259521484, "action_level_variance_full_gradient/metric": 1629.5423583984375, "adam_stats/lr_effective_max": 4.5699394831899554e-05, "adam_stats/lr_effective_mean": -1.3261777787043627e-10, "adam_stats/lr_effective_min": -4.7032575821504e-05, "adam_stats/m_t_max": 0.0007668674224987626, "adam_stats/m_t_mean": -4.4261998580208406e-12, "adam_stats/m_t_min": -0.0008592879166826606, "adam_stats/v_t_max": 7.337158604059368e-05, "adam_stats/v_t_mean": 3.459707563838177e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.08831347525119781, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.6414741277694702, "all_logprobs": -0.04050276428461075, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.0, "all_logprobs/p1": -1.140625, "all_logprobs/p10": -0.009765625, "all_logprobs/p25": -1.0728836059570312e-06, "all_logprobs/p5": -0.1318359375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.05849918723106384, "clip_ratio": 0.0, "completion_length": 567.32421875, "completion_length/correct": 502.7544250488281, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 475.0, "completion_length/correct/min": 129.0, "completion_length/correct/p25": 348.5, "completion_length/correct/p75": 636.0, "completion_length/correct/var": 41540.7109375, "completion_length/incorrect": 844.751708984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 299.0, "completion_length/incorrect/p25": 726.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 56135.46484375, "completion_length/max": 1024.0, "completion_length/median": 529.0, "completion_length/min": 129.0, "completion_length/p25": 362.0, "completion_length/p75": 752.0, "completion_length/var": 62163.42578125, "epoch": 0.6656, "feature_vector_variance/max_squared_error": 138804.65625, "feature_vector_variance/metric": 29399.408203125, "generated_tokens/total": 24969016.0, "grad_norm": 0.06471672654151917, "grouped_std_rewards": 0.13765618205070496, "learning_rate": 8.283963474507402e-06, "loss": -0.0883, "mean_logprobs": -0.042724609375, "mean_logprobs/var": 0.000614166259765625, "num_completions/total": 39936, "per_sentence_gradient_norm": 1.483828067779541, "per_sentence_gradient_norm/max": 155.97080993652344, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 37.04349899291992, "per_sentence_gradient_norm/var": 111.82357025146484, "per_token_feature_norm": 195.9656524658203, "per_token_feature_norm/max": 306.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 184.0, "per_token_feature_norm/p75": 213.0, "per_token_feature_norm/var": 709.867919921875, "per_token_full_gradient_variance/max_squared_error": 216.5301055908203, "per_token_full_gradient_variance/variance": 0.026998454704880714, "per_token_gradient_norm": 1.9068477153778076, "per_token_gradient_norm/max": 5435.14208984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2688.98388671875, "per_token_policy_error_norm": 0.0232283603399992, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.021080680191516876, "policy_entropy": 0.044465918093919754, "policy_entropy/max": 3.8125, "policy_entropy/median": 2.9569491744041443e-08, "policy_entropy/min": 3.130633773051894e-18, "policy_entropy/p25": 3.6925484891980886e-10, "policy_entropy/p75": 1.6927719116210938e-05, "policy_entropy/var": 0.02647024393081665, "policy_error_vector_variance/max_squared_error": 2.003851890563965, "policy_error_vector_variance/metric": 0.023214561864733696, "policy_loss": -0.08831347525119781, "policy_loss/max": 9.659051895141602, "policy_loss/median": 0.0, "policy_loss/min": -19.79339599609375, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.6414742469787598, "policy_sharpness": 8.965866088867188, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.285223960876465, "reward": 0.8111979365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15335553884506226, "rewards/accuracy_reward": 0.8111979365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15335553884506226, "sentence_full_gradient_variance/max_squared_error": 680073.25, "sentence_full_gradient_variance/metric": 1832.058349609375, "sentence_full_gradient_variance/p75": 53.37266540527344, "sentence_full_gradient_variance/p90": 84.06962585449219, "sentence_full_gradient_variance/p95": 84.06962585449219, "sentence_full_gradient_variance/p99": 39159.94140625, "state_level_variance/metric": 12.159886360168457, "state_level_variance_full_gradient/metric": 202.51597595214844, "step": 52 }, { "accuracy_reward": 0.8411458730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.13379374146461487, "action_level_variance/metric": 96.32028198242188, "action_level_variance_full_gradient/metric": 1781.10546875, "adam_stats/lr_effective_max": 4.487380647333339e-05, "adam_stats/lr_effective_mean": -1.1540553929156872e-10, "adam_stats/lr_effective_min": -4.183627606835216e-05, "adam_stats/m_t_max": 0.0007117336499504745, "adam_stats/m_t_mean": -1.6312355112513277e-12, "adam_stats/m_t_min": -0.0008993071387521923, "adam_stats/v_t_max": 7.329825166380033e-05, "adam_stats/v_t_mean": 3.457261820577484e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.025249775499105453, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.779266595840454, "all_logprobs": -0.0442609079182148, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.0625, "all_logprobs/p1": -1.1877365112304688, "all_logprobs/p10": -0.015625, "all_logprobs/p25": -3.0994415283203125e-06, "all_logprobs/p5": -0.16015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.06422467529773712, "clip_ratio": 0.0, "completion_length": 550.2578125, "completion_length/correct": 497.12847900390625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 452.0, "completion_length/correct/min": 172.0, "completion_length/correct/p25": 348.0, "completion_length/correct/p75": 611.0, "completion_length/correct/var": 37256.578125, "completion_length/incorrect": 831.5819091796875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 942.0, "completion_length/incorrect/min": 319.0, "completion_length/incorrect/p25": 593.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 51666.12109375, "completion_length/max": 1024.0, "completion_length/median": 482.0, "completion_length/min": 172.0, "completion_length/p25": 364.75, "completion_length/p75": 704.75, "completion_length/var": 54447.26953125, "epoch": 0.6784, "feature_vector_variance/max_squared_error": 135544.21875, "feature_vector_variance/metric": 30292.8515625, "generated_tokens/total": 25391616.0, "grad_norm": 0.09017767757177353, "grouped_std_rewards": 0.09666304290294647, "learning_rate": 8.02317355308094e-06, "loss": 0.0252, "mean_logprobs": -0.0439453125, "mean_logprobs/var": 0.000743865966796875, "num_completions/total": 40704, "per_sentence_gradient_norm": 1.2511171102523804, "per_sentence_gradient_norm/max": 185.6773681640625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 43.09260177612305, "per_sentence_gradient_norm/var": 94.87853240966797, "per_token_feature_norm": 197.2381591796875, "per_token_feature_norm/max": 294.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 48.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 213.0, "per_token_feature_norm/var": 672.2271728515625, "per_token_full_gradient_variance/max_squared_error": 270.7451171875, "per_token_full_gradient_variance/variance": 0.023663997650146484, "per_token_gradient_norm": 1.6804224252700806, "per_token_gradient_norm/max": 6680.26953125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2429.47412109375, "per_token_policy_error_norm": 0.025162965059280396, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.02282783016562462, "policy_entropy": 0.0490339957177639, "policy_entropy/max": 2.875, "policy_entropy/median": 5.029141902923584e-08, "policy_entropy/min": 7.426784881525705e-18, "policy_entropy/p25": 5.566107574850321e-10, "policy_entropy/p75": 4.4345855712890625e-05, "policy_entropy/var": 0.029716523364186287, "policy_error_vector_variance/max_squared_error": 2.007882833480835, "policy_error_vector_variance/metric": 0.025144562125205994, "policy_loss": 0.025249779224395752, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.779266595840454, "policy_sharpness": 8.86966323852539, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.776910305023193, "reward": 0.8411458730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.13379374146461487, "rewards/accuracy_reward": 0.8411458730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.13379374146461487, "sentence_full_gradient_variance/max_squared_error": 432823.78125, "sentence_full_gradient_variance/metric": 2022.720458984375, "sentence_full_gradient_variance/p75": 13.65489387512207, "sentence_full_gradient_variance/p90": 76.54634857177734, "sentence_full_gradient_variance/p95": 76.54634857177734, "sentence_full_gradient_variance/p99": 37114.73828125, "state_level_variance/metric": 10.585002899169922, "state_level_variance_full_gradient/metric": 241.61485290527344, "step": 53 }, { "accuracy_reward": 0.8190104365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1484256237745285, "action_level_variance/metric": 120.75401306152344, "action_level_variance_full_gradient/metric": 3024.67822265625, "adam_stats/lr_effective_max": 4.3470499804243445e-05, "adam_stats/lr_effective_mean": -1.1388180676252802e-10, "adam_stats/lr_effective_min": -4.269486089469865e-05, "adam_stats/m_t_max": 0.0006062358734197915, "adam_stats/m_t_mean": -4.303276051470517e-12, "adam_stats/m_t_min": -0.0008197338320314884, "adam_stats/v_t_max": 7.324320904444903e-05, "adam_stats/v_t_mean": 3.454789405943348e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.08902405947446823, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.613647699356079, "all_logprobs": -0.041270267218351364, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.625, "all_logprobs/p1": -1.140625, "all_logprobs/p10": -0.01104736328125, "all_logprobs/p25": -1.7881393432617188e-06, "all_logprobs/p5": -0.142578125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.05945087969303131, "clip_ratio": 0.0, "completion_length": 552.7409057617188, "completion_length/correct": 500.40380859375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 453.0, "completion_length/correct/min": 126.0, "completion_length/correct/p25": 332.0, "completion_length/correct/p75": 655.0, "completion_length/correct/var": 42684.8125, "completion_length/incorrect": 789.5755615234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 840.0, "completion_length/incorrect/min": 232.0, "completion_length/incorrect/p25": 562.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 62846.03515625, "completion_length/max": 1024.0, "completion_length/median": 495.0, "completion_length/min": 126.0, "completion_length/p25": 350.0, "completion_length/p75": 733.0, "completion_length/var": 58667.99609375, "epoch": 0.6912, "feature_vector_variance/max_squared_error": 146258.609375, "feature_vector_variance/metric": 29553.421875, "generated_tokens/total": 25816120.0, "grad_norm": 0.0636943057179451, "grouped_std_rewards": 0.10836939513683319, "learning_rate": 7.76174622526876e-06, "loss": 0.089, "mean_logprobs": -0.04296875, "mean_logprobs/var": 0.000682830810546875, "num_completions/total": 41472, "per_sentence_gradient_norm": 1.569475531578064, "per_sentence_gradient_norm/max": 141.7397003173828, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 50.51249313354492, "per_sentence_gradient_norm/var": 118.44497680664062, "per_token_feature_norm": 196.9976348876953, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 68.5, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 213.0, "per_token_feature_norm/var": 643.3179931640625, "per_token_full_gradient_variance/max_squared_error": 170.31576538085938, "per_token_full_gradient_variance/variance": 0.040870700031518936, "per_token_gradient_norm": 2.099789619445801, "per_token_gradient_norm/max": 6278.216796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3592.385498046875, "per_token_policy_error_norm": 0.023567363619804382, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.02143678069114685, "policy_entropy": 0.04572403058409691, "policy_entropy/max": 3.8125, "policy_entropy/median": 3.655441105365753e-08, "policy_entropy/min": 2.3852447794681098e-18, "policy_entropy/p25": 4.3291947804391384e-10, "policy_entropy/p75": 2.5272369384765625e-05, "policy_entropy/var": 0.027228286489844322, "policy_error_vector_variance/max_squared_error": 2.0060784816741943, "policy_error_vector_variance/metric": 0.023546433076262474, "policy_loss": 0.08902406692504883, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.613647699356079, "policy_sharpness": 8.937225341796875, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.423476219177246, "reward": 0.8190104365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1484256237745285, "rewards/accuracy_reward": 0.8190104365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1484256237745285, "sentence_full_gradient_variance/max_squared_error": 664024.25, "sentence_full_gradient_variance/metric": 3388.56396484375, "sentence_full_gradient_variance/p75": 192.1410369873047, "sentence_full_gradient_variance/p90": 266.0538024902344, "sentence_full_gradient_variance/p95": 266.0538024902344, "sentence_full_gradient_variance/p99": 51473.37109375, "state_level_variance/metric": 12.763954162597656, "state_level_variance_full_gradient/metric": 363.8854675292969, "step": 54 }, { "accuracy_reward": 0.7721354365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1761717051267624, "action_level_variance/metric": 286.7735595703125, "action_level_variance_full_gradient/metric": 2107.8232421875, "adam_stats/lr_effective_max": 4.1756840801099315e-05, "adam_stats/lr_effective_mean": -1.2866994969495948e-10, "adam_stats/lr_effective_min": -4.3005940824514255e-05, "adam_stats/m_t_max": 0.0011088851606473327, "adam_stats/m_t_mean": -1.3440972293499165e-11, "adam_stats/m_t_min": -0.0008433082839474082, "adam_stats/v_t_max": 7.32378612156026e-05, "adam_stats/v_t_mean": 3.4547371473986344e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0007467195391654968, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.8175454139709473, "all_logprobs": -0.04268820583820343, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.75, "all_logprobs/p1": -1.1484375, "all_logprobs/p10": -0.01416015625, "all_logprobs/p25": -2.1457672119140625e-06, "all_logprobs/p5": -0.16015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.06136397272348404, "clip_ratio": 0.0, "completion_length": 571.8099365234375, "completion_length/correct": 489.6930847167969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 456.0, "completion_length/correct/min": 158.0, "completion_length/correct/p25": 348.0, "completion_length/correct/p75": 612.0, "completion_length/correct/var": 33957.87109375, "completion_length/incorrect": 850.0685424804688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 240.0, "completion_length/incorrect/p25": 671.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 54813.97265625, "completion_length/max": 1024.0, "completion_length/median": 516.0, "completion_length/min": 158.0, "completion_length/p25": 372.0, "completion_length/p75": 729.0, "completion_length/var": 61524.46875, "epoch": 0.704, "feature_vector_variance/max_squared_error": 134038.234375, "feature_vector_variance/metric": 29507.517578125, "generated_tokens/total": 26255270.0, "grad_norm": 0.12792421877384186, "grouped_std_rewards": 0.11774790287017822, "learning_rate": 7.5e-06, "loss": -0.0007, "mean_logprobs": -0.04443359375, "mean_logprobs/var": 0.000675201416015625, "num_completions/total": 42240, "per_sentence_gradient_norm": 2.1356914043426514, "per_sentence_gradient_norm/max": 263.77618408203125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 49.26011657714844, "per_sentence_gradient_norm/var": 282.5803527832031, "per_token_feature_norm": 196.63356018066406, "per_token_feature_norm/max": 296.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 640.1890869140625, "per_token_full_gradient_variance/max_squared_error": 259.6190185546875, "per_token_full_gradient_variance/variance": 0.04686881601810455, "per_token_gradient_norm": 2.7584235668182373, "per_token_gradient_norm/max": 6541.5615234375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4927.41357421875, "per_token_policy_error_norm": 0.024448640644550323, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.022193806245923042, "policy_entropy": 0.047140203416347504, "policy_entropy/max": 3.46875, "policy_entropy/median": 3.655441105365753e-08, "policy_entropy/min": 3.333921680392926e-18, "policy_entropy/p25": 4.129105946049094e-10, "policy_entropy/p75": 3.0994415283203125e-05, "policy_entropy/var": 0.027848826721310616, "policy_error_vector_variance/max_squared_error": 2.0066492557525635, "policy_error_vector_variance/metric": 0.024424483999609947, "policy_loss": -0.0007467120885848999, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.8175458908081055, "policy_sharpness": 8.90626335144043, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.580687999725342, "reward": 0.7721354365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1761717051267624, "rewards/accuracy_reward": 0.7721354365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1761717051267624, "sentence_full_gradient_variance/max_squared_error": 592849.875, "sentence_full_gradient_variance/metric": 2374.24462890625, "sentence_full_gradient_variance/p75": 72.26671600341797, "sentence_full_gradient_variance/p90": 157.63919067382812, "sentence_full_gradient_variance/p95": 157.63919067382812, "sentence_full_gradient_variance/p99": 56143.9140625, "state_level_variance/metric": 31.614843368530273, "state_level_variance_full_gradient/metric": 266.4216613769531, "step": 55 }, { "accuracy_reward": 0.7291666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1977400928735733, "action_level_variance/metric": 46.80665588378906, "action_level_variance_full_gradient/metric": 2481.017333984375, "adam_stats/lr_effective_max": 4.215644730720669e-05, "adam_stats/lr_effective_mean": -1.1785979137091118e-10, "adam_stats/lr_effective_min": -4.150750828557648e-05, "adam_stats/m_t_max": 0.001210093847475946, "adam_stats/m_t_mean": -1.3229132199421567e-11, "adam_stats/m_t_min": -0.0008627372444607317, "adam_stats/v_t_max": 7.316958362935111e-05, "adam_stats/v_t_mean": 3.4516074894075377e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.06011238321661949, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.5261286497116089, "all_logprobs": -0.03832518681883812, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.75, "all_logprobs/p1": -1.1328125, "all_logprobs/p10": -0.007598876953125, "all_logprobs/p25": -8.344650268554688e-07, "all_logprobs/p5": -0.11745548248291016, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.05608004331588745, "clip_ratio": 0.0, "completion_length": 613.0078125, "completion_length/correct": 514.6571655273438, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 468.0, "completion_length/correct/min": 176.0, "completion_length/correct/p25": 362.75, "completion_length/correct/p75": 642.0, "completion_length/correct/var": 39598.48828125, "completion_length/incorrect": 877.798095703125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 275.0, "completion_length/incorrect/p25": 755.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 48725.640625, "completion_length/max": 1024.0, "completion_length/median": 565.0, "completion_length/min": 176.0, "completion_length/p25": 388.75, "completion_length/p75": 844.25, "completion_length/var": 68086.375, "epoch": 0.7168, "feature_vector_variance/max_squared_error": 141913.46875, "feature_vector_variance/metric": 28619.16015625, "generated_tokens/total": 26726060.0, "grad_norm": 0.049901194870471954, "grouped_std_rewards": 0.1116107925772667, "learning_rate": 7.238253774731245e-06, "loss": -0.0601, "mean_logprobs": -0.0390625, "mean_logprobs/var": 0.00069427490234375, "num_completions/total": 43008, "per_sentence_gradient_norm": 0.9876631498336792, "per_sentence_gradient_norm/max": 88.59687042236328, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 31.002710342407227, "per_sentence_gradient_norm/var": 45.89093017578125, "per_token_feature_norm": 196.8551025390625, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 73.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 212.0, "per_token_feature_norm/var": 586.5032958984375, "per_token_full_gradient_variance/max_squared_error": 137.66648864746094, "per_token_full_gradient_variance/variance": 0.014909247867763042, "per_token_gradient_norm": 1.0721663236618042, "per_token_gradient_norm/max": 4937.5234375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1218.8743896484375, "per_token_policy_error_norm": 0.02195601537823677, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.020273638889193535, "policy_entropy": 0.0419527143239975, "policy_entropy/max": 3.75, "policy_entropy/median": 2.6426278054714203e-08, "policy_entropy/min": 4.9060148304969076e-18, "policy_entropy/p25": 3.092281986027956e-10, "policy_entropy/p75": 1.2695789337158203e-05, "policy_entropy/var": 0.02483390085399151, "policy_error_vector_variance/max_squared_error": 2.006274938583374, "policy_error_vector_variance/metric": 0.021929923444986343, "policy_loss": -0.06011237949132919, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.5261286497116089, "policy_sharpness": 9.015398025512695, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.013247966766357, "reward": 0.7291666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.1977400928735733, "rewards/accuracy_reward": 0.7291666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1977400928735733, "sentence_full_gradient_variance/max_squared_error": 1088194.125, "sentence_full_gradient_variance/metric": 2779.806396484375, "sentence_full_gradient_variance/p75": 86.3342056274414, "sentence_full_gradient_variance/p90": 225.9904022216797, "sentence_full_gradient_variance/p95": 225.9904022216797, "sentence_full_gradient_variance/p99": 51450.35546875, "state_level_variance/metric": 4.926673412322998, "state_level_variance_full_gradient/metric": 298.7889709472656, "step": 56 }, { "accuracy_reward": 0.8463541865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1302083432674408, "action_level_variance/metric": 59.893795013427734, "action_level_variance_full_gradient/metric": 686.4127807617188, "adam_stats/lr_effective_max": 3.937382280128077e-05, "adam_stats/lr_effective_mean": -1.0822825274869885e-10, "adam_stats/lr_effective_min": -4.149148298893124e-05, "adam_stats/m_t_max": 0.0007945897523313761, "adam_stats/m_t_mean": -9.016738744538344e-12, "adam_stats/m_t_min": -0.00073664216324687, "adam_stats/v_t_max": 7.31066829757765e-05, "adam_stats/v_t_mean": 3.4495861028771557e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.015960179269313812, "advantages/max": 7.48191499710083, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.1249651908874512, "all_logprobs": -0.03813241049647331, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.1875, "all_logprobs/p1": -1.078125, "all_logprobs/p10": -0.0067138671875, "all_logprobs/p25": -8.344650268554688e-07, "all_logprobs/p5": -0.11279296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.056194644421339035, "clip_ratio": 0.0, "completion_length": 574.7643432617188, "completion_length/correct": 521.4169311523438, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 490.0, "completion_length/correct/min": 172.0, "completion_length/correct/p25": 372.0, "completion_length/correct/p75": 645.5, "completion_length/correct/var": 40508.0703125, "completion_length/incorrect": 868.6271362304688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 302.0, "completion_length/incorrect/p25": 753.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 48414.64453125, "completion_length/max": 1024.0, "completion_length/median": 535.0, "completion_length/min": 172.0, "completion_length/p25": 388.75, "completion_length/p75": 739.0, "completion_length/var": 57358.59765625, "epoch": 0.7296, "feature_vector_variance/max_squared_error": 165142.234375, "feature_vector_variance/metric": 28791.3203125, "generated_tokens/total": 27167480.0, "grad_norm": 0.08575651794672012, "grouped_std_rewards": 0.10253679752349854, "learning_rate": 6.976826446919061e-06, "loss": -0.016, "mean_logprobs": -0.038330078125, "mean_logprobs/var": 0.00098419189453125, "num_completions/total": 43776, "per_sentence_gradient_norm": 0.9192080497741699, "per_sentence_gradient_norm/max": 170.53170776367188, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 26.657182693481445, "per_sentence_gradient_norm/var": 59.12583923339844, "per_token_feature_norm": 195.77650451660156, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 69.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 580.7973022460938, "per_token_full_gradient_variance/max_squared_error": 566.2098388671875, "per_token_full_gradient_variance/variance": 0.014062853530049324, "per_token_gradient_norm": 1.2498981952667236, "per_token_gradient_norm/max": 7212.8359375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1751.254638671875, "per_token_policy_error_norm": 0.021690700203180313, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.019743235781788826, "policy_entropy": 0.042172107845544815, "policy_entropy/max": 3.65625, "policy_entropy/median": 2.270098775625229e-08, "policy_entropy/min": 4.553649124439119e-18, "policy_entropy/p25": 2.382876118645072e-10, "policy_entropy/p75": 1.2874603271484375e-05, "policy_entropy/var": 0.026679379865527153, "policy_error_vector_variance/max_squared_error": 2.005929946899414, "policy_error_vector_variance/metric": 0.021661890670657158, "policy_loss": -0.015960169956088066, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -7.481914520263672, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.1249653100967407, "policy_sharpness": 9.022354125976562, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.985549449920654, "reward": 0.8463541865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1302083432674408, "rewards/accuracy_reward": 0.8463541865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1302083432674408, "sentence_full_gradient_variance/max_squared_error": 97668.1328125, "sentence_full_gradient_variance/metric": 769.2658081054688, "sentence_full_gradient_variance/p75": 18.03800392150879, "sentence_full_gradient_variance/p90": 69.2143783569336, "sentence_full_gradient_variance/p95": 69.2143783569336, "sentence_full_gradient_variance/p99": 28526.990234375, "state_level_variance/metric": 6.71169376373291, "state_level_variance_full_gradient/metric": 82.85304260253906, "step": 57 }, { "accuracy_reward": 0.7877604365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1674119085073471, "action_level_variance/metric": 35.7793083190918, "action_level_variance_full_gradient/metric": 2561.650390625, "adam_stats/lr_effective_max": 3.6149303923593834e-05, "adam_stats/lr_effective_mean": -1.0068215705594241e-10, "adam_stats/lr_effective_min": -3.854066744679585e-05, "adam_stats/m_t_max": 0.0006371806957758963, "adam_stats/m_t_mean": -7.816658778581065e-12, "adam_stats/m_t_min": -0.0006637408514507115, "adam_stats/v_t_max": 7.303365418920293e-05, "adam_stats/v_t_mean": 3.4462903451132343e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.00829335954040289, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": -0.0, "advantages/p75": 0.0, "advantages/var": 1.4011690616607666, "all_logprobs": -0.03588743880391121, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.375, "all_logprobs/p1": -0.98046875, "all_logprobs/p10": -0.0067138671875, "all_logprobs/p25": -8.344650268554688e-07, "all_logprobs/p5": -0.10009765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.05112851411104202, "clip_ratio": 0.0, "completion_length": 563.83203125, "completion_length/correct": 485.8512268066406, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 463.0, "completion_length/correct/min": 139.0, "completion_length/correct/p25": 333.0, "completion_length/correct/p75": 612.0, "completion_length/correct/var": 38583.75, "completion_length/incorrect": 853.2698974609375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 996.0, "completion_length/incorrect/min": 255.0, "completion_length/incorrect/p25": 716.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 51125.71484375, "completion_length/max": 1024.0, "completion_length/median": 520.0, "completion_length/min": 139.0, "completion_length/p25": 348.75, "completion_length/p75": 727.25, "completion_length/var": 63782.4765625, "epoch": 0.7424, "feature_vector_variance/max_squared_error": 145238.765625, "feature_vector_variance/metric": 28304.095703125, "generated_tokens/total": 27600502.0, "grad_norm": 0.030135899782180786, "grouped_std_rewards": 0.10262416303157806, "learning_rate": 6.7160365254926005e-06, "loss": 0.0083, "mean_logprobs": -0.03564453125, "mean_logprobs/var": 0.000576019287109375, "num_completions/total": 44544, "per_sentence_gradient_norm": 0.9233092069625854, "per_sentence_gradient_norm/max": 66.25614166259766, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 34.215858459472656, "per_sentence_gradient_norm/var": 34.97233963012695, "per_token_feature_norm": 195.16464233398438, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 562.010009765625, "per_token_full_gradient_variance/max_squared_error": 94.75066375732422, "per_token_full_gradient_variance/variance": 0.013776687905192375, "per_token_gradient_norm": 1.1473517417907715, "per_token_gradient_norm/max": 3526.1787109375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1156.4075927734375, "per_token_policy_error_norm": 0.020519059151411057, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.018404722213745117, "policy_entropy": 0.040273573249578476, "policy_entropy/max": 3.78125, "policy_entropy/median": 2.898741513490677e-08, "policy_entropy/min": 9.698527246061739e-20, "policy_entropy/p25": 2.7466739993542433e-10, "policy_entropy/p75": 1.2636184692382812e-05, "policy_entropy/var": 0.023801714181900024, "policy_error_vector_variance/max_squared_error": 2.0065345764160156, "policy_error_vector_variance/metric": 0.02049415372312069, "policy_loss": 0.00829335954040289, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.4011690616607666, "policy_sharpness": 9.038990020751953, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.85551643371582, "reward": 0.7877604365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1674119085073471, "rewards/accuracy_reward": 0.7877604365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1674119085073471, "sentence_full_gradient_variance/max_squared_error": 1072948.0, "sentence_full_gradient_variance/metric": 2909.416015625, "sentence_full_gradient_variance/p75": 34.16775131225586, "sentence_full_gradient_variance/p90": 66.14220428466797, "sentence_full_gradient_variance/p95": 66.14220428466797, "sentence_full_gradient_variance/p99": 57356.0703125, "state_level_variance/metric": 3.658017873764038, "state_level_variance_full_gradient/metric": 347.765380859375, "step": 58 }, { "accuracy_reward": 0.8059896230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15657424926757812, "action_level_variance/metric": 104.6578369140625, "action_level_variance_full_gradient/metric": 2576.455322265625, "adam_stats/lr_effective_max": 3.465537156444043e-05, "adam_stats/lr_effective_mean": -1.8327867001843856e-10, "adam_stats/lr_effective_min": -3.36650809913408e-05, "adam_stats/m_t_max": 0.0010190194007009268, "adam_stats/m_t_mean": -1.1590483781076522e-11, "adam_stats/m_t_min": -0.0009000565623864532, "adam_stats/v_t_max": 7.298046693904325e-05, "adam_stats/v_t_mean": 3.449408293720868e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.050151146948337555, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.4773104190826416, "all_logprobs": -0.03153179958462715, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.25, "all_logprobs/p1": -0.95703125, "all_logprobs/p10": -0.003173828125, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.0712890625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04497332125902176, "clip_ratio": 0.0, "completion_length": 582.3346557617188, "completion_length/correct": 511.7916259765625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 478.0, "completion_length/correct/min": 161.0, "completion_length/correct/p25": 346.5, "completion_length/correct/p75": 641.0, "completion_length/correct/var": 41859.671875, "completion_length/incorrect": 875.39599609375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 278.0, "completion_length/incorrect/p25": 741.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 39608.2265625, "completion_length/max": 1024.0, "completion_length/median": 531.0, "completion_length/min": 161.0, "completion_length/p25": 379.75, "completion_length/p75": 765.5, "completion_length/var": 62071.05078125, "epoch": 0.7552, "feature_vector_variance/max_squared_error": 141732.046875, "feature_vector_variance/metric": 27857.36328125, "generated_tokens/total": 28047736.0, "grad_norm": 0.2387155294418335, "grouped_std_rewards": 0.09372615069150925, "learning_rate": 6.456201742799511e-06, "loss": -0.0502, "mean_logprobs": -0.0311279296875, "mean_logprobs/var": 0.0004444122314453125, "num_completions/total": 45312, "per_sentence_gradient_norm": 1.2481739521026611, "per_sentence_gradient_norm/max": 177.69679260253906, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 48.11866760253906, "per_sentence_gradient_norm/var": 103.23432159423828, "per_token_feature_norm": 194.31471252441406, "per_token_feature_norm/max": 300.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 207.0, "per_token_feature_norm/var": 444.9736633300781, "per_token_full_gradient_variance/max_squared_error": 101.46728515625, "per_token_full_gradient_variance/variance": 0.025288742035627365, "per_token_gradient_norm": 1.4509257078170776, "per_token_gradient_norm/max": 5709.77490234375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2599.183837890625, "per_token_policy_error_norm": 0.01809738017618656, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016729013994336128, "policy_entropy": 0.034906353801488876, "policy_entropy/max": 3.46875, "policy_entropy/median": 2.1420419216156006e-08, "policy_entropy/min": 1.3552527156068805e-19, "policy_entropy/p25": 1.90084392670542e-10, "policy_entropy/p75": 8.761882781982422e-06, "policy_entropy/var": 0.02053207717835903, "policy_error_vector_variance/max_squared_error": 2.001984119415283, "policy_error_vector_variance/metric": 0.01808425970375538, "policy_loss": -0.05015115067362785, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.4773104190826416, "policy_sharpness": 9.132710456848145, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.287900447845459, "reward": 0.8059896230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15657424926757812, "rewards/accuracy_reward": 0.8059896230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15657424926757812, "sentence_full_gradient_variance/max_squared_error": 706539.375, "sentence_full_gradient_variance/metric": 2905.0966796875, "sentence_full_gradient_variance/p75": 113.72769927978516, "sentence_full_gradient_variance/p90": 126.41804504394531, "sentence_full_gradient_variance/p95": 126.41804504394531, "sentence_full_gradient_variance/p99": 86510.3046875, "state_level_variance/metric": 11.645599365234375, "state_level_variance_full_gradient/metric": 328.6415100097656, "step": 59 }, { "accuracy_reward": 0.7565104365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18444254994392395, "action_level_variance/metric": 183.1912078857422, "action_level_variance_full_gradient/metric": 1336.505126953125, "adam_stats/lr_effective_max": 3.325475336168893e-05, "adam_stats/lr_effective_mean": -1.3887681282742648e-10, "adam_stats/lr_effective_min": -3.305673089926131e-05, "adam_stats/m_t_max": 0.0011978792026638985, "adam_stats/m_t_mean": -1.1441071180728901e-11, "adam_stats/m_t_min": -0.0007581508252769709, "adam_stats/v_t_max": 7.291536894626915e-05, "adam_stats/v_t_mean": 3.447217554811144e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.12139169871807098, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.6881566047668457, "all_logprobs": -0.03670266270637512, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.125, "all_logprobs/p1": -1.0546875, "all_logprobs/p10": -0.0067138671875, "all_logprobs/p25": -1.1920928955078125e-06, "all_logprobs/p5": -0.1005859375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.05294255539774895, "clip_ratio": 0.0, "completion_length": 591.8177490234375, "completion_length/correct": 511.25128173828125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 468.0, "completion_length/correct/min": 161.0, "completion_length/correct/p25": 340.0, "completion_length/correct/p75": 666.0, "completion_length/correct/var": 40733.5078125, "completion_length/incorrect": 842.1337280273438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 981.0, "completion_length/incorrect/min": 225.0, "completion_length/incorrect/p25": 670.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 51234.01171875, "completion_length/max": 1024.0, "completion_length/median": 548.0, "completion_length/min": 161.0, "completion_length/p25": 378.0, "completion_length/p75": 787.25, "completion_length/var": 63420.16015625, "epoch": 0.768, "feature_vector_variance/max_squared_error": 149481.28125, "feature_vector_variance/metric": 28314.9453125, "generated_tokens/total": 28502252.0, "grad_norm": 0.12383240461349487, "grouped_std_rewards": 0.128051295876503, "learning_rate": 6.197638667498023e-06, "loss": -0.1214, "mean_logprobs": -0.0361328125, "mean_logprobs/var": 0.00067138671875, "num_completions/total": 46080, "per_sentence_gradient_norm": 1.7353754043579102, "per_sentence_gradient_norm/max": 237.44044494628906, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 43.12663650512695, "per_sentence_gradient_norm/var": 180.41458129882812, "per_token_feature_norm": 192.01177978515625, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 194.0, "per_token_feature_norm/min": 68.5, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 561.9298095703125, "per_token_full_gradient_variance/max_squared_error": 156.34666442871094, "per_token_full_gradient_variance/variance": 0.038328684866428375, "per_token_gradient_norm": 2.3509597778320312, "per_token_gradient_norm/max": 6191.62060546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3977.370849609375, "per_token_policy_error_norm": 0.02102849818766117, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01915905252099037, "policy_entropy": 0.040728550404310226, "policy_entropy/max": 3.765625, "policy_entropy/median": 3.655441105365753e-08, "policy_entropy/min": 1.2451384324638215e-19, "policy_entropy/p25": 2.7284841053187847e-10, "policy_entropy/p75": 1.7881393432617188e-05, "policy_entropy/var": 0.024847866967320442, "policy_error_vector_variance/max_squared_error": 2.0067012310028076, "policy_error_vector_variance/metric": 0.02099786326289177, "policy_loss": -0.12139169871807098, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.6881563663482666, "policy_sharpness": 9.031051635742188, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.902164459228516, "reward": 0.7565104365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18444254994392395, "rewards/accuracy_reward": 0.7565104365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18444254994392395, "sentence_full_gradient_variance/max_squared_error": 213517.4375, "sentence_full_gradient_variance/metric": 1472.8387451171875, "sentence_full_gradient_variance/p75": 103.83321380615234, "sentence_full_gradient_variance/p90": 226.09791564941406, "sentence_full_gradient_variance/p95": 226.09791564941406, "sentence_full_gradient_variance/p99": 34444.00390625, "state_level_variance/metric": 20.09671401977539, "state_level_variance_full_gradient/metric": 136.33355712890625, "step": 60 }, { "accuracy_reward": 0.8229166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14591483771800995, "action_level_variance/metric": 105.83480834960938, "action_level_variance_full_gradient/metric": 3450.762451171875, "adam_stats/lr_effective_max": 3.359928086865693e-05, "adam_stats/lr_effective_mean": -9.755223018270698e-11, "adam_stats/lr_effective_min": -3.315069989184849e-05, "adam_stats/m_t_max": 0.0009667895501479506, "adam_stats/m_t_mean": -1.3778886018278591e-11, "adam_stats/m_t_min": -0.0008684929925948381, "adam_stats/v_t_max": 7.284688763320446e-05, "adam_stats/v_t_mean": 3.445626379702804e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.03223560377955437, "advantages/max": 7.48191499710083, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.8935390710830688, "all_logprobs": -0.03329417109489441, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.875, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.00408935546875, "all_logprobs/p25": -8.344650268554688e-07, "all_logprobs/p5": -0.07958984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0486573725938797, "clip_ratio": 0.0, "completion_length": 567.7421875, "completion_length/correct": 516.7484130859375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 496.0, "completion_length/correct/min": 118.0, "completion_length/correct/p25": 391.0, "completion_length/correct/p75": 633.0, "completion_length/correct/var": 32171.09375, "completion_length/incorrect": 804.7132568359375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 855.0, "completion_length/incorrect/min": 295.0, "completion_length/incorrect/p25": 609.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 44418.81640625, "completion_length/max": 1024.0, "completion_length/median": 534.0, "completion_length/min": 118.0, "completion_length/p25": 416.75, "completion_length/p75": 687.0, "completion_length/var": 46384.671875, "epoch": 0.7808, "feature_vector_variance/max_squared_error": 147076.015625, "feature_vector_variance/metric": 27860.0390625, "generated_tokens/total": 28938276.0, "grad_norm": 0.0997760221362114, "grouped_std_rewards": 0.1069260686635971, "learning_rate": 5.9406623188668065e-06, "loss": 0.0322, "mean_logprobs": -0.033447265625, "mean_logprobs/var": 0.0007781982421875, "num_completions/total": 46848, "per_sentence_gradient_norm": 1.215348482131958, "per_sentence_gradient_norm/max": 226.9810791015625, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 41.342559814453125, "per_sentence_gradient_norm/var": 104.49378967285156, "per_token_feature_norm": 192.70120239257812, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 194.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 184.0, "per_token_feature_norm/p75": 205.0, "per_token_feature_norm/var": 473.0445556640625, "per_token_full_gradient_variance/max_squared_error": 385.46453857421875, "per_token_full_gradient_variance/variance": 0.025283293798565865, "per_token_gradient_norm": 1.5795761346817017, "per_token_gradient_norm/max": 6964.18115234375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2497.373779296875, "per_token_policy_error_norm": 0.019014691933989525, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.017237111926078796, "policy_entropy": 0.03703422099351883, "policy_entropy/max": 3.609375, "policy_entropy/median": 2.9569491744041443e-08, "policy_entropy/min": 1.0706496453294356e-18, "policy_entropy/p25": 2.064552973024547e-10, "policy_entropy/p75": 1.245737075805664e-05, "policy_entropy/var": 0.02176433429121971, "policy_error_vector_variance/max_squared_error": 2.0084564685821533, "policy_error_vector_variance/metric": 0.018988313153386116, "policy_loss": 0.032235607504844666, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -7.4819159507751465, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.8935390710830688, "policy_sharpness": 9.09504508972168, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.509760856628418, "reward": 0.8229166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14591483771800995, "rewards/accuracy_reward": 0.8229166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14591483771800995, "sentence_full_gradient_variance/max_squared_error": 1355351.875, "sentence_full_gradient_variance/metric": 3905.1298828125, "sentence_full_gradient_variance/p75": 72.3565902709961, "sentence_full_gradient_variance/p90": 150.17572021484375, "sentence_full_gradient_variance/p95": 150.17572021484375, "sentence_full_gradient_variance/p99": 41799.05859375, "state_level_variance/metric": 11.87598705291748, "state_level_variance_full_gradient/metric": 454.36810302734375, "step": 61 }, { "accuracy_reward": 0.7526041865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18643388152122498, "action_level_variance/metric": 102.76683044433594, "action_level_variance_full_gradient/metric": 1926.5546875, "adam_stats/lr_effective_max": 3.191061477991752e-05, "adam_stats/lr_effective_mean": -1.931724398496204e-11, "adam_stats/lr_effective_min": -3.3248736144742e-05, "adam_stats/m_t_max": 0.0007249435293488204, "adam_stats/m_t_mean": -9.402930907856377e-13, "adam_stats/m_t_min": -0.0007206973386928439, "adam_stats/v_t_max": 7.299947901628911e-05, "adam_stats/v_t_mean": 3.446663527501004e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.01829640567302704, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.5184261798858643, "all_logprobs": -0.03274922072887421, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.6875, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.00408935546875, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.0791015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.045191690325737, "clip_ratio": 0.0, "completion_length": 570.7982177734375, "completion_length/correct": 499.0104064941406, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 479.0, "completion_length/correct/min": 131.0, "completion_length/correct/p25": 348.0, "completion_length/correct/p75": 627.5, "completion_length/correct/var": 38274.17578125, "completion_length/incorrect": 789.1842651367188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 855.0, "completion_length/incorrect/min": 219.0, "completion_length/incorrect/p25": 561.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 62140.2265625, "completion_length/max": 1024.0, "completion_length/median": 533.0, "completion_length/min": 131.0, "completion_length/p25": 379.0, "completion_length/p75": 732.0, "completion_length/var": 59803.109375, "epoch": 0.7936, "feature_vector_variance/max_squared_error": 142159.53125, "feature_vector_variance/metric": 27976.123046875, "generated_tokens/total": 29376650.0, "grad_norm": 0.1459578424692154, "grouped_std_rewards": 0.1369946300983429, "learning_rate": 5.685585783002493e-06, "loss": -0.0183, "mean_logprobs": -0.03271484375, "mean_logprobs/var": 0.000415802001953125, "num_completions/total": 47616, "per_sentence_gradient_norm": 1.5145018100738525, "per_sentence_gradient_norm/max": 156.61398315429688, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 47.49754333496094, "per_sentence_gradient_norm/var": 100.60408782958984, "per_token_feature_norm": 191.01515197753906, "per_token_feature_norm/max": 302.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 68.5, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 496.946044921875, "per_token_full_gradient_variance/max_squared_error": 145.05355834960938, "per_token_full_gradient_variance/variance": 0.033567916601896286, "per_token_gradient_norm": 1.9097342491149902, "per_token_gradient_norm/max": 5706.06396484375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2835.260009765625, "per_token_policy_error_norm": 0.018975839018821716, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01716669276356697, "policy_entropy": 0.0366196446120739, "policy_entropy/max": 3.453125, "policy_entropy/median": 2.5494955480098724e-08, "policy_entropy/min": 1.5856456772600502e-18, "policy_entropy/p25": 1.8098944565281272e-10, "policy_entropy/p75": 1.055002212524414e-05, "policy_entropy/var": 0.020994046702980995, "policy_error_vector_variance/max_squared_error": 1.9946801662445068, "policy_error_vector_variance/metric": 0.018953965976834297, "policy_loss": -0.01829640381038189, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.5184264183044434, "policy_sharpness": 9.108033180236816, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.4643754959106445, "reward": 0.7526041865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18643388152122498, "rewards/accuracy_reward": 0.7526041865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18643388152122498, "sentence_full_gradient_variance/max_squared_error": 213314.59375, "sentence_full_gradient_variance/metric": 2188.13330078125, "sentence_full_gradient_variance/p75": 7.134788513183594, "sentence_full_gradient_variance/p90": 94.30184173583984, "sentence_full_gradient_variance/p95": 94.30184173583984, "sentence_full_gradient_variance/p99": 65242.640625, "state_level_variance/metric": 10.663213729858398, "state_level_variance_full_gradient/metric": 261.57867431640625, "step": 62 }, { "accuracy_reward": 0.7552083730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1851097196340561, "action_level_variance/metric": 44.35204315185547, "action_level_variance_full_gradient/metric": 2539.427001953125, "adam_stats/lr_effective_max": 3.0209976102923974e-05, "adam_stats/lr_effective_mean": -4.168907233315089e-11, "adam_stats/lr_effective_min": -3.1219227821566164e-05, "adam_stats/m_t_max": 0.0007729936041869223, "adam_stats/m_t_mean": 2.7989633180625084e-13, "adam_stats/m_t_min": -0.0007600167882628739, "adam_stats/v_t_max": 7.292772352229804e-05, "adam_stats/v_t_mean": 3.444827539542117e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.013241363689303398, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.4308733940124512, "all_logprobs": -0.030941996723413467, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.6875, "all_logprobs/p1": -0.90234375, "all_logprobs/p10": -0.0024871826171875, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.06396484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0448857843875885, "clip_ratio": 0.0, "completion_length": 579.3737182617188, "completion_length/correct": 503.3034362792969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 457.0, "completion_length/correct/min": 109.0, "completion_length/correct/p25": 351.75, "completion_length/correct/p75": 623.5, "completion_length/correct/var": 40610.05859375, "completion_length/incorrect": 814.0584716796875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 971.0, "completion_length/incorrect/min": 212.0, "completion_length/incorrect/p25": 609.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 66631.109375, "completion_length/max": 1024.0, "completion_length/median": 506.0, "completion_length/min": 109.0, "completion_length/p25": 370.5, "completion_length/p75": 789.0, "completion_length/var": 64777.0390625, "epoch": 0.8064, "feature_vector_variance/max_squared_error": 146796.546875, "feature_vector_variance/metric": 27885.4921875, "generated_tokens/total": 29821608.0, "grad_norm": 0.09887552261352539, "grouped_std_rewards": 0.10424509644508362, "learning_rate": 5.432719831372507e-06, "loss": 0.0132, "mean_logprobs": -0.03125, "mean_logprobs/var": 0.0004444122314453125, "num_completions/total": 48384, "per_sentence_gradient_norm": 0.9478491544723511, "per_sentence_gradient_norm/max": 94.24639892578125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 33.17183303833008, "per_sentence_gradient_norm/var": 43.51027297973633, "per_token_feature_norm": 190.6471710205078, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 72.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 494.67193603515625, "per_token_full_gradient_variance/max_squared_error": 111.61968231201172, "per_token_full_gradient_variance/variance": 0.01388537883758545, "per_token_gradient_norm": 1.1039007902145386, "per_token_gradient_norm/max": 4862.2158203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1248.7249755859375, "per_token_policy_error_norm": 0.01774364709854126, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016332576051354408, "policy_entropy": 0.03421241044998169, "policy_entropy/max": 3.40625, "policy_entropy/median": 2.3632310330867767e-08, "policy_entropy/min": 1.8465318250143747e-19, "policy_entropy/p25": 1.709850039333105e-10, "policy_entropy/p75": 8.404254913330078e-06, "policy_entropy/var": 0.01997997611761093, "policy_error_vector_variance/max_squared_error": 2.0061542987823486, "policy_error_vector_variance/metric": 0.017720771953463554, "policy_loss": 0.013241356238722801, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.4308733940124512, "policy_sharpness": 9.15993881225586, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.1724700927734375, "reward": 0.7552083730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1851097196340561, "rewards/accuracy_reward": 0.7552083730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1851097196340561, "sentence_full_gradient_variance/max_squared_error": 915163.875, "sentence_full_gradient_variance/metric": 2885.3994140625, "sentence_full_gradient_variance/p75": 30.054460525512695, "sentence_full_gradient_variance/p90": 64.50424194335938, "sentence_full_gradient_variance/p95": 64.50424194335938, "sentence_full_gradient_variance/p99": 33880.375, "state_level_variance/metric": 4.694487571716309, "state_level_variance_full_gradient/metric": 345.97235107421875, "step": 63 }, { "accuracy_reward": 0.8463541865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1302083432674408, "action_level_variance/metric": 49.160125732421875, "action_level_variance_full_gradient/metric": 1279.115234375, "adam_stats/lr_effective_max": 2.6809071641764604e-05, "adam_stats/lr_effective_mean": -4.568857098208312e-11, "adam_stats/lr_effective_min": -2.7366246285964735e-05, "adam_stats/m_t_max": 0.0005489049945026636, "adam_stats/m_t_mean": 3.73820237460111e-14, "adam_stats/m_t_min": -0.0005800182116217911, "adam_stats/v_t_max": 7.285823085112497e-05, "adam_stats/v_t_mean": 3.441787653490902e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.010617944411933422, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.873626708984375, "all_logprobs": -0.030053557828068733, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.6875, "all_logprobs/p1": -0.875, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04391307383775711, "clip_ratio": 0.0, "completion_length": 558.6315307617188, "completion_length/correct": 510.0430908203125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 476.0, "completion_length/correct/min": 159.0, "completion_length/correct/p25": 351.75, "completion_length/correct/p75": 643.5, "completion_length/correct/var": 39392.4296875, "completion_length/incorrect": 826.2796630859375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 222.0, "completion_length/incorrect/p25": 630.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 68571.5078125, "completion_length/max": 1024.0, "completion_length/median": 507.0, "completion_length/min": 159.0, "completion_length/p25": 364.75, "completion_length/p75": 706.25, "completion_length/var": 56813.67578125, "epoch": 0.8192, "feature_vector_variance/max_squared_error": 128068.8359375, "feature_vector_variance/metric": 27714.408203125, "generated_tokens/total": 30250638.0, "grad_norm": 0.057441070675849915, "grouped_std_rewards": 0.12034021317958832, "learning_rate": 5.182372542187895e-06, "loss": 0.0106, "mean_logprobs": -0.031005859375, "mean_logprobs/var": 0.000469207763671875, "num_completions/total": 49152, "per_sentence_gradient_norm": 0.9820560216903687, "per_sentence_gradient_norm/max": 98.40292358398438, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 25.024303436279297, "per_sentence_gradient_norm/var": 48.25852966308594, "per_token_feature_norm": 189.0646209716797, "per_token_feature_norm/max": 296.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 69.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 540.5189208984375, "per_token_full_gradient_variance/max_squared_error": 107.84678649902344, "per_token_full_gradient_variance/variance": 0.01731765642762184, "per_token_gradient_norm": 1.2331846952438354, "per_token_gradient_norm/max": 5035.25341796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1702.0572509765625, "per_token_policy_error_norm": 0.01721976324915886, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015861591324210167, "policy_entropy": 0.032981257885694504, "policy_entropy/max": 3.46875, "policy_entropy/median": 2.561137080192566e-08, "policy_entropy/min": 6.471331717022855e-19, "policy_entropy/p25": 1.9372237147763371e-10, "policy_entropy/p75": 7.3909759521484375e-06, "policy_entropy/var": 0.019973844289779663, "policy_error_vector_variance/max_squared_error": 2.0034162998199463, "policy_error_vector_variance/metric": 0.01719626970589161, "policy_loss": 0.010617947205901146, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.873626708984375, "policy_sharpness": 9.192229270935059, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.983604431152344, "reward": 0.8463541865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1302083432674408, "rewards/accuracy_reward": 0.8463541865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1302083432674408, "sentence_full_gradient_variance/max_squared_error": 438880.625, "sentence_full_gradient_variance/metric": 1454.9739990234375, "sentence_full_gradient_variance/p75": 9.82681655883789, "sentence_full_gradient_variance/p90": 26.065908432006836, "sentence_full_gradient_variance/p95": 26.065908432006836, "sentence_full_gradient_variance/p99": 23732.625, "state_level_variance/metric": 5.235115051269531, "state_level_variance_full_gradient/metric": 175.85879516601562, "step": 64 }, { "accuracy_reward": 0.8046875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1573704332113266, "action_level_variance/metric": 93.89555358886719, "action_level_variance_full_gradient/metric": 2188.93701171875, "adam_stats/lr_effective_max": 2.7526069970917888e-05, "adam_stats/lr_effective_mean": -4.992845720197536e-11, "adam_stats/lr_effective_min": -2.8111220672144555e-05, "adam_stats/m_t_max": 0.002074031624943018, "adam_stats/m_t_mean": 2.7684821449214247e-11, "adam_stats/m_t_min": -0.002938993740826845, "adam_stats/v_t_max": 7.340549200307578e-05, "adam_stats/v_t_mean": 3.4511842168793994e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.03496674820780754, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.103943347930908, "all_logprobs": -0.03189842775464058, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.0, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.0028533935546875, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.06982421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.047426458448171616, "clip_ratio": 0.0, "completion_length": 549.7265625, "completion_length/correct": 482.53399658203125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 444.0, "completion_length/correct/min": 191.0, "completion_length/correct/p25": 339.25, "completion_length/correct/p75": 596.75, "completion_length/correct/var": 35184.3125, "completion_length/incorrect": 826.5599975585938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 899.0, "completion_length/incorrect/min": 250.0, "completion_length/incorrect/p25": 672.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 51728.0078125, "completion_length/max": 1024.0, "completion_length/median": 501.0, "completion_length/min": 191.0, "completion_length/p25": 355.75, "completion_length/p75": 714.5, "completion_length/var": 56977.6796875, "epoch": 0.832, "feature_vector_variance/max_squared_error": 155555.296875, "feature_vector_variance/metric": 27856.65234375, "generated_tokens/total": 30672828.0, "grad_norm": 0.18789061903953552, "grouped_std_rewards": 0.14918524026870728, "learning_rate": 4.934848925057485e-06, "loss": -0.035, "mean_logprobs": -0.031005859375, "mean_logprobs/var": 0.0004634857177734375, "num_completions/total": 49920, "per_sentence_gradient_norm": 1.191488265991211, "per_sentence_gradient_norm/max": 209.87530517578125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 25.83624267578125, "per_sentence_gradient_norm/var": 92.59646606445312, "per_token_feature_norm": 190.3195037841797, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 505.95068359375, "per_token_full_gradient_variance/max_squared_error": 232.11985778808594, "per_token_full_gradient_variance/variance": 0.022978246212005615, "per_token_gradient_norm": 1.5184521675109863, "per_token_gradient_norm/max": 5760.49560546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2584.88037109375, "per_token_policy_error_norm": 0.018279388546943665, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01698954775929451, "policy_entropy": 0.034737661480903625, "policy_entropy/max": 3.640625, "policy_entropy/median": 2.9336661100387573e-08, "policy_entropy/min": 5.353248226647178e-19, "policy_entropy/p25": 2.419255906715989e-10, "policy_entropy/p75": 9.5367431640625e-06, "policy_entropy/var": 0.020539315417408943, "policy_error_vector_variance/max_squared_error": 2.006328582763672, "policy_error_vector_variance/metric": 0.018248984590172768, "policy_loss": -0.034966759383678436, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.79339599609375, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.1039435863494873, "policy_sharpness": 9.144235610961914, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.249436855316162, "reward": 0.8046875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1573704332113266, "rewards/accuracy_reward": 0.8046875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1573704332113266, "sentence_full_gradient_variance/max_squared_error": 370350.3125, "sentence_full_gradient_variance/metric": 2461.798828125, "sentence_full_gradient_variance/p75": 30.966110229492188, "sentence_full_gradient_variance/p90": 212.71730041503906, "sentence_full_gradient_variance/p95": 212.71730041503906, "sentence_full_gradient_variance/p99": 48836.140625, "state_level_variance/metric": 10.42590045928955, "state_level_variance_full_gradient/metric": 272.86181640625, "step": 65 }, { "accuracy_reward": 0.79296875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16438336670398712, "action_level_variance/metric": 24.869029998779297, "action_level_variance_full_gradient/metric": 952.4189453125, "adam_stats/lr_effective_max": 2.4726679839659482e-05, "adam_stats/lr_effective_mean": -2.9498972708985605e-11, "adam_stats/lr_effective_min": -2.4555518393754028e-05, "adam_stats/m_t_max": 0.0019616144709289074, "adam_stats/m_t_mean": 2.274590758799544e-11, "adam_stats/m_t_min": -0.002642805455252528, "adam_stats/v_t_max": 7.333208486670628e-05, "adam_stats/v_t_mean": 3.4488666263154943e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.024394728243350983, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.8814336657524109, "all_logprobs": -0.0338255912065506, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.5, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.00408935546875, "all_logprobs/p25": -8.344650268554688e-07, "all_logprobs/p5": -0.08203125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.049162063747644424, "clip_ratio": 0.0, "completion_length": 543.0794677734375, "completion_length/correct": 470.0624084472656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 428.0, "completion_length/correct/min": 106.0, "completion_length/correct/p25": 317.0, "completion_length/correct/p75": 570.0, "completion_length/correct/var": 43067.01953125, "completion_length/incorrect": 822.7484130859375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 933.0, "completion_length/incorrect/min": 187.0, "completion_length/incorrect/p25": 629.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 53801.21484375, "completion_length/max": 1024.0, "completion_length/median": 477.0, "completion_length/min": 106.0, "completion_length/p25": 344.0, "completion_length/p75": 718.5, "completion_length/var": 65669.3125, "epoch": 0.8448, "feature_vector_variance/max_squared_error": 145109.015625, "feature_vector_variance/metric": 28233.7890625, "generated_tokens/total": 31089912.0, "grad_norm": 0.079984650015831, "grouped_std_rewards": 0.09552391618490219, "learning_rate": 4.6904505493806595e-06, "loss": -0.0244, "mean_logprobs": -0.0341796875, "mean_logprobs/var": 0.00095367431640625, "num_completions/total": 50688, "per_sentence_gradient_norm": 0.661768913269043, "per_sentence_gradient_norm/max": 57.647666931152344, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 25.2336368560791, "per_sentence_gradient_norm/var": 24.462942123413086, "per_token_feature_norm": 190.34136962890625, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 478.18426513671875, "per_token_full_gradient_variance/max_squared_error": 54.151432037353516, "per_token_full_gradient_variance/variance": 0.008968017995357513, "per_token_gradient_norm": 0.8740770816802979, "per_token_gradient_norm/max": 3748.096435546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 765.7369995117188, "per_token_policy_error_norm": 0.01941780000925064, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.017872069031000137, "policy_entropy": 0.037389323115348816, "policy_entropy/max": 3.765625, "policy_entropy/median": 2.782326191663742e-08, "policy_entropy/min": 3.3034284942917713e-19, "policy_entropy/p25": 1.9917933968827128e-10, "policy_entropy/p75": 1.245737075805664e-05, "policy_entropy/var": 0.02246534824371338, "policy_error_vector_variance/max_squared_error": 2.0067625045776367, "policy_error_vector_variance/metric": 0.019386103376746178, "policy_loss": -0.024394724518060684, "policy_loss/max": 9.659051895141602, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.8814336657524109, "policy_sharpness": 9.097464561462402, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.538632392883301, "reward": 0.79296875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16438336670398712, "rewards/accuracy_reward": 0.79296875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16438336670398712, "sentence_full_gradient_variance/max_squared_error": 335264.03125, "sentence_full_gradient_variance/metric": 1074.716552734375, "sentence_full_gradient_variance/p75": 34.73926544189453, "sentence_full_gradient_variance/p90": 37.28718185424805, "sentence_full_gradient_variance/p95": 37.28718185424805, "sentence_full_gradient_variance/p99": 20776.8203125, "state_level_variance/metric": 2.698803186416626, "state_level_variance_full_gradient/metric": 122.29759979248047, "step": 66 }, { "accuracy_reward": 0.82421875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1450711041688919, "action_level_variance/metric": 74.07305908203125, "action_level_variance_full_gradient/metric": 3800.780029296875, "adam_stats/lr_effective_max": 2.5009709133883007e-05, "adam_stats/lr_effective_mean": -4.340440507011323e-11, "adam_stats/lr_effective_min": -2.2768645067117177e-05, "adam_stats/m_t_max": 0.0015229105483740568, "adam_stats/m_t_mean": 1.2809804432467597e-11, "adam_stats/m_t_min": -0.0015240323264151812, "adam_stats/v_t_max": 7.333176472457126e-05, "adam_stats/v_t_mean": 3.4597940831715412e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.10056664794683456, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.0533335208892822, "all_logprobs": -0.03344983607530594, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.25, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.003662109375, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.0791015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04960845783352852, "clip_ratio": 0.0, "completion_length": 528.796875, "completion_length/correct": 472.1927490234375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 438.0, "completion_length/correct/min": 94.0, "completion_length/correct/p25": 320.0, "completion_length/correct/p75": 587.0, "completion_length/correct/var": 36229.23046875, "completion_length/incorrect": 794.2073974609375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 930.0, "completion_length/incorrect/min": 218.0, "completion_length/incorrect/p25": 572.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 69544.359375, "completion_length/max": 1024.0, "completion_length/median": 482.0, "completion_length/min": 94.0, "completion_length/p25": 338.0, "completion_length/p75": 676.75, "completion_length/var": 57045.2890625, "epoch": 0.8576, "feature_vector_variance/max_squared_error": 146211.390625, "feature_vector_variance/metric": 28143.8046875, "generated_tokens/total": 31496028.0, "grad_norm": 0.20450016856193542, "grouped_std_rewards": 0.10122168064117432, "learning_rate": 4.4494751769315e-06, "loss": 0.1006, "mean_logprobs": -0.034912109375, "mean_logprobs/var": 0.000972747802734375, "num_completions/total": 51456, "per_sentence_gradient_norm": 0.9950002431869507, "per_sentence_gradient_norm/max": 175.12977600097656, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 23.722097396850586, "per_sentence_gradient_norm/var": 73.17831420898438, "per_token_feature_norm": 189.46945190429688, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 63.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 595.7549438476562, "per_token_full_gradient_variance/max_squared_error": 271.7835998535156, "per_token_full_gradient_variance/variance": 0.022430360317230225, "per_token_gradient_norm": 1.2789956331253052, "per_token_gradient_norm/max": 4952.2138671875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1852.2098388671875, "per_token_policy_error_norm": 0.019005995243787766, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.017407549545168877, "policy_entropy": 0.03735748678445816, "policy_entropy/max": 3.796875, "policy_entropy/median": 2.4097971618175507e-08, "policy_entropy/min": 1.4094628242311558e-18, "policy_entropy/p25": 1.8189894035458565e-10, "policy_entropy/p75": 9.059906005859375e-06, "policy_entropy/var": 0.023500943556427956, "policy_error_vector_variance/max_squared_error": 2.0059351921081543, "policy_error_vector_variance/metric": 0.018960624933242798, "policy_loss": 0.10056664794683456, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.0533335208892822, "policy_sharpness": 9.113700866699219, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.4845099449157715, "reward": 0.82421875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1450711041688919, "rewards/accuracy_reward": 0.82421875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1450711041688919, "sentence_full_gradient_variance/max_squared_error": 1202774.375, "sentence_full_gradient_variance/metric": 4322.6298828125, "sentence_full_gradient_variance/p75": 29.699296951293945, "sentence_full_gradient_variance/p90": 86.53016662597656, "sentence_full_gradient_variance/p95": 86.53016662597656, "sentence_full_gradient_variance/p99": 52647.015625, "state_level_variance/metric": 8.356149673461914, "state_level_variance_full_gradient/metric": 521.849609375, "step": 67 }, { "accuracy_reward": 0.8138021230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15172581374645233, "action_level_variance/metric": 58.57830047607422, "action_level_variance_full_gradient/metric": 1131.6424560546875, "adam_stats/lr_effective_max": 2.1807751181768253e-05, "adam_stats/lr_effective_mean": -6.363688742627716e-11, "adam_stats/lr_effective_min": -2.048956048383843e-05, "adam_stats/m_t_max": 0.001269911415874958, "adam_stats/m_t_mean": 1.0318282686605507e-11, "adam_stats/m_t_min": -0.0009194655576720834, "adam_stats/v_t_max": 7.328860374400392e-05, "adam_stats/v_t_mean": 3.4579119082001064e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.015412203967571259, "advantages/max": 12.9586820602417, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.108414888381958, "all_logprobs": -0.03124142996966839, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.4375, "all_logprobs/p1": -0.9296875, "all_logprobs/p10": -0.0024871826171875, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.0634765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04578729718923569, "clip_ratio": 0.0, "completion_length": 538.5286865234375, "completion_length/correct": 486.7887878417969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 449.0, "completion_length/correct/min": 162.0, "completion_length/correct/p25": 326.0, "completion_length/correct/p75": 596.0, "completion_length/correct/var": 36768.015625, "completion_length/incorrect": 764.664306640625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 826.0, "completion_length/incorrect/min": 231.0, "completion_length/incorrect/p25": 521.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 71224.03125, "completion_length/max": 1024.0, "completion_length/median": 495.0, "completion_length/min": 162.0, "completion_length/p25": 337.0, "completion_length/p75": 664.0, "completion_length/var": 54814.640625, "epoch": 0.8704, "feature_vector_variance/max_squared_error": 139570.96875, "feature_vector_variance/metric": 28131.658203125, "generated_tokens/total": 31909620.0, "grad_norm": 0.09443406015634537, "grouped_std_rewards": 0.11639320850372314, "learning_rate": 4.212216399081919e-06, "loss": 0.0154, "mean_logprobs": -0.03173828125, "mean_logprobs/var": 0.000522613525390625, "num_completions/total": 52224, "per_sentence_gradient_norm": 1.0557334423065186, "per_sentence_gradient_norm/max": 98.8786392211914, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 36.23039627075195, "per_sentence_gradient_norm/var": 57.538639068603516, "per_token_feature_norm": 189.9246368408203, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 74.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 502.9931945800781, "per_token_full_gradient_variance/max_squared_error": 137.32736206054688, "per_token_full_gradient_variance/variance": 0.021763136610388756, "per_token_gradient_norm": 1.4811960458755493, "per_token_gradient_norm/max": 5216.48681640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2104.375, "per_token_policy_error_norm": 0.017968419939279556, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01661592535674572, "policy_entropy": 0.034227460622787476, "policy_entropy/max": 3.65625, "policy_entropy/median": 2.922024577856064e-08, "policy_entropy/min": 5.353248226647178e-19, "policy_entropy/p25": 2.1100277081131935e-10, "policy_entropy/p75": 9.834766387939453e-06, "policy_entropy/var": 0.020050570368766785, "policy_error_vector_variance/max_squared_error": 2.0042059421539307, "policy_error_vector_variance/metric": 0.01793654076755047, "policy_loss": 0.015412211418151855, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.108415365219116, "policy_sharpness": 9.153827667236328, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.195859909057617, "reward": 0.8138021230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15172581374645233, "rewards/accuracy_reward": 0.8138021230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15172581374645233, "sentence_full_gradient_variance/max_squared_error": 139882.203125, "sentence_full_gradient_variance/metric": 1275.795166015625, "sentence_full_gradient_variance/p75": 49.63752365112305, "sentence_full_gradient_variance/p90": 64.20435333251953, "sentence_full_gradient_variance/p95": 64.20435333251953, "sentence_full_gradient_variance/p99": 51290.1171875, "state_level_variance/metric": 6.2730584144592285, "state_level_variance_full_gradient/metric": 144.15269470214844, "step": 68 }, { "accuracy_reward": 0.8463541865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1302083432674408, "action_level_variance/metric": 167.74826049804688, "action_level_variance_full_gradient/metric": 1998.601318359375, "adam_stats/lr_effective_max": 2.0709630916826427e-05, "adam_stats/lr_effective_mean": 1.1505719987869867e-11, "adam_stats/lr_effective_min": -2.0933206542395055e-05, "adam_stats/m_t_max": 0.0018736907513812184, "adam_stats/m_t_mean": 2.893065301212694e-11, "adam_stats/m_t_min": -0.002060393337160349, "adam_stats/v_t_max": 7.325108890654519e-05, "adam_stats/v_t_mean": 3.4678199981735824e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.0569586418569088, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.086644411087036, "all_logprobs": -0.03355448320508003, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.4375, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.0033721923828125, "all_logprobs/p25": -7.152557373046875e-07, "all_logprobs/p5": -0.0791015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.05031739920377731, "clip_ratio": 0.0, "completion_length": 515.0846557617188, "completion_length/correct": 472.31231689453125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 435.0, "completion_length/correct/min": 98.0, "completion_length/correct/p25": 326.25, "completion_length/correct/p75": 571.75, "completion_length/correct/var": 38964.87890625, "completion_length/incorrect": 750.6949462890625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 755.0, "completion_length/incorrect/min": 309.0, "completion_length/incorrect/p25": 488.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 70033.1875, "completion_length/max": 1024.0, "completion_length/median": 457.0, "completion_length/min": 98.0, "completion_length/p25": 340.0, "completion_length/p75": 642.25, "completion_length/var": 53744.05078125, "epoch": 0.8832, "feature_vector_variance/max_squared_error": 139945.625, "feature_vector_variance/metric": 28359.01953125, "generated_tokens/total": 32305204.0, "grad_norm": 0.19159166514873505, "grouped_std_rewards": 0.13085834681987762, "learning_rate": 3.978963279105821e-06, "loss": 0.057, "mean_logprobs": -0.034912109375, "mean_logprobs/var": 0.000946044921875, "num_completions/total": 52992, "per_sentence_gradient_norm": 1.5032453536987305, "per_sentence_gradient_norm/max": 250.34066772460938, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 40.62282943725586, "per_sentence_gradient_norm/var": 165.7042694091797, "per_token_feature_norm": 188.26780700683594, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 191.0, "per_token_feature_norm/min": 69.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 202.0, "per_token_feature_norm/var": 593.085693359375, "per_token_full_gradient_variance/max_squared_error": 226.92616271972656, "per_token_full_gradient_variance/variance": 0.038380298763513565, "per_token_gradient_norm": 2.262446403503418, "per_token_gradient_norm/max": 6624.60107421875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4346.15625, "per_token_policy_error_norm": 0.019068583846092224, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01760757900774479, "policy_entropy": 0.0369243398308754, "policy_entropy/max": 3.453125, "policy_entropy/median": 3.026798367500305e-08, "policy_entropy/min": 3.1170812458958252e-18, "policy_entropy/p25": 2.1827872842550278e-10, "policy_entropy/p75": 1.1146068572998047e-05, "policy_entropy/var": 0.02261391468346119, "policy_error_vector_variance/max_squared_error": 2.004514455795288, "policy_error_vector_variance/metric": 0.019028637558221817, "policy_loss": 0.0569586455821991, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.086644411087036, "policy_sharpness": 9.117799758911133, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.42343282699585, "reward": 0.8463541865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1302083432674408, "rewards/accuracy_reward": 0.8463541865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1302083432674408, "sentence_full_gradient_variance/max_squared_error": 884077.875, "sentence_full_gradient_variance/metric": 2276.89794921875, "sentence_full_gradient_variance/p75": 15.041175842285156, "sentence_full_gradient_variance/p90": 15.76565933227539, "sentence_full_gradient_variance/p95": 15.76565933227539, "sentence_full_gradient_variance/p99": 42308.0390625, "state_level_variance/metric": 18.90572166442871, "state_level_variance_full_gradient/metric": 278.296630859375, "step": 69 }, { "accuracy_reward": 0.7669271230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17898297309875488, "action_level_variance/metric": 149.90240478515625, "action_level_variance_full_gradient/metric": 4293.87646484375, "adam_stats/lr_effective_max": 2.109016895701643e-05, "adam_stats/lr_effective_mean": 1.002748405143361e-11, "adam_stats/lr_effective_min": -1.9740757124964148e-05, "adam_stats/m_t_max": 0.00140708580147475, "adam_stats/m_t_mean": 2.4553557104178836e-11, "adam_stats/m_t_min": -0.0014774617739021778, "adam_stats/v_t_max": 7.317784184124321e-05, "adam_stats/v_t_mean": 3.466233376714367e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.056593090295791626, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.5961661338806152, "all_logprobs": -0.030625836923718452, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.75, "all_logprobs/p1": -0.90234375, "all_logprobs/p10": -0.002471923828125, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04436488449573517, "clip_ratio": 0.0, "completion_length": 558.2474365234375, "completion_length/correct": 478.19696044921875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 442.0, "completion_length/correct/min": 179.0, "completion_length/correct/p25": 335.0, "completion_length/correct/p75": 567.0, "completion_length/correct/var": 33653.1953125, "completion_length/incorrect": 821.6536254882812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 957.0, "completion_length/incorrect/min": 303.0, "completion_length/incorrect/p25": 635.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 55879.015625, "completion_length/max": 1024.0, "completion_length/median": 484.0, "completion_length/min": 179.0, "completion_length/p25": 364.5, "completion_length/p75": 717.25, "completion_length/var": 59880.6171875, "epoch": 0.896, "feature_vector_variance/max_squared_error": 134201.078125, "feature_vector_variance/metric": 28191.271484375, "generated_tokens/total": 32733938.0, "grad_norm": 0.1282072514295578, "grouped_std_rewards": 0.13927370309829712, "learning_rate": 3.750000000000002e-06, "loss": 0.0566, "mean_logprobs": -0.031494140625, "mean_logprobs/var": 0.000377655029296875, "num_completions/total": 53760, "per_sentence_gradient_norm": 1.722543716430664, "per_sentence_gradient_norm/max": 162.6853790283203, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 43.14893341064453, "per_sentence_gradient_norm/var": 147.12680053710938, "per_token_feature_norm": 188.84742736816406, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 191.0, "per_token_feature_norm/min": 70.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 201.0, "per_token_feature_norm/var": 484.672119140625, "per_token_full_gradient_variance/max_squared_error": 187.1636962890625, "per_token_full_gradient_variance/variance": 0.03174590319395065, "per_token_gradient_norm": 2.0662145614624023, "per_token_gradient_norm/max": 5835.9580078125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3283.73388671875, "per_token_policy_error_norm": 0.01773592084646225, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016347452998161316, "policy_entropy": 0.03340986371040344, "policy_entropy/max": 3.78125, "policy_entropy/median": 3.003515303134918e-08, "policy_entropy/min": 6.810144895924575e-19, "policy_entropy/p25": 2.319211489520967e-10, "policy_entropy/p75": 9.47713851928711e-06, "policy_entropy/var": 0.019211359322071075, "policy_error_vector_variance/max_squared_error": 2.0043370723724365, "policy_error_vector_variance/metric": 0.017705896869301796, "policy_loss": 0.056593090295791626, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.5961661338806152, "policy_sharpness": 9.167749404907227, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.085265159606934, "reward": 0.7669271230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17898297309875488, "rewards/accuracy_reward": 0.7669271230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17898297309875488, "sentence_full_gradient_variance/max_squared_error": 910256.0, "sentence_full_gradient_variance/metric": 4853.97314453125, "sentence_full_gradient_variance/p75": 112.08480834960938, "sentence_full_gradient_variance/p90": 189.1615447998047, "sentence_full_gradient_variance/p95": 189.1615447998047, "sentence_full_gradient_variance/p99": 133259.71875, "state_level_variance/metric": 15.93664836883545, "state_level_variance_full_gradient/metric": 560.0966186523438, "step": 70 }, { "accuracy_reward": 0.8229166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14591483771800995, "action_level_variance/metric": 59.962989807128906, "action_level_variance_full_gradient/metric": 2614.091796875, "adam_stats/lr_effective_max": 1.9119806893286295e-05, "adam_stats/lr_effective_mean": 4.3073492689282844e-11, "adam_stats/lr_effective_min": -1.8831733541446738e-05, "adam_stats/m_t_max": 0.0015883377054706216, "adam_stats/m_t_mean": 2.7315981074305107e-11, "adam_stats/m_t_min": -0.0017508582677692175, "adam_stats/v_t_max": 7.311187800951302e-05, "adam_stats/v_t_mean": 3.463876538031818e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.05216365307569504, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.108712911605835, "all_logprobs": -0.03070259839296341, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.875, "all_logprobs/p1": -0.8984375, "all_logprobs/p10": -0.002471923828125, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04417193681001663, "clip_ratio": 0.0, "completion_length": 522.2096557617188, "completion_length/correct": 478.5997009277344, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 431.0, "completion_length/correct/min": 95.0, "completion_length/correct/p25": 324.0, "completion_length/correct/p75": 629.0, "completion_length/correct/var": 39429.98046875, "completion_length/incorrect": 724.86767578125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 731.0, "completion_length/incorrect/min": 221.0, "completion_length/incorrect/p25": 538.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 71175.0234375, "completion_length/max": 1024.0, "completion_length/median": 464.0, "completion_length/min": 95.0, "completion_length/p25": 334.75, "completion_length/p75": 690.25, "completion_length/var": 53815.4609375, "epoch": 0.9088, "feature_vector_variance/max_squared_error": 130675.9375, "feature_vector_variance/metric": 28008.74609375, "generated_tokens/total": 33134996.0, "grad_norm": 0.1026138886809349, "grouped_std_rewards": 0.10977429151535034, "learning_rate": 3.525605518250964e-06, "loss": 0.0522, "mean_logprobs": -0.0322265625, "mean_logprobs/var": 0.0005645751953125, "num_completions/total": 54528, "per_sentence_gradient_norm": 1.1015000343322754, "per_sentence_gradient_norm/max": 108.05470275878906, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 32.72998809814453, "per_sentence_gradient_norm/var": 58.82628631591797, "per_token_feature_norm": 188.24961853027344, "per_token_feature_norm/max": 296.0, "per_token_feature_norm/median": 191.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 201.0, "per_token_feature_norm/var": 516.6107788085938, "per_token_full_gradient_variance/max_squared_error": 219.732666015625, "per_token_full_gradient_variance/variance": 0.021452724933624268, "per_token_gradient_norm": 1.3126983642578125, "per_token_gradient_norm/max": 5009.89306640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1700.9644775390625, "per_token_policy_error_norm": 0.017651578411459923, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016247015446424484, "policy_entropy": 0.03398236259818077, "policy_entropy/max": 3.53125, "policy_entropy/median": 3.306195139884949e-08, "policy_entropy/min": 5.929230630780102e-19, "policy_entropy/p25": 2.28283170145005e-10, "policy_entropy/p75": 1.0192394256591797e-05, "policy_entropy/var": 0.019925003871321678, "policy_error_vector_variance/max_squared_error": 2.0066797733306885, "policy_error_vector_variance/metric": 0.01762399449944496, "policy_loss": 0.052163656800985336, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.108713150024414, "policy_sharpness": 9.156929016113281, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.171346664428711, "reward": 0.8229166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14591483771800995, "rewards/accuracy_reward": 0.8229166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14591483771800995, "sentence_full_gradient_variance/max_squared_error": 1404165.125, "sentence_full_gradient_variance/metric": 2971.99609375, "sentence_full_gradient_variance/p75": 10.838412284851074, "sentence_full_gradient_variance/p90": 98.32798767089844, "sentence_full_gradient_variance/p95": 98.32798767089844, "sentence_full_gradient_variance/p99": 43739.3046875, "state_level_variance/metric": 6.348198413848877, "state_level_variance_full_gradient/metric": 357.904052734375, "step": 71 }, { "accuracy_reward": 0.8359375, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.13732481002807617, "action_level_variance/metric": 54.80581283569336, "action_level_variance_full_gradient/metric": 2049.72509765625, "adam_stats/lr_effective_max": 1.8471269868314266e-05, "adam_stats/lr_effective_mean": 5.0231322573646153e-11, "adam_stats/lr_effective_min": -1.815422911022324e-05, "adam_stats/m_t_max": 0.001600402407348156, "adam_stats/m_t_mean": 1.9496794803619544e-11, "adam_stats/m_t_min": -0.0018351719481870532, "adam_stats/v_t_max": 7.303927122848108e-05, "adam_stats/v_t_mean": 3.4641005342006537e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.06224649399518967, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.0818235874176025, "all_logprobs": -0.030827166512608528, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.90625, "all_logprobs/p1": -0.90234375, "all_logprobs/p10": -0.002716064453125, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04513952136039734, "clip_ratio": 0.0, "completion_length": 527.2630615234375, "completion_length/correct": 485.84735107421875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 439.0, "completion_length/correct/min": 138.0, "completion_length/correct/p25": 346.0, "completion_length/correct/p75": 599.75, "completion_length/correct/var": 36346.3046875, "completion_length/incorrect": 738.2857666015625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 711.0, "completion_length/incorrect/min": 216.0, "completion_length/incorrect/p25": 534.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 64548.9609375, "completion_length/max": 1024.0, "completion_length/median": 461.0, "completion_length/min": 138.0, "completion_length/p25": 363.0, "completion_length/p75": 666.0, "completion_length/var": 49646.22265625, "epoch": 0.9216, "feature_vector_variance/max_squared_error": 125199.71875, "feature_vector_variance/metric": 28137.177734375, "generated_tokens/total": 33539932.0, "grad_norm": 0.16872061789035797, "grouped_std_rewards": 0.10450761020183563, "learning_rate": 3.3060532239694e-06, "loss": -0.0622, "mean_logprobs": -0.0303955078125, "mean_logprobs/var": 0.000492095947265625, "num_completions/total": 55296, "per_sentence_gradient_norm": 0.9352582693099976, "per_sentence_gradient_norm/max": 116.77657318115234, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 29.180919647216797, "per_sentence_gradient_norm/var": 54.00141906738281, "per_token_feature_norm": 188.05201721191406, "per_token_feature_norm/max": 296.0, "per_token_feature_norm/median": 191.0, "per_token_feature_norm/min": 71.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 201.0, "per_token_feature_norm/var": 527.2772216796875, "per_token_full_gradient_variance/max_squared_error": 40.16450500488281, "per_token_full_gradient_variance/variance": 0.01172440592199564, "per_token_gradient_norm": 1.139359712600708, "per_token_gradient_norm/max": 2719.626708984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1127.2645263671875, "per_token_policy_error_norm": 0.017659271135926247, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016417859122157097, "policy_entropy": 0.03401104733347893, "policy_entropy/max": 3.484375, "policy_entropy/median": 2.9569491744041443e-08, "policy_entropy/min": 3.4220131069073734e-19, "policy_entropy/p25": 2.0190782379359007e-10, "policy_entropy/p75": 9.417533874511719e-06, "policy_entropy/var": 0.01993771456182003, "policy_error_vector_variance/max_squared_error": 2.0060484409332275, "policy_error_vector_variance/metric": 0.017619671300053596, "policy_loss": -0.06224649399518967, "policy_loss/max": 9.659052848815918, "policy_loss/median": 0.0, "policy_loss/min": -9.659052848815918, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.0818235874176025, "policy_sharpness": 9.155271530151367, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.178316593170166, "reward": 0.8359375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.13732481002807617, "rewards/accuracy_reward": 0.8359375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.13732481002807617, "sentence_full_gradient_variance/max_squared_error": 616245.5, "sentence_full_gradient_variance/metric": 2318.474609375, "sentence_full_gradient_variance/p75": 49.414146423339844, "sentence_full_gradient_variance/p90": 82.50782012939453, "sentence_full_gradient_variance/p95": 82.50782012939453, "sentence_full_gradient_variance/p99": 34711.125, "state_level_variance/metric": 6.038924217224121, "state_level_variance_full_gradient/metric": 268.74981689453125, "step": 72 }, { "accuracy_reward": 0.7981771230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16130046546459198, "action_level_variance/metric": 180.92881774902344, "action_level_variance_full_gradient/metric": 3168.81494140625, "adam_stats/lr_effective_max": 1.6738002159399912e-05, "adam_stats/lr_effective_mean": 6.879547626015281e-11, "adam_stats/lr_effective_min": -1.6489906556671485e-05, "adam_stats/m_t_max": 0.0019122399389743805, "adam_stats/m_t_mean": 1.938354685093735e-11, "adam_stats/m_t_min": -0.002104231622070074, "adam_stats/v_t_max": 7.308163185371086e-05, "adam_stats/v_t_mean": 3.4809616127062792e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.013145866803824902, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.410186767578125, "all_logprobs": -0.030633660033345222, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.25, "all_logprobs/p1": -0.8984375, "all_logprobs/p10": -0.0024871826171875, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.044028617441654205, "clip_ratio": 0.0, "completion_length": 555.8307495117188, "completion_length/correct": 508.1761779785156, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 482.0, "completion_length/correct/min": 173.0, "completion_length/correct/p25": 365.0, "completion_length/correct/p75": 601.0, "completion_length/correct/var": 37534.8515625, "completion_length/incorrect": 744.2967529296875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 748.0, "completion_length/incorrect/min": 181.0, "completion_length/incorrect/p25": 550.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 64664.46875, "completion_length/max": 1024.0, "completion_length/median": 509.0, "completion_length/min": 173.0, "completion_length/p25": 388.75, "completion_length/p75": 694.0, "completion_length/var": 51926.02734375, "epoch": 0.9344, "feature_vector_variance/max_squared_error": 129571.1328125, "feature_vector_variance/metric": 27987.109375, "generated_tokens/total": 33966812.0, "grad_norm": 0.2617335319519043, "grouped_std_rewards": 0.12296222895383835, "learning_rate": 3.0916106078064522e-06, "loss": -0.0131, "mean_logprobs": -0.03076171875, "mean_logprobs/var": 0.0003833770751953125, "num_completions/total": 56064, "per_sentence_gradient_norm": 1.6502641439437866, "per_sentence_gradient_norm/max": 249.6702880859375, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 45.59638214111328, "per_sentence_gradient_norm/var": 178.4377899169922, "per_token_feature_norm": 189.20069885253906, "per_token_feature_norm/max": 306.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 202.0, "per_token_feature_norm/var": 502.74603271484375, "per_token_full_gradient_variance/max_squared_error": 231.11407470703125, "per_token_full_gradient_variance/variance": 0.043511368334293365, "per_token_gradient_norm": 2.2841956615448, "per_token_gradient_norm/max": 6658.00244140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4418.8095703125, "per_token_policy_error_norm": 0.01764543727040291, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01618148386478424, "policy_entropy": 0.03386976942420006, "policy_entropy/max": 3.59375, "policy_entropy/median": 2.8870999813079834e-08, "policy_entropy/min": 3.7947076036992655e-19, "policy_entropy/p25": 1.9190338207408786e-10, "policy_entropy/p75": 9.834766387939453e-06, "policy_entropy/var": 0.0199916772544384, "policy_error_vector_variance/max_squared_error": 2.003338575363159, "policy_error_vector_variance/metric": 0.017623061314225197, "policy_loss": -0.013145874254405499, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.410186767578125, "policy_sharpness": 9.157456398010254, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.136026382446289, "reward": 0.7981771230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16130046546459198, "rewards/accuracy_reward": 0.7981771230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16130046546459198, "sentence_full_gradient_variance/max_squared_error": 742733.0, "sentence_full_gradient_variance/metric": 3577.71826171875, "sentence_full_gradient_variance/p75": 75.245849609375, "sentence_full_gradient_variance/p90": 150.0884552001953, "sentence_full_gradient_variance/p95": 150.0884552001953, "sentence_full_gradient_variance/p99": 67521.6640625, "state_level_variance/metric": 20.102127075195312, "state_level_variance_full_gradient/metric": 408.903076171875, "step": 73 }, { "accuracy_reward": 0.7161458730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2035459727048874, "action_level_variance/metric": 204.4163055419922, "action_level_variance_full_gradient/metric": 1192.736083984375, "adam_stats/lr_effective_max": 1.5128128325159196e-05, "adam_stats/lr_effective_mean": 4.350811724784798e-11, "adam_stats/lr_effective_min": -1.50168116306304e-05, "adam_stats/m_t_max": 0.0017835769103839993, "adam_stats/m_t_mean": 1.780148424501693e-11, "adam_stats/m_t_min": -0.0014513033675029874, "adam_stats/v_t_max": 7.302813173737377e-05, "adam_stats/v_t_mean": 3.481134217692139e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.056941092014312744, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.3020029067993164, "all_logprobs": -0.029391678050160408, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.0, "all_logprobs/p1": -0.8515625, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.0537109375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.042360465973615646, "clip_ratio": 0.0, "completion_length": 581.08984375, "completion_length/correct": 482.7381896972656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 443.0, "completion_length/correct/min": 128.0, "completion_length/correct/p25": 334.5, "completion_length/correct/p75": 610.0, "completion_length/correct/var": 38677.0390625, "completion_length/incorrect": 829.2247314453125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 978.0, "completion_length/incorrect/min": 279.0, "completion_length/incorrect/p25": 615.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 55850.58203125, "completion_length/max": 1024.0, "completion_length/median": 519.0, "completion_length/min": 128.0, "completion_length/p25": 365.0, "completion_length/p75": 768.25, "completion_length/var": 67921.65625, "epoch": 0.9472, "feature_vector_variance/max_squared_error": 140315.71875, "feature_vector_variance/metric": 27851.341796875, "generated_tokens/total": 34413088.0, "grad_norm": 0.13101617991924286, "grouped_std_rewards": 0.09846079349517822, "learning_rate": 2.882538935057563e-06, "loss": -0.0569, "mean_logprobs": -0.0294189453125, "mean_logprobs/var": 0.0004634857177734375, "num_completions/total": 56832, "per_sentence_gradient_norm": 1.214787244796753, "per_sentence_gradient_norm/max": 345.6391296386719, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 19.56243133544922, "per_sentence_gradient_norm/var": 203.20516967773438, "per_token_feature_norm": 189.1320037841797, "per_token_feature_norm/max": 302.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 61.25, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 540.6355590820312, "per_token_full_gradient_variance/max_squared_error": 81.5265121459961, "per_token_full_gradient_variance/variance": 0.0167480930685997, "per_token_gradient_norm": 1.6730079650878906, "per_token_gradient_norm/max": 5591.32373046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2164.546142578125, "per_token_policy_error_norm": 0.016903512179851532, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015638358891010284, "policy_entropy": 0.03232993185520172, "policy_entropy/max": 3.5, "policy_entropy/median": 2.130400389432907e-08, "policy_entropy/min": 8.74138001566438e-19, "policy_entropy/p25": 1.7826096154749393e-10, "policy_entropy/p75": 5.781650543212891e-06, "policy_entropy/var": 0.019208768382668495, "policy_error_vector_variance/max_squared_error": 2.0043129920959473, "policy_error_vector_variance/metric": 0.016881288960576057, "policy_loss": -0.056941092014312744, "policy_loss/max": 9.659051895141602, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.3020029067993164, "policy_sharpness": 9.203852653503418, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.90102481842041, "reward": 0.7161458730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2035459727048874, "rewards/accuracy_reward": 0.7161458730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2035459727048874, "sentence_full_gradient_variance/max_squared_error": 189415.921875, "sentence_full_gradient_variance/metric": 1344.0377197265625, "sentence_full_gradient_variance/p75": 36.92374038696289, "sentence_full_gradient_variance/p90": 82.31062316894531, "sentence_full_gradient_variance/p95": 82.31062316894531, "sentence_full_gradient_variance/p99": 43331.12890625, "state_level_variance/metric": 24.32976531982422, "state_level_variance_full_gradient/metric": 151.30174255371094, "step": 74 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.165145605802536, "action_level_variance/metric": 58.78113555908203, "action_level_variance_full_gradient/metric": 4491.2275390625, "adam_stats/lr_effective_max": 1.3664196558238473e-05, "adam_stats/lr_effective_mean": 2.970908588584287e-11, "adam_stats/lr_effective_min": -1.3881576705898624e-05, "adam_stats/m_t_max": 0.0012741034151986241, "adam_stats/m_t_mean": 7.395994407188855e-12, "adam_stats/m_t_min": -0.0011587669141590595, "adam_stats/v_t_max": 7.302502490347251e-05, "adam_stats/v_t_mean": 3.4793463683097103e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.07552927732467651, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.7742706537246704, "all_logprobs": -0.028856521472334862, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.25, "all_logprobs/p1": -0.828125, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.052978515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04145384579896927, "clip_ratio": 0.0, "completion_length": 537.0143432617188, "completion_length/correct": 458.0427551269531, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 430.0, "completion_length/correct/min": 154.0, "completion_length/correct/p25": 338.5, "completion_length/correct/p75": 534.25, "completion_length/correct/var": 29817.251953125, "completion_length/incorrect": 837.1062622070312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 966.0, "completion_length/incorrect/min": 207.0, "completion_length/incorrect/p25": 626.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 52552.23828125, "completion_length/max": 1024.0, "completion_length/median": 465.0, "completion_length/min": 154.0, "completion_length/p25": 357.75, "completion_length/p75": 661.25, "completion_length/var": 58220.9921875, "epoch": 0.96, "feature_vector_variance/max_squared_error": 142471.046875, "feature_vector_variance/metric": 27804.63671875, "generated_tokens/total": 34825516.0, "grad_norm": 0.07943108677864075, "grouped_std_rewards": 0.08947332948446274, "learning_rate": 2.6790929273509547e-06, "loss": -0.0755, "mean_logprobs": -0.029052734375, "mean_logprobs/var": 0.0003108978271484375, "num_completions/total": 57600, "per_sentence_gradient_norm": 0.9056682586669922, "per_sentence_gradient_norm/max": 116.35720825195312, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 24.37770652770996, "per_sentence_gradient_norm/var": 58.03646469116211, "per_token_feature_norm": 188.71981811523438, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 191.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 202.0, "per_token_feature_norm/var": 531.8097534179688, "per_token_full_gradient_variance/max_squared_error": 161.9886016845703, "per_token_full_gradient_variance/variance": 0.0153726851567626, "per_token_gradient_norm": 0.886742353439331, "per_token_gradient_norm/max": 5205.9716796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1328.2811279296875, "per_token_policy_error_norm": 0.01659359037876129, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015220025554299355, "policy_entropy": 0.032037440687417984, "policy_entropy/max": 3.453125, "policy_entropy/median": 2.561137080192566e-08, "policy_entropy/min": 1.8566962203814263e-18, "policy_entropy/p25": 2.0372681319713593e-10, "policy_entropy/p75": 6.973743438720703e-06, "policy_entropy/var": 0.018327074125409126, "policy_error_vector_variance/max_squared_error": 2.006571054458618, "policy_error_vector_variance/metric": 0.01657637394964695, "policy_loss": -0.07552927732467651, "policy_loss/max": 9.659051895141602, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.7742706537246704, "policy_sharpness": 9.193458557128906, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.933463096618652, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.165145605802536, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.165145605802536, "sentence_full_gradient_variance/max_squared_error": 1212841.875, "sentence_full_gradient_variance/metric": 5070.19775390625, "sentence_full_gradient_variance/p75": 112.7924575805664, "sentence_full_gradient_variance/p90": 311.13507080078125, "sentence_full_gradient_variance/p95": 311.13507080078125, "sentence_full_gradient_variance/p99": 44350.81640625, "state_level_variance/metric": 6.59611701965332, "state_level_variance_full_gradient/metric": 578.9703979492188, "step": 75 }, { "accuracy_reward": 0.78515625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1689058393239975, "action_level_variance/metric": 59.56815719604492, "action_level_variance_full_gradient/metric": 2848.587890625, "adam_stats/lr_effective_max": 1.1763660950236954e-05, "adam_stats/lr_effective_mean": 3.800421349148264e-12, "adam_stats/lr_effective_min": -1.3479024346452206e-05, "adam_stats/m_t_max": 0.000977801624685526, "adam_stats/m_t_mean": -5.6897971577318796e-12, "adam_stats/m_t_min": -0.0011019740486517549, "adam_stats/v_t_max": 7.296922558452934e-05, "adam_stats/v_t_mean": 3.4784844275825844e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.01312495768070221, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.942435383796692, "all_logprobs": -0.02839968353509903, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.375, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00150299072265625, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.048828125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0406063050031662, "clip_ratio": 0.0, "completion_length": 557.9609375, "completion_length/correct": 475.144287109375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 438.0, "completion_length/correct/min": 133.0, "completion_length/correct/p25": 323.0, "completion_length/correct/p75": 597.0, "completion_length/correct/var": 39242.234375, "completion_length/incorrect": 860.6181640625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 180.0, "completion_length/incorrect/p25": 693.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 54459.5859375, "completion_length/max": 1024.0, "completion_length/median": 501.0, "completion_length/min": 133.0, "completion_length/p25": 348.0, "completion_length/p75": 738.0, "completion_length/var": 67542.578125, "epoch": 0.9728, "feature_vector_variance/max_squared_error": 139487.9375, "feature_vector_variance/metric": 27717.025390625, "generated_tokens/total": 35254028.0, "grad_norm": 0.11824916303157806, "grouped_std_rewards": 0.12353219836950302, "learning_rate": 2.4815204523085656e-06, "loss": 0.0131, "mean_logprobs": -0.029541015625, "mean_logprobs/var": 0.0003299713134765625, "num_completions/total": 58368, "per_sentence_gradient_norm": 1.11305570602417, "per_sentence_gradient_norm/max": 112.2957763671875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 35.479209899902344, "per_sentence_gradient_norm/var": 58.405311584472656, "per_token_feature_norm": 190.24591064453125, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 482.8566589355469, "per_token_full_gradient_variance/max_squared_error": 78.8277816772461, "per_token_full_gradient_variance/variance": 0.021383507177233696, "per_token_gradient_norm": 1.4893313646316528, "per_token_gradient_norm/max": 4843.65966796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2000.5867919921875, "per_token_policy_error_norm": 0.016486626118421555, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01515877153724432, "policy_entropy": 0.031201329082250595, "policy_entropy/max": 3.65625, "policy_entropy/median": 1.9441358745098114e-08, "policy_entropy/min": 3.4558944247975454e-19, "policy_entropy/p25": 1.5825207810848951e-10, "policy_entropy/p75": 5.453824996948242e-06, "policy_entropy/var": 0.017667362466454506, "policy_error_vector_variance/max_squared_error": 2.0046985149383545, "policy_error_vector_variance/metric": 0.01646680384874344, "policy_loss": 0.013124948367476463, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.942435383796692, "policy_sharpness": 9.217986106872559, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.81109094619751, "reward": 0.78515625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1689058393239975, "rewards/accuracy_reward": 0.78515625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1689058393239975, "sentence_full_gradient_variance/max_squared_error": 1599685.25, "sentence_full_gradient_variance/metric": 3238.85546875, "sentence_full_gradient_variance/p75": 35.50508117675781, "sentence_full_gradient_variance/p90": 49.460487365722656, "sentence_full_gradient_variance/p95": 49.460487365722656, "sentence_full_gradient_variance/p99": 44722.7578125, "state_level_variance/metric": 6.272464752197266, "state_level_variance_full_gradient/metric": 390.26702880859375, "step": 76 }, { "accuracy_reward": 0.7955729365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16284869611263275, "action_level_variance/metric": 99.63695526123047, "action_level_variance_full_gradient/metric": 3167.940185546875, "adam_stats/lr_effective_max": 1.0153275979973841e-05, "adam_stats/lr_effective_mean": -3.282377895961336e-13, "adam_stats/lr_effective_min": -1.2010335922241211e-05, "adam_stats/m_t_max": 0.0008497015223838389, "adam_stats/m_t_mean": -6.100676387676973e-12, "adam_stats/m_t_min": -0.0009421855211257935, "adam_stats/v_t_max": 7.289674977073446e-05, "adam_stats/v_t_mean": 3.4761119763887516e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.09892304986715317, "advantages/max": 9.659051895141602, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.1239302158355713, "all_logprobs": -0.02880488894879818, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -14.5, "all_logprobs/p1": -0.828125, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.05517578125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.041976626962423325, "clip_ratio": 0.0, "completion_length": 551.5494995117188, "completion_length/correct": 480.31097412109375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 450.0, "completion_length/correct/min": 121.0, "completion_length/correct/p25": 343.5, "completion_length/correct/p75": 578.5, "completion_length/correct/var": 33964.2890625, "completion_length/incorrect": 828.789794921875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 867.0, "completion_length/incorrect/min": 336.0, "completion_length/incorrect/p25": 668.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 43538.9375, "completion_length/max": 1024.0, "completion_length/median": 490.0, "completion_length/min": 121.0, "completion_length/p25": 363.0, "completion_length/p75": 719.25, "completion_length/var": 55643.328125, "epoch": 0.9856, "feature_vector_variance/max_squared_error": 142853.75, "feature_vector_variance/metric": 27926.154296875, "generated_tokens/total": 35677620.0, "grad_norm": 0.06941869109869003, "grouped_std_rewards": 0.12235349416732788, "learning_rate": 2.29006222155752e-06, "loss": 0.0989, "mean_logprobs": -0.0284423828125, "mean_logprobs/var": 0.0003337860107421875, "num_completions/total": 59136, "per_sentence_gradient_norm": 1.218902826309204, "per_sentence_gradient_norm/max": 177.72769165039062, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 29.52305030822754, "per_sentence_gradient_norm/var": 98.27919006347656, "per_token_feature_norm": 190.2283172607422, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 72.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 465.7263488769531, "per_token_full_gradient_variance/max_squared_error": 185.28001403808594, "per_token_full_gradient_variance/variance": 0.03572314232587814, "per_token_gradient_norm": 1.7082539796829224, "per_token_gradient_norm/max": 5709.9296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3034.815673828125, "per_token_policy_error_norm": 0.016681279987096786, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015380836091935635, "policy_entropy": 0.03159809485077858, "policy_entropy/max": 3.484375, "policy_entropy/median": 2.3981556296348572e-08, "policy_entropy/min": 3.6083603553033194e-19, "policy_entropy/p25": 1.9190338207408786e-10, "policy_entropy/p75": 7.808208465576172e-06, "policy_entropy/var": 0.0174908135086298, "policy_error_vector_variance/max_squared_error": 2.003908634185791, "policy_error_vector_variance/metric": 0.016667021438479424, "policy_loss": 0.09892304986715317, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.1239302158355713, "policy_sharpness": 9.198870658874512, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.893713474273682, "reward": 0.7955729365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16284869611263275, "rewards/accuracy_reward": 0.7955729365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16284869611263275, "sentence_full_gradient_variance/max_squared_error": 1538802.375, "sentence_full_gradient_variance/metric": 3580.010498046875, "sentence_full_gradient_variance/p75": 91.35650634765625, "sentence_full_gradient_variance/p90": 194.70465087890625, "sentence_full_gradient_variance/p95": 194.70465087890625, "sentence_full_gradient_variance/p99": 23083.732421875, "state_level_variance/metric": 11.084357261657715, "state_level_variance_full_gradient/metric": 412.06976318359375, "step": 77 }, { "accuracy_reward": 0.8828125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1035894826054573, "action_level_variance/metric": 20.432592391967773, "action_level_variance_full_gradient/metric": 902.453125, "adam_stats/lr_effective_max": 1.0124497748620342e-05, "adam_stats/lr_effective_mean": -5.840263711011373e-13, "adam_stats/lr_effective_min": -1.0400514838693198e-05, "adam_stats/m_t_max": 0.00095069978851825, "adam_stats/m_t_mean": -1.2613088525015304e-11, "adam_stats/m_t_min": -0.0011892284965142608, "adam_stats/v_t_max": 7.282519800355658e-05, "adam_stats/v_t_mean": 3.4773791918879526e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.011245082132518291, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.836032509803772, "all_logprobs": -0.026512518525123596, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.875, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00116729736328125, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.040283203125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03778928518295288, "clip_ratio": 0.0, "completion_length": 517.7265625, "completion_length/correct": 490.4321594238281, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 428.0, "completion_length/correct/min": 170.0, "completion_length/correct/p25": 344.25, "completion_length/correct/p75": 624.5, "completion_length/correct/var": 41364.15234375, "completion_length/incorrect": 723.344482421875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 649.0, "completion_length/incorrect/min": 212.0, "completion_length/incorrect/p25": 514.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 62537.78515625, "completion_length/max": 1024.0, "completion_length/median": 456.0, "completion_length/min": 170.0, "completion_length/p25": 354.0, "completion_length/p75": 660.25, "completion_length/var": 49386.671875, "epoch": 0.9984, "feature_vector_variance/max_squared_error": 137300.71875, "feature_vector_variance/metric": 27813.0859375, "generated_tokens/total": 36075232.0, "grad_norm": 0.13638128340244293, "grouped_std_rewards": 0.06851749867200851, "learning_rate": 2.104951497460118e-06, "loss": -0.0112, "mean_logprobs": -0.0274658203125, "mean_logprobs/var": 0.0003814697265625, "num_completions/total": 59904, "per_sentence_gradient_norm": 0.5209664106369019, "per_sentence_gradient_norm/max": 60.75081253051758, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 17.084745407104492, "per_sentence_gradient_norm/var": 20.187471389770508, "per_token_feature_norm": 189.2363739013672, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 202.0, "per_token_feature_norm/var": 477.2803649902344, "per_token_full_gradient_variance/max_squared_error": 40.20726776123047, "per_token_full_gradient_variance/variance": 0.007275890558958054, "per_token_gradient_norm": 0.6560913324356079, "per_token_gradient_norm/max": 3531.443359375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 721.9190063476562, "per_token_policy_error_norm": 0.015417581424117088, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014252924360334873, "policy_entropy": 0.0291477981954813, "policy_entropy/max": 3.609375, "policy_entropy/median": 1.9907020032405853e-08, "policy_entropy/min": 8.74138001566438e-19, "policy_entropy/p25": 1.509761204943061e-10, "policy_entropy/p75": 5.424022674560547e-06, "policy_entropy/var": 0.01619502156972885, "policy_error_vector_variance/max_squared_error": 2.0029075145721436, "policy_error_vector_variance/metric": 0.015401361510157585, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.24948501586914, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.608096599578857, "reward": 0.8828125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1035894826054573, "rewards/accuracy_reward": 0.8828125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1035894826054573, "sentence_full_gradient_variance/max_squared_error": 225528.9375, "sentence_full_gradient_variance/metric": 1025.10302734375, "sentence_full_gradient_variance/p75": 12.92098617553711, "sentence_full_gradient_variance/p90": 24.98935890197754, "sentence_full_gradient_variance/p95": 24.98935890197754, "sentence_full_gradient_variance/p99": 6697.05029296875, "state_level_variance/metric": 2.3066959381103516, "state_level_variance_full_gradient/metric": 122.64984130859375, "step": 78 }, { "accuracy_reward": 0.8567708730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.12287458032369614, "action_level_variance/metric": 48.59912872314453, "action_level_variance_full_gradient/metric": 2504.50146484375, "adam_stats/lr_effective_max": 8.926968803280033e-06, "adam_stats/lr_effective_mean": -7.33559653992577e-12, "adam_stats/lr_effective_min": -9.032047273649368e-06, "adam_stats/m_t_max": 0.0008285628282465041, "adam_stats/m_t_mean": -1.081097017463506e-11, "adam_stats/m_t_min": -0.0010510414140298963, "adam_stats/v_t_max": 7.2763032221701e-05, "adam_stats/v_t_mean": 3.4746201141994115e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.03709567338228226, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.7687917947769165, "all_logprobs": -0.029659129679203033, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.125, "all_logprobs/p1": -0.83203125, "all_logprobs/p10": -0.002288818359375, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.042354051023721695, "clip_ratio": 0.0, "completion_length": 513.52734375, "completion_length/correct": 467.15045166015625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 415.0, "completion_length/correct/min": 101.0, "completion_length/correct/p25": 320.0, "completion_length/correct/p75": 579.5, "completion_length/correct/var": 40529.74609375, "completion_length/incorrect": 790.9454345703125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 898.0, "completion_length/incorrect/min": 301.0, "completion_length/incorrect/p25": 573.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 64601.6484375, "completion_length/max": 1024.0, "completion_length/median": 445.0, "completion_length/min": 101.0, "completion_length/p25": 333.0, "completion_length/p75": 645.25, "completion_length/var": 56780.37890625, "epoch": 1.0128, "feature_vector_variance/max_squared_error": 116966.3359375, "feature_vector_variance/metric": 27899.37109375, "generated_tokens/total": 36469624.0, "grad_norm": 0.0789218470454216, "grouped_std_rewards": 0.1059533879160881, "learning_rate": 1.9264138089195424e-06, "loss": 0.0371, "mean_logprobs": -0.0289306640625, "mean_logprobs/var": 0.0004100799560546875, "num_completions/total": 60672, "per_sentence_gradient_norm": 0.8971767425537109, "per_sentence_gradient_norm/max": 111.58689880371094, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 32.507083892822266, "per_sentence_gradient_norm/var": 47.85652160644531, "per_token_feature_norm": 189.30638122558594, "per_token_feature_norm/max": 298.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 202.0, "per_token_feature_norm/var": 523.5548706054688, "per_token_full_gradient_variance/max_squared_error": 112.04202270507812, "per_token_full_gradient_variance/variance": 0.017109464854002, "per_token_gradient_norm": 1.2713470458984375, "per_token_gradient_norm/max": 5273.7021484375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1667.519775390625, "per_token_policy_error_norm": 0.017057819291949272, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015702275559306145, "policy_entropy": 0.03301263600587845, "policy_entropy/max": 3.78125, "policy_entropy/median": 3.306195139884949e-08, "policy_entropy/min": 7.047314121155779e-19, "policy_entropy/p25": 2.510205376893282e-10, "policy_entropy/p75": 9.894371032714844e-06, "policy_entropy/var": 0.018830735236406326, "policy_error_vector_variance/max_squared_error": 2.0043606758117676, "policy_error_vector_variance/metric": 0.01704191043972969, "policy_loss": 0.03709567338228226, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -7.48191499710083, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.7687917947769165, "policy_sharpness": 9.169017791748047, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.062298774719238, "reward": 0.8567708730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.12287458032369614, "rewards/accuracy_reward": 0.8567708730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.12287458032369614, "sentence_full_gradient_variance/max_squared_error": 1214354.25, "sentence_full_gradient_variance/metric": 2820.116943359375, "sentence_full_gradient_variance/p75": 45.76980972290039, "sentence_full_gradient_variance/p90": 269.9893493652344, "sentence_full_gradient_variance/p95": 269.9893493652344, "sentence_full_gradient_variance/p99": 33004.78125, "state_level_variance/metric": 5.325438022613525, "state_level_variance_full_gradient/metric": 315.6160583496094, "step": 79 }, { "accuracy_reward": 0.7825521230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1703861802816391, "action_level_variance/metric": 44.005592346191406, "action_level_variance_full_gradient/metric": 1012.8994140625, "adam_stats/lr_effective_max": 8.02098020358244e-06, "adam_stats/lr_effective_mean": -2.0048710955289373e-12, "adam_stats/lr_effective_min": -8.406058441323694e-06, "adam_stats/m_t_max": 0.0008766105747781694, "adam_stats/m_t_mean": -8.38296532884586e-12, "adam_stats/m_t_min": -0.0010911369463428855, "adam_stats/v_t_max": 7.269458001246676e-05, "adam_stats/v_t_mean": 3.472406607044065e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.031143752858042717, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.0206878185272217, "all_logprobs": -0.028306597843766212, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.5, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.05126953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03977212309837341, "clip_ratio": 0.0, "completion_length": 566.15625, "completion_length/correct": 485.56903076171875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 441.0, "completion_length/correct/min": 138.0, "completion_length/correct/p25": 352.0, "completion_length/correct/p75": 563.0, "completion_length/correct/var": 37698.96484375, "completion_length/incorrect": 856.1737060546875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 291.0, "completion_length/incorrect/p25": 650.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 50554.14453125, "completion_length/max": 1024.0, "completion_length/median": 487.0, "completion_length/min": 138.0, "completion_length/p25": 372.75, "completion_length/p75": 761.0, "completion_length/var": 63834.1953125, "epoch": 1.0256, "feature_vector_variance/max_squared_error": 148401.578125, "feature_vector_variance/metric": 27878.0, "generated_tokens/total": 36904432.0, "grad_norm": 0.07511414587497711, "grouped_std_rewards": 0.09091828763484955, "learning_rate": 1.7546666766076658e-06, "loss": 0.0311, "mean_logprobs": -0.0283203125, "mean_logprobs/var": 0.0002841949462890625, "num_completions/total": 61440, "per_sentence_gradient_norm": 0.6196403503417969, "per_sentence_gradient_norm/max": 163.1257781982422, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 18.684036254882812, "per_sentence_gradient_norm/var": 43.67851257324219, "per_token_feature_norm": 190.6186065673828, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 72.0, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 494.4631652832031, "per_token_full_gradient_variance/max_squared_error": 216.26321411132812, "per_token_full_gradient_variance/variance": 0.011399435810744762, "per_token_gradient_norm": 0.8145317435264587, "per_token_gradient_norm/max": 6568.93212890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1070.889404296875, "per_token_policy_error_norm": 0.016346869990229607, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014977073296904564, "policy_entropy": 0.031534019857645035, "policy_entropy/max": 3.34375, "policy_entropy/median": 2.0838342607021332e-08, "policy_entropy/min": 1.2451384324638215e-19, "policy_entropy/p25": 1.6916601452976465e-10, "policy_entropy/p75": 7.063150405883789e-06, "policy_entropy/var": 0.01795075088739395, "policy_error_vector_variance/max_squared_error": 2.0033936500549316, "policy_error_vector_variance/metric": 0.016332652419805527, "policy_loss": 0.031143754720687866, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -7.481915473937988, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.0206876993179321, "policy_sharpness": 9.202463150024414, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.867586135864258, "reward": 0.7825521230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1703861802816391, "rewards/accuracy_reward": 0.7825521230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1703861802816391, "sentence_full_gradient_variance/max_squared_error": 363265.34375, "sentence_full_gradient_variance/metric": 1149.4052734375, "sentence_full_gradient_variance/p75": 18.335010528564453, "sentence_full_gradient_variance/p90": 40.25109100341797, "sentence_full_gradient_variance/p95": 40.25109100341797, "sentence_full_gradient_variance/p99": 19802.849609375, "state_level_variance/metric": 5.170604705810547, "state_level_variance_full_gradient/metric": 136.50558471679688, "step": 80 }, { "accuracy_reward": 0.7747396230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17474570870399475, "action_level_variance/metric": 175.39593505859375, "action_level_variance_full_gradient/metric": 1720.2825927734375, "adam_stats/lr_effective_max": 7.3205865191994235e-06, "adam_stats/lr_effective_mean": 2.074443047894725e-12, "adam_stats/lr_effective_min": -7.212254331534496e-06, "adam_stats/m_t_max": 0.0010102458763867617, "adam_stats/m_t_mean": -7.763460881105022e-12, "adam_stats/m_t_min": -0.0011895428178831935, "adam_stats/v_t_max": 7.263274892466143e-05, "adam_stats/v_t_mean": 3.471068267882349e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.028856506571173668, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.4441218376159668, "all_logprobs": -0.030355127528309822, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.125, "all_logprobs/p1": -0.87890625, "all_logprobs/p10": -0.002471923828125, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.043854162096977234, "clip_ratio": 0.0, "completion_length": 546.9974365234375, "completion_length/correct": 483.7529602050781, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 441.0, "completion_length/correct/min": 135.0, "completion_length/correct/p25": 319.5, "completion_length/correct/p75": 609.5, "completion_length/correct/var": 45842.7734375, "completion_length/incorrect": 764.514404296875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 797.0, "completion_length/incorrect/min": 300.0, "completion_length/incorrect/p25": 533.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 61139.3203125, "completion_length/max": 1024.0, "completion_length/median": 491.0, "completion_length/min": 135.0, "completion_length/p25": 356.0, "completion_length/p75": 732.25, "completion_length/var": 62987.94140625, "epoch": 1.0384, "feature_vector_variance/max_squared_error": 146623.40625, "feature_vector_variance/metric": 27993.94140625, "generated_tokens/total": 37324524.0, "grad_norm": 0.10218194127082825, "grouped_std_rewards": 0.09648623317480087, "learning_rate": 1.5899193479495858e-06, "loss": 0.0289, "mean_logprobs": -0.030517578125, "mean_logprobs/var": 0.0004444122314453125, "num_completions/total": 62208, "per_sentence_gradient_norm": 1.201846718788147, "per_sentence_gradient_norm/max": 324.86358642578125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 36.55315017700195, "per_sentence_gradient_norm/var": 174.1782684326172, "per_token_feature_norm": 189.9331817626953, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 512.931396484375, "per_token_full_gradient_variance/max_squared_error": 183.89761352539062, "per_token_full_gradient_variance/variance": 0.029143454506993294, "per_token_gradient_norm": 1.7667464017868042, "per_token_gradient_norm/max": 6568.93212890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3630.64599609375, "per_token_policy_error_norm": 0.017333637923002243, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0158331748098135, "policy_entropy": 0.033936239778995514, "policy_entropy/max": 3.453125, "policy_entropy/median": 2.9802322387695312e-08, "policy_entropy/min": 2.8053731213062427e-18, "policy_entropy/p25": 2.2464519133791327e-10, "policy_entropy/p75": 9.059906005859375e-06, "policy_entropy/var": 0.020328376442193985, "policy_error_vector_variance/max_squared_error": 2.0069148540496826, "policy_error_vector_variance/metric": 0.017309103161096573, "policy_loss": 0.028856506571173668, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -7.481915473937988, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.4441218376159668, "policy_sharpness": 9.16593074798584, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.086091041564941, "reward": 0.7747396230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17474570870399475, "rewards/accuracy_reward": 0.7747396230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17474570870399475, "sentence_full_gradient_variance/max_squared_error": 385666.84375, "sentence_full_gradient_variance/metric": 1947.4588623046875, "sentence_full_gradient_variance/p75": 31.78798484802246, "sentence_full_gradient_variance/p90": 48.82391357421875, "sentence_full_gradient_variance/p95": 48.82391357421875, "sentence_full_gradient_variance/p99": 54359.015625, "state_level_variance/metric": 20.695636749267578, "state_level_variance_full_gradient/metric": 227.17623901367188, "step": 81 }, { "accuracy_reward": 0.8359375, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.13732482492923737, "action_level_variance/metric": 32.41179275512695, "action_level_variance_full_gradient/metric": 1528.9541015625, "adam_stats/lr_effective_max": 6.462119927164167e-06, "adam_stats/lr_effective_mean": 4.2715787504366e-12, "adam_stats/lr_effective_min": -6.577853582712123e-06, "adam_stats/m_t_max": 0.0009378314716741443, "adam_stats/m_t_mean": -1.7313867353707657e-12, "adam_stats/m_t_min": -0.0011144065065309405, "adam_stats/v_t_max": 7.256607932504267e-05, "adam_stats/v_t_mean": 3.4697895598401196e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.007849341258406639, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.5415030717849731, "all_logprobs": -0.027551451697945595, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -13.8125, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00150299072265625, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.048583984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04013313353061676, "clip_ratio": 0.0, "completion_length": 545.4921875, "completion_length/correct": 494.0576171875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 465.0, "completion_length/correct/min": 149.0, "completion_length/correct/p25": 372.5, "completion_length/correct/p75": 587.0, "completion_length/correct/var": 31903.802734375, "completion_length/incorrect": 807.5635375976562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 887.0, "completion_length/incorrect/min": 267.0, "completion_length/incorrect/p25": 638.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 53741.703125, "completion_length/max": 1024.0, "completion_length/median": 497.0, "completion_length/min": 149.0, "completion_length/p25": 389.75, "completion_length/p75": 654.25, "completion_length/var": 48918.28515625, "epoch": 1.0512, "feature_vector_variance/max_squared_error": 121489.625, "feature_vector_variance/metric": 27763.33203125, "generated_tokens/total": 37743464.0, "grad_norm": 0.10926102101802826, "grouped_std_rewards": 0.08202111721038818, "learning_rate": 1.432372542187895e-06, "loss": 0.0078, "mean_logprobs": -0.028076171875, "mean_logprobs/var": 0.0002956390380859375, "num_completions/total": 62976, "per_sentence_gradient_norm": 0.6879371404647827, "per_sentence_gradient_norm/max": 94.96250915527344, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 21.913808822631836, "per_sentence_gradient_norm/var": 31.980178833007812, "per_token_feature_norm": 190.18545532226562, "per_token_feature_norm/max": 282.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 72.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 473.7177429199219, "per_token_full_gradient_variance/max_squared_error": 226.1012420654297, "per_token_full_gradient_variance/variance": 0.012502566911280155, "per_token_gradient_norm": 0.9075920581817627, "per_token_gradient_norm/max": 4902.88525390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1261.4986572265625, "per_token_policy_error_norm": 0.0159503985196352, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014811733737587929, "policy_entropy": 0.03023529052734375, "policy_entropy/max": 3.5, "policy_entropy/median": 2.6309862732887268e-08, "policy_entropy/min": 1.7143946852427039e-18, "policy_entropy/p25": 1.9736035028472543e-10, "policy_entropy/p75": 6.973743438720703e-06, "policy_entropy/var": 0.016621164977550507, "policy_error_vector_variance/max_squared_error": 2.0032999515533447, "policy_error_vector_variance/metric": 0.015935147181153297, "policy_loss": 0.007849341258406639, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.5415033102035522, "policy_sharpness": 9.222776412963867, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.747604846954346, "reward": 0.8359375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.13732482492923737, "rewards/accuracy_reward": 0.8359375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.13732482492923737, "sentence_full_gradient_variance/max_squared_error": 589372.0, "sentence_full_gradient_variance/metric": 1732.7021484375, "sentence_full_gradient_variance/p75": 33.901336669921875, "sentence_full_gradient_variance/p90": 57.05504608154297, "sentence_full_gradient_variance/p95": 57.05504608154297, "sentence_full_gradient_variance/p99": 16117.54296875, "state_level_variance/metric": 3.615882396697998, "state_level_variance_full_gradient/metric": 203.74807739257812, "step": 82 }, { "accuracy_reward": 0.7526041865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18643388152122498, "action_level_variance/metric": 28.810590744018555, "action_level_variance_full_gradient/metric": 1794.585205078125, "adam_stats/lr_effective_max": 5.751766366302036e-06, "adam_stats/lr_effective_mean": 1.879141460492395e-11, "adam_stats/lr_effective_min": -5.9540602705965284e-06, "adam_stats/m_t_max": 0.0006852994556538761, "adam_stats/m_t_mean": 3.027986932371829e-12, "adam_stats/m_t_min": -0.0008030756725929677, "adam_stats/v_t_max": 7.24988931324333e-05, "adam_stats/v_t_mean": 3.4701646937917996e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.07168923318386078, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.452359676361084, "all_logprobs": -0.02850411646068096, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.0625, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.052001953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04082169756293297, "clip_ratio": 0.0, "completion_length": 549.4557495117188, "completion_length/correct": 475.6885986328125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 451.0, "completion_length/correct/min": 176.0, "completion_length/correct/p25": 330.5, "completion_length/correct/p75": 591.0, "completion_length/correct/var": 32245.4375, "completion_length/incorrect": 773.8632202148438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 790.0, "completion_length/incorrect/min": 335.0, "completion_length/incorrect/p25": 604.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 46806.85546875, "completion_length/max": 1024.0, "completion_length/median": 512.0, "completion_length/min": 176.0, "completion_length/p25": 370.0, "completion_length/p75": 683.0, "completion_length/var": 52367.015625, "epoch": 1.064, "feature_vector_variance/max_squared_error": 120949.609375, "feature_vector_variance/metric": 27979.814453125, "generated_tokens/total": 38165444.0, "grad_norm": 0.1425739973783493, "grouped_std_rewards": 0.10788838565349579, "learning_rate": 1.282218205837188e-06, "loss": -0.0717, "mean_logprobs": -0.0283203125, "mean_logprobs/var": 0.00026702880859375, "num_completions/total": 63744, "per_sentence_gradient_norm": 0.8242558240890503, "per_sentence_gradient_norm/max": 68.13257598876953, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 27.360069274902344, "per_sentence_gradient_norm/var": 28.167869567871094, "per_token_feature_norm": 189.5809326171875, "per_token_feature_norm/max": 288.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 66.5, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 202.0, "per_token_feature_norm/var": 469.4793701171875, "per_token_full_gradient_variance/max_squared_error": 219.3925323486328, "per_token_full_gradient_variance/variance": 0.01697331853210926, "per_token_gradient_norm": 1.0914993286132812, "per_token_gradient_norm/max": 5008.810546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1579.28076171875, "per_token_policy_error_norm": 0.016517722979187965, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0153230344876647, "policy_entropy": 0.03127846121788025, "policy_entropy/max": 3.625, "policy_entropy/median": 2.782326191663742e-08, "policy_entropy/min": 7.894347068410079e-19, "policy_entropy/p25": 1.9917933968827128e-10, "policy_entropy/p75": 7.987022399902344e-06, "policy_entropy/var": 0.01727774180471897, "policy_error_vector_variance/max_squared_error": 2.0038681030273438, "policy_error_vector_variance/metric": 0.01650739274919033, "policy_loss": -0.07168923318386078, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.452359676361084, "policy_sharpness": 9.196937561035156, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.874277114868164, "reward": 0.7526041865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18643388152122498, "rewards/accuracy_reward": 0.7526041865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18643388152122498, "sentence_full_gradient_variance/max_squared_error": 737660.0, "sentence_full_gradient_variance/metric": 2024.2425537109375, "sentence_full_gradient_variance/p75": 31.176349639892578, "sentence_full_gradient_variance/p90": 132.6593017578125, "sentence_full_gradient_variance/p95": 132.6593017578125, "sentence_full_gradient_variance/p99": 50811.18359375, "state_level_variance/metric": 2.952683687210083, "state_level_variance_full_gradient/metric": 229.6573486328125, "step": 83 }, { "accuracy_reward": 0.76953125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17758414149284363, "action_level_variance/metric": 38.091732025146484, "action_level_variance_full_gradient/metric": 2942.86669921875, "adam_stats/lr_effective_max": 5.564338152908022e-06, "adam_stats/lr_effective_mean": 2.4694165115524136e-11, "adam_stats/lr_effective_min": -5.706690444640117e-06, "adam_stats/m_t_max": 0.0007627366576343775, "adam_stats/m_t_mean": -5.6061847872324044e-12, "adam_stats/m_t_min": -0.0008051655604504049, "adam_stats/v_t_max": 7.242642459459603e-05, "adam_stats/v_t_mean": 3.4708032888713936e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.009970207698643208, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.888224482536316, "all_logprobs": -0.030450768768787384, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.5, "all_logprobs/p1": -0.8984375, "all_logprobs/p10": -0.003173828125, "all_logprobs/p25": -8.344650268554688e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.043549906462430954, "clip_ratio": 0.0, "completion_length": 567.0443115234375, "completion_length/correct": 507.3536376953125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 488.0, "completion_length/correct/min": 132.0, "completion_length/correct/p25": 367.0, "completion_length/correct/p75": 625.5, "completion_length/correct/var": 32796.1484375, "completion_length/incorrect": 766.3502807617188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 777.0, "completion_length/incorrect/min": 145.0, "completion_length/incorrect/p25": 572.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 52507.03515625, "completion_length/max": 1024.0, "completion_length/median": 540.0, "completion_length/min": 132.0, "completion_length/p25": 393.0, "completion_length/p75": 693.25, "completion_length/var": 49188.57421875, "epoch": 1.0768, "feature_vector_variance/max_squared_error": 147394.15625, "feature_vector_variance/metric": 28083.8984375, "generated_tokens/total": 38600936.0, "grad_norm": 0.13870194554328918, "grouped_std_rewards": 0.10385192930698395, "learning_rate": 1.1396392788268054e-06, "loss": 0.01, "mean_logprobs": -0.031005859375, "mean_logprobs/var": 0.00058746337890625, "num_completions/total": 64512, "per_sentence_gradient_norm": 0.8887437582015991, "per_sentence_gradient_norm/max": 82.68476867675781, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 26.840686798095703, "per_sentence_gradient_norm/var": 37.350494384765625, "per_token_feature_norm": 190.098876953125, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 504.1027526855469, "per_token_full_gradient_variance/max_squared_error": 152.6376495361328, "per_token_full_gradient_variance/variance": 0.013613685965538025, "per_token_gradient_norm": 1.0089153051376343, "per_token_gradient_norm/max": 5610.962890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1363.261962890625, "per_token_policy_error_norm": 0.017490530386567116, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0160877276211977, "policy_entropy": 0.03401428461074829, "policy_entropy/max": 3.765625, "policy_entropy/median": 3.818422555923462e-08, "policy_entropy/min": 5.116079001415974e-19, "policy_entropy/p25": 2.764863893389702e-10, "policy_entropy/p75": 1.2755393981933594e-05, "policy_entropy/var": 0.019632980227470398, "policy_error_vector_variance/max_squared_error": 2.003082275390625, "policy_error_vector_variance/metric": 0.01747051626443863, "policy_loss": 0.009970211423933506, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.888224482536316, "policy_sharpness": 9.139504432678223, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.207947254180908, "reward": 0.76953125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17758414149284363, "rewards/accuracy_reward": 0.76953125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17758414149284363, "sentence_full_gradient_variance/max_squared_error": 1085503.25, "sentence_full_gradient_variance/metric": 3342.48779296875, "sentence_full_gradient_variance/p75": 55.14588165283203, "sentence_full_gradient_variance/p90": 73.59479522705078, "sentence_full_gradient_variance/p95": 73.59479522705078, "sentence_full_gradient_variance/p99": 48340.25, "state_level_variance/metric": 4.013407230377197, "state_level_variance_full_gradient/metric": 399.6209411621094, "step": 84 }, { "accuracy_reward": 0.80078125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15973863005638123, "action_level_variance/metric": 148.52674865722656, "action_level_variance_full_gradient/metric": 6560.5556640625, "adam_stats/lr_effective_max": 5.144409442436881e-06, "adam_stats/lr_effective_mean": 3.3177419145324905e-11, "adam_stats/lr_effective_min": -5.070679890195606e-06, "adam_stats/m_t_max": 0.0012204868253320456, "adam_stats/m_t_mean": -5.827222927280717e-13, "adam_stats/m_t_min": -0.0011812980519607663, "adam_stats/v_t_max": 7.242189167300239e-05, "adam_stats/v_t_mean": 3.4762353585959804e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.08071267604827881, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 3.125258684158325, "all_logprobs": -0.029949169605970383, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.125, "all_logprobs/p1": -0.8515625, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.043518517166376114, "clip_ratio": 0.0, "completion_length": 521.8229370117188, "completion_length/correct": 459.3365783691406, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 418.0, "completion_length/correct/min": 90.0, "completion_length/correct/p25": 317.0, "completion_length/correct/p75": 541.0, "completion_length/correct/var": 37535.25, "completion_length/incorrect": 772.9934692382812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 835.0, "completion_length/incorrect/min": 215.0, "completion_length/incorrect/p25": 556.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 70169.2578125, "completion_length/max": 1024.0, "completion_length/median": 458.0, "completion_length/min": 90.0, "completion_length/p25": 347.0, "completion_length/p75": 650.0, "completion_length/var": 59668.734375, "epoch": 1.0896, "feature_vector_variance/max_squared_error": 157449.21875, "feature_vector_variance/metric": 27780.1875, "generated_tokens/total": 39001696.0, "grad_norm": 0.22307470440864563, "grouped_std_rewards": 0.1449311077594757, "learning_rate": 1.0048094716167097e-06, "loss": 0.0807, "mean_logprobs": -0.0301513671875, "mean_logprobs/var": 0.000446319580078125, "num_completions/total": 65280, "per_sentence_gradient_norm": 1.7816088199615479, "per_sentence_gradient_norm/max": 181.2491912841797, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 62.614784240722656, "per_sentence_gradient_norm/var": 145.54212951660156, "per_token_feature_norm": 189.9653778076172, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 71.5, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 503.6533508300781, "per_token_full_gradient_variance/max_squared_error": 123.31084442138672, "per_token_full_gradient_variance/variance": 0.028952913358807564, "per_token_gradient_norm": 2.014993906021118, "per_token_gradient_norm/max": 5300.60888671875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2884.646484375, "per_token_policy_error_norm": 0.01725623570382595, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015870535746216774, "policy_entropy": 0.03284088149666786, "policy_entropy/max": 3.828125, "policy_entropy/median": 2.293381839990616e-08, "policy_entropy/min": 2.2090619264392153e-18, "policy_entropy/p25": 1.709850039333105e-10, "policy_entropy/p75": 6.973743438720703e-06, "policy_entropy/var": 0.019236689433455467, "policy_error_vector_variance/max_squared_error": 2.0025289058685303, "policy_error_vector_variance/metric": 0.01724054291844368, "policy_loss": 0.08071267604827881, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 3.125258684158325, "policy_sharpness": 9.191482543945312, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.954697608947754, "reward": 0.80078125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15973863005638123, "rewards/accuracy_reward": 0.80078125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15973863005638123, "sentence_full_gradient_variance/max_squared_error": 1850831.75, "sentence_full_gradient_variance/metric": 7428.5537109375, "sentence_full_gradient_variance/p75": 150.12301635742188, "sentence_full_gradient_variance/p90": 163.0178985595703, "sentence_full_gradient_variance/p95": 163.0178985595703, "sentence_full_gradient_variance/p99": 161838.15625, "state_level_variance/metric": 15.553731918334961, "state_level_variance_full_gradient/metric": 867.997802734375, "step": 85 }, { "accuracy_reward": 0.8072916865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15577468276023865, "action_level_variance/metric": 116.59144592285156, "action_level_variance_full_gradient/metric": 1452.48486328125, "adam_stats/lr_effective_max": 4.705993887910154e-06, "adam_stats/lr_effective_mean": 2.0788334595400748e-11, "adam_stats/lr_effective_min": -4.426487066666596e-06, "adam_stats/m_t_max": 0.0019679341930896044, "adam_stats/m_t_mean": 1.1078857449497992e-11, "adam_stats/m_t_min": -0.002051937859505415, "adam_stats/v_t_max": 7.247286703204736e-05, "adam_stats/v_t_mean": 3.4780700455122604e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.060826338827610016, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.455268383026123, "all_logprobs": -0.03146906942129135, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.9375, "all_logprobs/p1": -0.96875, "all_logprobs/p10": -0.002471923828125, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0484720878303051, "clip_ratio": 0.0, "completion_length": 542.6810302734375, "completion_length/correct": 474.69677734375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 452.0, "completion_length/correct/min": 158.0, "completion_length/correct/p25": 329.0, "completion_length/correct/p75": 593.0, "completion_length/correct/var": 31505.23046875, "completion_length/incorrect": 827.479736328125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 937.0, "completion_length/incorrect/min": 192.0, "completion_length/incorrect/p25": 623.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 54641.625, "completion_length/max": 1024.0, "completion_length/median": 490.0, "completion_length/min": 158.0, "completion_length/p25": 364.0, "completion_length/p75": 670.25, "completion_length/var": 55285.4375, "epoch": 1.1024, "feature_vector_variance/max_squared_error": 139792.3125, "feature_vector_variance/metric": 28014.126953125, "generated_tokens/total": 39418472.0, "grad_norm": 0.18915222585201263, "grouped_std_rewards": 0.11660876125097275, "learning_rate": 8.778930535580476e-07, "loss": 0.0608, "mean_logprobs": -0.03173828125, "mean_logprobs/var": 0.000675201416015625, "num_completions/total": 66048, "per_sentence_gradient_norm": 1.335120439529419, "per_sentence_gradient_norm/max": 184.08657836914062, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 29.937131881713867, "per_sentence_gradient_norm/var": 114.95857238769531, "per_token_feature_norm": 189.05809020996094, "per_token_feature_norm/max": 306.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 69.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 202.0, "per_token_feature_norm/var": 497.4306945800781, "per_token_full_gradient_variance/max_squared_error": 294.1576843261719, "per_token_full_gradient_variance/variance": 0.03160979598760605, "per_token_gradient_norm": 1.9896963834762573, "per_token_gradient_norm/max": 7057.58154296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3355.124267578125, "per_token_policy_error_norm": 0.01790757291018963, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01669716089963913, "policy_entropy": 0.03414241597056389, "policy_entropy/max": 3.78125, "policy_entropy/median": 3.236345946788788e-08, "policy_entropy/min": 6.056285572868247e-20, "policy_entropy/p25": 2.2100721253082156e-10, "policy_entropy/p75": 1.0013580322265625e-05, "policy_entropy/var": 0.020914791151881218, "policy_error_vector_variance/max_squared_error": 2.004664182662964, "policy_error_vector_variance/metric": 0.01788346841931343, "policy_loss": 0.06082633137702942, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.455268383026123, "policy_sharpness": 9.162269592285156, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.125557899475098, "reward": 0.8072916865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15577468276023865, "rewards/accuracy_reward": 0.8072916865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15577468276023865, "sentence_full_gradient_variance/max_squared_error": 383904.28125, "sentence_full_gradient_variance/metric": 1650.624267578125, "sentence_full_gradient_variance/p75": 20.400115966796875, "sentence_full_gradient_variance/p90": 29.405498504638672, "sentence_full_gradient_variance/p95": 29.405498504638672, "sentence_full_gradient_variance/p99": 22437.13671875, "state_level_variance/metric": 12.926030158996582, "state_level_variance_full_gradient/metric": 198.139404296875, "step": 86 }, { "accuracy_reward": 0.79296875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16438335180282593, "action_level_variance/metric": 36.24317169189453, "action_level_variance_full_gradient/metric": 537.2294921875, "adam_stats/lr_effective_max": 3.975077561335638e-06, "adam_stats/lr_effective_mean": 1.0931414627657343e-11, "adam_stats/lr_effective_min": -3.7813217659277143e-06, "adam_stats/m_t_max": 0.0017484432319179177, "adam_stats/m_t_mean": 8.355535013881976e-12, "adam_stats/m_t_min": -0.0018158449092879891, "adam_stats/v_t_max": 7.2401475335937e-05, "adam_stats/v_t_mean": 3.4752745386307238e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.015760518610477448, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.2001334428787231, "all_logprobs": -0.02872755005955696, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.5, "all_logprobs/p1": -0.828125, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.054931640625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.041183896362781525, "clip_ratio": 0.0, "completion_length": 549.9609375, "completion_length/correct": 469.8883361816406, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 421.0, "completion_length/correct/min": 139.0, "completion_length/correct/p25": 351.0, "completion_length/correct/p75": 585.0, "completion_length/correct/var": 34343.98828125, "completion_length/incorrect": 856.654052734375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 339.0, "completion_length/incorrect/p25": 722.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 49621.21484375, "completion_length/max": 1024.0, "completion_length/median": 482.0, "completion_length/min": 139.0, "completion_length/p25": 370.75, "completion_length/p75": 722.75, "completion_length/var": 62036.015625, "epoch": 1.1152, "feature_vector_variance/max_squared_error": 154022.15625, "feature_vector_variance/metric": 27610.59765625, "generated_tokens/total": 39840844.0, "grad_norm": 0.08929230272769928, "grouped_std_rewards": 0.10161110013723373, "learning_rate": 7.59044652756249e-07, "loss": -0.0158, "mean_logprobs": -0.02880859375, "mean_logprobs/var": 0.000316619873046875, "num_completions/total": 66816, "per_sentence_gradient_norm": 0.7936667203903198, "per_sentence_gradient_norm/max": 103.07134246826172, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 28.421598434448242, "per_sentence_gradient_norm/var": 35.65969467163086, "per_token_feature_norm": 189.85983276367188, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 69.5, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 513.116943359375, "per_token_full_gradient_variance/max_squared_error": 112.60688018798828, "per_token_full_gradient_variance/variance": 0.015035693533718586, "per_token_gradient_norm": 1.22019624710083, "per_token_gradient_norm/max": 3587.125244140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1376.3843994140625, "per_token_policy_error_norm": 0.016609756276011467, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015462425537407398, "policy_entropy": 0.03164740279316902, "policy_entropy/max": 3.203125, "policy_entropy/median": 3.306195139884949e-08, "policy_entropy/min": 1.2197274440461925e-18, "policy_entropy/p25": 2.473825588822365e-10, "policy_entropy/p75": 8.761882781982422e-06, "policy_entropy/var": 0.017703544348478317, "policy_error_vector_variance/max_squared_error": 2.005713701248169, "policy_error_vector_variance/metric": 0.016595931723713875, "policy_loss": -0.01576051488518715, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.200133204460144, "policy_sharpness": 9.19091510772705, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.927079677581787, "reward": 0.79296875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16438335180282593, "rewards/accuracy_reward": 0.79296875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16438335180282593, "sentence_full_gradient_variance/max_squared_error": 92198.8515625, "sentence_full_gradient_variance/metric": 599.74365234375, "sentence_full_gradient_variance/p75": 31.979162216186523, "sentence_full_gradient_variance/p90": 73.20433044433594, "sentence_full_gradient_variance/p95": 73.20433044433594, "sentence_full_gradient_variance/p99": 23500.80078125, "state_level_variance/metric": 3.94154691696167, "state_level_variance_full_gradient/metric": 62.51409149169922, "step": 87 }, { "accuracy_reward": 0.7630208730697632, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18105578422546387, "action_level_variance/metric": 69.09695434570312, "action_level_variance_full_gradient/metric": 4338.3046875, "adam_stats/lr_effective_max": 3.1444892556464765e-06, "adam_stats/lr_effective_mean": 8.862822802047088e-12, "adam_stats/lr_effective_min": -3.0702794902026653e-06, "adam_stats/m_t_max": 0.0020008450374007225, "adam_stats/m_t_mean": 7.03264486576205e-12, "adam_stats/m_t_min": -0.0017750679980963469, "adam_stats/v_t_max": 7.233337964862585e-05, "adam_stats/v_t_mean": 3.478172827878212e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.06727683544158936, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.4294044971466064, "all_logprobs": -0.03174816071987152, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.375, "all_logprobs/p1": -0.9462108612060547, "all_logprobs/p10": -0.00299072265625, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.06982421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.046158093959093094, "clip_ratio": 0.0, "completion_length": 573.7994995117188, "completion_length/correct": 500.52557373046875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 467.0, "completion_length/correct/min": 160.0, "completion_length/correct/p25": 322.0, "completion_length/correct/p75": 642.75, "completion_length/correct/var": 43299.1171875, "completion_length/incorrect": 809.7252807617188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 889.0, "completion_length/incorrect/min": 216.0, "completion_length/incorrect/p25": 628.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 57800.14453125, "completion_length/max": 1024.0, "completion_length/median": 538.0, "completion_length/min": 160.0, "completion_length/p25": 361.75, "completion_length/p75": 751.25, "completion_length/var": 63974.41796875, "epoch": 1.1280000000000001, "feature_vector_variance/max_squared_error": 140844.8125, "feature_vector_variance/metric": 27966.626953125, "generated_tokens/total": 40281520.0, "grad_norm": 0.155808687210083, "grouped_std_rewards": 0.15873974561691284, "learning_rate": 6.484090676804927e-07, "loss": -0.0673, "mean_logprobs": -0.033203125, "mean_logprobs/var": 0.000598907470703125, "num_completions/total": 67584, "per_sentence_gradient_norm": 1.3982486724853516, "per_sentence_gradient_norm/max": 96.56819915771484, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 40.14397048950195, "per_sentence_gradient_norm/var": 67.22938537597656, "per_token_feature_norm": 189.75631713867188, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 565.7164306640625, "per_token_full_gradient_variance/max_squared_error": 109.90692138671875, "per_token_full_gradient_variance/variance": 0.023295434191823006, "per_token_gradient_norm": 1.5374454259872437, "per_token_gradient_norm/max": 5266.279296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1821.230224609375, "per_token_policy_error_norm": 0.018175864592194557, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016648875549435616, "policy_entropy": 0.035127729177474976, "policy_entropy/max": 3.609375, "policy_entropy/median": 3.259629011154175e-08, "policy_entropy/min": 1.0503208545953324e-18, "policy_entropy/p25": 2.4920154828578234e-10, "policy_entropy/p75": 1.0132789611816406e-05, "policy_entropy/var": 0.021021489053964615, "policy_error_vector_variance/max_squared_error": 2.0056614875793457, "policy_error_vector_variance/metric": 0.01815592683851719, "policy_loss": -0.06727684289216995, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.4294044971466064, "policy_sharpness": 9.143460273742676, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.247086524963379, "reward": 0.7630208730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18105578422546387, "rewards/accuracy_reward": 0.7630208730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18105578422546387, "sentence_full_gradient_variance/max_squared_error": 1724429.0, "sentence_full_gradient_variance/metric": 4885.7158203125, "sentence_full_gradient_variance/p75": 151.1637420654297, "sentence_full_gradient_variance/p90": 256.9216003417969, "sentence_full_gradient_variance/p95": 256.9216003417969, "sentence_full_gradient_variance/p99": 83647.7890625, "state_level_variance/metric": 6.752357006072998, "state_level_variance_full_gradient/metric": 547.411865234375, "step": 88 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16514559090137482, "action_level_variance/metric": 46.86294174194336, "action_level_variance_full_gradient/metric": 1107.651123046875, "adam_stats/lr_effective_max": 2.4613752884761197e-06, "adam_stats/lr_effective_mean": 7.820397975033533e-12, "adam_stats/lr_effective_min": -2.388319444435183e-06, "adam_stats/m_t_max": 0.0017397253541275859, "adam_stats/m_t_mean": 5.265699334899843e-12, "adam_stats/m_t_min": -0.0015369075117632747, "adam_stats/v_t_max": 7.226142042782158e-05, "adam_stats/v_t_mean": 3.4749529642663646e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.033165231347084045, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.4389866590499878, "all_logprobs": -0.03087686561048031, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.5625, "all_logprobs/p1": -0.8984375, "all_logprobs/p10": -0.0025482177734375, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.0634765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04520042985677719, "clip_ratio": 0.0, "completion_length": 566.2565307617188, "completion_length/correct": 507.09539794921875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 462.0, "completion_length/correct/min": 123.0, "completion_length/correct/p25": 357.75, "completion_length/correct/p75": 646.5, "completion_length/correct/var": 42415.1484375, "completion_length/incorrect": 791.0687866210938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 847.0, "completion_length/incorrect/min": 236.0, "completion_length/incorrect/p25": 577.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 60883.66015625, "completion_length/max": 1024.0, "completion_length/median": 512.0, "completion_length/min": 123.0, "completion_length/p25": 375.0, "completion_length/p75": 733.25, "completion_length/var": 59505.87109375, "epoch": 1.1408, "feature_vector_variance/max_squared_error": 148258.25, "feature_vector_variance/metric": 27771.8125, "generated_tokens/total": 40716408.0, "grad_norm": 0.043003764003515244, "grouped_std_rewards": 0.10113140940666199, "learning_rate": 5.461210907490952e-07, "loss": -0.0332, "mean_logprobs": -0.0322265625, "mean_logprobs/var": 0.00066375732421875, "num_completions/total": 68352, "per_sentence_gradient_norm": 0.8145145177841187, "per_sentence_gradient_norm/max": 113.54893493652344, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 23.266586303710938, "per_sentence_gradient_norm/var": 46.25973892211914, "per_token_feature_norm": 191.21295166015625, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 67.5, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 490.55023193359375, "per_token_full_gradient_variance/max_squared_error": 222.18394470214844, "per_token_full_gradient_variance/variance": 0.01551458053290844, "per_token_gradient_norm": 1.0432016849517822, "per_token_gradient_norm/max": 4990.71826171875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1321.8621826171875, "per_token_policy_error_norm": 0.017636267468333244, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016213471069931984, "policy_entropy": 0.03428548201918602, "policy_entropy/max": 3.734375, "policy_entropy/median": 2.293381839990616e-08, "policy_entropy/min": 8.165397611531455e-19, "policy_entropy/p25": 1.5188561519607902e-10, "policy_entropy/p75": 9.417533874511719e-06, "policy_entropy/var": 0.02040298655629158, "policy_error_vector_variance/max_squared_error": 2.003894090652466, "policy_error_vector_variance/metric": 0.017612596973776817, "policy_loss": -0.033165235072374344, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.4389866590499878, "policy_sharpness": 9.152793884277344, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.164774417877197, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16514559090137482, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16514559090137482, "sentence_full_gradient_variance/max_squared_error": 250191.9375, "sentence_full_gradient_variance/metric": 1261.73193359375, "sentence_full_gradient_variance/p75": 8.672307968139648, "sentence_full_gradient_variance/p90": 8.94566822052002, "sentence_full_gradient_variance/p95": 8.94566822052002, "sentence_full_gradient_variance/p99": 22681.8125, "state_level_variance/metric": 5.249112129211426, "state_level_variance_full_gradient/metric": 154.080810546875, "step": 89 }, { "accuracy_reward": 0.8359375, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.13732481002807617, "action_level_variance/metric": 58.978851318359375, "action_level_variance_full_gradient/metric": 1214.042724609375, "adam_stats/lr_effective_max": 1.9921380953746848e-06, "adam_stats/lr_effective_mean": 5.650653122496463e-12, "adam_stats/lr_effective_min": -2.0141658296779497e-06, "adam_stats/m_t_max": 0.0017641170416027308, "adam_stats/m_t_mean": 9.349518555190617e-12, "adam_stats/m_t_min": -0.0016288832994177938, "adam_stats/v_t_max": 7.219519466161728e-05, "adam_stats/v_t_mean": 3.4725193640700036e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.03661075979471207, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.8912242650985718, "all_logprobs": -0.02888115681707859, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.375, "all_logprobs/p1": -0.828125, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.05224609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04175263270735741, "clip_ratio": 0.0, "completion_length": 551.0416870117188, "completion_length/correct": 507.0436096191406, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 467.0, "completion_length/correct/min": 137.0, "completion_length/correct/p25": 357.0, "completion_length/correct/p75": 620.5, "completion_length/correct/var": 40053.31640625, "completion_length/incorrect": 775.2222900390625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 816.0, "completion_length/incorrect/min": 345.0, "completion_length/incorrect/p25": 567.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 51904.546875, "completion_length/max": 1024.0, "completion_length/median": 500.0, "completion_length/min": 137.0, "completion_length/p25": 376.75, "completion_length/p75": 696.5, "completion_length/var": 51808.890625, "epoch": 1.1536, "feature_vector_variance/max_squared_error": 114366.328125, "feature_vector_variance/metric": 27737.669921875, "generated_tokens/total": 41139608.0, "grad_norm": 0.12354805320501328, "grouped_std_rewards": 0.1378316730260849, "learning_rate": 4.5230534410568764e-07, "loss": -0.0366, "mean_logprobs": -0.029541015625, "mean_logprobs/var": 0.0003814697265625, "num_completions/total": 69120, "per_sentence_gradient_norm": 1.1363630294799805, "per_sentence_gradient_norm/max": 125.00822448730469, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 38.62104415893555, "per_sentence_gradient_norm/var": 57.76274871826172, "per_token_feature_norm": 190.34861755371094, "per_token_feature_norm/max": 276.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 68.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 479.3587646484375, "per_token_full_gradient_variance/max_squared_error": 156.80543518066406, "per_token_full_gradient_variance/variance": 0.023781130090355873, "per_token_gradient_norm": 1.6073988676071167, "per_token_gradient_norm/max": 5114.58203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2253.478515625, "per_token_policy_error_norm": 0.016683965921401978, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015502308495342731, "policy_entropy": 0.03176591172814369, "policy_entropy/max": 3.484375, "policy_entropy/median": 2.759043127298355e-08, "policy_entropy/min": 1.3417001884508117e-18, "policy_entropy/p25": 2.419255906715989e-10, "policy_entropy/p75": 7.808208465576172e-06, "policy_entropy/var": 0.01788376271724701, "policy_error_vector_variance/max_squared_error": 2.0044665336608887, "policy_error_vector_variance/metric": 0.01666801981627941, "policy_loss": -0.036610763520002365, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.8912242650985718, "policy_sharpness": 9.19232177734375, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.926074028015137, "reward": 0.8359375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.13732481002807617, "rewards/accuracy_reward": 0.8359375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.13732481002807617, "sentence_full_gradient_variance/max_squared_error": 226531.03125, "sentence_full_gradient_variance/metric": 1371.2869873046875, "sentence_full_gradient_variance/p75": 28.67296600341797, "sentence_full_gradient_variance/p90": 46.25166320800781, "sentence_full_gradient_variance/p95": 46.25166320800781, "sentence_full_gradient_variance/p99": 43179.12890625, "state_level_variance/metric": 6.145047187805176, "state_level_variance_full_gradient/metric": 157.2442626953125, "step": 90 }, { "accuracy_reward": 0.7434896230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19096148014068604, "action_level_variance/metric": 34.943199157714844, "action_level_variance_full_gradient/metric": 1437.89599609375, "adam_stats/lr_effective_max": 1.714537461339205e-06, "adam_stats/lr_effective_mean": 6.803905854523007e-13, "adam_stats/lr_effective_min": -1.7775030300981598e-06, "adam_stats/m_t_max": 0.0015465066535398364, "adam_stats/m_t_mean": -1.335885612199772e-12, "adam_stats/m_t_min": -0.0009776247898116708, "adam_stats/v_t_max": 7.215453661046922e-05, "adam_stats/v_t_mean": 3.473019831792823e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.03562644496560097, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.598774790763855, "all_logprobs": -0.028914151713252068, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.75, "all_logprobs/p1": -0.828125, "all_logprobs/p10": -0.0021820068359375, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.054931640625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04130992665886879, "clip_ratio": 0.0, "completion_length": 559.7890625, "completion_length/correct": 474.1348571777344, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 443.0, "completion_length/correct/min": 80.0, "completion_length/correct/p25": 330.5, "completion_length/correct/p75": 585.0, "completion_length/correct/var": 34388.63671875, "completion_length/incorrect": 808.0558471679688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 863.0, "completion_length/incorrect/min": 336.0, "completion_length/incorrect/p25": 606.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 49224.28515625, "completion_length/max": 1024.0, "completion_length/median": 509.0, "completion_length/min": 80.0, "completion_length/p25": 374.0, "completion_length/p75": 710.25, "completion_length/var": 59427.74609375, "epoch": 1.1663999999999999, "feature_vector_variance/max_squared_error": 114931.65625, "feature_vector_variance/metric": 27699.421875, "generated_tokens/total": 41569524.0, "grad_norm": 0.1442537158727646, "grouped_std_rewards": 0.12699656188488007, "learning_rate": 3.6707612778634855e-07, "loss": -0.0356, "mean_logprobs": -0.029052734375, "mean_logprobs/var": 0.0002994537353515625, "num_completions/total": 69888, "per_sentence_gradient_norm": 0.9513980746269226, "per_sentence_gradient_norm/max": 70.90808868408203, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 29.758508682250977, "per_sentence_gradient_norm/var": 34.08241653442383, "per_token_feature_norm": 190.81410217285156, "per_token_feature_norm/max": 288.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 69.5, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 455.2473449707031, "per_token_full_gradient_variance/max_squared_error": 111.572509765625, "per_token_full_gradient_variance/variance": 0.015036245808005333, "per_token_gradient_norm": 1.2108970880508423, "per_token_gradient_norm/max": 3133.77392578125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1204.0006103515625, "per_token_policy_error_norm": 0.01686318963766098, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015816692262887955, "policy_entropy": 0.0314069539308548, "policy_entropy/max": 2.53125, "policy_entropy/median": 2.8870999813079834e-08, "policy_entropy/min": 2.4936649967166602e-18, "policy_entropy/p25": 1.837179297581315e-10, "policy_entropy/p75": 9.000301361083984e-06, "policy_entropy/var": 0.017002718523144722, "policy_error_vector_variance/max_squared_error": 2.0020322799682617, "policy_error_vector_variance/metric": 0.016855092719197273, "policy_loss": -0.03562643751502037, "policy_loss/max": 12.958681106567383, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.5987746715545654, "policy_sharpness": 9.182641983032227, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.942352771759033, "reward": 0.7434896230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19096148014068604, "rewards/accuracy_reward": 0.7434896230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19096148014068604, "sentence_full_gradient_variance/max_squared_error": 358072.0, "sentence_full_gradient_variance/metric": 1627.7698974609375, "sentence_full_gradient_variance/p75": 37.26297378540039, "sentence_full_gradient_variance/p90": 48.124183654785156, "sentence_full_gradient_variance/p95": 48.124183654785156, "sentence_full_gradient_variance/p99": 49688.234375, "state_level_variance/metric": 3.4991915225982666, "state_level_variance_full_gradient/metric": 189.87399291992188, "step": 91 }, { "accuracy_reward": 0.69921875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21058611571788788, "action_level_variance/metric": 69.25486755371094, "action_level_variance_full_gradient/metric": 1759.083984375, "adam_stats/lr_effective_max": 1.2935958011439652e-06, "adam_stats/lr_effective_mean": 1.2751370055338884e-12, "adam_stats/lr_effective_min": -1.3886949545849347e-06, "adam_stats/m_t_max": 0.0011200823355466127, "adam_stats/m_t_mean": -3.447945271309316e-12, "adam_stats/m_t_min": -0.00078602071153, "adam_stats/v_t_max": 7.209646719275042e-05, "adam_stats/v_t_mean": 3.4705651980743157e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.0072229234501719475, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.191959261894226, "all_logprobs": -0.037013206630945206, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -12.5, "all_logprobs/p1": -1.0625, "all_logprobs/p10": -0.005218505859375, "all_logprobs/p25": -1.0728836059570312e-06, "all_logprobs/p5": -0.10009765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.056966595351696014, "clip_ratio": 0.0, "completion_length": 579.31640625, "completion_length/correct": 475.71881103515625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 444.0, "completion_length/correct/min": 169.0, "completion_length/correct/p25": 330.0, "completion_length/correct/p75": 585.0, "completion_length/correct/var": 36521.55859375, "completion_length/incorrect": 820.1471557617188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 875.0, "completion_length/incorrect/min": 333.0, "completion_length/incorrect/p25": 669.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 41830.8828125, "completion_length/max": 1024.0, "completion_length/median": 529.0, "completion_length/min": 169.0, "completion_length/p25": 368.75, "completion_length/p75": 762.25, "completion_length/var": 63048.0625, "epoch": 1.1792, "feature_vector_variance/max_squared_error": 146870.3125, "feature_vector_variance/metric": 28538.8046875, "generated_tokens/total": 42014440.0, "grad_norm": 0.06299547106027603, "grouped_std_rewards": 0.12787367403507233, "learning_rate": 2.905372804626083e-07, "loss": 0.0072, "mean_logprobs": -0.036376953125, "mean_logprobs/var": 0.001495361328125, "num_completions/total": 70656, "per_sentence_gradient_norm": 1.1210708618164062, "per_sentence_gradient_norm/max": 156.5771026611328, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 29.917491912841797, "per_sentence_gradient_norm/var": 68.08672332763672, "per_token_feature_norm": 190.89781188964844, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 69.5, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 529.0776977539062, "per_token_full_gradient_variance/max_squared_error": 74.85964965820312, "per_token_full_gradient_variance/variance": 0.011325960978865623, "per_token_gradient_norm": 1.3634727001190186, "per_token_gradient_norm/max": 3856.4228515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1305.6893310546875, "per_token_policy_error_norm": 0.020793016999959946, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.019036652520298958, "policy_entropy": 0.04088946059346199, "policy_entropy/max": 3.609375, "policy_entropy/median": 3.4458935260772705e-08, "policy_entropy/min": 2.0735366548785272e-18, "policy_entropy/p25": 2.4374458007514477e-10, "policy_entropy/p75": 1.5854835510253906e-05, "policy_entropy/var": 0.02723749727010727, "policy_error_vector_variance/max_squared_error": 2.005140542984009, "policy_error_vector_variance/metric": 0.02075251005589962, "policy_loss": 0.007222925778478384, "policy_loss/max": 12.958683013916016, "policy_loss/median": 0.0, "policy_loss/min": -12.958681106567383, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.191959261894226, "policy_sharpness": 9.05459976196289, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.790335178375244, "reward": 0.69921875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21058611571788788, "rewards/accuracy_reward": 0.69921875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21058611571788788, "sentence_full_gradient_variance/max_squared_error": 574730.75, "sentence_full_gradient_variance/metric": 1991.451904296875, "sentence_full_gradient_variance/p75": 48.011474609375, "sentence_full_gradient_variance/p90": 68.75626373291016, "sentence_full_gradient_variance/p95": 68.75626373291016, "sentence_full_gradient_variance/p99": 37815.07421875, "state_level_variance/metric": 7.477953910827637, "state_level_variance_full_gradient/metric": 232.36822509765625, "step": 92 }, { "accuracy_reward": 0.7565104365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18444257974624634, "action_level_variance/metric": 32.539703369140625, "action_level_variance_full_gradient/metric": 1121.3193359375, "adam_stats/lr_effective_max": 1.0245596513414057e-06, "adam_stats/lr_effective_mean": 1.2054767991112603e-12, "adam_stats/lr_effective_min": -9.95906702883076e-07, "adam_stats/m_t_max": 0.0013391898246482015, "adam_stats/m_t_mean": 4.4686225206258534e-12, "adam_stats/m_t_min": -0.0013042162172496319, "adam_stats/v_t_max": 7.21070755389519e-05, "adam_stats/v_t_mean": 3.4683551603659213e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.008185411803424358, "advantages/max": 4.319560527801514, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.9124557971954346, "all_logprobs": -0.0322299599647522, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -12.625, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.0030059814453125, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.06982421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.048578131943941116, "clip_ratio": 0.0, "completion_length": 589.32421875, "completion_length/correct": 523.4974365234375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 486.0, "completion_length/correct/min": 170.0, "completion_length/correct/p25": 374.0, "completion_length/correct/p75": 636.0, "completion_length/correct/var": 37005.51953125, "completion_length/incorrect": 793.8449096679688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 862.0, "completion_length/incorrect/min": 277.0, "completion_length/incorrect/p25": 580.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 57817.31640625, "completion_length/max": 1024.0, "completion_length/median": 540.0, "completion_length/min": 170.0, "completion_length/p25": 394.0, "completion_length/p75": 750.25, "completion_length/var": 55484.69921875, "epoch": 1.192, "feature_vector_variance/max_squared_error": 136891.078125, "feature_vector_variance/metric": 28206.376953125, "generated_tokens/total": 42467040.0, "grad_norm": 0.07372075319290161, "grouped_std_rewards": 0.08890711516141891, "learning_rate": 2.2278205293002645e-07, "loss": 0.0082, "mean_logprobs": -0.0322265625, "mean_logprobs/var": 0.000606536865234375, "num_completions/total": 71424, "per_sentence_gradient_norm": 0.6165584325790405, "per_sentence_gradient_norm/max": 133.7151336669922, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 19.4737491607666, "per_sentence_gradient_norm/var": 32.20148849487305, "per_token_feature_norm": 191.5940704345703, "per_token_feature_norm/max": 302.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 68.5, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 474.3844909667969, "per_token_full_gradient_variance/max_squared_error": 164.5417022705078, "per_token_full_gradient_variance/variance": 0.011462656781077385, "per_token_gradient_norm": 0.7160468101501465, "per_token_gradient_norm/max": 5089.3759765625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 869.6436767578125, "per_token_policy_error_norm": 0.018377166241407394, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01708722487092018, "policy_entropy": 0.03520968556404114, "policy_entropy/max": 3.4375, "policy_entropy/median": 2.782326191663742e-08, "policy_entropy/min": 1.734723475976807e-18, "policy_entropy/p25": 2.2009771782904863e-10, "policy_entropy/p75": 1.049041748046875e-05, "policy_entropy/var": 0.02115335874259472, "policy_error_vector_variance/max_squared_error": 2.004999876022339, "policy_error_vector_variance/metric": 0.018354950472712517, "policy_loss": 0.008185414597392082, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -4.319561004638672, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.9124557971954346, "policy_sharpness": 9.13859748840332, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.270044326782227, "reward": 0.7565104365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18444257974624634, "rewards/accuracy_reward": 0.7565104365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18444257974624634, "sentence_full_gradient_variance/max_squared_error": 554370.4375, "sentence_full_gradient_variance/metric": 1276.5830078125, "sentence_full_gradient_variance/p75": 10.425423622131348, "sentence_full_gradient_variance/p90": 17.807308197021484, "sentence_full_gradient_variance/p95": 17.807308197021484, "sentence_full_gradient_variance/p99": 15179.587890625, "state_level_variance/metric": 3.7261321544647217, "state_level_variance_full_gradient/metric": 155.26368713378906, "step": 93 }, { "accuracy_reward": 0.8111979365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15335555374622345, "action_level_variance/metric": 94.87741088867188, "action_level_variance_full_gradient/metric": 1862.17724609375, "adam_stats/lr_effective_max": 7.19378533631243e-07, "adam_stats/lr_effective_mean": 9.154635599931127e-13, "adam_stats/lr_effective_min": -7.265717272275651e-07, "adam_stats/m_t_max": 0.001218292280100286, "adam_stats/m_t_mean": 2.8264808028810595e-12, "adam_stats/m_t_min": -0.0012762228725478053, "adam_stats/v_t_max": 7.203614222817123e-05, "adam_stats/v_t_mean": 3.465204252012244e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.02494657039642334, "advantages/max": 19.793392181396484, "advantages/median": -0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.173290729522705, "all_logprobs": -0.03165610507130623, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.5, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.0029449462890625, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.06982421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04620172455906868, "clip_ratio": 0.0, "completion_length": 537.0729370117188, "completion_length/correct": 480.4542541503906, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 431.0, "completion_length/correct/min": 157.0, "completion_length/correct/p25": 319.5, "completion_length/correct/p75": 608.5, "completion_length/correct/var": 41111.20703125, "completion_length/incorrect": 780.3379516601562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 802.0, "completion_length/incorrect/min": 225.0, "completion_length/incorrect/p25": 627.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 56109.8046875, "completion_length/max": 1024.0, "completion_length/median": 486.0, "completion_length/min": 157.0, "completion_length/p25": 336.75, "completion_length/p75": 697.5, "completion_length/var": 57664.8125, "epoch": 1.2048, "feature_vector_variance/max_squared_error": 146084.84375, "feature_vector_variance/metric": 28067.92578125, "generated_tokens/total": 42879512.0, "grad_norm": 0.05580280348658562, "grouped_std_rewards": 0.07842773199081421, "learning_rate": 1.6389299449645734e-07, "loss": 0.0249, "mean_logprobs": -0.031494140625, "mean_logprobs/var": 0.000518798828125, "num_completions/total": 72192, "per_sentence_gradient_norm": 1.0710580348968506, "per_sentence_gradient_norm/max": 176.37576293945312, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 30.904033660888672, "per_sentence_gradient_norm/var": 93.85244750976562, "per_token_feature_norm": 190.78822326660156, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 65.5, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 491.1971435546875, "per_token_full_gradient_variance/max_squared_error": 137.33544921875, "per_token_full_gradient_variance/variance": 0.026677219197154045, "per_token_gradient_norm": 1.5020089149475098, "per_token_gradient_norm/max": 5360.91650390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2672.421142578125, "per_token_policy_error_norm": 0.01821115054190159, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01684403233230114, "policy_entropy": 0.034726765006780624, "policy_entropy/max": 3.640625, "policy_entropy/median": 2.7939677238464355e-08, "policy_entropy/min": 1.1587410718438829e-18, "policy_entropy/p25": 2.0372681319713593e-10, "policy_entropy/p75": 9.715557098388672e-06, "policy_entropy/var": 0.020425742492079735, "policy_error_vector_variance/max_squared_error": 2.005218505859375, "policy_error_vector_variance/metric": 0.018194083124399185, "policy_loss": 0.02494657039642334, "policy_loss/max": 19.793392181396484, "policy_loss/median": 0.0, "policy_loss/min": -19.793392181396484, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.173290491104126, "policy_sharpness": 9.145638465881348, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.215653896331787, "reward": 0.8111979365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15335555374622345, "rewards/accuracy_reward": 0.8111979365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15335555374622345, "sentence_full_gradient_variance/max_squared_error": 953150.0, "sentence_full_gradient_variance/metric": 2109.36376953125, "sentence_full_gradient_variance/p75": 40.12620162963867, "sentence_full_gradient_variance/p90": 96.26681518554688, "sentence_full_gradient_variance/p95": 96.26681518554688, "sentence_full_gradient_variance/p99": 5782.51708984375, "state_level_variance/metric": 10.825273513793945, "state_level_variance_full_gradient/metric": 247.18663024902344, "step": 94 }, { "accuracy_reward": 0.8151041865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15090587735176086, "action_level_variance/metric": 87.97319030761719, "action_level_variance_full_gradient/metric": 787.4747314453125, "adam_stats/lr_effective_max": 4.85057057630911e-07, "adam_stats/lr_effective_mean": 6.007586463886716e-13, "adam_stats/lr_effective_min": -5.004393983654154e-07, "adam_stats/m_t_max": 0.0009820221457630396, "adam_stats/m_t_mean": 4.131489529830834e-13, "adam_stats/m_t_min": -0.0008830975857563317, "adam_stats/v_t_max": 7.19725139788352e-05, "adam_stats/v_t_mean": 3.462397252587679e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.00777426129207015, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.1529728174209595, "all_logprobs": -0.02797761745750904, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.25, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00150299072265625, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.048583984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04144003987312317, "clip_ratio": 0.0, "completion_length": 554.32421875, "completion_length/correct": 487.08624267578125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 422.0, "completion_length/correct/min": 132.0, "completion_length/correct/p25": 308.25, "completion_length/correct/p75": 644.5, "completion_length/correct/var": 49610.0625, "completion_length/incorrect": 850.7394409179688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 972.0, "completion_length/incorrect/min": 345.0, "completion_length/incorrect/p25": 641.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 42400.25, "completion_length/max": 1024.0, "completion_length/median": 490.0, "completion_length/min": 132.0, "completion_length/p25": 332.0, "completion_length/p75": 768.25, "completion_length/var": 68176.3203125, "epoch": 1.2176, "feature_vector_variance/max_squared_error": 156075.53125, "feature_vector_variance/metric": 27483.06640625, "generated_tokens/total": 43305232.0, "grad_norm": 0.074695885181427, "grouped_std_rewards": 0.08701907843351364, "learning_rate": 1.1394185240843985e-07, "loss": -0.0078, "mean_logprobs": -0.029052734375, "mean_logprobs/var": 0.0003185272216796875, "num_completions/total": 72960, "per_sentence_gradient_norm": 0.7877984046936035, "per_sentence_gradient_norm/max": 239.6697998046875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 20.01032257080078, "per_sentence_gradient_norm/var": 87.46643829345703, "per_token_feature_norm": 190.27037048339844, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 74.5, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 489.65283203125, "per_token_full_gradient_variance/max_squared_error": 451.2679748535156, "per_token_full_gradient_variance/variance": 0.019902752712368965, "per_token_gradient_norm": 1.2164673805236816, "per_token_gradient_norm/max": 5769.77392578125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2265.8681640625, "per_token_policy_error_norm": 0.016155172139406204, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015190845355391502, "policy_entropy": 0.030239420011639595, "policy_entropy/max": 3.625, "policy_entropy/median": 2.246815711259842e-08, "policy_entropy/min": 1.5382118322138094e-18, "policy_entropy/p25": 1.7189449863508344e-10, "policy_entropy/p75": 6.198883056640625e-06, "policy_entropy/var": 0.0170237198472023, "policy_error_vector_variance/max_squared_error": 2.002004861831665, "policy_error_vector_variance/metric": 0.016144737601280212, "policy_loss": -0.007774258963763714, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -9.659051895141602, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.152972936630249, "policy_sharpness": 9.225289344787598, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.725771427154541, "reward": 0.8151041865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15090587735176086, "rewards/accuracy_reward": 0.8151041865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15090587735176086, "sentence_full_gradient_variance/max_squared_error": 332661.90625, "sentence_full_gradient_variance/metric": 887.3447265625, "sentence_full_gradient_variance/p75": 29.148435592651367, "sentence_full_gradient_variance/p90": 60.260623931884766, "sentence_full_gradient_variance/p95": 60.260623931884766, "sentence_full_gradient_variance/p99": 11964.533203125, "state_level_variance/metric": 10.485241889953613, "state_level_variance_full_gradient/metric": 99.87007141113281, "step": 95 }, { "accuracy_reward": 0.8059896230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15657426416873932, "action_level_variance/metric": 21.629749298095703, "action_level_variance_full_gradient/metric": 1177.5499267578125, "adam_stats/lr_effective_max": 3.099571586062666e-07, "adam_stats/lr_effective_mean": 3.4932058873109184e-13, "adam_stats/lr_effective_min": -2.998511092755507e-07, "adam_stats/m_t_max": 0.0009685062104836106, "adam_stats/m_t_mean": 2.587221019020869e-12, "adam_stats/m_t_min": -0.0007474855519831181, "adam_stats/v_t_max": 7.190251199062914e-05, "adam_stats/v_t_mean": 3.4595412472249176e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.012489864602684975, "advantages/max": 9.659051895141602, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.9479197263717651, "all_logprobs": -0.02792838215827942, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.8125, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.00150299072265625, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.048583984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04027795419096947, "clip_ratio": 0.0, "completion_length": 540.171875, "completion_length/correct": 468.9046936035156, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 416.0, "completion_length/correct/min": 130.0, "completion_length/correct/p25": 318.5, "completion_length/correct/p75": 577.5, "completion_length/correct/var": 39445.63671875, "completion_length/incorrect": 836.2416381835938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 937.0, "completion_length/incorrect/min": 197.0, "completion_length/incorrect/p25": 702.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 58215.6015625, "completion_length/max": 1024.0, "completion_length/median": 456.0, "completion_length/min": 130.0, "completion_length/p25": 339.75, "completion_length/p75": 718.25, "completion_length/var": 64143.62109375, "epoch": 1.2304, "feature_vector_variance/max_squared_error": 142876.984375, "feature_vector_variance/metric": 27754.951171875, "generated_tokens/total": 43720084.0, "grad_norm": 0.05657722428441048, "grouped_std_rewards": 0.09508083760738373, "learning_rate": 7.298948443822229e-08, "loss": -0.0125, "mean_logprobs": -0.0274658203125, "mean_logprobs/var": 0.0005645751953125, "num_completions/total": 73728, "per_sentence_gradient_norm": 0.617752194404602, "per_sentence_gradient_norm/max": 61.579776763916016, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 20.183307647705078, "per_sentence_gradient_norm/var": 21.275833129882812, "per_token_feature_norm": 190.58096313476562, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 60.25, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 483.2680969238281, "per_token_full_gradient_variance/max_squared_error": 40.81024169921875, "per_token_full_gradient_variance/variance": 0.009434841573238373, "per_token_gradient_norm": 0.8559454679489136, "per_token_gradient_norm/max": 2639.637939453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 728.7550048828125, "per_token_policy_error_norm": 0.01594540849328041, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014502447098493576, "policy_entropy": 0.03143181651830673, "policy_entropy/max": 3.453125, "policy_entropy/median": 2.0256265997886658e-08, "policy_entropy/min": 1.2874900798265365e-18, "policy_entropy/p25": 1.6825651982799172e-10, "policy_entropy/p75": 5.543231964111328e-06, "policy_entropy/var": 0.018989142030477524, "policy_error_vector_variance/max_squared_error": 2.0070290565490723, "policy_error_vector_variance/metric": 0.015918977558612823, "policy_loss": -0.01248986180871725, "policy_loss/max": 9.659051895141602, "policy_loss/median": 0.0, "policy_loss/min": -9.659050941467285, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.9479196071624756, "policy_sharpness": 9.22118854522705, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.786490440368652, "reward": 0.8059896230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15657426416873932, "rewards/accuracy_reward": 0.8059896230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15657426416873932, "sentence_full_gradient_variance/max_squared_error": 346465.09375, "sentence_full_gradient_variance/metric": 1325.113037109375, "sentence_full_gradient_variance/p75": 42.79906463623047, "sentence_full_gradient_variance/p90": 54.15158462524414, "sentence_full_gradient_variance/p95": 54.15158462524414, "sentence_full_gradient_variance/p99": 22199.583984375, "state_level_variance/metric": 2.346544027328491, "state_level_variance_full_gradient/metric": 147.56301879882812, "step": 96 }, { "accuracy_reward": 0.7825521230697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1703861802816391, "action_level_variance/metric": 45.7636833190918, "action_level_variance_full_gradient/metric": 1808.7396240234375, "adam_stats/lr_effective_max": 1.7832726939559507e-07, "adam_stats/lr_effective_mean": 2.5355303794050155e-13, "adam_stats/lr_effective_min": -1.8077491859003203e-07, "adam_stats/m_t_max": 0.0006862542359158397, "adam_stats/m_t_mean": -1.1244657115161427e-11, "adam_stats/m_t_min": -0.0005503692664206028, "adam_stats/v_t_max": 7.194339559646323e-05, "adam_stats/v_t_mean": 3.4597598223828907e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.07125717401504517, "advantages/max": 12.9586820602417, "advantages/median": 0.0, "advantages/min": -12.9586820602417, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.01057767868042, "all_logprobs": -0.03285559266805649, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.375, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.003173828125, "all_logprobs/p25": -7.152557373046875e-07, "all_logprobs/p5": -0.07382774353027344, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04959830269217491, "clip_ratio": 0.0, "completion_length": 563.74609375, "completion_length/correct": 488.1014709472656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 459.0, "completion_length/correct/min": 161.0, "completion_length/correct/p25": 334.0, "completion_length/correct/p75": 613.0, "completion_length/correct/var": 37471.71484375, "completion_length/incorrect": 835.97607421875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 981.0, "completion_length/incorrect/min": 262.0, "completion_length/incorrect/p25": 676.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 49814.76953125, "completion_length/max": 1024.0, "completion_length/median": 506.0, "completion_length/min": 161.0, "completion_length/p25": 364.75, "completion_length/p75": 731.25, "completion_length/var": 60713.81640625, "epoch": 1.2432, "feature_vector_variance/max_squared_error": 141003.625, "feature_vector_variance/metric": 28048.30078125, "generated_tokens/total": 44153040.0, "grad_norm": 0.11981648951768875, "grouped_std_rewards": 0.16197949647903442, "learning_rate": 4.108578473795033e-08, "loss": -0.0713, "mean_logprobs": -0.0341796875, "mean_logprobs/var": 0.00098419189453125, "num_completions/total": 74496, "per_sentence_gradient_norm": 1.1961004734039307, "per_sentence_gradient_norm/max": 77.41229248046875, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 30.499357223510742, "per_sentence_gradient_norm/var": 44.39082336425781, "per_token_feature_norm": 190.08145141601562, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 498.09417724609375, "per_token_full_gradient_variance/max_squared_error": 95.27923583984375, "per_token_full_gradient_variance/variance": 0.020041372627019882, "per_token_gradient_norm": 1.5228639841079712, "per_token_gradient_norm/max": 3670.1416015625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1633.41552734375, "per_token_policy_error_norm": 0.01862647943198681, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.017139222472906113, "policy_entropy": 0.035997677594423294, "policy_entropy/max": 3.625, "policy_entropy/median": 2.922024577856064e-08, "policy_entropy/min": 2.862971361719535e-19, "policy_entropy/p25": 2.0736479200422764e-10, "policy_entropy/p75": 1.1086463928222656e-05, "policy_entropy/var": 0.022786101326346397, "policy_error_vector_variance/max_squared_error": 2.00417423248291, "policy_error_vector_variance/metric": 0.018585704267024994, "policy_loss": -0.07125716656446457, "policy_loss/max": 12.9586820602417, "policy_loss/median": 0.0, "policy_loss/min": -12.9586820602417, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.010577440261841, "policy_sharpness": 9.139013290405273, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.284911632537842, "reward": 0.7825521230697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1703861802816391, "rewards/accuracy_reward": 0.7825521230697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1703861802816391, "sentence_full_gradient_variance/max_squared_error": 219234.765625, "sentence_full_gradient_variance/metric": 2020.76806640625, "sentence_full_gradient_variance/p75": 54.87483215332031, "sentence_full_gradient_variance/p90": 173.49021911621094, "sentence_full_gradient_variance/p95": 173.49021911621094, "sentence_full_gradient_variance/p99": 55837.9453125, "state_level_variance/metric": 4.334959983825684, "state_level_variance_full_gradient/metric": 212.0284881591797, "step": 97 }, { "accuracy_reward": 0.8489583730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.12839527428150177, "action_level_variance/metric": 73.4534912109375, "action_level_variance_full_gradient/metric": 5618.4580078125, "adam_stats/lr_effective_max": 7.275740898649019e-08, "adam_stats/lr_effective_mean": 9.096402694671116e-14, "adam_stats/lr_effective_min": -7.23798052604252e-08, "adam_stats/m_t_max": 0.001130324206314981, "adam_stats/m_t_mean": -1.3594320114052039e-11, "adam_stats/m_t_min": -0.0008790307329036295, "adam_stats/v_t_max": 7.190907490439713e-05, "adam_stats/v_t_mean": 3.4578583486127856e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.1085357666015625, "advantages/max": 7.48191499710083, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.499936103820801, "all_logprobs": -0.03032013401389122, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -13.8125, "all_logprobs/p1": -0.8732032775878906, "all_logprobs/p10": -0.002471923828125, "all_logprobs/p25": -8.344650268554688e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04479158669710159, "clip_ratio": 0.0, "completion_length": 515.42578125, "completion_length/correct": 475.51226806640625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 448.0, "completion_length/correct/min": 166.0, "completion_length/correct/p25": 336.0, "completion_length/correct/p75": 579.0, "completion_length/correct/var": 34226.53515625, "completion_length/incorrect": 739.7672119140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 764.0, "completion_length/incorrect/min": 206.0, "completion_length/incorrect/p25": 496.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 65349.34765625, "completion_length/max": 1024.0, "completion_length/median": 469.0, "completion_length/min": 166.0, "completion_length/p25": 352.0, "completion_length/p75": 633.25, "completion_length/var": 47814.2421875, "epoch": 1.256, "feature_vector_variance/max_squared_error": 140550.53125, "feature_vector_variance/metric": 27957.369140625, "generated_tokens/total": 44548888.0, "grad_norm": 0.07618419826030731, "grouped_std_rewards": 0.11810068786144257, "learning_rate": 1.8269623051318517e-08, "loss": 0.1085, "mean_logprobs": -0.03076171875, "mean_logprobs/var": 0.0004425048828125, "num_completions/total": 75264, "per_sentence_gradient_norm": 1.2394115924835205, "per_sentence_gradient_norm/max": 103.90141296386719, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 41.85806655883789, "per_sentence_gradient_norm/var": 72.01111602783203, "per_token_feature_norm": 190.4860076904297, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 70.5, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 487.3561096191406, "per_token_full_gradient_variance/max_squared_error": 162.11590576171875, "per_token_full_gradient_variance/variance": 0.023855922743678093, "per_token_gradient_norm": 1.518203854560852, "per_token_gradient_norm/max": 5187.87890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2176.642578125, "per_token_policy_error_norm": 0.01745210960507393, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01622314564883709, "policy_entropy": 0.03315744549036026, "policy_entropy/max": 3.46875, "policy_entropy/median": 3.725290298461914e-08, "policy_entropy/min": 1.3417001884508117e-18, "policy_entropy/p25": 2.7284841053187847e-10, "policy_entropy/p75": 1.245737075805664e-05, "policy_entropy/var": 0.018701096996665, "policy_error_vector_variance/max_squared_error": 2.0041892528533936, "policy_error_vector_variance/metric": 0.017443500459194183, "policy_loss": 0.1085357666015625, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -7.48191499710083, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.499936103820801, "policy_sharpness": 9.158427238464355, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.1078290939331055, "reward": 0.8489583730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.12839527428150177, "rewards/accuracy_reward": 0.8489583730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.12839527428150177, "sentence_full_gradient_variance/max_squared_error": 1821632.0, "sentence_full_gradient_variance/metric": 6370.9208984375, "sentence_full_gradient_variance/p75": 79.54423522949219, "sentence_full_gradient_variance/p90": 255.48097229003906, "sentence_full_gradient_variance/p95": 255.48097229003906, "sentence_full_gradient_variance/p99": 50943.65234375, "state_level_variance/metric": 7.726024627685547, "state_level_variance_full_gradient/metric": 752.4620361328125, "step": 98 }, { "accuracy_reward": 0.8020833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15895263850688934, "action_level_variance/metric": 70.80473327636719, "action_level_variance_full_gradient/metric": 1641.0438232421875, "adam_stats/lr_effective_max": 1.844696839725657e-08, "adam_stats/lr_effective_mean": 2.0315807413087694e-14, "adam_stats/lr_effective_min": -1.8832871262475237e-08, "adam_stats/m_t_max": 0.0014475897187367082, "adam_stats/m_t_mean": -1.9864060049612497e-11, "adam_stats/m_t_min": -0.001022570300847292, "adam_stats/v_t_max": 7.184041169239208e-05, "adam_stats/v_t_mean": 3.4565718343149143e-12, "adam_stats/v_t_min": 0.0, "advantages": -0.006811360828578472, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -19.793392181396484, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 2.6152408123016357, "all_logprobs": -0.03231552988290787, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.3125, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.00360107421875, "all_logprobs/p25": -8.344650268554688e-07, "all_logprobs/p5": -0.0791015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04707633703947067, "clip_ratio": 0.0, "completion_length": 553.1185302734375, "completion_length/correct": 488.8035583496094, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 469.0, "completion_length/correct/min": 169.0, "completion_length/correct/p25": 356.0, "completion_length/correct/p75": 597.25, "completion_length/correct/var": 32396.4296875, "completion_length/incorrect": 813.76318359375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 874.0, "completion_length/incorrect/min": 273.0, "completion_length/incorrect/p25": 615.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 49664.76953125, "completion_length/max": 1024.0, "completion_length/median": 512.0, "completion_length/min": 169.0, "completion_length/p25": 382.75, "completion_length/p75": 679.25, "completion_length/var": 52539.0234375, "epoch": 1.2688, "feature_vector_variance/max_squared_error": 141680.515625, "feature_vector_variance/metric": 27978.6328125, "generated_tokens/total": 44973684.0, "grad_norm": 0.08381044864654541, "grouped_std_rewards": 0.12103675305843353, "learning_rate": 4.568797356781784e-09, "loss": 0.0068, "mean_logprobs": -0.03125, "mean_logprobs/var": 0.00048065185546875, "num_completions/total": 76032, "per_sentence_gradient_norm": 1.1561708450317383, "per_sentence_gradient_norm/max": 124.46831512451172, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 36.482601165771484, "per_sentence_gradient_norm/var": 69.55856323242188, "per_token_feature_norm": 190.711669921875, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 62.25, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 488.9446105957031, "per_token_full_gradient_variance/max_squared_error": 183.8382110595703, "per_token_full_gradient_variance/variance": 0.028139842674136162, "per_token_gradient_norm": 1.5548065900802612, "per_token_gradient_norm/max": 5054.73779296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 2289.176025390625, "per_token_policy_error_norm": 0.01836892031133175, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016837652772665024, "policy_entropy": 0.036113087087869644, "policy_entropy/max": 3.8125, "policy_entropy/median": 3.4691765904426575e-08, "policy_entropy/min": 4.445228907190568e-18, "policy_entropy/p25": 2.4374458007514477e-10, "policy_entropy/p75": 1.2576580047607422e-05, "policy_entropy/var": 0.02234623394906521, "policy_error_vector_variance/max_squared_error": 2.0038671493530273, "policy_error_vector_variance/metric": 0.01835831254720688, "policy_loss": 0.006811360828578472, "policy_loss/max": 19.793394088745117, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 2.6152408123016357, "policy_sharpness": 9.117114067077637, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.358609199523926, "reward": 0.8020833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15895263850688934, "rewards/accuracy_reward": 0.8020833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15895263850688934, "sentence_full_gradient_variance/max_squared_error": 243629.515625, "sentence_full_gradient_variance/metric": 1835.2054443359375, "sentence_full_gradient_variance/p75": 110.08567810058594, "sentence_full_gradient_variance/p90": 135.29202270507812, "sentence_full_gradient_variance/p95": 135.29202270507812, "sentence_full_gradient_variance/p99": 51788.98828125, "state_level_variance/metric": 7.592954158782959, "state_level_variance_full_gradient/metric": 194.16162109375, "step": 99 }, { "accuracy_reward": 0.8111979365348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.15335553884506226, "action_level_variance/metric": 67.60980224609375, "action_level_variance_full_gradient/metric": 1910.340087890625, "adam_stats/lr_effective_max": 0.0, "adam_stats/lr_effective_mean": 0.0, "adam_stats/lr_effective_min": 0.0, "adam_stats/m_t_max": 0.0016906026285141706, "adam_stats/m_t_mean": -2.6288466195567572e-11, "adam_stats/m_t_min": -0.0011144758900627494, "adam_stats/v_t_max": 7.182401895988733e-05, "adam_stats/v_t_mean": 3.4560349373990995e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0594317764043808, "advantages/max": 19.793392181396484, "advantages/median": 0.0, "advantages/min": -9.659051895141602, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 1.451844573020935, "all_logprobs": -0.03207896649837494, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.25, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.003173828125, "all_logprobs/p25": -9.5367431640625e-07, "all_logprobs/p5": -0.072265625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.046994276344776154, "clip_ratio": 0.0, "completion_length": 545.9791870117188, "completion_length/correct": 496.7223205566406, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 436.0, "completion_length/correct/min": 170.0, "completion_length/correct/p25": 339.0, "completion_length/correct/p75": 630.5, "completion_length/correct/var": 39510.28125, "completion_length/incorrect": 757.61376953125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 776.0, "completion_length/incorrect/min": 219.0, "completion_length/incorrect/p25": 546.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 64592.1875, "completion_length/max": 1024.0, "completion_length/median": 488.0, "completion_length/min": 170.0, "completion_length/p25": 355.75, "completion_length/p75": 690.25, "completion_length/var": 54605.8046875, "epoch": 1.2816, "feature_vector_variance/max_squared_error": 140812.96875, "feature_vector_variance/metric": 28308.533203125, "generated_tokens/total": 45392996.0, "grad_norm": 0.13870655000209808, "grouped_std_rewards": 0.09034831821918488, "learning_rate": 0.0, "loss": -0.0594, "mean_logprobs": -0.033447265625, "mean_logprobs/var": 0.000629425048828125, "num_completions/total": 76800, "per_sentence_gradient_norm": 0.9746565818786621, "per_sentence_gradient_norm/max": 158.52667236328125, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 37.07760238647461, "per_sentence_gradient_norm/var": 66.74675750732422, "per_token_feature_norm": 189.65811157226562, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 202.0, "per_token_feature_norm/var": 498.2579650878906, "per_token_full_gradient_variance/max_squared_error": 142.3223876953125, "per_token_full_gradient_variance/variance": 0.015865039080381393, "per_token_gradient_norm": 1.1519023180007935, "per_token_gradient_norm/max": 5026.43896484375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1484.85205078125, "per_token_policy_error_norm": 0.01844283752143383, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.017060741782188416, "policy_entropy": 0.03525524213910103, "policy_entropy/max": 3.796875, "policy_entropy/median": 5.2386894822120667e-08, "policy_entropy/min": 6.979551485375435e-19, "policy_entropy/p25": 3.9472070056945086e-10, "policy_entropy/p75": 1.436471939086914e-05, "policy_entropy/var": 0.020780984312295914, "policy_error_vector_variance/max_squared_error": 2.004514455795288, "policy_error_vector_variance/metric": 0.018424199894070625, "policy_loss": -0.0594317764043808, "policy_loss/max": 9.659051895141602, "policy_loss/median": 0.0, "policy_loss/min": -19.793394088745117, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 1.4518444538116455, "policy_sharpness": 9.132195472717285, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.295224666595459, "reward": 0.8111979365348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.15335553884506226, "rewards/accuracy_reward": 0.8111979365348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.15335553884506226, "sentence_full_gradient_variance/max_squared_error": 471708.46875, "sentence_full_gradient_variance/metric": 2160.71875, "sentence_full_gradient_variance/p75": 78.64983367919922, "sentence_full_gradient_variance/p90": 85.98413848876953, "sentence_full_gradient_variance/p95": 85.98413848876953, "sentence_full_gradient_variance/p99": 44114.52734375, "state_level_variance/metric": 7.580231189727783, "state_level_variance_full_gradient/metric": 250.37887573242188, "step": 100 }, { "adam_stats/lr_effective_max": 0.0, "adam_stats/lr_effective_mean": 0.0, "adam_stats/lr_effective_min": 0.0, "adam_stats/m_t_max": 0.0016906026285141706, "adam_stats/m_t_mean": -2.6288466195567572e-11, "adam_stats/m_t_min": -0.0011144758900627494, "adam_stats/v_t_max": 7.182401895988733e-05, "adam_stats/v_t_mean": 3.4560349373990995e-12, "adam_stats/v_t_min": 0.0, "epoch": 1.2816, "step": 100, "total_flos": 0.0, "train_loss": -0.0027308084536343813, "train_runtime": 72287.8954, "train_samples_per_second": 1.062, "train_steps_per_second": 0.001 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 24, "trial_name": null, "trial_params": null }