{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.16, "eval_steps": 10, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "accuracy_reward": 0.5520833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24989035725593567, "action_level_variance/metric": 65.57559204101562, "action_level_variance_full_gradient/metric": 586.177490234375, "adam_stats/lr_effective_max": 4.743436875287443e-06, "adam_stats/lr_effective_mean": 3.996759173618969e-11, "adam_stats/lr_effective_min": -4.7434400585189e-06, "adam_stats/m_t_max": 0.013329396955668926, "adam_stats/m_t_mean": 1.5761092431176849e-10, "adam_stats/m_t_min": -0.017717335373163223, "adam_stats/v_t_max": 3.138997999485582e-05, "adam_stats/v_t_mean": 5.185702670225389e-13, "adam_stats/v_t_min": 0.0, "advantages": 0.5520833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.24989035725593567, "all_logprobs": -0.1549258977174759, "all_logprobs/max": 0.0, "all_logprobs/median": -2.5272369384765625e-05, "all_logprobs/min": -13.375, "all_logprobs/p1": -2.59375, "all_logprobs/p10": -0.427734375, "all_logprobs/p25": -0.0159912109375, "all_logprobs/p5": -0.9765625, "all_logprobs/p75": -4.76837158203125e-07, "all_logprobs/var": 0.26769405603408813, "clip_ratio": 0.0, "completion_length": 637.2708740234375, "completion_length/correct": 482.0566101074219, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 435.0, "completion_length/correct/min": 202.0, "completion_length/correct/p25": 334.0, "completion_length/correct/p75": 523.0, "completion_length/correct/var": 50256.20703125, "completion_length/incorrect": 828.5814208984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 21.0, "completion_length/incorrect/p25": 781.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 95711.015625, "completion_length/max": 1024.0, "completion_length/median": 523.0, "completion_length/min": 21.0, "completion_length/p25": 363.75, "completion_length/p75": 1024.0, "completion_length/var": 99829.703125, "epoch": 0.0016, "feature_vector_variance/max_squared_error": 91169.515625, "feature_vector_variance/metric": 24799.556640625, "generated_tokens/total": 61178.0, "grad_norm": 2.3590943813323975, "learning_rate": 1.5e-06, "loss": -0.5521, "mean_logprobs": -0.17578125, "mean_logprobs/var": 0.060302734375, "num_completions/total": 96, "per_sentence_gradient_norm": 9.49322509765625, "per_sentence_gradient_norm/max": 37.098121643066406, "per_sentence_gradient_norm/median": 7.641116619110107, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 16.53400421142578, "per_sentence_gradient_norm/p85": 22.34661293029785, "per_sentence_gradient_norm/p90": 24.84002685546875, "per_sentence_gradient_norm/p95": 28.43463897705078, "per_sentence_gradient_norm/p99": 31.18088150024414, "per_sentence_gradient_norm/var": 106.55781555175781, "per_token_feature_norm": 160.70457458496094, "per_token_feature_norm/max": 330.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 62.25, "per_token_feature_norm/p25": 123.5, "per_token_feature_norm/p75": 191.0, "per_token_feature_norm/var": 2264.09521484375, "per_token_full_gradient_variance/max_squared_error": 3.0729262828826904, "per_token_full_gradient_variance/variance": 0.008951449766755104, "per_token_gradient_norm": 7.5556206703186035, "per_token_gradient_norm/max": 407.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1152.4373779296875, "per_token_policy_error_norm": 0.08051766455173492, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06755160540342331, "policy_entropy": 0.17055420577526093, "policy_entropy/max": 3.65625, "policy_entropy/median": 0.0003108978271484375, "policy_entropy/min": 1.9468870959826745e-12, "policy_entropy/p25": 8.58306884765625e-06, "policy_entropy/p75": 0.08935546875, "policy_entropy/var": 0.1420246958732605, "policy_error_vector_variance/max_squared_error": 2.012315034866333, "policy_error_vector_variance/metric": 0.08006816357374191, "policy_loss": -0.5520833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.24989035725593567, "policy_sharpness": 7.119263172149658, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.034374952316284, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.090181350708008, "reward": 0.5520833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24989035725593567, "rewards/accuracy_reward": 0.5520833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24989035725593567, "sentence_full_gradient_variance/max_squared_error": 3765.688232421875, "sentence_full_gradient_variance/metric": 1375.7093505859375, "sentence_full_gradient_variance/p75": 1265.6181640625, "sentence_full_gradient_variance/p90": 2422.34375, "sentence_full_gradient_variance/p95": 2878.111572265625, "sentence_full_gradient_variance/p99": 3484.031982421875, "state_level_variance/metric": 52.43911361694336, "state_level_variance_full_gradient/metric": 789.5319213867188, "step": 1 }, { "accuracy_reward": 0.5833333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24561403691768646, "action_level_variance/metric": 125.26174926757812, "action_level_variance_full_gradient/metric": 682.9362182617188, "adam_stats/lr_effective_max": 1.2765986866725143e-05, "adam_stats/lr_effective_mean": 4.656810700942593e-11, "adam_stats/lr_effective_min": -1.2765889550792053e-05, "adam_stats/m_t_max": 0.014315794222056866, "adam_stats/m_t_mean": 1.8794019118750782e-10, "adam_stats/m_t_min": -0.01865556463599205, "adam_stats/v_t_max": 3.20929721056018e-05, "adam_stats/v_t_mean": 5.989012996469867e-13, "adam_stats/v_t_min": 0.0, "advantages": 0.5833333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.24561403691768646, "all_logprobs": -0.1573808491230011, "all_logprobs/max": 0.0, "all_logprobs/median": -2.849102020263672e-05, "all_logprobs/min": -7.75, "all_logprobs/p1": -2.65625, "all_logprobs/p10": -0.43359375, "all_logprobs/p25": -0.016632080078125, "all_logprobs/p5": -1.0, "all_logprobs/p75": -7.152557373046875e-07, "all_logprobs/var": 0.2609635889530182, "clip_ratio": 0.0, "completion_length": 626.25, "completion_length/correct": 554.3035888671875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 490.0, "completion_length/correct/min": 193.0, "completion_length/correct/p25": 386.5, "completion_length/correct/p75": 693.75, "completion_length/correct/var": 46151.34375, "completion_length/incorrect": 726.9750366210938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 38.0, "completion_length/incorrect/p25": 473.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 134433.96875, "completion_length/max": 1024.0, "completion_length/median": 525.0, "completion_length/min": 38.0, "completion_length/p25": 388.25, "completion_length/p75": 1013.5, "completion_length/var": 89230.96875, "epoch": 0.0032, "feature_vector_variance/max_squared_error": 92389.0078125, "feature_vector_variance/metric": 24192.873046875, "generated_tokens/total": 121298.0, "grad_norm": 0.47953829169273376, "learning_rate": 3e-06, "loss": -0.5833, "mean_logprobs": -0.189453125, "mean_logprobs/var": 0.029296875, "num_completions/total": 192, "per_sentence_gradient_norm": 12.322691917419434, "per_sentence_gradient_norm/max": 43.98616409301758, "per_sentence_gradient_norm/median": 12.045063972473145, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 21.675640106201172, "per_sentence_gradient_norm/p85": 25.604753494262695, "per_sentence_gradient_norm/p90": 28.94076156616211, "per_sentence_gradient_norm/p95": 31.928468704223633, "per_sentence_gradient_norm/p99": 40.21195983886719, "per_sentence_gradient_norm/var": 149.62686157226562, "per_token_feature_norm": 160.12371826171875, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 149.0, "per_token_feature_norm/min": 62.5, "per_token_feature_norm/p25": 121.0, "per_token_feature_norm/p75": 191.0, "per_token_feature_norm/var": 2463.46923828125, "per_token_full_gradient_variance/max_squared_error": 2.0633437633514404, "per_token_full_gradient_variance/variance": 0.012623072601854801, "per_token_gradient_norm": 10.929417610168457, "per_token_gradient_norm/max": 413.875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1667.029052734375, "per_token_policy_error_norm": 0.0819171816110611, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0685625970363617, "policy_entropy": 0.1747380793094635, "policy_entropy/max": 3.71875, "policy_entropy/median": 0.000347137451171875, "policy_entropy/min": 6.52811138479592e-14, "policy_entropy/p25": 1.2278556823730469e-05, "policy_entropy/p75": 0.09130859375, "policy_entropy/var": 0.14669381082057953, "policy_error_vector_variance/max_squared_error": 2.0111491680145264, "policy_error_vector_variance/metric": 0.08160345256328583, "policy_loss": -0.5833333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.24561403691768646, "policy_sharpness": 7.152019500732422, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.0196046829223633, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.189763069152832, "reward": 0.5833333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24561403691768646, "rewards/accuracy_reward": 0.5833333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24561403691768646, "sentence_full_gradient_variance/max_squared_error": 4274.55712890625, "sentence_full_gradient_variance/metric": 1103.071533203125, "sentence_full_gradient_variance/p75": 1097.642578125, "sentence_full_gradient_variance/p90": 2106.4013671875, "sentence_full_gradient_variance/p95": 2515.486083984375, "sentence_full_gradient_variance/p99": 3394.974853515625, "state_level_variance/metric": 41.960968017578125, "state_level_variance_full_gradient/metric": 420.13531494140625, "step": 2 }, { "accuracy_reward": 0.6354166865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23410086333751678, "action_level_variance/metric": 80.47393798828125, "action_level_variance_full_gradient/metric": 549.3681640625, "adam_stats/lr_effective_max": 2.234960084024351e-05, "adam_stats/lr_effective_mean": -3.8091335641254886e-11, "adam_stats/lr_effective_min": -2.2350332073983736e-05, "adam_stats/m_t_max": 0.01984311081469059, "adam_stats/m_t_mean": 3.3942021415711565e-10, "adam_stats/m_t_min": -0.02776208147406578, "adam_stats/v_t_max": 4.287566844141111e-05, "adam_stats/v_t_mean": 1.1062834676112132e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6354166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.23410086333751678, "all_logprobs": -0.14948494732379913, "all_logprobs/max": 0.0, "all_logprobs/median": -2.0742416381835938e-05, "all_logprobs/min": -10.5, "all_logprobs/p1": -2.5068750381469727, "all_logprobs/p10": -0.3953123092651367, "all_logprobs/p25": -0.01220703125, "all_logprobs/p5": -0.9765625, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.25220224261283875, "clip_ratio": 0.0, "completion_length": 628.71875, "completion_length/correct": 637.0983276367188, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 585.0, "completion_length/correct/min": 246.0, "completion_length/correct/p25": 427.0, "completion_length/correct/p75": 848.0, "completion_length/correct/var": 65387.05859375, "completion_length/incorrect": 614.1143188476562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 749.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 200.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 152916.984375, "completion_length/max": 1024.0, "completion_length/median": 595.0, "completion_length/min": 2.0, "completion_length/p25": 407.5, "completion_length/p75": 1011.0, "completion_length/var": 96148.9453125, "epoch": 0.0048, "feature_vector_variance/max_squared_error": 92206.875, "feature_vector_variance/metric": 25111.83203125, "generated_tokens/total": 181655.0, "grad_norm": 1.399181842803955, "learning_rate": 4.5e-06, "loss": -0.6354, "mean_logprobs": -0.2158203125, "mean_logprobs/var": 0.162109375, "num_completions/total": 288, "per_sentence_gradient_norm": 10.984062194824219, "per_sentence_gradient_norm/max": 35.67982864379883, "per_sentence_gradient_norm/median": 11.57533073425293, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 17.326705932617188, "per_sentence_gradient_norm/p85": 21.111553192138672, "per_sentence_gradient_norm/p90": 24.498876571655273, "per_sentence_gradient_norm/p95": 27.605539321899414, "per_sentence_gradient_norm/p99": 33.86392593383789, "per_sentence_gradient_norm/var": 100.51630401611328, "per_token_feature_norm": 161.52076721191406, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 61.25, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 2382.918701171875, "per_token_full_gradient_variance/max_squared_error": 2.6490533351898193, "per_token_full_gradient_variance/variance": 0.012139114551246166, "per_token_gradient_norm": 10.652451515197754, "per_token_gradient_norm/max": 407.8125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1562.185302734375, "per_token_policy_error_norm": 0.07812295854091644, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06542951613664627, "policy_entropy": 0.1663353443145752, "policy_entropy/max": 3.640625, "policy_entropy/median": 0.000255584716796875, "policy_entropy/min": 9.903189379656396e-14, "policy_entropy/p25": 6.3478946685791016e-06, "policy_entropy/p75": 0.0712890625, "policy_entropy/var": 0.1406816989183426, "policy_error_vector_variance/max_squared_error": 2.0103423595428467, "policy_error_vector_variance/metric": 0.07777471095323563, "policy_loss": -0.6354166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.23410086333751678, "policy_sharpness": 7.270443916320801, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.369140625, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.81234359741211, "reward": 0.6354166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23410086333751678, "rewards/accuracy_reward": 0.6354166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23410086333751678, "sentence_full_gradient_variance/max_squared_error": 4031.721435546875, "sentence_full_gradient_variance/metric": 1075.525146484375, "sentence_full_gradient_variance/p75": 1300.9754638671875, "sentence_full_gradient_variance/p90": 2118.332763671875, "sentence_full_gradient_variance/p95": 2920.947265625, "sentence_full_gradient_variance/p99": 3945.80126953125, "state_level_variance/metric": 31.695878982543945, "state_level_variance_full_gradient/metric": 526.156982421875, "step": 3 }, { "accuracy_reward": 0.7708333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17850877344608307, "action_level_variance/metric": 183.33074951171875, "action_level_variance_full_gradient/metric": 1038.39599609375, "adam_stats/lr_effective_max": 3.283171827206388e-05, "adam_stats/lr_effective_mean": -2.4598950654208807e-10, "adam_stats/lr_effective_min": -3.282213947386481e-05, "adam_stats/m_t_max": 0.014192074537277222, "adam_stats/m_t_mean": 2.320087039153762e-10, "adam_stats/m_t_min": -0.017943816259503365, "adam_stats/v_t_max": 6.185643724165857e-05, "adam_stats/v_t_mean": 1.5434432161162692e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7708333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.17850877344608307, "all_logprobs": -0.15718112885951996, "all_logprobs/max": 0.0, "all_logprobs/median": -1.9073486328125e-05, "all_logprobs/min": -11.4375, "all_logprobs/p1": -2.6875, "all_logprobs/p10": -0.421875, "all_logprobs/p25": -0.015960693359375, "all_logprobs/p5": -1.0, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.27130645513534546, "clip_ratio": 0.0, "completion_length": 600.5208740234375, "completion_length/correct": 567.3378295898438, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 498.0, "completion_length/correct/min": 40.0, "completion_length/correct/p25": 311.75, "completion_length/correct/p75": 780.0, "completion_length/correct/var": 82504.96875, "completion_length/incorrect": 712.1364135742188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 985.0, "completion_length/incorrect/min": 22.0, "completion_length/incorrect/p25": 291.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 166101.921875, "completion_length/max": 1024.0, "completion_length/median": 571.0, "completion_length/min": 22.0, "completion_length/p25": 306.5, "completion_length/p75": 994.75, "completion_length/var": 103858.5546875, "epoch": 0.0064, "feature_vector_variance/max_squared_error": 97145.921875, "feature_vector_variance/metric": 25787.30078125, "generated_tokens/total": 239305.0, "grad_norm": 1.0053987503051758, "learning_rate": 6e-06, "loss": -0.7708, "mean_logprobs": -0.1767578125, "mean_logprobs/var": 0.0196533203125, "num_completions/total": 384, "per_sentence_gradient_norm": 16.159828186035156, "per_sentence_gradient_norm/max": 87.6875, "per_sentence_gradient_norm/median": 15.268604278564453, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 8.787548065185547, "per_sentence_gradient_norm/p75": 23.31141471862793, "per_sentence_gradient_norm/p85": 25.086261749267578, "per_sentence_gradient_norm/p90": 28.30315399169922, "per_sentence_gradient_norm/p95": 36.51607131958008, "per_sentence_gradient_norm/p99": 52.00395584106445, "per_sentence_gradient_norm/var": 188.1351318359375, "per_token_feature_norm": 163.90188598632812, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 156.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 125.5, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 2315.03857421875, "per_token_full_gradient_variance/max_squared_error": 3.085036039352417, "per_token_full_gradient_variance/variance": 0.016322001814842224, "per_token_gradient_norm": 13.786334991455078, "per_token_gradient_norm/max": 394.65625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1983.989990234375, "per_token_policy_error_norm": 0.08057296276092529, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06701833009719849, "policy_entropy": 0.17530383169651031, "policy_entropy/max": 3.65625, "policy_entropy/median": 0.00023746490478515625, "policy_entropy/min": 6.838973831690964e-14, "policy_entropy/p25": 5.602836608886719e-06, "policy_entropy/p75": 0.08837890625, "policy_entropy/var": 0.15744002163410187, "policy_error_vector_variance/max_squared_error": 2.013417959213257, "policy_error_vector_variance/metric": 0.08007515966892242, "policy_loss": -0.7708333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.17850877344608307, "policy_sharpness": 7.180308818817139, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.0700042247772217, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.125783920288086, "reward": 0.7708333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17850877344608307, "rewards/accuracy_reward": 0.7708333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17850877344608307, "sentence_full_gradient_variance/max_squared_error": 5775.2265625, "sentence_full_gradient_variance/metric": 1439.8275146484375, "sentence_full_gradient_variance/p75": 1791.747314453125, "sentence_full_gradient_variance/p90": 2412.8232421875, "sentence_full_gradient_variance/p95": 2599.676513671875, "sentence_full_gradient_variance/p99": 4339.04296875, "state_level_variance/metric": 28.10291862487793, "state_level_variance_full_gradient/metric": 401.4313659667969, "step": 4 }, { "accuracy_reward": 0.625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23684212565422058, "action_level_variance/metric": 108.32626342773438, "action_level_variance_full_gradient/metric": 629.672119140625, "adam_stats/lr_effective_max": 4.386681757750921e-05, "adam_stats/lr_effective_mean": -5.966155347536528e-10, "adam_stats/lr_effective_min": -4.3805743189295754e-05, "adam_stats/m_t_max": 0.009085164405405521, "adam_stats/m_t_mean": 4.4400420839973975e-11, "adam_stats/m_t_min": -0.005640056449919939, "adam_stats/v_t_max": 8.895601058611646e-05, "adam_stats/v_t_mean": 2.0277599044327133e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.625, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.23684212565422058, "all_logprobs": -0.14774225652217865, "all_logprobs/max": 0.0, "all_logprobs/median": -7.987022399902344e-06, "all_logprobs/min": -8.375, "all_logprobs/p1": -2.5625, "all_logprobs/p10": -0.388671875, "all_logprobs/p25": -0.0111083984375, "all_logprobs/p5": -0.9375, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.2533728778362274, "clip_ratio": 0.0, "completion_length": 590.0416870117188, "completion_length/correct": 408.183349609375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 337.0, "completion_length/correct/min": 148.0, "completion_length/correct/p25": 269.0, "completion_length/correct/p75": 453.5, "completion_length/correct/var": 51026.2578125, "completion_length/incorrect": 893.138916015625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 114.0, "completion_length/incorrect/p25": 819.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 42304.578125, "completion_length/max": 1024.0, "completion_length/median": 481.0, "completion_length/min": 114.0, "completion_length/p25": 289.25, "completion_length/p75": 997.0, "completion_length/var": 102976.859375, "epoch": 0.008, "feature_vector_variance/max_squared_error": 89883.1796875, "feature_vector_variance/metric": 25681.423828125, "generated_tokens/total": 295949.0, "grad_norm": 1.3095223903656006, "learning_rate": 7.5e-06, "loss": -0.625, "mean_logprobs": -0.1552734375, "mean_logprobs/var": 0.006622314453125, "num_completions/total": 480, "per_sentence_gradient_norm": 12.978553771972656, "per_sentence_gradient_norm/max": 53.917991638183594, "per_sentence_gradient_norm/median": 12.67796516418457, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 20.25406837463379, "per_sentence_gradient_norm/p85": 27.256420135498047, "per_sentence_gradient_norm/p90": 29.594135284423828, "per_sentence_gradient_norm/p95": 34.20004653930664, "per_sentence_gradient_norm/p99": 49.613494873046875, "per_sentence_gradient_norm/var": 167.5721435546875, "per_token_feature_norm": 161.27783203125, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 124.0, "per_token_feature_norm/p75": 191.0, "per_token_feature_norm/var": 2302.661376953125, "per_token_full_gradient_variance/max_squared_error": 2.506481885910034, "per_token_full_gradient_variance/variance": 0.010252884589135647, "per_token_gradient_norm": 8.715559959411621, "per_token_gradient_norm/max": 406.0625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1325.7021484375, "per_token_policy_error_norm": 0.07614801079034805, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06355822086334229, "policy_entropy": 0.16376902163028717, "policy_entropy/max": 3.71875, "policy_entropy/median": 0.0001068115234375, "policy_entropy/min": 1.2789769243681803e-13, "policy_entropy/p25": 3.844499588012695e-06, "policy_entropy/p75": 0.06640625, "policy_entropy/var": 0.13680870831012726, "policy_error_vector_variance/max_squared_error": 2.0178463459014893, "policy_error_vector_variance/metric": 0.07592270523309708, "policy_loss": -0.625, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.23684212565422058, "policy_sharpness": 7.408853054046631, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.632751226425171, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.474851608276367, "reward": 0.625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23684212565422058, "rewards/accuracy_reward": 0.625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23684212565422058, "sentence_full_gradient_variance/max_squared_error": 3405.451904296875, "sentence_full_gradient_variance/metric": 1697.692138671875, "sentence_full_gradient_variance/p75": 2091.5576171875, "sentence_full_gradient_variance/p90": 2311.4794921875, "sentence_full_gradient_variance/p95": 2624.11376953125, "sentence_full_gradient_variance/p99": 3134.722412109375, "state_level_variance/metric": 77.49940490722656, "state_level_variance_full_gradient/metric": 1068.0201416015625, "step": 5 }, { "accuracy_reward": 0.7395833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19462719559669495, "action_level_variance/metric": 101.6297836303711, "action_level_variance_full_gradient/metric": 892.167724609375, "adam_stats/lr_effective_max": 5.5080898164305836e-05, "adam_stats/lr_effective_mean": -5.358946619793414e-10, "adam_stats/lr_effective_min": -5.507968307938427e-05, "adam_stats/m_t_max": 0.009739148430526257, "adam_stats/m_t_mean": -5.058398578478007e-12, "adam_stats/m_t_min": -0.007456422317773104, "adam_stats/v_t_max": 8.979092672234401e-05, "adam_stats/v_t_mean": 2.1183365391669318e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7395833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.19462719559669495, "all_logprobs": -0.15943387150764465, "all_logprobs/max": 0.0, "all_logprobs/median": -1.7523765563964844e-05, "all_logprobs/min": -8.5, "all_logprobs/p1": -2.5, "all_logprobs/p10": -0.466796875, "all_logprobs/p25": -0.0257568359375, "all_logprobs/p5": -1.0078125, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.2479802966117859, "clip_ratio": 0.0, "completion_length": 548.8333740234375, "completion_length/correct": 456.2676086425781, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 411.0, "completion_length/correct/min": 178.0, "completion_length/correct/p25": 308.5, "completion_length/correct/p75": 517.5, "completion_length/correct/var": 43688.484375, "completion_length/incorrect": 811.719970703125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 980.0, "completion_length/incorrect/min": 24.0, "completion_length/incorrect/p25": 653.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 75046.625, "completion_length/max": 1024.0, "completion_length/median": 462.0, "completion_length/min": 24.0, "completion_length/p25": 345.0, "completion_length/p75": 708.75, "completion_length/var": 75741.109375, "epoch": 0.0096, "feature_vector_variance/max_squared_error": 94201.8046875, "feature_vector_variance/metric": 25290.408203125, "generated_tokens/total": 348637.0, "grad_norm": 0.556908130645752, "learning_rate": 9e-06, "loss": -0.7396, "mean_logprobs": -0.162109375, "mean_logprobs/var": 0.0030670166015625, "num_completions/total": 576, "per_sentence_gradient_norm": 15.456159591674805, "per_sentence_gradient_norm/max": 46.031864166259766, "per_sentence_gradient_norm/median": 17.35538101196289, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 23.636308670043945, "per_sentence_gradient_norm/p85": 26.305265426635742, "per_sentence_gradient_norm/p90": 28.34222984313965, "per_sentence_gradient_norm/p95": 33.182891845703125, "per_sentence_gradient_norm/p99": 39.124412536621094, "per_sentence_gradient_norm/var": 127.6876449584961, "per_token_feature_norm": 163.50079345703125, "per_token_feature_norm/max": 322.0, "per_token_feature_norm/median": 156.0, "per_token_feature_norm/min": 63.0, "per_token_feature_norm/p25": 126.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 2162.515380859375, "per_token_full_gradient_variance/max_squared_error": 2.891162633895874, "per_token_full_gradient_variance/variance": 0.014241340570151806, "per_token_gradient_norm": 12.616556167602539, "per_token_gradient_norm/max": 397.75, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1770.0777587890625, "per_token_policy_error_norm": 0.08452247828245163, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06960076093673706, "policy_entropy": 0.17832566797733307, "policy_entropy/max": 3.65625, "policy_entropy/median": 0.000225067138671875, "policy_entropy/min": 4.04121180963557e-14, "policy_entropy/p25": 4.947185516357422e-06, "policy_entropy/p75": 0.12451171875, "policy_entropy/var": 0.137643501162529, "policy_error_vector_variance/max_squared_error": 2.0059001445770264, "policy_error_vector_variance/metric": 0.08432231843471527, "policy_loss": -0.7395833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.19462719559669495, "policy_sharpness": 7.118005752563477, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 2.933227300643921, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 14.089131355285645, "reward": 0.7395833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19462719559669495, "rewards/accuracy_reward": 0.7395833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19462719559669495, "sentence_full_gradient_variance/max_squared_error": 2511.060791015625, "sentence_full_gradient_variance/metric": 1299.0467529296875, "sentence_full_gradient_variance/p75": 2438.154296875, "sentence_full_gradient_variance/p90": 2438.154296875, "sentence_full_gradient_variance/p95": 2438.154296875, "sentence_full_gradient_variance/p99": 2441.93017578125, "state_level_variance/metric": 40.8343620300293, "state_level_variance_full_gradient/metric": 406.8789978027344, "step": 6 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894736886024475, "action_level_variance/metric": 35.0031852722168, "action_level_variance_full_gradient/metric": 479.840576171875, "adam_stats/lr_effective_max": 6.647619011346251e-05, "adam_stats/lr_effective_mean": -6.269100238931458e-10, "adam_stats/lr_effective_min": -6.62305683363229e-05, "adam_stats/m_t_max": 0.008818765170872211, "adam_stats/m_t_mean": 8.902824658041375e-12, "adam_stats/m_t_min": -0.0058382973074913025, "adam_stats/v_t_max": 8.998491102829576e-05, "adam_stats/v_t_mean": 2.2108974807172332e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.75, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.75, "advantages/p75": 1.0, "advantages/var": 0.1894736886024475, "all_logprobs": -0.10002879053354263, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-06, "all_logprobs/min": -7.65625, "all_logprobs/p1": -2.015625, "all_logprobs/p10": -0.201171875, "all_logprobs/p25": -0.001708984375, "all_logprobs/p5": -0.6171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.14980179071426392, "clip_ratio": 0.0, "completion_length": 622.7083740234375, "completion_length/correct": 544.8611450195312, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 474.0, "completion_length/correct/min": 184.0, "completion_length/correct/p25": 325.75, "completion_length/correct/p75": 732.25, "completion_length/correct/var": 62667.05078125, "completion_length/incorrect": 856.25, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 335.0, "completion_length/incorrect/p25": 769.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 62294.9765625, "completion_length/max": 1024.0, "completion_length/median": 581.0, "completion_length/min": 184.0, "completion_length/p25": 348.25, "completion_length/p75": 893.25, "completion_length/var": 80289.265625, "epoch": 0.0112, "feature_vector_variance/max_squared_error": 104412.0703125, "feature_vector_variance/metric": 23830.6171875, "generated_tokens/total": 408417.0, "grad_norm": 0.5001587867736816, "learning_rate": 1.05e-05, "loss": -0.75, "mean_logprobs": -0.10107421875, "mean_logprobs/var": 0.0014190673828125, "num_completions/total": 672, "per_sentence_gradient_norm": 9.341876983642578, "per_sentence_gradient_norm/max": 26.192832946777344, "per_sentence_gradient_norm/median": 9.528319358825684, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 4.769090175628662, "per_sentence_gradient_norm/p75": 13.795180320739746, "per_sentence_gradient_norm/p85": 15.416234016418457, "per_sentence_gradient_norm/p90": 17.46799659729004, "per_sentence_gradient_norm/p95": 19.109947204589844, "per_sentence_gradient_norm/p99": 21.835256576538086, "per_sentence_gradient_norm/var": 42.13678741455078, "per_token_feature_norm": 155.62924194335938, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 60.25, "per_token_feature_norm/p25": 124.5, "per_token_feature_norm/p75": 182.0, "per_token_feature_norm/var": 1575.377685546875, "per_token_full_gradient_variance/max_squared_error": 1.4774857759475708, "per_token_full_gradient_variance/variance": 0.008702874183654785, "per_token_gradient_norm": 7.902462482452393, "per_token_gradient_norm/max": 383.171875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 996.5313720703125, "per_token_policy_error_norm": 0.05464908108115196, "per_token_policy_error_norm/max": 1.9921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04682914912700653, "policy_entropy": 0.11161145567893982, "policy_entropy/max": 3.625, "policy_entropy/median": 1.8715858459472656e-05, "policy_entropy/min": 1.6653345369377348e-14, "policy_entropy/p25": 8.381903171539307e-07, "policy_entropy/p75": 0.012939453125, "policy_entropy/var": 0.08089924603700638, "policy_error_vector_variance/max_squared_error": 2.000345230102539, "policy_error_vector_variance/metric": 0.054574400186538696, "policy_loss": -0.75, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -0.75, "policy_loss/var": 0.1894736886024475, "policy_sharpness": 7.947611331939697, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.74609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.157611846923828, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894736886024475, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894736886024475, "sentence_full_gradient_variance/max_squared_error": 3408.32470703125, "sentence_full_gradient_variance/metric": 1318.8060302734375, "sentence_full_gradient_variance/p75": 1905.1341552734375, "sentence_full_gradient_variance/p90": 2323.338623046875, "sentence_full_gradient_variance/p95": 2787.17822265625, "sentence_full_gradient_variance/p99": 3341.355712890625, "state_level_variance/metric": 12.076446533203125, "state_level_variance_full_gradient/metric": 838.9654541015625, "step": 7 }, { "accuracy_reward": 0.65625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2279605269432068, "action_level_variance/metric": 37.15428161621094, "action_level_variance_full_gradient/metric": 451.6731872558594, "adam_stats/lr_effective_max": 7.771278615109622e-05, "adam_stats/lr_effective_mean": -6.844059763366772e-10, "adam_stats/lr_effective_min": -7.774774712743238e-05, "adam_stats/m_t_max": 0.010097533464431763, "adam_stats/m_t_mean": -4.1103634385031285e-11, "adam_stats/m_t_min": -0.007360502611845732, "adam_stats/v_t_max": 9.026222687680274e-05, "adam_stats/v_t_mean": 2.5688542005380954e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.65625, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2279605269432068, "all_logprobs": -0.08851872384548187, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-07, "all_logprobs/min": -10.0625, "all_logprobs/p1": -1.8984375, "all_logprobs/p10": -0.1494140625, "all_logprobs/p25": -0.000553131103515625, "all_logprobs/p5": -0.5546875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.12891219556331635, "clip_ratio": 0.0, "completion_length": 637.6354370117188, "completion_length/correct": 554.3016357421875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 494.0, "completion_length/correct/min": 114.0, "completion_length/correct/p25": 391.0, "completion_length/correct/p75": 711.5, "completion_length/correct/var": 48392.40625, "completion_length/incorrect": 796.727294921875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 188.0, "completion_length/incorrect/p25": 494.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 106244.6953125, "completion_length/max": 1024.0, "completion_length/median": 558.0, "completion_length/min": 114.0, "completion_length/p25": 402.5, "completion_length/p75": 994.0, "completion_length/var": 80767.3828125, "epoch": 0.0128, "feature_vector_variance/max_squared_error": 112171.0859375, "feature_vector_variance/metric": 24870.4609375, "generated_tokens/total": 469630.0, "grad_norm": 0.8633704781532288, "learning_rate": 1.2e-05, "loss": -0.6562, "mean_logprobs": -0.09716796875, "mean_logprobs/var": 0.00262451171875, "num_completions/total": 768, "per_sentence_gradient_norm": 7.837907791137695, "per_sentence_gradient_norm/max": 39.6926383972168, "per_sentence_gradient_norm/median": 8.711915969848633, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 11.657690048217773, "per_sentence_gradient_norm/p85": 13.310232162475586, "per_sentence_gradient_norm/p90": 16.075965881347656, "per_sentence_gradient_norm/p95": 18.681060791015625, "per_sentence_gradient_norm/p99": 28.294530868530273, "per_sentence_gradient_norm/var": 51.85896301269531, "per_token_feature_norm": 160.15977478027344, "per_token_feature_norm/max": 320.0, "per_token_feature_norm/median": 157.0, "per_token_feature_norm/min": 64.0, "per_token_feature_norm/p25": 129.0, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 1528.244140625, "per_token_full_gradient_variance/max_squared_error": 1.1390169858932495, "per_token_full_gradient_variance/variance": 0.007089408580213785, "per_token_gradient_norm": 6.568002223968506, "per_token_gradient_norm/max": 355.78125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 835.8424072265625, "per_token_policy_error_norm": 0.04936419427394867, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04290662333369255, "policy_entropy": 0.09811588376760483, "policy_entropy/max": 3.40625, "policy_entropy/median": 7.510185241699219e-06, "policy_entropy/min": 1.0746958878371515e-13, "policy_entropy/p25": 3.5390257835388184e-07, "policy_entropy/p75": 0.00482177734375, "policy_entropy/var": 0.06687535345554352, "policy_error_vector_variance/max_squared_error": 2.007657766342163, "policy_error_vector_variance/metric": 0.049324844032526016, "policy_loss": -0.65625, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2279605269432068, "policy_sharpness": 8.158766746520996, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 7.5, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.33189582824707, "reward": 0.65625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2279605269432068, "rewards/accuracy_reward": 0.65625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2279605269432068, "sentence_full_gradient_variance/max_squared_error": 5195.0634765625, "sentence_full_gradient_variance/metric": 1162.3424072265625, "sentence_full_gradient_variance/p75": 1363.0440673828125, "sentence_full_gradient_variance/p90": 1945.6224365234375, "sentence_full_gradient_variance/p95": 2134.762451171875, "sentence_full_gradient_variance/p99": 2695.601318359375, "state_level_variance/metric": 20.518653869628906, "state_level_variance_full_gradient/metric": 710.6691284179688, "step": 8 }, { "accuracy_reward": 0.78125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17269736528396606, "action_level_variance/metric": 8.567049980163574, "action_level_variance_full_gradient/metric": 246.8193817138672, "adam_stats/lr_effective_max": 8.816900663077831e-05, "adam_stats/lr_effective_mean": -7.153331260667528e-10, "adam_stats/lr_effective_min": -8.833342872094363e-05, "adam_stats/m_t_max": 0.005858511198312044, "adam_stats/m_t_mean": 2.756485794827568e-12, "adam_stats/m_t_min": -0.007711869198828936, "adam_stats/v_t_max": 9.277552453568205e-05, "adam_stats/v_t_mean": 2.837151086981793e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.78125, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.17269736528396606, "all_logprobs": -0.06855661422014236, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -8.5625, "all_logprobs/p1": -1.630312442779541, "all_logprobs/p10": -0.07177734375, "all_logprobs/p25": -5.14984130859375e-05, "all_logprobs/p5": -0.3515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1033305898308754, "clip_ratio": 0.0, "completion_length": 525.6979370117188, "completion_length/correct": 402.5333557128906, "completion_length/correct/max": 1003.0, "completion_length/correct/median": 343.0, "completion_length/correct/min": 199.0, "completion_length/correct/p25": 289.0, "completion_length/correct/p75": 466.5, "completion_length/correct/var": 28106.818359375, "completion_length/incorrect": 965.5714721679688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 750.0, "completion_length/incorrect/p25": 920.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 10065.7578125, "completion_length/max": 1024.0, "completion_length/median": 409.0, "completion_length/min": 199.0, "completion_length/p25": 306.0, "completion_length/p75": 751.75, "completion_length/var": 78759.9609375, "epoch": 0.0144, "feature_vector_variance/max_squared_error": 109967.9375, "feature_vector_variance/metric": 26660.75, "generated_tokens/total": 520097.0, "grad_norm": 0.7698550224304199, "learning_rate": 1.3500000000000001e-05, "loss": -0.7812, "mean_logprobs": -0.06982421875, "mean_logprobs/var": 0.00102996826171875, "num_completions/total": 864, "per_sentence_gradient_norm": 6.8918585777282715, "per_sentence_gradient_norm/max": 16.5216007232666, "per_sentence_gradient_norm/median": 7.962216377258301, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 5.19706392288208, "per_sentence_gradient_norm/p75": 9.937297821044922, "per_sentence_gradient_norm/p85": 10.846299171447754, "per_sentence_gradient_norm/p90": 11.471418380737305, "per_sentence_gradient_norm/p95": 13.765853881835938, "per_sentence_gradient_norm/p99": 14.981086730957031, "per_sentence_gradient_norm/var": 18.82402229309082, "per_token_feature_norm": 171.59925842285156, "per_token_feature_norm/max": 292.0, "per_token_feature_norm/median": 172.0, "per_token_feature_norm/min": 70.0, "per_token_feature_norm/p25": 143.0, "per_token_feature_norm/p75": 200.0, "per_token_feature_norm/var": 1411.4990234375, "per_token_full_gradient_variance/max_squared_error": 1.01852548122406, "per_token_full_gradient_variance/variance": 0.00610711332410574, "per_token_gradient_norm": 5.197245121002197, "per_token_gradient_norm/max": 363.59375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 649.404296875, "per_token_policy_error_norm": 0.03813522309064865, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03394502028822899, "policy_entropy": 0.07487653940916061, "policy_entropy/max": 3.015625, "policy_entropy/median": 1.5422701835632324e-06, "policy_entropy/min": 1.4988010832439613e-14, "policy_entropy/p25": 4.842877388000488e-08, "policy_entropy/p75": 0.000568389892578125, "policy_entropy/var": 0.05105002596974373, "policy_error_vector_variance/max_squared_error": 1.9950414896011353, "policy_error_vector_variance/metric": 0.03812417760491371, "policy_loss": -0.78125, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.17269736528396606, "policy_sharpness": 8.517199516296387, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.682750701904297, "reward": 0.78125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17269736528396606, "rewards/accuracy_reward": 0.78125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17269736528396606, "sentence_full_gradient_variance/max_squared_error": 3461.443603515625, "sentence_full_gradient_variance/metric": 1287.348388671875, "sentence_full_gradient_variance/p75": 1598.90283203125, "sentence_full_gradient_variance/p90": 3461.443603515625, "sentence_full_gradient_variance/p95": 3461.443603515625, "sentence_full_gradient_variance/p99": 3461.443603515625, "state_level_variance/metric": 12.143749237060547, "state_level_variance_full_gradient/metric": 1040.529052734375, "step": 9 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1666666716337204, "action_level_variance/metric": 4.096006393432617, "action_level_variance_full_gradient/metric": 144.37423706054688, "adam_stats/lr_effective_max": 9.940489690052345e-05, "adam_stats/lr_effective_mean": -6.465435409275244e-10, "adam_stats/lr_effective_min": -9.966846846509725e-05, "adam_stats/m_t_max": 0.011837320402264595, "adam_stats/m_t_mean": 6.907718147486364e-11, "adam_stats/m_t_min": -0.011693155393004417, "adam_stats/v_t_max": 0.00011084428842877969, "adam_stats/v_t_mean": 3.2173354692216494e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7916666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.1666666716337204, "all_logprobs": -0.05231146886944771, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.25, "all_logprobs/p1": -1.328125, "all_logprobs/p10": -0.032958984375, "all_logprobs/p25": -1.1086463928222656e-05, "all_logprobs/p5": -0.251953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.07477839291095734, "clip_ratio": 0.0, "completion_length": 571.6771240234375, "completion_length/correct": 494.6447448730469, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 489.0, "completion_length/correct/min": 132.0, "completion_length/correct/p25": 252.25, "completion_length/correct/p75": 682.0, "completion_length/correct/var": 55983.6484375, "completion_length/incorrect": 864.4000244140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 408.0, "completion_length/incorrect/p25": 678.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 44053.83203125, "completion_length/max": 1024.0, "completion_length/median": 552.0, "completion_length/min": 132.0, "completion_length/p25": 289.25, "completion_length/p75": 741.5, "completion_length/var": 75794.875, "epoch": 0.016, "feature_vector_variance/max_squared_error": 130186.8046875, "feature_vector_variance/metric": 27232.693359375, "generated_tokens/total": 574978.0, "grad_norm": 0.8945251107215881, "learning_rate": 1.5e-05, "loss": -0.7917, "mean_logprobs": -0.053466796875, "mean_logprobs/var": 0.0004825592041015625, "num_completions/total": 960, "per_sentence_gradient_norm": 5.583071708679199, "per_sentence_gradient_norm/max": 12.316662788391113, "per_sentence_gradient_norm/median": 6.046398162841797, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 4.456374168395996, "per_sentence_gradient_norm/p75": 7.995277404785156, "per_sentence_gradient_norm/p85": 8.616241455078125, "per_sentence_gradient_norm/p90": 9.316627502441406, "per_sentence_gradient_norm/p95": 9.99951171875, "per_sentence_gradient_norm/p99": 12.019734382629395, "per_sentence_gradient_norm/var": 11.372522354125977, "per_token_feature_norm": 179.05191040039062, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 182.0, "per_token_feature_norm/min": 72.0, "per_token_feature_norm/p25": 152.0, "per_token_feature_norm/p75": 206.0, "per_token_feature_norm/var": 1350.1334228515625, "per_token_full_gradient_variance/max_squared_error": 1.071797251701355, "per_token_full_gradient_variance/variance": 0.006178185809403658, "per_token_gradient_norm": 4.681027412414551, "per_token_gradient_norm/max": 359.125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 598.5772705078125, "per_token_policy_error_norm": 0.02972162328660488, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.02703368477523327, "policy_entropy": 0.05815049633383751, "policy_entropy/max": 3.0, "policy_entropy/median": 4.153698682785034e-07, "policy_entropy/min": 1.5265566588595902e-15, "policy_entropy/p25": 1.1350493878126144e-08, "policy_entropy/p75": 0.00014209747314453125, "policy_entropy/var": 0.03467747941613197, "policy_error_vector_variance/max_squared_error": 2.007847547531128, "policy_error_vector_variance/metric": 0.02970934472978115, "policy_loss": -0.7916666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.1666666716337204, "policy_sharpness": 8.7080078125, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.591718673706055, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1666666716337204, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1666666716337204, "sentence_full_gradient_variance/max_squared_error": 3084.7099609375, "sentence_full_gradient_variance/metric": 1240.6094970703125, "sentence_full_gradient_variance/p75": 2404.828125, "sentence_full_gradient_variance/p90": 2422.1083984375, "sentence_full_gradient_variance/p95": 2422.1083984375, "sentence_full_gradient_variance/p99": 2455.271484375, "state_level_variance/metric": 8.367331504821777, "state_level_variance_full_gradient/metric": 1096.235107421875, "step": 10 }, { "accuracy_reward": 0.7083333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2087719589471817, "action_level_variance/metric": 5.365599632263184, "action_level_variance_full_gradient/metric": 111.57150268554688, "adam_stats/lr_effective_max": 0.00010013843711931258, "adam_stats/lr_effective_mean": -4.1554967800117026e-10, "adam_stats/lr_effective_min": -9.997909364756197e-05, "adam_stats/m_t_max": 0.02194279059767723, "adam_stats/m_t_mean": 1.9330438638665015e-10, "adam_stats/m_t_min": -0.026619335636496544, "adam_stats/v_t_max": 0.00013663960271514952, "adam_stats/v_t_mean": 3.7252258569542285e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7083333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2087719589471817, "all_logprobs": -0.03621437028050423, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.25, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.009136945009231567, "all_logprobs/p25": -1.430511474609375e-06, "all_logprobs/p5": -0.11279296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.05150386691093445, "clip_ratio": 0.0, "completion_length": 606.6041870117188, "completion_length/correct": 506.2205810546875, "completion_length/correct/max": 1009.0, "completion_length/correct/median": 389.0, "completion_length/correct/min": 151.0, "completion_length/correct/p25": 280.0, "completion_length/correct/p75": 734.0, "completion_length/correct/var": 70068.5, "completion_length/incorrect": 850.3928833007812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 839.0, "completion_length/incorrect/min": 522.0, "completion_length/incorrect/p25": 694.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 37259.0625, "completion_length/max": 1024.0, "completion_length/median": 619.0, "completion_length/min": 151.0, "completion_length/p25": 339.25, "completion_length/p75": 842.75, "completion_length/var": 84736.1484375, "epoch": 0.0176, "feature_vector_variance/max_squared_error": 113332.6640625, "feature_vector_variance/metric": 28611.9140625, "generated_tokens/total": 633212.0, "grad_norm": 1.7473827600479126, "learning_rate": 1.4995431202643219e-05, "loss": -0.7083, "mean_logprobs": -0.0390625, "mean_logprobs/var": 0.000431060791015625, "num_completions/total": 1056, "per_sentence_gradient_norm": 4.0042572021484375, "per_sentence_gradient_norm/max": 15.242703437805176, "per_sentence_gradient_norm/median": 4.074686527252197, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.839934825897217, "per_sentence_gradient_norm/p85": 6.969700813293457, "per_sentence_gradient_norm/p90": 7.930027484893799, "per_sentence_gradient_norm/p95": 9.638992309570312, "per_sentence_gradient_norm/p99": 14.185281753540039, "per_sentence_gradient_norm/var": 11.301508903503418, "per_token_feature_norm": 192.2179412841797, "per_token_feature_norm/max": 292.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 78.5, "per_token_feature_norm/p25": 173.0, "per_token_feature_norm/p75": 214.0, "per_token_feature_norm/var": 1051.0391845703125, "per_token_full_gradient_variance/max_squared_error": 0.7819811105728149, "per_token_full_gradient_variance/variance": 0.00470166327431798, "per_token_gradient_norm": 3.147686719894409, "per_token_gradient_norm/max": 343.1640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 417.2275085449219, "per_token_policy_error_norm": 0.020671918988227844, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.018716422840952873, "policy_entropy": 0.0411674790084362, "policy_entropy/max": 2.453125, "policy_entropy/median": 8.242204785346985e-08, "policy_entropy/min": 2.393918396847994e-16, "policy_entropy/p25": 2.35741026699543e-09, "policy_entropy/p75": 2.205371856689453e-05, "policy_entropy/var": 0.02311147004365921, "policy_error_vector_variance/max_squared_error": 2.006598711013794, "policy_error_vector_variance/metric": 0.02066034823656082, "policy_loss": -0.7083333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2087719589471817, "policy_sharpness": 8.981894493103027, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.096639156341553, "reward": 0.7083333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2087719589471817, "rewards/accuracy_reward": 0.7083333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2087719589471817, "sentence_full_gradient_variance/max_squared_error": 4122.5693359375, "sentence_full_gradient_variance/metric": 1341.5584716796875, "sentence_full_gradient_variance/p75": 1616.00830078125, "sentence_full_gradient_variance/p90": 2238.042724609375, "sentence_full_gradient_variance/p95": 2381.467041015625, "sentence_full_gradient_variance/p99": 4094.072509765625, "state_level_variance/metric": 7.078782558441162, "state_level_variance_full_gradient/metric": 1229.987060546875, "step": 11 }, { "accuracy_reward": 0.6875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21710528433322906, "action_level_variance/metric": 4.196975231170654, "action_level_variance_full_gradient/metric": 57.80980682373047, "adam_stats/lr_effective_max": 0.00010042815847555175, "adam_stats/lr_effective_mean": -2.968305601314114e-10, "adam_stats/lr_effective_min": -9.993695130106062e-05, "adam_stats/m_t_max": 0.017652688547968864, "adam_stats/m_t_mean": 1.7813488184525994e-10, "adam_stats/m_t_min": -0.02358052134513855, "adam_stats/v_t_max": 0.00013651716290041804, "adam_stats/v_t_mean": 4.209032861829387e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6875, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21710528433322906, "all_logprobs": -0.03511261194944382, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.5, "all_logprobs/p1": -1.0, "all_logprobs/p10": -0.0067138671875, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.10009765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.050834137946367264, "clip_ratio": 0.0, "completion_length": 588.6979370117188, "completion_length/correct": 480.0151672363281, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 452.0, "completion_length/correct/min": 110.0, "completion_length/correct/p25": 236.5, "completion_length/correct/p75": 730.0, "completion_length/correct/var": 70988.203125, "completion_length/incorrect": 827.800048828125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 960.0, "completion_length/incorrect/min": 283.0, "completion_length/incorrect/p25": 693.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 60235.19921875, "completion_length/max": 1024.0, "completion_length/median": 650.0, "completion_length/min": 110.0, "completion_length/p25": 289.0, "completion_length/p75": 809.0, "completion_length/var": 93218.28125, "epoch": 0.0192, "feature_vector_variance/max_squared_error": 135771.71875, "feature_vector_variance/metric": 30025.029296875, "generated_tokens/total": 689727.0, "grad_norm": 1.3279757499694824, "learning_rate": 1.4981730376948682e-05, "loss": -0.6875, "mean_logprobs": -0.03515625, "mean_logprobs/var": 0.00048828125, "num_completions/total": 1152, "per_sentence_gradient_norm": 3.010693073272705, "per_sentence_gradient_norm/max": 17.77301597595215, "per_sentence_gradient_norm/median": 2.6494669914245605, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 4.125319480895996, "per_sentence_gradient_norm/p85": 6.348448276519775, "per_sentence_gradient_norm/p90": 7.269320011138916, "per_sentence_gradient_norm/p95": 7.948740005493164, "per_sentence_gradient_norm/p99": 10.795391082763672, "per_sentence_gradient_norm/var": 9.598917007446289, "per_token_feature_norm": 199.7120819091797, "per_token_feature_norm/max": 298.0, "per_token_feature_norm/median": 201.0, "per_token_feature_norm/min": 82.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 219.0, "per_token_feature_norm/var": 864.9633178710938, "per_token_full_gradient_variance/max_squared_error": 1.4926090240478516, "per_token_full_gradient_variance/variance": 0.0035488733556121588, "per_token_gradient_norm": 2.17610502243042, "per_token_gradient_norm/max": 342.6171875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 316.2865905761719, "per_token_policy_error_norm": 0.020026106387376785, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0186576209962368, "policy_entropy": 0.039029184728860855, "policy_entropy/max": 3.421875, "policy_entropy/median": 4.0745362639427185e-08, "policy_entropy/min": 5.3939058081153846e-18, "policy_entropy/p25": 8.512870408594608e-10, "policy_entropy/p75": 1.0311603546142578e-05, "policy_entropy/var": 0.023290833458304405, "policy_error_vector_variance/max_squared_error": 2.001560926437378, "policy_error_vector_variance/metric": 0.020013781264424324, "policy_loss": -0.6875, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21710528433322906, "policy_sharpness": 9.044166564941406, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.797149658203125, "reward": 0.6875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21710528433322906, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21710528433322906, "sentence_full_gradient_variance/max_squared_error": 3717.044677734375, "sentence_full_gradient_variance/metric": 1291.791259765625, "sentence_full_gradient_variance/p75": 1429.6278076171875, "sentence_full_gradient_variance/p90": 3450.583251953125, "sentence_full_gradient_variance/p95": 3646.131103515625, "sentence_full_gradient_variance/p99": 3703.549560546875, "state_level_variance/metric": 6.356263637542725, "state_level_variance_full_gradient/metric": 1233.9813232421875, "step": 12 }, { "accuracy_reward": 0.8854166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.10252192616462708, "action_level_variance/metric": 1.2150514125823975, "action_level_variance_full_gradient/metric": 158.42247009277344, "adam_stats/lr_effective_max": 0.00010040959023172036, "adam_stats/lr_effective_mean": -3.142050231108584e-10, "adam_stats/lr_effective_min": -9.976472210837528e-05, "adam_stats/m_t_max": 0.010315833613276482, "adam_stats/m_t_mean": 8.830080416855068e-11, "adam_stats/m_t_min": -0.012140435166656971, "adam_stats/v_t_max": 0.00014462888066191226, "adam_stats/v_t_mean": 4.398503696684264e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.8854166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.10252192616462708, "all_logprobs": -0.022831445559859276, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.8125, "all_logprobs/p1": -0.69140625, "all_logprobs/p10": -0.000518798828125, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.02337646484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03253808617591858, "clip_ratio": 0.0, "completion_length": 557.8229370117188, "completion_length/correct": 509.91766357421875, "completion_length/correct/max": 933.0, "completion_length/correct/median": 471.0, "completion_length/correct/min": 277.0, "completion_length/correct/p25": 407.0, "completion_length/correct/p75": 607.0, "completion_length/correct/var": 26544.455078125, "completion_length/incorrect": 928.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 952.0, "completion_length/incorrect/min": 759.0, "completion_length/incorrect/p25": 864.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 9593.400390625, "completion_length/max": 1024.0, "completion_length/median": 483.0, "completion_length/min": 277.0, "completion_length/p25": 420.0, "completion_length/p75": 713.0, "completion_length/var": 42400.828125, "epoch": 0.0208, "feature_vector_variance/max_squared_error": 115077.7265625, "feature_vector_variance/metric": 28758.39453125, "generated_tokens/total": 743278.0, "grad_norm": 0.6912662386894226, "learning_rate": 1.495891421526205e-05, "loss": -0.8854, "mean_logprobs": -0.022216796875, "mean_logprobs/var": 0.00010585784912109375, "num_completions/total": 1248, "per_sentence_gradient_norm": 2.962367534637451, "per_sentence_gradient_norm/max": 9.167826652526855, "per_sentence_gradient_norm/median": 3.1882591247558594, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 2.121004104614258, "per_sentence_gradient_norm/p75": 3.9408488273620605, "per_sentence_gradient_norm/p85": 4.331238746643066, "per_sentence_gradient_norm/p90": 4.7354631423950195, "per_sentence_gradient_norm/p95": 5.035143852233887, "per_sentence_gradient_norm/p99": 6.233034610748291, "per_sentence_gradient_norm/var": 2.5478224754333496, "per_token_feature_norm": 201.861328125, "per_token_feature_norm/max": 298.0, "per_token_feature_norm/median": 202.0, "per_token_feature_norm/min": 88.0, "per_token_feature_norm/p25": 188.0, "per_token_feature_norm/p75": 219.0, "per_token_feature_norm/var": 707.345458984375, "per_token_full_gradient_variance/max_squared_error": 0.5279205441474915, "per_token_full_gradient_variance/variance": 0.004521642345935106, "per_token_gradient_norm": 2.7312936782836914, "per_token_gradient_norm/max": 337.890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 390.2888488769531, "per_token_policy_error_norm": 0.013541449792683125, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.012808948755264282, "policy_entropy": 0.02435128763318062, "policy_entropy/max": 1.6796875, "policy_entropy/median": 1.792795956134796e-08, "policy_entropy/min": 2.3245294578089215e-16, "policy_entropy/p25": 4.18367562815547e-10, "policy_entropy/p75": 2.3990869522094727e-06, "policy_entropy/var": 0.012512567453086376, "policy_error_vector_variance/max_squared_error": 1.988722562789917, "policy_error_vector_variance/metric": 0.013534238561987877, "policy_loss": -0.8854166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.10252192616462708, "policy_sharpness": 9.331881523132324, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.090082168579102, "reward": 0.8854166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.10252192616462708, "rewards/accuracy_reward": 0.8854166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.10252192616462708, "sentence_full_gradient_variance/max_squared_error": 1952.11474609375, "sentence_full_gradient_variance/metric": 776.0484619140625, "sentence_full_gradient_variance/p75": 1408.6063232421875, "sentence_full_gradient_variance/p90": 1793.954345703125, "sentence_full_gradient_variance/p95": 1793.96630859375, "sentence_full_gradient_variance/p99": 1942.415771484375, "state_level_variance/metric": 1.5906678438186646, "state_level_variance_full_gradient/metric": 617.6259765625, "step": 13 }, { "accuracy_reward": 0.6875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21710528433322906, "action_level_variance/metric": 2.6204347610473633, "action_level_variance_full_gradient/metric": 175.2250213623047, "adam_stats/lr_effective_max": 9.983986819861457e-05, "adam_stats/lr_effective_mean": -6.467805180321307e-10, "adam_stats/lr_effective_min": -9.993959247367457e-05, "adam_stats/m_t_max": 0.009037444368004799, "adam_stats/m_t_mean": 7.0675965080369e-11, "adam_stats/m_t_min": -0.01251639798283577, "adam_stats/v_t_max": 0.00014453352196142077, "adam_stats/v_t_mean": 4.671708332681934e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6875, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21710528433322906, "all_logprobs": -0.020757151767611504, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.0625, "all_logprobs/p1": -0.6328125, "all_logprobs/p10": -0.00017452239990234375, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.0181884765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02708730474114418, "clip_ratio": 0.0, "completion_length": 660.53125, "completion_length/correct": 554.9242553710938, "completion_length/correct/max": 1022.0, "completion_length/correct/median": 511.0, "completion_length/correct/min": 254.0, "completion_length/correct/p25": 431.0, "completion_length/correct/p75": 671.0, "completion_length/correct/var": 32612.47265625, "completion_length/incorrect": 892.86669921875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 321.0, "completion_length/incorrect/p25": 944.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 60562.328125, "completion_length/max": 1024.0, "completion_length/median": 594.0, "completion_length/min": 254.0, "completion_length/p25": 466.75, "completion_length/p75": 962.5, "completion_length/var": 65595.765625, "epoch": 0.0224, "feature_vector_variance/max_squared_error": 122721.9296875, "feature_vector_variance/metric": 27775.21875, "generated_tokens/total": 806689.0, "grad_norm": 0.7978067994117737, "learning_rate": 1.4927010515561777e-05, "loss": -0.6875, "mean_logprobs": -0.0211181640625, "mean_logprobs/var": 0.0001659393310546875, "num_completions/total": 1344, "per_sentence_gradient_norm": 2.2511954307556152, "per_sentence_gradient_norm/max": 8.548160552978516, "per_sentence_gradient_norm/median": 2.5180892944335938, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.1964478492736816, "per_sentence_gradient_norm/p85": 3.5274813175201416, "per_sentence_gradient_norm/p90": 4.646113395690918, "per_sentence_gradient_norm/p95": 6.088399887084961, "per_sentence_gradient_norm/p99": 8.142067909240723, "per_sentence_gradient_norm/var": 4.132708549499512, "per_token_feature_norm": 205.61434936523438, "per_token_feature_norm/max": 290.0, "per_token_feature_norm/median": 205.0, "per_token_feature_norm/min": 79.5, "per_token_feature_norm/p25": 191.0, "per_token_feature_norm/p75": 221.0, "per_token_feature_norm/var": 514.0474243164062, "per_token_full_gradient_variance/max_squared_error": 1.4413224458694458, "per_token_full_gradient_variance/variance": 0.003122388618066907, "per_token_gradient_norm": 1.935487985610962, "per_token_gradient_norm/max": 297.5, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 273.1777038574219, "per_token_policy_error_norm": 0.01218461524695158, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01086847297847271, "policy_entropy": 0.023590149357914925, "policy_entropy/max": 2.53125, "policy_entropy/median": 9.022187441587448e-09, "policy_entropy/min": 5.767955557622884e-17, "policy_entropy/p25": 1.8189894035458565e-10, "policy_entropy/p75": 8.940696716308594e-07, "policy_entropy/var": 0.013992421329021454, "policy_error_vector_variance/max_squared_error": 1.9836434125900269, "policy_error_vector_variance/metric": 0.012177756987512112, "policy_loss": -0.6875, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21710528433322906, "policy_sharpness": 9.395709991455078, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.8161184787750244, "reward": 0.6875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21710528433322906, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21710528433322906, "sentence_full_gradient_variance/max_squared_error": 3480.5517578125, "sentence_full_gradient_variance/metric": 824.406494140625, "sentence_full_gradient_variance/p75": 925.6949462890625, "sentence_full_gradient_variance/p90": 1259.3756103515625, "sentence_full_gradient_variance/p95": 2075.11865234375, "sentence_full_gradient_variance/p99": 3262.919189453125, "state_level_variance/metric": 1.9601227045059204, "state_level_variance_full_gradient/metric": 649.1815795898438, "step": 14 }, { "accuracy_reward": 0.8541666865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.12587718665599823, "action_level_variance/metric": 1.9738068580627441, "action_level_variance_full_gradient/metric": 182.25482177734375, "adam_stats/lr_effective_max": 9.964442142518237e-05, "adam_stats/lr_effective_mean": -7.990096362320287e-10, "adam_stats/lr_effective_min": -0.00010080893844133243, "adam_stats/m_t_max": 0.009261993691325188, "adam_stats/m_t_mean": 1.2595782750146611e-10, "adam_stats/m_t_min": -0.013571887277066708, "adam_stats/v_t_max": 0.00014490450848825276, "adam_stats/v_t_mean": 4.866205962489323e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.8541666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.12587718665599823, "all_logprobs": -0.020835429430007935, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.53125, "all_logprobs/p1": -0.5858203172683716, "all_logprobs/p10": -0.000335693359375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.018658429384231567, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.028981028124690056, "clip_ratio": 0.0, "completion_length": 508.375, "completion_length/correct": 461.48779296875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 450.0, "completion_length/correct/min": 210.0, "completion_length/correct/p25": 339.25, "completion_length/correct/p75": 569.0, "completion_length/correct/var": 30513.4140625, "completion_length/incorrect": 783.0000610351562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 754.0, "completion_length/incorrect/min": 303.0, "completion_length/incorrect/p25": 586.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 77282.15625, "completion_length/max": 1024.0, "completion_length/median": 463.0, "completion_length/min": 210.0, "completion_length/p25": 340.75, "completion_length/p75": 587.25, "completion_length/var": 49604.1015625, "epoch": 0.024, "feature_vector_variance/max_squared_error": 114781.0859375, "feature_vector_variance/metric": 27919.869140625, "generated_tokens/total": 855493.0, "grad_norm": 0.7618879675865173, "learning_rate": 1.488605814759156e-05, "loss": -0.8542, "mean_logprobs": -0.0203857421875, "mean_logprobs/var": 9.632110595703125e-05, "num_completions/total": 1440, "per_sentence_gradient_norm": 2.7219810485839844, "per_sentence_gradient_norm/max": 6.290472030639648, "per_sentence_gradient_norm/median": 2.837303638458252, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.6167407035827637, "per_sentence_gradient_norm/p75": 3.7653582096099854, "per_sentence_gradient_norm/p85": 4.23042106628418, "per_sentence_gradient_norm/p90": 4.546991348266602, "per_sentence_gradient_norm/p95": 5.22141170501709, "per_sentence_gradient_norm/p99": 6.1140666007995605, "per_sentence_gradient_norm/var": 2.6864283084869385, "per_token_feature_norm": 196.45852661132812, "per_token_feature_norm/max": 274.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 84.5, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 414.2362976074219, "per_token_full_gradient_variance/max_squared_error": 0.6883414387702942, "per_token_full_gradient_variance/variance": 0.004450193606317043, "per_token_gradient_norm": 2.5493202209472656, "per_token_gradient_norm/max": 337.5, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 345.1825866699219, "per_token_policy_error_norm": 0.012259657494723797, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011453308165073395, "policy_entropy": 0.02285408228635788, "policy_entropy/max": 3.03125, "policy_entropy/median": 1.3154931366443634e-08, "policy_entropy/min": 1.5395670849294163e-17, "policy_entropy/p25": 1.709850039333105e-10, "policy_entropy/p75": 2.3283064365386963e-06, "policy_entropy/var": 0.011774715967476368, "policy_error_vector_variance/max_squared_error": 1.9839688539505005, "policy_error_vector_variance/metric": 0.012252316810190678, "policy_loss": -0.8541666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.12587718665599823, "policy_sharpness": 9.365391731262207, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.912745237350464, "reward": 0.8541666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.12587718665599823, "rewards/accuracy_reward": 0.8541666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.12587718665599823, "sentence_full_gradient_variance/max_squared_error": 2554.218505859375, "sentence_full_gradient_variance/metric": 960.3873901367188, "sentence_full_gradient_variance/p75": 1910.846435546875, "sentence_full_gradient_variance/p90": 2070.91015625, "sentence_full_gradient_variance/p95": 2186.091796875, "sentence_full_gradient_variance/p99": 2444.3193359375, "state_level_variance/metric": 1.0160332918167114, "state_level_variance_full_gradient/metric": 778.1325073242188, "step": 15 }, { "accuracy_reward": 0.8125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1539473831653595, "action_level_variance/metric": 1.038564682006836, "action_level_variance_full_gradient/metric": 92.02091979980469, "adam_stats/lr_effective_max": 9.838990808930248e-05, "adam_stats/lr_effective_mean": -2.909345264701102e-10, "adam_stats/lr_effective_min": -9.957340807886794e-05, "adam_stats/m_t_max": 0.0075035784393548965, "adam_stats/m_t_mean": -2.8327147919005036e-11, "adam_stats/m_t_min": -0.00563243729993701, "adam_stats/v_t_max": 0.00015958708536345512, "adam_stats/v_t_mean": 5.366404803169855e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.8125, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.1539473831653595, "all_logprobs": -0.015453856438398361, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.125, "all_logprobs/p1": -0.392578125, "all_logprobs/p10": -3.4332275390625e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.0067138671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02234630659222603, "clip_ratio": 0.0, "completion_length": 535.6666870117188, "completion_length/correct": 438.1538391113281, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 414.0, "completion_length/correct/min": 99.0, "completion_length/correct/p25": 217.0, "completion_length/correct/p75": 592.5, "completion_length/correct/var": 66529.5390625, "completion_length/incorrect": 958.2222290039062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 18.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 56814.06640625, "completion_length/max": 1024.0, "completion_length/median": 431.0, "completion_length/min": 18.0, "completion_length/p25": 301.5, "completion_length/p75": 866.75, "completion_length/var": 105728.9921875, "epoch": 0.0256, "feature_vector_variance/max_squared_error": 103327.6953125, "feature_vector_variance/metric": 26710.587890625, "generated_tokens/total": 906917.0, "grad_norm": 2.774850368499756, "learning_rate": 1.4836107005503543e-05, "loss": -0.8125, "mean_logprobs": -0.020263671875, "mean_logprobs/var": 0.00135040283203125, "num_completions/total": 1536, "per_sentence_gradient_norm": 2.1924445629119873, "per_sentence_gradient_norm/max": 5.715519428253174, "per_sentence_gradient_norm/median": 2.2671117782592773, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.1340365409851074, "per_sentence_gradient_norm/p75": 3.057879686355591, "per_sentence_gradient_norm/p85": 3.72683048248291, "per_sentence_gradient_norm/p90": 4.064645290374756, "per_sentence_gradient_norm/p95": 4.446006774902344, "per_sentence_gradient_norm/p99": 5.269911289215088, "per_sentence_gradient_norm/var": 2.075329542160034, "per_token_feature_norm": 199.335205078125, "per_token_feature_norm/max": 266.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 88.0, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 213.0, "per_token_feature_norm/var": 429.3045959472656, "per_token_full_gradient_variance/max_squared_error": 0.6195434927940369, "per_token_full_gradient_variance/variance": 0.002912163268774748, "per_token_gradient_norm": 1.6900120973587036, "per_token_gradient_norm/max": 292.125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 220.5580596923828, "per_token_policy_error_norm": 0.008726736530661583, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.008205077610909939, "policy_entropy": 0.017751963809132576, "policy_entropy/max": 1.8515625, "policy_entropy/median": 4.452886059880257e-09, "policy_entropy/min": 1.485356976305141e-17, "policy_entropy/p25": 8.685674401931465e-11, "policy_entropy/p75": 5.811452865600586e-07, "policy_entropy/var": 0.009296610951423645, "policy_error_vector_variance/max_squared_error": 2.0035879611968994, "policy_error_vector_variance/metric": 0.008718459866940975, "policy_loss": -0.8125, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.1539473831653595, "policy_sharpness": 9.514228820800781, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.0752506256103516, "reward": 0.8125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1539473831653595, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1539473831653595, "sentence_full_gradient_variance/max_squared_error": 3670.94287109375, "sentence_full_gradient_variance/metric": 1499.5517578125, "sentence_full_gradient_variance/p75": 1980.6971435546875, "sentence_full_gradient_variance/p90": 2525.60546875, "sentence_full_gradient_variance/p95": 3670.94287109375, "sentence_full_gradient_variance/p99": 3670.94287109375, "state_level_variance/metric": 1.2490553855895996, "state_level_variance_full_gradient/metric": 1407.531005859375, "step": 16 }, { "accuracy_reward": 0.5729166865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24725878238677979, "action_level_variance/metric": 1.817776083946228, "action_level_variance_full_gradient/metric": 127.57858276367188, "adam_stats/lr_effective_max": 9.813284850679338e-05, "adam_stats/lr_effective_mean": -1.1557052537192192e-10, "adam_stats/lr_effective_min": -9.778292587725446e-05, "adam_stats/m_t_max": 0.013722456991672516, "adam_stats/m_t_mean": 9.334243794567598e-11, "adam_stats/m_t_min": -0.01621653139591217, "adam_stats/v_t_max": 0.0001878517068689689, "adam_stats/v_t_mean": 5.82313971347892e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.5729166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.24725878238677979, "all_logprobs": -0.022896215319633484, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.5, "all_logprobs/p1": -0.69140625, "all_logprobs/p10": -0.00055694580078125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.02978515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.031147433444857597, "clip_ratio": 0.0, "completion_length": 575.1875, "completion_length/correct": 417.16363525390625, "completion_length/correct/max": 943.0, "completion_length/correct/median": 378.0, "completion_length/correct/min": 212.0, "completion_length/correct/p25": 244.0, "completion_length/correct/p75": 520.0, "completion_length/correct/var": 44783.58203125, "completion_length/incorrect": 787.1707153320312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 818.0, "completion_length/incorrect/min": 339.0, "completion_length/incorrect/p25": 643.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 64131.29296875, "completion_length/max": 1024.0, "completion_length/median": 446.0, "completion_length/min": 212.0, "completion_length/p25": 274.0, "completion_length/p75": 809.0, "completion_length/var": 86309.6015625, "epoch": 0.0272, "feature_vector_variance/max_squared_error": 104055.3671875, "feature_vector_variance/metric": 27365.93359375, "generated_tokens/total": 962135.0, "grad_norm": 1.969389796257019, "learning_rate": 1.4777217947069972e-05, "loss": -0.5729, "mean_logprobs": -0.0216064453125, "mean_logprobs/var": 0.0001544952392578125, "num_completions/total": 1632, "per_sentence_gradient_norm": 1.5867996215820312, "per_sentence_gradient_norm/max": 6.10418176651001, "per_sentence_gradient_norm/median": 0.9461761713027954, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.0118765830993652, "per_sentence_gradient_norm/p85": 3.605234146118164, "per_sentence_gradient_norm/p90": 4.047341346740723, "per_sentence_gradient_norm/p95": 4.360373020172119, "per_sentence_gradient_norm/p99": 6.059333801269531, "per_sentence_gradient_norm/var": 2.9580070972442627, "per_token_feature_norm": 196.2088165283203, "per_token_feature_norm/max": 270.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 82.5, "per_token_feature_norm/p25": 184.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 402.2176208496094, "per_token_full_gradient_variance/max_squared_error": 0.5049923062324524, "per_token_full_gradient_variance/variance": 0.0020304627250880003, "per_token_gradient_norm": 1.235309362411499, "per_token_gradient_norm/max": 287.8828125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 174.3235626220703, "per_token_policy_error_norm": 0.0134718157351017, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.012691115029156208, "policy_entropy": 0.025239648297429085, "policy_entropy/max": 1.5625, "policy_entropy/median": 1.932494342327118e-08, "policy_entropy/min": 3.361026734705064e-17, "policy_entropy/p25": 2.3646862246096134e-10, "policy_entropy/p75": 4.26173210144043e-06, "policy_entropy/var": 0.013120451010763645, "policy_error_vector_variance/max_squared_error": 2.0026023387908936, "policy_error_vector_variance/metric": 0.013463904149830341, "policy_loss": -0.5729166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.24725878238677979, "policy_sharpness": 9.313688278198242, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.172406196594238, "reward": 0.5729166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24725878238677979, "rewards/accuracy_reward": 0.5729166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24725878238677979, "sentence_full_gradient_variance/max_squared_error": 2740.04345703125, "sentence_full_gradient_variance/metric": 1440.747802734375, "sentence_full_gradient_variance/p75": 1676.422607421875, "sentence_full_gradient_variance/p90": 2357.43701171875, "sentence_full_gradient_variance/p95": 2646.513671875, "sentence_full_gradient_variance/p99": 2682.500244140625, "state_level_variance/metric": 1.4581536054611206, "state_level_variance_full_gradient/metric": 1313.1693115234375, "step": 17 }, { "accuracy_reward": 0.6979166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21304824948310852, "action_level_variance/metric": 3.487668991088867, "action_level_variance_full_gradient/metric": 265.70892333984375, "adam_stats/lr_effective_max": 9.782792767509818e-05, "adam_stats/lr_effective_mean": -7.095916076949038e-10, "adam_stats/lr_effective_min": -9.868081542663276e-05, "adam_stats/m_t_max": 0.007077009882777929, "adam_stats/m_t_mean": 3.431590775426763e-11, "adam_stats/m_t_min": -0.010925300419330597, "adam_stats/v_t_max": 0.0001900959323393181, "adam_stats/v_t_mean": 6.209880699936665e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6979166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21304824948310852, "all_logprobs": -0.02949291653931141, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.40625, "all_logprobs/p1": -0.8794922828674316, "all_logprobs/p10": -0.00118255615234375, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.048583984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04451855272054672, "clip_ratio": 0.0, "completion_length": 599.8541870117188, "completion_length/correct": 575.34326171875, "completion_length/correct/max": 1010.0, "completion_length/correct/median": 551.0, "completion_length/correct/min": 322.0, "completion_length/correct/p25": 420.0, "completion_length/correct/p75": 674.0, "completion_length/correct/var": 35446.07421875, "completion_length/incorrect": 656.4827270507812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 602.0, "completion_length/incorrect/min": 207.0, "completion_length/incorrect/p25": 500.0, "completion_length/incorrect/p75": 840.0, "completion_length/incorrect/var": 67315.9765625, "completion_length/max": 1024.0, "completion_length/median": 576.0, "completion_length/min": 207.0, "completion_length/p25": 427.75, "completion_length/p75": 687.25, "completion_length/var": 45868.8203125, "epoch": 0.0288, "feature_vector_variance/max_squared_error": 133173.71875, "feature_vector_variance/metric": 28372.123046875, "generated_tokens/total": 1019721.0, "grad_norm": 0.9514789581298828, "learning_rate": 1.4709462719537392e-05, "loss": -0.6979, "mean_logprobs": -0.03076171875, "mean_logprobs/var": 0.00058746337890625, "num_completions/total": 1728, "per_sentence_gradient_norm": 2.3261260986328125, "per_sentence_gradient_norm/max": 10.989334106445312, "per_sentence_gradient_norm/median": 2.3166236877441406, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.3492236137390137, "per_sentence_gradient_norm/p85": 4.1339111328125, "per_sentence_gradient_norm/p90": 4.55232048034668, "per_sentence_gradient_norm/p95": 5.638106346130371, "per_sentence_gradient_norm/p99": 7.576062202453613, "per_sentence_gradient_norm/var": 4.208441257476807, "per_token_feature_norm": 195.6666717529297, "per_token_feature_norm/max": 290.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 84.0, "per_token_feature_norm/p25": 184.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 438.8687438964844, "per_token_full_gradient_variance/max_squared_error": 0.7684510946273804, "per_token_full_gradient_variance/variance": 0.0036784757394343615, "per_token_gradient_norm": 2.2745871543884277, "per_token_gradient_norm/max": 284.859375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 301.4239196777344, "per_token_policy_error_norm": 0.0166985634714365, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015224020928144455, "policy_entropy": 0.032592739909887314, "policy_entropy/max": 2.578125, "policy_entropy/median": 2.130400389432907e-08, "policy_entropy/min": 1.2956215961201778e-17, "policy_entropy/p25": 2.1464074961841106e-10, "policy_entropy/p75": 7.808208465576172e-06, "policy_entropy/var": 0.02174447849392891, "policy_error_vector_variance/max_squared_error": 1.9876235723495483, "policy_error_vector_variance/metric": 0.01668381318449974, "policy_loss": -0.6979166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21304824948310852, "policy_sharpness": 9.221996307373047, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.852928638458252, "reward": 0.6979166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21304824948310852, "rewards/accuracy_reward": 0.6979166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21304824948310852, "sentence_full_gradient_variance/max_squared_error": 2587.312744140625, "sentence_full_gradient_variance/metric": 751.65087890625, "sentence_full_gradient_variance/p75": 915.603759765625, "sentence_full_gradient_variance/p90": 1536.202392578125, "sentence_full_gradient_variance/p95": 1682.224365234375, "sentence_full_gradient_variance/p99": 2576.1494140625, "state_level_variance/metric": 1.2140649557113647, "state_level_variance_full_gradient/metric": 485.94195556640625, "step": 18 }, { "accuracy_reward": 0.7083333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20877192914485931, "action_level_variance/metric": 1.3071269989013672, "action_level_variance_full_gradient/metric": 179.87445068359375, "adam_stats/lr_effective_max": 9.623785445000976e-05, "adam_stats/lr_effective_mean": -1.308586017323421e-10, "adam_stats/lr_effective_min": -9.82282217592001e-05, "adam_stats/m_t_max": 0.008290679194033146, "adam_stats/m_t_mean": 6.052832540737185e-11, "adam_stats/m_t_min": -0.011438692919909954, "adam_stats/v_t_max": 0.00019065756350755692, "adam_stats/v_t_mean": 6.613112835118784e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7083333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.20877192914485931, "all_logprobs": -0.01890667714178562, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.5625, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -7.486343383789062e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.00860595703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.027441997081041336, "clip_ratio": 0.0, "completion_length": 557.2396240234375, "completion_length/correct": 453.308837890625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 455.0, "completion_length/correct/min": 204.0, "completion_length/correct/p25": 276.75, "completion_length/correct/p75": 598.0, "completion_length/correct/var": 35084.0078125, "completion_length/incorrect": 809.6428833007812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 446.0, "completion_length/incorrect/p25": 490.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 73054.234375, "completion_length/max": 1024.0, "completion_length/median": 501.0, "completion_length/min": 204.0, "completion_length/p25": 349.25, "completion_length/p75": 691.0, "completion_length/var": 72014.828125, "epoch": 0.0304, "feature_vector_variance/max_squared_error": 81207.7421875, "feature_vector_variance/metric": 27514.341796875, "generated_tokens/total": 1073216.0, "grad_norm": 1.068534016609192, "learning_rate": 1.4632923872213653e-05, "loss": -0.7083, "mean_logprobs": -0.01953125, "mean_logprobs/var": 8.916854858398438e-05, "num_completions/total": 1824, "per_sentence_gradient_norm": 2.0492444038391113, "per_sentence_gradient_norm/max": 7.907660961151123, "per_sentence_gradient_norm/median": 2.219489336013794, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.154902219772339, "per_sentence_gradient_norm/p85": 3.480919122695923, "per_sentence_gradient_norm/p90": 3.6519901752471924, "per_sentence_gradient_norm/p95": 4.341643333435059, "per_sentence_gradient_norm/p99": 7.566076755523682, "per_sentence_gradient_norm/var": 2.795301914215088, "per_token_feature_norm": 193.67478942871094, "per_token_feature_norm/max": 274.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 88.5, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 205.0, "per_token_feature_norm/var": 373.3233947753906, "per_token_full_gradient_variance/max_squared_error": 0.5066419243812561, "per_token_full_gradient_variance/variance": 0.0026065781712532043, "per_token_gradient_norm": 1.681670904159546, "per_token_gradient_norm/max": 289.0078125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 233.4036102294922, "per_token_policy_error_norm": 0.01132776029407978, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010797660797834396, "policy_entropy": 0.01943100057542324, "policy_entropy/max": 2.0625, "policy_entropy/median": 9.953510016202927e-09, "policy_entropy/min": 7.386127300057499e-19, "policy_entropy/p25": 9.458744898438454e-11, "policy_entropy/p75": 1.4789402484893799e-06, "policy_entropy/var": 0.010069433599710464, "policy_error_vector_variance/max_squared_error": 1.9833661317825317, "policy_error_vector_variance/metric": 0.011324908584356308, "policy_loss": -0.7083333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.20877192914485931, "policy_sharpness": 9.460067749023438, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.371443748474121, "reward": 0.7083333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20877192914485931, "rewards/accuracy_reward": 0.7083333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20877192914485931, "sentence_full_gradient_variance/max_squared_error": 2917.49072265625, "sentence_full_gradient_variance/metric": 1215.814453125, "sentence_full_gradient_variance/p75": 1736.1048583984375, "sentence_full_gradient_variance/p90": 2515.123046875, "sentence_full_gradient_variance/p95": 2730.756591796875, "sentence_full_gradient_variance/p99": 2881.52783203125, "state_level_variance/metric": 1.769943118095398, "state_level_variance_full_gradient/metric": 1035.9403076171875, "step": 19 }, { "accuracy_reward": 0.7604166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18410086631774902, "action_level_variance/metric": 0.9285461902618408, "action_level_variance_full_gradient/metric": 298.89471435546875, "adam_stats/lr_effective_max": 9.58472301135771e-05, "adam_stats/lr_effective_mean": -3.8359423970568685e-10, "adam_stats/lr_effective_min": -9.673673775978386e-05, "adam_stats/m_t_max": 0.006989352870732546, "adam_stats/m_t_mean": -4.331859523865056e-11, "adam_stats/m_t_min": -0.005098333116620779, "adam_stats/v_t_max": 0.00020358411711640656, "adam_stats/v_t_mean": 7.039503095024324e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7604166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.18410086631774902, "all_logprobs": -0.01821148209273815, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.96875, "all_logprobs/p1": -0.5234375, "all_logprobs/p10": -0.00010919570922851562, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.009765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.025912368670105934, "clip_ratio": 0.0, "completion_length": 606.71875, "completion_length/correct": 524.76708984375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 480.0, "completion_length/correct/min": 242.0, "completion_length/correct/p25": 385.0, "completion_length/correct/p75": 662.0, "completion_length/correct/var": 29468.4296875, "completion_length/incorrect": 866.8261108398438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 237.0, "completion_length/incorrect/p25": 713.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 58322.609375, "completion_length/max": 1024.0, "completion_length/median": 606.0, "completion_length/min": 237.0, "completion_length/p25": 388.0, "completion_length/p75": 739.75, "completion_length/var": 57380.8515625, "epoch": 0.032, "feature_vector_variance/max_squared_error": 91223.5390625, "feature_vector_variance/metric": 26230.876953125, "generated_tokens/total": 1131461.0, "grad_norm": 1.1340385675430298, "learning_rate": 1.4547694655894313e-05, "loss": -0.7604, "mean_logprobs": -0.01806640625, "mean_logprobs/var": 6.723403930664062e-05, "num_completions/total": 1920, "per_sentence_gradient_norm": 1.9504364728927612, "per_sentence_gradient_norm/max": 4.987678050994873, "per_sentence_gradient_norm/median": 2.052321434020996, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.8926503658294678, "per_sentence_gradient_norm/p75": 3.098804473876953, "per_sentence_gradient_norm/p85": 3.3984017372131348, "per_sentence_gradient_norm/p90": 3.7377195358276367, "per_sentence_gradient_norm/p95": 4.218216896057129, "per_sentence_gradient_norm/p99": 4.57138729095459, "per_sentence_gradient_norm/var": 1.9786031246185303, "per_token_feature_norm": 188.68182373046875, "per_token_feature_norm/max": 262.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 83.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 198.0, "per_token_feature_norm/var": 282.0351867675781, "per_token_full_gradient_variance/max_squared_error": 0.5969656705856323, "per_token_full_gradient_variance/variance": 0.0028962090145796537, "per_token_gradient_norm": 1.7007070779800415, "per_token_gradient_norm/max": 292.578125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 231.7816619873047, "per_token_policy_error_norm": 0.010778520256280899, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010431783273816109, "policy_entropy": 0.019445421174168587, "policy_entropy/max": 2.328125, "policy_entropy/median": 1.3620592653751373e-08, "policy_entropy/min": 8.294146619514109e-18, "policy_entropy/p25": 9.822542779147625e-11, "policy_entropy/p75": 3.2633543014526367e-06, "policy_entropy/var": 0.010232146829366684, "policy_error_vector_variance/max_squared_error": 1.9864741563796997, "policy_error_vector_variance/metric": 0.010776471346616745, "policy_loss": -0.7604166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.18410086631774902, "policy_sharpness": 9.454683303833008, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.3590636253356934, "reward": 0.7604166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18410086631774902, "rewards/accuracy_reward": 0.7604166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18410086631774902, "sentence_full_gradient_variance/max_squared_error": 2034.2882080078125, "sentence_full_gradient_variance/metric": 876.2467041015625, "sentence_full_gradient_variance/p75": 1805.5198974609375, "sentence_full_gradient_variance/p90": 1805.5198974609375, "sentence_full_gradient_variance/p95": 1805.5198974609375, "sentence_full_gradient_variance/p99": 1817.027099609375, "state_level_variance/metric": 1.24965238571167, "state_level_variance_full_gradient/metric": 577.3519897460938, "step": 20 }, { "accuracy_reward": 0.7395833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19462722539901733, "action_level_variance/metric": 1.4828357696533203, "action_level_variance_full_gradient/metric": 223.45989990234375, "adam_stats/lr_effective_max": 9.504338959231973e-05, "adam_stats/lr_effective_mean": 5.3422960499815986e-11, "adam_stats/lr_effective_min": -9.622451034374535e-05, "adam_stats/m_t_max": 0.007666312623769045, "adam_stats/m_t_mean": -1.9250224678191152e-11, "adam_stats/m_t_min": -0.010353675112128258, "adam_stats/v_t_max": 0.00021636755263898522, "adam_stats/v_t_mean": 7.400454381245591e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7395833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.19462722539901733, "all_logprobs": -0.0184634979814291, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.75, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.0002040863037109375, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.01416015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02271651290357113, "clip_ratio": 0.0, "completion_length": 494.71875, "completion_length/correct": 376.7464599609375, "completion_length/correct/max": 634.0, "completion_length/correct/median": 376.0, "completion_length/correct/min": 192.0, "completion_length/correct/p25": 243.5, "completion_length/correct/p75": 485.5, "completion_length/correct/var": 18574.9921875, "completion_length/incorrect": 829.760009765625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 309.0, "completion_length/incorrect/p25": 645.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 66021.109375, "completion_length/max": 1024.0, "completion_length/median": 432.0, "completion_length/min": 192.0, "completion_length/p25": 267.0, "completion_length/p75": 596.0, "completion_length/var": 70307.4921875, "epoch": 0.0336, "feature_vector_variance/max_squared_error": 126766.5546875, "feature_vector_variance/metric": 26446.931640625, "generated_tokens/total": 1178954.0, "grad_norm": 1.0283081531524658, "learning_rate": 1.4453878909250906e-05, "loss": -0.7396, "mean_logprobs": -0.019775390625, "mean_logprobs/var": 0.00011777877807617188, "num_completions/total": 2016, "per_sentence_gradient_norm": 2.3405778408050537, "per_sentence_gradient_norm/max": 7.334765911102295, "per_sentence_gradient_norm/median": 2.1805150508880615, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.8280727863311768, "per_sentence_gradient_norm/p85": 4.490243434906006, "per_sentence_gradient_norm/p90": 4.681454658508301, "per_sentence_gradient_norm/p95": 4.973042011260986, "per_sentence_gradient_norm/p99": 6.723264217376709, "per_sentence_gradient_norm/var": 3.489403009414673, "per_token_feature_norm": 188.85409545898438, "per_token_feature_norm/max": 288.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 90.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 198.0, "per_token_feature_norm/var": 311.1824645996094, "per_token_full_gradient_variance/max_squared_error": 0.5278702974319458, "per_token_full_gradient_variance/variance": 0.002776028588414192, "per_token_gradient_norm": 1.6977758407592773, "per_token_gradient_norm/max": 288.203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 209.4752197265625, "per_token_policy_error_norm": 0.010933518409729004, "per_token_policy_error_norm/max": 1.9765625, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009615164250135422, "policy_entropy": 0.021560916677117348, "policy_entropy/max": 2.25, "policy_entropy/median": 5.564652383327484e-08, "policy_entropy/min": 2.3987973066241786e-18, "policy_entropy/p25": 4.69299266114831e-10, "policy_entropy/p75": 5.8710575103759766e-06, "policy_entropy/var": 0.011523821391165257, "policy_error_vector_variance/max_squared_error": 1.982831597328186, "policy_error_vector_variance/metric": 0.010925250127911568, "policy_loss": -0.7395833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.19462722539901733, "policy_sharpness": 9.403401374816895, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.6599907875061035, "reward": 0.7395833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19462722539901733, "rewards/accuracy_reward": 0.7395833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19462722539901733, "sentence_full_gradient_variance/max_squared_error": 2327.97265625, "sentence_full_gradient_variance/metric": 1260.142333984375, "sentence_full_gradient_variance/p75": 2327.942138671875, "sentence_full_gradient_variance/p90": 2327.942138671875, "sentence_full_gradient_variance/p95": 2327.942138671875, "sentence_full_gradient_variance/p99": 2327.943603515625, "state_level_variance/metric": 2.351534843444824, "state_level_variance_full_gradient/metric": 1036.682373046875, "step": 21 }, { "accuracy_reward": 0.8229166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14725877344608307, "action_level_variance/metric": 1.5586903095245361, "action_level_variance_full_gradient/metric": 149.14849853515625, "adam_stats/lr_effective_max": 9.412822691956535e-05, "adam_stats/lr_effective_mean": 4.4292641776522146e-10, "adam_stats/lr_effective_min": -9.644238889450207e-05, "adam_stats/m_t_max": 0.00840827263891697, "adam_stats/m_t_mean": -1.8414179903114558e-11, "adam_stats/m_t_min": -0.009452681057155132, "adam_stats/v_t_max": 0.0002170094958273694, "adam_stats/v_t_mean": 7.701550334970886e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.8229166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.14725877344608307, "all_logprobs": -0.022061968222260475, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -12.75, "all_logprobs/p1": -0.69140625, "all_logprobs/p10": -0.000431060791015625, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.02001953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.035940419882535934, "clip_ratio": 0.0, "completion_length": 445.76043701171875, "completion_length/correct": 372.1012878417969, "completion_length/correct/max": 784.0, "completion_length/correct/median": 394.0, "completion_length/correct/min": 184.0, "completion_length/correct/p25": 248.5, "completion_length/correct/p75": 449.5, "completion_length/correct/var": 15993.169921875, "completion_length/incorrect": 788.058837890625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 857.0, "completion_length/incorrect/min": 374.0, "completion_length/incorrect/p25": 575.0, "completion_length/incorrect/p75": 982.0, "completion_length/incorrect/var": 54871.8046875, "completion_length/max": 1024.0, "completion_length/median": 410.0, "completion_length/min": 184.0, "completion_length/p25": 266.0, "completion_length/p75": 512.5, "completion_length/var": 47851.609375, "epoch": 0.0352, "feature_vector_variance/max_squared_error": 77065.3203125, "feature_vector_variance/metric": 26659.056640625, "generated_tokens/total": 1221747.0, "grad_norm": 0.9819366931915283, "learning_rate": 1.4351590932319506e-05, "loss": -0.8229, "mean_logprobs": -0.0213623046875, "mean_logprobs/var": 0.00016880035400390625, "num_completions/total": 2112, "per_sentence_gradient_norm": 2.4536192417144775, "per_sentence_gradient_norm/max": 8.208606719970703, "per_sentence_gradient_norm/median": 2.3966774940490723, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.858302891254425, "per_sentence_gradient_norm/p75": 3.740978717803955, "per_sentence_gradient_norm/p85": 4.437868595123291, "per_sentence_gradient_norm/p90": 4.748236179351807, "per_sentence_gradient_norm/p95": 5.061720848083496, "per_sentence_gradient_norm/p99": 6.192534923553467, "per_sentence_gradient_norm/var": 3.203883171081543, "per_token_feature_norm": 188.783203125, "per_token_feature_norm/max": 260.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 87.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 287.70025634765625, "per_token_full_gradient_variance/max_squared_error": 0.4761597216129303, "per_token_full_gradient_variance/variance": 0.0033074459061026573, "per_token_gradient_norm": 2.0810863971710205, "per_token_gradient_norm/max": 285.640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 277.42742919921875, "per_token_policy_error_norm": 0.012725184671580791, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.012018030509352684, "policy_entropy": 0.02344077080488205, "policy_entropy/max": 1.7421875, "policy_entropy/median": 6.51925802230835e-08, "policy_entropy/min": 4.9060148304969076e-18, "policy_entropy/p25": 6.475602276623249e-10, "policy_entropy/p75": 9.5367431640625e-06, "policy_entropy/var": 0.012235420756042004, "policy_error_vector_variance/max_squared_error": 2.002983331680298, "policy_error_vector_variance/metric": 0.012717816047370434, "policy_loss": -0.8229166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.14725877344608307, "policy_sharpness": 9.343352317810059, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.993143320083618, "reward": 0.8229166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14725877344608307, "rewards/accuracy_reward": 0.8229166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14725877344608307, "sentence_full_gradient_variance/max_squared_error": 1788.5029296875, "sentence_full_gradient_variance/metric": 709.314453125, "sentence_full_gradient_variance/p75": 1139.6123046875, "sentence_full_gradient_variance/p90": 1788.478759765625, "sentence_full_gradient_variance/p95": 1788.478759765625, "sentence_full_gradient_variance/p99": 1788.47998046875, "state_level_variance/metric": 1.970896601676941, "state_level_variance_full_gradient/metric": 560.166015625, "step": 22 }, { "accuracy_reward": 0.875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.11052631586790085, "action_level_variance/metric": 1.71317458152771, "action_level_variance_full_gradient/metric": 175.0991668701172, "adam_stats/lr_effective_max": 9.150314872385934e-05, "adam_stats/lr_effective_mean": 2.7391802714937796e-10, "adam_stats/lr_effective_min": -9.17147845029831e-05, "adam_stats/m_t_max": 0.010916796512901783, "adam_stats/m_t_mean": 3.219172844959317e-11, "adam_stats/m_t_min": -0.014076565392315388, "adam_stats/v_t_max": 0.00022082567738834769, "adam_stats/v_t_mean": 8.20666364725886e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.875, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.11052631586790085, "all_logprobs": -0.02074512466788292, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.5625, "all_logprobs/p1": -0.6317191123962402, "all_logprobs/p10": -0.0002460479736328125, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.015054315328598022, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.030393086373806, "clip_ratio": 0.0, "completion_length": 473.0, "completion_length/correct": 461.2261962890625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 434.0, "completion_length/correct/min": 153.0, "completion_length/correct/p25": 265.0, "completion_length/correct/p75": 611.5, "completion_length/correct/var": 48383.81640625, "completion_length/incorrect": 555.4166870117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 346.0, "completion_length/incorrect/min": 324.0, "completion_length/incorrect/p25": 334.25, "completion_length/incorrect/p75": 799.0, "completion_length/incorrect/var": 93293.359375, "completion_length/max": 1024.0, "completion_length/median": 401.0, "completion_length/min": 153.0, "completion_length/p25": 274.5, "completion_length/p75": 614.5, "completion_length/var": 54055.13671875, "epoch": 0.0368, "feature_vector_variance/max_squared_error": 123143.1015625, "feature_vector_variance/metric": 26872.951171875, "generated_tokens/total": 1267155.0, "grad_norm": 3.9980275630950928, "learning_rate": 1.4240955347243754e-05, "loss": -0.875, "mean_logprobs": -0.02197265625, "mean_logprobs/var": 0.0001621246337890625, "num_completions/total": 2208, "per_sentence_gradient_norm": 2.9070346355438232, "per_sentence_gradient_norm/max": 6.835420608520508, "per_sentence_gradient_norm/median": 2.7871274948120117, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.357481598854065, "per_sentence_gradient_norm/p75": 4.3539886474609375, "per_sentence_gradient_norm/p85": 5.361295700073242, "per_sentence_gradient_norm/p90": 5.7464447021484375, "per_sentence_gradient_norm/p95": 6.161093235015869, "per_sentence_gradient_norm/p99": 6.731619358062744, "per_sentence_gradient_norm/var": 3.92995548248291, "per_token_feature_norm": 187.15145874023438, "per_token_feature_norm/max": 296.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 87.0, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 261.20928955078125, "per_token_full_gradient_variance/max_squared_error": 0.5923542380332947, "per_token_full_gradient_variance/variance": 0.004443208687007427, "per_token_gradient_norm": 2.6375648975372314, "per_token_gradient_norm/max": 326.25, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 354.40667724609375, "per_token_policy_error_norm": 0.012143629603087902, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011321972124278545, "policy_entropy": 0.022337624803185463, "policy_entropy/max": 2.625, "policy_entropy/median": 3.725290298461914e-08, "policy_entropy/min": 1.3823577699190182e-18, "policy_entropy/p25": 2.1714186004828662e-10, "policy_entropy/p75": 8.046627044677734e-06, "policy_entropy/var": 0.011965611018240452, "policy_error_vector_variance/max_squared_error": 1.9866764545440674, "policy_error_vector_variance/metric": 0.012141930870711803, "policy_loss": -0.875, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.11052631586790085, "policy_sharpness": 9.386476516723633, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.784083127975464, "reward": 0.875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.11052631586790085, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.11052631586790085, "sentence_full_gradient_variance/max_squared_error": 2424.65478515625, "sentence_full_gradient_variance/metric": 589.921142578125, "sentence_full_gradient_variance/p75": 971.6443481445312, "sentence_full_gradient_variance/p90": 1016.2410278320312, "sentence_full_gradient_variance/p95": 2064.648681640625, "sentence_full_gradient_variance/p99": 2413.9541015625, "state_level_variance/metric": 2.6072630882263184, "state_level_variance_full_gradient/metric": 414.822021484375, "step": 23 }, { "accuracy_reward": 0.7291666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19956141710281372, "action_level_variance/metric": 0.7493284940719604, "action_level_variance_full_gradient/metric": 288.5623474121094, "adam_stats/lr_effective_max": 9.086643694899976e-05, "adam_stats/lr_effective_mean": 2.7507018884875833e-10, "adam_stats/lr_effective_min": -9.154957660939544e-05, "adam_stats/m_t_max": 0.012911345809698105, "adam_stats/m_t_mean": 5.0000725781984556e-11, "adam_stats/m_t_min": -0.019093617796897888, "adam_stats/v_t_max": 0.0002273338905069977, "adam_stats/v_t_mean": 8.51518248273786e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7291666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.19956141710281372, "all_logprobs": -0.01592841185629368, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.25, "all_logprobs/p1": -0.470703125, "all_logprobs/p10": -7.60077964514494e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.00592041015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.021975986659526825, "clip_ratio": 0.0, "completion_length": 551.84375, "completion_length/correct": 477.0428466796875, "completion_length/correct/max": 1020.0, "completion_length/correct/median": 459.0, "completion_length/correct/min": 227.0, "completion_length/correct/p25": 341.25, "completion_length/correct/p75": 581.0, "completion_length/correct/var": 36636.82421875, "completion_length/incorrect": 753.2307739257812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 710.0, "completion_length/incorrect/min": 218.0, "completion_length/incorrect/p25": 666.5, "completion_length/incorrect/p75": 1011.75, "completion_length/incorrect/var": 63914.50390625, "completion_length/max": 1024.0, "completion_length/median": 485.0, "completion_length/min": 218.0, "completion_length/p25": 344.75, "completion_length/p75": 693.0, "completion_length/var": 58652.01171875, "epoch": 0.0384, "feature_vector_variance/max_squared_error": 84577.9453125, "feature_vector_variance/metric": 26684.4921875, "generated_tokens/total": 1320132.0, "grad_norm": 0.9114542007446289, "learning_rate": 1.4122106946441953e-05, "loss": -0.7292, "mean_logprobs": -0.015869140625, "mean_logprobs/var": 5.7220458984375e-05, "num_completions/total": 2304, "per_sentence_gradient_norm": 1.5951570272445679, "per_sentence_gradient_norm/max": 3.8960273265838623, "per_sentence_gradient_norm/median": 1.6758246421813965, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.494011402130127, "per_sentence_gradient_norm/p85": 2.978198528289795, "per_sentence_gradient_norm/p90": 3.1845571994781494, "per_sentence_gradient_norm/p95": 3.35292911529541, "per_sentence_gradient_norm/p99": 3.8908531665802, "per_sentence_gradient_norm/var": 1.4652976989746094, "per_token_feature_norm": 187.47079467773438, "per_token_feature_norm/max": 262.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 86.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 310.1325988769531, "per_token_full_gradient_variance/max_squared_error": 0.8081739544868469, "per_token_full_gradient_variance/variance": 0.0027149252127856016, "per_token_gradient_norm": 1.3554184436798096, "per_token_gradient_norm/max": 292.03125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 172.47396850585938, "per_token_policy_error_norm": 0.009253868833184242, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.008781563490629196, "policy_entropy": 0.017843080684542656, "policy_entropy/max": 2.09375, "policy_entropy/median": 1.5133991837501526e-08, "policy_entropy/min": 3.63207727782644e-18, "policy_entropy/p25": 8.685674401931465e-11, "policy_entropy/p75": 2.60770320892334e-06, "policy_entropy/var": 0.009462224319577217, "policy_error_vector_variance/max_squared_error": 1.9843863248825073, "policy_error_vector_variance/metric": 0.009252350777387619, "policy_loss": -0.7291666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.19956141710281372, "policy_sharpness": 9.490751266479492, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.1100683212280273, "reward": 0.7291666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19956141710281372, "rewards/accuracy_reward": 0.7291666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19956141710281372, "sentence_full_gradient_variance/max_squared_error": 2185.847412109375, "sentence_full_gradient_variance/metric": 1156.2283935546875, "sentence_full_gradient_variance/p75": 2131.791015625, "sentence_full_gradient_variance/p90": 2131.791015625, "sentence_full_gradient_variance/p95": 2131.791015625, "sentence_full_gradient_variance/p99": 2185.847412109375, "state_level_variance/metric": 0.866587221622467, "state_level_variance_full_gradient/metric": 867.6659545898438, "step": 24 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18947365880012512, "action_level_variance/metric": 0.648493230342865, "action_level_variance_full_gradient/metric": 115.80767059326172, "adam_stats/lr_effective_max": 9.0169305622112e-05, "adam_stats/lr_effective_mean": 8.220925051816153e-10, "adam_stats/lr_effective_min": -9.181876521324739e-05, "adam_stats/m_t_max": 0.012024067342281342, "adam_stats/m_t_mean": 1.2072058630518967e-11, "adam_stats/m_t_min": -0.011404609307646751, "adam_stats/v_t_max": 0.00023044695262797177, "adam_stats/v_t_mean": 9.02121606582984e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.75, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.75, "advantages/p75": 1.0, "advantages/var": 0.18947365880012512, "all_logprobs": -0.01556422933936119, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.5, "all_logprobs/p1": -0.396484375, "all_logprobs/p10": -5.230901297181845e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.00408935546875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.024186469614505768, "clip_ratio": 0.0, "completion_length": 512.5729370117188, "completion_length/correct": 406.47222900390625, "completion_length/correct/max": 717.0, "completion_length/correct/median": 373.0, "completion_length/correct/min": 197.0, "completion_length/correct/p25": 311.0, "completion_length/correct/p75": 455.0, "completion_length/correct/var": 18164.619140625, "completion_length/incorrect": 830.875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 868.0, "completion_length/incorrect/min": 434.0, "completion_length/incorrect/p25": 638.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 49007.24609375, "completion_length/max": 1024.0, "completion_length/median": 424.0, "completion_length/min": 197.0, "completion_length/p25": 322.25, "completion_length/p75": 656.75, "completion_length/var": 59568.140625, "epoch": 0.04, "feature_vector_variance/max_squared_error": 75606.46875, "feature_vector_variance/metric": 26709.07421875, "generated_tokens/total": 1369339.0, "grad_norm": 3.2272462844848633, "learning_rate": 1.3995190528383292e-05, "loss": -0.75, "mean_logprobs": -0.01519775390625, "mean_logprobs/var": 9.5367431640625e-05, "num_completions/total": 2400, "per_sentence_gradient_norm": 1.5125845670700073, "per_sentence_gradient_norm/max": 7.392582893371582, "per_sentence_gradient_norm/median": 1.435705304145813, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.39963895082473755, "per_sentence_gradient_norm/p75": 2.1841726303100586, "per_sentence_gradient_norm/p85": 2.778928756713867, "per_sentence_gradient_norm/p90": 3.0767791271209717, "per_sentence_gradient_norm/p95": 3.6464650630950928, "per_sentence_gradient_norm/p99": 5.383396625518799, "per_sentence_gradient_norm/var": 1.8123186826705933, "per_token_feature_norm": 187.53538513183594, "per_token_feature_norm/max": 258.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 93.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 251.37440490722656, "per_token_full_gradient_variance/max_squared_error": 0.4667612612247467, "per_token_full_gradient_variance/variance": 0.0019987651612609625, "per_token_gradient_norm": 1.1143858432769775, "per_token_gradient_norm/max": 296.71875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 149.2452850341797, "per_token_policy_error_norm": 0.009085462428629398, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009024658240377903, "policy_entropy": 0.01622636616230011, "policy_entropy/max": 2.1875, "policy_entropy/median": 2.3166649043560028e-08, "policy_entropy/min": 8.605854744103691e-19, "policy_entropy/p25": 1.355147105641663e-10, "policy_entropy/p75": 2.682209014892578e-06, "policy_entropy/var": 0.008697683922946453, "policy_error_vector_variance/max_squared_error": 1.9978611469268799, "policy_error_vector_variance/metric": 0.009078281000256538, "policy_loss": -0.75, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -0.75, "policy_loss/var": 0.18947365880012512, "policy_sharpness": 9.530314445495605, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 2.8926193714141846, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.18947365880012512, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18947365880012512, "sentence_full_gradient_variance/max_squared_error": 1937.6251220703125, "sentence_full_gradient_variance/metric": 1124.308837890625, "sentence_full_gradient_variance/p75": 1729.460693359375, "sentence_full_gradient_variance/p90": 1729.460693359375, "sentence_full_gradient_variance/p95": 1729.460693359375, "sentence_full_gradient_variance/p99": 1927.907470703125, "state_level_variance/metric": 1.3374640941619873, "state_level_variance_full_gradient/metric": 1008.5011596679688, "step": 25 }, { "accuracy_reward": 0.9895833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": NaN, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.0104166679084301, "action_level_variance/metric": 0.6368333101272583, "action_level_variance_full_gradient/metric": 79.3081283569336, "adam_stats/lr_effective_max": 9.055612463271245e-05, "adam_stats/lr_effective_mean": 1.0483521828419384e-09, "adam_stats/lr_effective_min": -9.179494372801855e-05, "adam_stats/m_t_max": 0.010593908838927746, "adam_stats/m_t_mean": 4.9421529368931516e-11, "adam_stats/m_t_min": -0.015550310723483562, "adam_stats/v_t_max": 0.00023301082546822727, "adam_stats/v_t_mean": 9.517537799541564e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.9895833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.0104166679084301, "all_logprobs": -0.014741583727300167, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.25, "all_logprobs/p1": -0.38671875, "all_logprobs/p10": -7.724761962890625e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.005218505859375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02038617432117462, "clip_ratio": 0.0, "completion_length": 360.63543701171875, "completion_length/correct": 355.5473937988281, "completion_length/correct/max": 1007.0, "completion_length/correct/median": 307.0, "completion_length/correct/min": 184.0, "completion_length/correct/p25": 225.0, "completion_length/correct/p75": 416.0, "completion_length/correct/var": 29270.05859375, "completion_length/incorrect": 844.0, "completion_length/incorrect/max": 844.0, "completion_length/incorrect/median": 844.0, "completion_length/incorrect/min": 844.0, "completion_length/incorrect/p25": 844.0, "completion_length/incorrect/p75": 844.0, "completion_length/incorrect/var": NaN, "completion_length/max": 1007.0, "completion_length/median": 307.0, "completion_length/min": 184.0, "completion_length/p25": 225.5, "completion_length/p75": 421.5, "completion_length/var": 31447.220703125, "epoch": 0.0416, "feature_vector_variance/max_squared_error": 73454.7265625, "feature_vector_variance/metric": 27356.51953125, "generated_tokens/total": 1403960.0, "grad_norm": 4.00884485244751, "learning_rate": 1.3860360721173195e-05, "loss": -0.9896, "mean_logprobs": -0.0167236328125, "mean_logprobs/var": 0.00010347366333007812, "num_completions/total": 2496, "per_sentence_gradient_norm": 2.4967803955078125, "per_sentence_gradient_norm/max": 6.594139575958252, "per_sentence_gradient_norm/median": 2.1787614822387695, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.5360310077667236, "per_sentence_gradient_norm/p75": 3.313337564468384, "per_sentence_gradient_norm/p85": 3.815992832183838, "per_sentence_gradient_norm/p90": 4.177639961242676, "per_sentence_gradient_norm/p95": 4.821144104003906, "per_sentence_gradient_norm/p99": 5.438652038574219, "per_sentence_gradient_norm/var": 1.578698992729187, "per_token_feature_norm": 188.1514892578125, "per_token_feature_norm/max": 258.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 85.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 306.9408264160156, "per_token_full_gradient_variance/max_squared_error": 0.5435266494750977, "per_token_full_gradient_variance/variance": 0.0038966129068285227, "per_token_gradient_norm": 2.201805353164673, "per_token_gradient_norm/max": 272.4140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 283.07244873046875, "per_token_policy_error_norm": 0.008674344047904015, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.008350392803549767, "policy_entropy": 0.016354184597730637, "policy_entropy/max": 1.609375, "policy_entropy/median": 9.546056389808655e-09, "policy_entropy/min": 2.72405795836983e-18, "policy_entropy/p25": 5.0249582272954285e-11, "policy_entropy/p75": 2.3990869522094727e-06, "policy_entropy/var": 0.00797013659030199, "policy_error_vector_variance/max_squared_error": 1.990935206413269, "policy_error_vector_variance/metric": 0.008663717657327652, "policy_loss": -0.9895833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.0104166679084301, "policy_sharpness": 9.509317398071289, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 2.9275732040405273, "reward": 0.9895833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.0104166679084301, "rewards/accuracy_reward": 0.9895833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.0104166679084301, "sentence_full_gradient_variance/max_squared_error": 2401.3671875, "sentence_full_gradient_variance/metric": 752.4237060546875, "sentence_full_gradient_variance/p75": 1081.8338623046875, "sentence_full_gradient_variance/p90": 1917.322265625, "sentence_full_gradient_variance/p95": 2401.30908203125, "sentence_full_gradient_variance/p99": 2401.343994140625, "state_level_variance/metric": 1.096391201019287, "state_level_variance_full_gradient/metric": 673.115478515625, "step": 26 }, { "accuracy_reward": 0.5833333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24561403691768646, "action_level_variance/metric": 1.4471197128295898, "action_level_variance_full_gradient/metric": 89.61497497558594, "adam_stats/lr_effective_max": 8.784762758295983e-05, "adam_stats/lr_effective_mean": 9.701149883412086e-10, "adam_stats/lr_effective_min": -8.876957144821063e-05, "adam_stats/m_t_max": 0.014243029057979584, "adam_stats/m_t_mean": 1.2816071814913954e-10, "adam_stats/m_t_min": -0.017556199803948402, "adam_stats/v_t_max": 0.00023404581588692963, "adam_stats/v_t_mean": 1.0027379100663314e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.5833333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.24561403691768646, "all_logprobs": -0.022611940279603004, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -14.75, "all_logprobs/p1": -0.6833202838897705, "all_logprobs/p10": -0.00015926361083984375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.01416015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.046946510672569275, "clip_ratio": 0.0, "completion_length": 576.7708740234375, "completion_length/correct": 369.33929443359375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 354.0, "completion_length/correct/min": 147.0, "completion_length/correct/p25": 290.75, "completion_length/correct/p75": 470.5, "completion_length/correct/var": 22896.12109375, "completion_length/incorrect": 867.1749877929688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 236.0, "completion_length/incorrect/p25": 662.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 56670.30078125, "completion_length/max": 1024.0, "completion_length/median": 490.0, "completion_length/min": 147.0, "completion_length/p25": 328.5, "completion_length/p75": 1024.0, "completion_length/var": 97393.3828125, "epoch": 0.0432, "feature_vector_variance/max_squared_error": 139371.0, "feature_vector_variance/metric": 26782.052734375, "generated_tokens/total": 1459330.0, "grad_norm": 10.91492748260498, "learning_rate": 1.3717781794162813e-05, "loss": -0.5833, "mean_logprobs": -0.024169921875, "mean_logprobs/var": 0.00024127960205078125, "num_completions/total": 2592, "per_sentence_gradient_norm": 1.8247394561767578, "per_sentence_gradient_norm/max": 9.043802261352539, "per_sentence_gradient_norm/median": 1.1839463710784912, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.041207790374756, "per_sentence_gradient_norm/p85": 4.243175506591797, "per_sentence_gradient_norm/p90": 4.658143043518066, "per_sentence_gradient_norm/p95": 5.254398345947266, "per_sentence_gradient_norm/p99": 7.0375847816467285, "per_sentence_gradient_norm/var": 4.120632171630859, "per_token_feature_norm": 186.65892028808594, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 86.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 237.43939208984375, "per_token_full_gradient_variance/max_squared_error": 1.0267339944839478, "per_token_full_gradient_variance/variance": 0.001968286233022809, "per_token_gradient_norm": 1.2456127405166626, "per_token_gradient_norm/max": 332.3046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 186.24591064453125, "per_token_policy_error_norm": 0.01280173473060131, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.013159386813640594, "policy_entropy": 0.021132269874215126, "policy_entropy/max": 3.015625, "policy_entropy/median": 1.3096723705530167e-08, "policy_entropy/min": 1.3688052427629493e-18, "policy_entropy/p25": 8.321876521222293e-11, "policy_entropy/p75": 2.9355287551879883e-06, "policy_entropy/var": 0.011721648275852203, "policy_error_vector_variance/max_squared_error": 1.9987188577651978, "policy_error_vector_variance/metric": 0.012796806171536446, "policy_loss": -0.5833333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.24561403691768646, "policy_sharpness": 9.421380043029785, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.5908546447753906, "reward": 0.5833333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24561403691768646, "rewards/accuracy_reward": 0.5833333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24561403691768646, "sentence_full_gradient_variance/max_squared_error": 2708.05615234375, "sentence_full_gradient_variance/metric": 1422.737060546875, "sentence_full_gradient_variance/p75": 1755.580078125, "sentence_full_gradient_variance/p90": 2072.52001953125, "sentence_full_gradient_variance/p95": 2541.230224609375, "sentence_full_gradient_variance/p99": 2565.19287109375, "state_level_variance/metric": 3.0670676231384277, "state_level_variance_full_gradient/metric": 1333.1217041015625, "step": 27 }, { "accuracy_reward": 0.625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2368421107530594, "action_level_variance/metric": 0.9236360788345337, "action_level_variance_full_gradient/metric": 136.30694580078125, "adam_stats/lr_effective_max": 8.801485819276422e-05, "adam_stats/lr_effective_mean": 8.593878386697895e-10, "adam_stats/lr_effective_min": -8.937768143368885e-05, "adam_stats/m_t_max": 0.026460465043783188, "adam_stats/m_t_mean": 1.2073118893507484e-10, "adam_stats/m_t_min": -0.02485637180507183, "adam_stats/v_t_max": 0.00024201240739785135, "adam_stats/v_t_mean": 1.0529578944873386e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.625, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2368421107530594, "all_logprobs": -0.018841395154595375, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.25, "all_logprobs/p1": -0.474609375, "all_logprobs/p10": -7.486343383789062e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.006744384765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03360382467508316, "clip_ratio": 0.0, "completion_length": 667.5208740234375, "completion_length/correct": 587.9500122070312, "completion_length/correct/max": 985.0, "completion_length/correct/median": 528.0, "completion_length/correct/min": 290.0, "completion_length/correct/p25": 424.0, "completion_length/correct/p75": 760.0, "completion_length/correct/var": 36901.47265625, "completion_length/incorrect": 800.138916015625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 898.0, "completion_length/incorrect/min": 289.0, "completion_length/incorrect/p25": 618.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 70863.890625, "completion_length/max": 1024.0, "completion_length/median": 667.0, "completion_length/min": 289.0, "completion_length/p25": 428.75, "completion_length/p75": 877.0, "completion_length/var": 59689.109375, "epoch": 0.0448, "feature_vector_variance/max_squared_error": 73363.015625, "feature_vector_variance/metric": 26472.69921875, "generated_tokens/total": 1523412.0, "grad_norm": 3.3645589351654053, "learning_rate": 1.3567627457812107e-05, "loss": -0.625, "mean_logprobs": -0.01953125, "mean_logprobs/var": 8.726119995117188e-05, "num_completions/total": 2688, "per_sentence_gradient_norm": 1.7296111583709717, "per_sentence_gradient_norm/max": 5.509543418884277, "per_sentence_gradient_norm/median": 2.0568175315856934, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.87499737739563, "per_sentence_gradient_norm/p85": 3.248425006866455, "per_sentence_gradient_norm/p90": 3.5654380321502686, "per_sentence_gradient_norm/p95": 4.19293212890625, "per_sentence_gradient_norm/p99": 5.427924633026123, "per_sentence_gradient_norm/var": 2.420577049255371, "per_token_feature_norm": 184.77166748046875, "per_token_feature_norm/max": 272.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 89.0, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 208.05247497558594, "per_token_full_gradient_variance/max_squared_error": 1.0976908206939697, "per_token_full_gradient_variance/variance": 0.00223166448995471, "per_token_gradient_norm": 1.4609674215316772, "per_token_gradient_norm/max": 283.828125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 208.9548797607422, "per_token_policy_error_norm": 0.010943376459181309, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01144515909254551, "policy_entropy": 0.017950721085071564, "policy_entropy/max": 2.390625, "policy_entropy/median": 1.2631062418222427e-08, "policy_entropy/min": 1.4501204056993622e-18, "policy_entropy/p25": 8.86757334228605e-11, "policy_entropy/p75": 1.9818544387817383e-06, "policy_entropy/var": 0.009245148859918118, "policy_error_vector_variance/max_squared_error": 1.9962290525436401, "policy_error_vector_variance/metric": 0.010927196592092514, "policy_loss": -0.625, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2368421107530594, "policy_sharpness": 9.484565734863281, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.1953909397125244, "reward": 0.625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2368421107530594, "rewards/accuracy_reward": 0.625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2368421107530594, "sentence_full_gradient_variance/max_squared_error": 3192.892822265625, "sentence_full_gradient_variance/metric": 863.4285888671875, "sentence_full_gradient_variance/p75": 1054.3798828125, "sentence_full_gradient_variance/p90": 2080.06298828125, "sentence_full_gradient_variance/p95": 2138.84521484375, "sentence_full_gradient_variance/p99": 2359.49609375, "state_level_variance/metric": 1.7314701080322266, "state_level_variance_full_gradient/metric": 727.12158203125, "step": 28 }, { "accuracy_reward": 0.7604166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18410088121891022, "action_level_variance/metric": 1.257580280303955, "action_level_variance_full_gradient/metric": 206.33750915527344, "adam_stats/lr_effective_max": 8.701244223630056e-05, "adam_stats/lr_effective_mean": 7.530863710414337e-10, "adam_stats/lr_effective_min": -8.682946645421907e-05, "adam_stats/m_t_max": 0.01436476781964302, "adam_stats/m_t_mean": 7.564562171102907e-11, "adam_stats/m_t_min": -0.0130698811262846, "adam_stats/v_t_max": 0.00025058427127078176, "adam_stats/v_t_mean": 1.1040463682165935e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7604166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.18410088121891022, "all_logprobs": -0.017782343551516533, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.25, "all_logprobs/p1": -0.47638678550720215, "all_logprobs/p10": -8.158688433468342e-05, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.0079345703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.030215494334697723, "clip_ratio": 0.0, "completion_length": 550.1041870117188, "completion_length/correct": 493.75341796875, "completion_length/correct/max": 993.0, "completion_length/correct/median": 394.0, "completion_length/correct/min": 212.0, "completion_length/correct/p25": 363.0, "completion_length/correct/p75": 670.0, "completion_length/correct/var": 44077.71484375, "completion_length/incorrect": 728.95654296875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 895.0, "completion_length/incorrect/min": 307.0, "completion_length/incorrect/p25": 376.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 96888.6796875, "completion_length/max": 1024.0, "completion_length/median": 403.0, "completion_length/min": 212.0, "completion_length/p25": 359.5, "completion_length/p75": 794.5, "completion_length/var": 66028.1953125, "epoch": 0.0464, "feature_vector_variance/max_squared_error": 88320.7578125, "feature_vector_variance/metric": 26921.603515625, "generated_tokens/total": 1576222.0, "grad_norm": 6.3659772872924805, "learning_rate": 1.3410080652050414e-05, "loss": -0.7604, "mean_logprobs": -0.01904296875, "mean_logprobs/var": 0.00014591217041015625, "num_completions/total": 2784, "per_sentence_gradient_norm": 2.12886905670166, "per_sentence_gradient_norm/max": 7.800070285797119, "per_sentence_gradient_norm/median": 2.0360522270202637, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.8474001884460449, "per_sentence_gradient_norm/p75": 3.0047218799591064, "per_sentence_gradient_norm/p85": 3.6109795570373535, "per_sentence_gradient_norm/p90": 4.282419204711914, "per_sentence_gradient_norm/p95": 5.028200626373291, "per_sentence_gradient_norm/p99": 7.461233615875244, "per_sentence_gradient_norm/var": 2.914354085922241, "per_token_feature_norm": 186.08538818359375, "per_token_feature_norm/max": 252.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 87.0, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 242.13796997070312, "per_token_full_gradient_variance/max_squared_error": 0.6330313086509705, "per_token_full_gradient_variance/variance": 0.0029894241597503424, "per_token_gradient_norm": 1.79819655418396, "per_token_gradient_norm/max": 304.0234375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 241.72561645507812, "per_token_policy_error_norm": 0.010298860259354115, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010117635130882263, "policy_entropy": 0.01821892149746418, "policy_entropy/max": 1.8359375, "policy_entropy/median": 2.4330802261829376e-08, "policy_entropy/min": 3.409307612698559e-20, "policy_entropy/p25": 1.3915268937125802e-10, "policy_entropy/p75": 3.1888484954833984e-06, "policy_entropy/var": 0.009343491867184639, "policy_error_vector_variance/max_squared_error": 1.9997332096099854, "policy_error_vector_variance/metric": 0.010295568034052849, "policy_loss": -0.7604166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.18410088121891022, "policy_sharpness": 9.472848892211914, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.265812873840332, "reward": 0.7604166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18410088121891022, "rewards/accuracy_reward": 0.7604166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18410088121891022, "sentence_full_gradient_variance/max_squared_error": 3589.034423828125, "sentence_full_gradient_variance/metric": 860.95068359375, "sentence_full_gradient_variance/p75": 1155.17626953125, "sentence_full_gradient_variance/p90": 1651.139404296875, "sentence_full_gradient_variance/p95": 2190.608154296875, "sentence_full_gradient_variance/p99": 2572.209228515625, "state_level_variance/metric": 1.9457600116729736, "state_level_variance_full_gradient/metric": 654.61328125, "step": 29 }, { "accuracy_reward": 0.6041666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24166668951511383, "action_level_variance/metric": 1.9296703338623047, "action_level_variance_full_gradient/metric": 299.5917663574219, "adam_stats/lr_effective_max": 8.752837311476469e-05, "adam_stats/lr_effective_mean": 8.550704588827784e-10, "adam_stats/lr_effective_min": -8.478518429910764e-05, "adam_stats/m_t_max": 0.011696604080498219, "adam_stats/m_t_mean": 2.1790473939131694e-11, "adam_stats/m_t_min": -0.011218108236789703, "adam_stats/v_t_max": 0.00025494411238469183, "adam_stats/v_t_mean": 1.1528964415086218e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6041666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.24166668951511383, "all_logprobs": -0.019954096525907516, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.5, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00010728836059570312, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.008630365133285522, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03402724117040634, "clip_ratio": 0.0, "completion_length": 508.26043701171875, "completion_length/correct": 369.9137878417969, "completion_length/correct/max": 899.0, "completion_length/correct/median": 355.0, "completion_length/correct/min": 140.0, "completion_length/correct/p25": 249.25, "completion_length/correct/p75": 458.5, "completion_length/correct/var": 29045.16796875, "completion_length/incorrect": 719.4210815429688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 849.0, "completion_length/incorrect/min": 197.0, "completion_length/incorrect/p25": 383.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 105211.5546875, "completion_length/max": 1024.0, "completion_length/median": 376.0, "completion_length/min": 140.0, "completion_length/p25": 273.75, "completion_length/p75": 723.25, "completion_length/var": 87925.09375, "epoch": 0.048, "feature_vector_variance/max_squared_error": 72978.7265625, "feature_vector_variance/metric": 27181.078125, "generated_tokens/total": 1625015.0, "grad_norm": 3.0921785831451416, "learning_rate": 1.3245333323392335e-05, "loss": -0.6042, "mean_logprobs": -0.02197265625, "mean_logprobs/var": 0.0001850128173828125, "num_completions/total": 2880, "per_sentence_gradient_norm": 1.835782527923584, "per_sentence_gradient_norm/max": 9.798725128173828, "per_sentence_gradient_norm/median": 1.6023173332214355, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.0446419715881348, "per_sentence_gradient_norm/p85": 3.727512836456299, "per_sentence_gradient_norm/p90": 4.386249542236328, "per_sentence_gradient_norm/p95": 5.282849311828613, "per_sentence_gradient_norm/p99": 6.080296039581299, "per_sentence_gradient_norm/var": 3.651338815689087, "per_token_feature_norm": 185.9906768798828, "per_token_feature_norm/max": 253.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 92.5, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 261.4459533691406, "per_token_full_gradient_variance/max_squared_error": 0.5219939947128296, "per_token_full_gradient_variance/variance": 0.0021286727860569954, "per_token_gradient_norm": 1.2932764291763306, "per_token_gradient_norm/max": 311.8515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 182.62217712402344, "per_token_policy_error_norm": 0.011331191286444664, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011262917891144753, "policy_entropy": 0.02004864811897278, "policy_entropy/max": 2.15625, "policy_entropy/median": 2.6309862732887268e-08, "policy_entropy/min": 5.285485590866834e-18, "policy_entropy/p25": 2.1827872842550278e-10, "policy_entropy/p75": 3.680586814880371e-06, "policy_entropy/var": 0.011971303261816502, "policy_error_vector_variance/max_squared_error": 1.9980707168579102, "policy_error_vector_variance/metric": 0.011325339786708355, "policy_loss": -0.6041666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.24166668951511383, "policy_sharpness": 9.456090927124023, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.365805149078369, "reward": 0.6041666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24166668951511383, "rewards/accuracy_reward": 0.6041666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24166668951511383, "sentence_full_gradient_variance/max_squared_error": 3183.654296875, "sentence_full_gradient_variance/metric": 1592.981201171875, "sentence_full_gradient_variance/p75": 2080.685302734375, "sentence_full_gradient_variance/p90": 2080.685302734375, "sentence_full_gradient_variance/p95": 2527.40283203125, "sentence_full_gradient_variance/p99": 3171.2822265625, "state_level_variance/metric": 2.099828004837036, "state_level_variance_full_gradient/metric": 1293.38916015625, "step": 30 }, { "accuracy_reward": 0.7083333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20877192914485931, "action_level_variance/metric": 1.8601621389389038, "action_level_variance_full_gradient/metric": 289.99993896484375, "adam_stats/lr_effective_max": 8.657100988784805e-05, "adam_stats/lr_effective_mean": 1.0249051607402748e-09, "adam_stats/lr_effective_min": -8.314933074871078e-05, "adam_stats/m_t_max": 0.008643757551908493, "adam_stats/m_t_mean": 1.7952700090417828e-11, "adam_stats/m_t_min": -0.012116923928260803, "adam_stats/v_t_max": 0.0002554922830313444, "adam_stats/v_t_mean": 1.2032409790752041e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7083333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.20877192914485931, "all_logprobs": -0.019871758297085762, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.0, "all_logprobs/p1": -0.516171932220459, "all_logprobs/p10": -0.00010347366333007812, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.00860595703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03161460906267166, "clip_ratio": 0.0, "completion_length": 485.35418701171875, "completion_length/correct": 442.48529052734375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 420.0, "completion_length/correct/min": 207.0, "completion_length/correct/p25": 315.75, "completion_length/correct/p75": 485.25, "completion_length/correct/var": 38119.8984375, "completion_length/incorrect": 589.4642944335938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 521.0, "completion_length/incorrect/min": 269.0, "completion_length/incorrect/p25": 368.75, "completion_length/incorrect/p75": 779.25, "completion_length/incorrect/var": 60140.77734375, "completion_length/max": 1024.0, "completion_length/median": 433.0, "completion_length/min": 207.0, "completion_length/p25": 321.25, "completion_length/p75": 582.5, "completion_length/var": 48487.26953125, "epoch": 0.0496, "feature_vector_variance/max_squared_error": 70752.5078125, "feature_vector_variance/metric": 27380.751953125, "generated_tokens/total": 1671609.0, "grad_norm": 3.273651361465454, "learning_rate": 1.3073586191080456e-05, "loss": -0.7083, "mean_logprobs": -0.0206298828125, "mean_logprobs/var": 0.00021076202392578125, "num_completions/total": 2976, "per_sentence_gradient_norm": 1.8186848163604736, "per_sentence_gradient_norm/max": 8.127693176269531, "per_sentence_gradient_norm/median": 1.3861982822418213, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.7709343433380127, "per_sentence_gradient_norm/p85": 3.9293007850646973, "per_sentence_gradient_norm/p90": 4.430187225341797, "per_sentence_gradient_norm/p95": 5.410035133361816, "per_sentence_gradient_norm/p99": 6.827005386352539, "per_sentence_gradient_norm/var": 3.540353775024414, "per_token_feature_norm": 185.3906707763672, "per_token_feature_norm/max": 264.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 87.0, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 245.0056915283203, "per_token_full_gradient_variance/max_squared_error": 0.6095245480537415, "per_token_full_gradient_variance/variance": 0.0024720404762774706, "per_token_gradient_norm": 1.6400384902954102, "per_token_gradient_norm/max": 286.2421875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 217.64251708984375, "per_token_policy_error_norm": 0.01155321579426527, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011568731628358364, "policy_entropy": 0.019837645813822746, "policy_entropy/max": 2.171875, "policy_entropy/median": 2.4796463549137115e-08, "policy_entropy/min": 8.40256683676266e-18, "policy_entropy/p25": 1.9281287677586079e-10, "policy_entropy/p75": 3.203749656677246e-06, "policy_entropy/var": 0.010579580441117287, "policy_error_vector_variance/max_squared_error": 1.982302188873291, "policy_error_vector_variance/metric": 0.011543696746230125, "policy_loss": -0.7083333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.20877192914485931, "policy_sharpness": 9.46081256866455, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.352494239807129, "reward": 0.7083333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20877192914485931, "rewards/accuracy_reward": 0.7083333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20877192914485931, "sentence_full_gradient_variance/max_squared_error": 2867.597900390625, "sentence_full_gradient_variance/metric": 1142.447265625, "sentence_full_gradient_variance/p75": 1525.8350830078125, "sentence_full_gradient_variance/p90": 1706.2930908203125, "sentence_full_gradient_variance/p95": 2706.23291015625, "sentence_full_gradient_variance/p99": 2867.597900390625, "state_level_variance/metric": 2.046363592147827, "state_level_variance_full_gradient/metric": 852.4473876953125, "step": 31 }, { "accuracy_reward": 0.6875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21710526943206787, "action_level_variance/metric": 1.5718516111373901, "action_level_variance_full_gradient/metric": 179.75120544433594, "adam_stats/lr_effective_max": 8.48375420900993e-05, "adam_stats/lr_effective_mean": 8.493385994512437e-10, "adam_stats/lr_effective_min": -8.303120557684451e-05, "adam_stats/m_t_max": 0.01036232989281416, "adam_stats/m_t_mean": 2.138742481727629e-11, "adam_stats/m_t_min": -0.013431834988296032, "adam_stats/v_t_max": 0.00025626071146689355, "adam_stats/v_t_mean": 1.2490317807811735e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6875, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21710526943206787, "all_logprobs": -0.017239084467291832, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.875, "all_logprobs/p1": -0.474609375, "all_logprobs/p10": -3.528594970703125e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.00445556640625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.029167689383029938, "clip_ratio": 0.0, "completion_length": 567.1146240234375, "completion_length/correct": 545.4242553710938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 570.0, "completion_length/correct/min": 169.0, "completion_length/correct/p25": 439.0, "completion_length/correct/p75": 624.0, "completion_length/correct/var": 31165.017578125, "completion_length/incorrect": 614.8333740234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 335.0, "completion_length/incorrect/min": 211.0, "completion_length/incorrect/p25": 260.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 135089.65625, "completion_length/max": 1024.0, "completion_length/median": 570.0, "completion_length/min": 169.0, "completion_length/p25": 388.75, "completion_length/p75": 701.75, "completion_length/var": 63607.26953125, "epoch": 0.0512, "feature_vector_variance/max_squared_error": 136235.015625, "feature_vector_variance/metric": 27161.142578125, "generated_tokens/total": 1726052.0, "grad_norm": 1.8158560991287231, "learning_rate": 1.2895048502539883e-05, "loss": -0.6875, "mean_logprobs": -0.0189208984375, "mean_logprobs/var": 0.0001811981201171875, "num_completions/total": 3072, "per_sentence_gradient_norm": 1.6766389608383179, "per_sentence_gradient_norm/max": 9.0156888961792, "per_sentence_gradient_norm/median": 1.437819242477417, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.5084080696105957, "per_sentence_gradient_norm/p85": 2.8793272972106934, "per_sentence_gradient_norm/p90": 3.441074848175049, "per_sentence_gradient_norm/p95": 4.533603191375732, "per_sentence_gradient_norm/p99": 7.173570156097412, "per_sentence_gradient_norm/var": 2.9037222862243652, "per_token_feature_norm": 184.81605529785156, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 90.0, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 266.7093505859375, "per_token_full_gradient_variance/max_squared_error": 1.6543262004852295, "per_token_full_gradient_variance/variance": 0.0027188840322196484, "per_token_gradient_norm": 1.5426067113876343, "per_token_gradient_norm/max": 344.09375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 218.62698364257812, "per_token_policy_error_norm": 0.00971418060362339, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.00942198932170868, "policy_entropy": 0.0178788211196661, "policy_entropy/max": 3.140625, "policy_entropy/median": 6.05359673500061e-09, "policy_entropy/min": 1.5924219408380846e-19, "policy_entropy/p25": 3.865352482534945e-11, "policy_entropy/p75": 1.0132789611816406e-06, "policy_entropy/var": 0.01124450284987688, "policy_error_vector_variance/max_squared_error": 2.000464916229248, "policy_error_vector_variance/metric": 0.009705682285130024, "policy_loss": -0.6875, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21710526943206787, "policy_sharpness": 9.52500057220459, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 2.9705467224121094, "reward": 0.6875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21710526943206787, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21710526943206787, "sentence_full_gradient_variance/max_squared_error": 3270.5947265625, "sentence_full_gradient_variance/metric": 694.8263549804688, "sentence_full_gradient_variance/p75": 850.40234375, "sentence_full_gradient_variance/p90": 1444.830078125, "sentence_full_gradient_variance/p95": 1564.456787109375, "sentence_full_gradient_variance/p99": 3077.501953125, "state_level_variance/metric": 1.6342962980270386, "state_level_variance_full_gradient/metric": 515.0751342773438, "step": 32 }, { "accuracy_reward": 0.6458333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2311403751373291, "action_level_variance/metric": 0.34633347392082214, "action_level_variance_full_gradient/metric": 86.98683166503906, "adam_stats/lr_effective_max": 8.007795258890837e-05, "adam_stats/lr_effective_mean": 6.129771135121587e-10, "adam_stats/lr_effective_min": -8.039874228416011e-05, "adam_stats/m_t_max": 0.0076311733573675156, "adam_stats/m_t_mean": 4.2216598272748485e-11, "adam_stats/m_t_min": -0.013495183549821377, "adam_stats/v_t_max": 0.0002578298153821379, "adam_stats/v_t_mean": 1.2783556331552592e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6458333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2311403751373291, "all_logprobs": -0.014920377172529697, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.25, "all_logprobs/p1": -0.38671875, "all_logprobs/p10": -1.6689300537109375e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.00193023681640625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03140197694301605, "clip_ratio": 0.0, "completion_length": 577.5625, "completion_length/correct": 530.3870849609375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 521.0, "completion_length/correct/min": 258.0, "completion_length/correct/p25": 310.0, "completion_length/correct/p75": 666.0, "completion_length/correct/var": 47460.05078125, "completion_length/incorrect": 663.5882568359375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 673.0, "completion_length/incorrect/min": 98.0, "completion_length/incorrect/p25": 563.0, "completion_length/incorrect/p75": 874.25, "completion_length/incorrect/var": 85620.125, "completion_length/max": 1024.0, "completion_length/median": 625.0, "completion_length/min": 98.0, "completion_length/p25": 324.0, "completion_length/p75": 791.0, "completion_length/var": 64317.08984375, "epoch": 0.0528, "feature_vector_variance/max_squared_error": 105142.7734375, "feature_vector_variance/metric": 27117.4609375, "generated_tokens/total": 1781498.0, "grad_norm": 0.8354498147964478, "learning_rate": 1.270993777844248e-05, "loss": -0.6458, "mean_logprobs": -0.0205078125, "mean_logprobs/var": 0.00133514404296875, "num_completions/total": 3168, "per_sentence_gradient_norm": 0.9701495170593262, "per_sentence_gradient_norm/max": 4.082172393798828, "per_sentence_gradient_norm/median": 0.7674193978309631, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 1.5565677881240845, "per_sentence_gradient_norm/p85": 2.0697574615478516, "per_sentence_gradient_norm/p90": 2.379206657409668, "per_sentence_gradient_norm/p95": 2.7240817546844482, "per_sentence_gradient_norm/p99": 3.019512414932251, "per_sentence_gradient_norm/var": 0.9130706191062927, "per_token_feature_norm": 186.41477966308594, "per_token_feature_norm/max": 268.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 86.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 248.46714782714844, "per_token_full_gradient_variance/max_squared_error": 0.782799482345581, "per_token_full_gradient_variance/variance": 0.0015344582498073578, "per_token_gradient_norm": 0.962958037853241, "per_token_gradient_norm/max": 325.875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 130.69952392578125, "per_token_policy_error_norm": 0.008290547877550125, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.008459961973130703, "policy_entropy": 0.014095034450292587, "policy_entropy/max": 3.328125, "policy_entropy/median": 1.0593794286251068e-08, "policy_entropy/min": 4.797594613248357e-18, "policy_entropy/p25": 7.548806024715304e-11, "policy_entropy/p75": 1.30385160446167e-06, "policy_entropy/var": 0.007446228060871363, "policy_error_vector_variance/max_squared_error": 1.9989932775497437, "policy_error_vector_variance/metric": 0.00828042533248663, "policy_loss": -0.6458333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2311403751373291, "policy_sharpness": 9.592808723449707, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 2.536665439605713, "reward": 0.6458333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2311403751373291, "rewards/accuracy_reward": 0.6458333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2311403751373291, "sentence_full_gradient_variance/max_squared_error": 2546.066650390625, "sentence_full_gradient_variance/metric": 1022.720703125, "sentence_full_gradient_variance/p75": 1333.9453125, "sentence_full_gradient_variance/p90": 1863.47216796875, "sentence_full_gradient_variance/p95": 2147.640869140625, "sentence_full_gradient_variance/p99": 2480.24609375, "state_level_variance/metric": 0.6551101803779602, "state_level_variance_full_gradient/metric": 935.7340087890625, "step": 33 }, { "accuracy_reward": 0.8958333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.09429825097322464, "action_level_variance/metric": 0.7859346270561218, "action_level_variance_full_gradient/metric": 61.24829864501953, "adam_stats/lr_effective_max": 7.946848927531391e-05, "adam_stats/lr_effective_mean": 9.361230679516552e-10, "adam_stats/lr_effective_min": -8.038422674871981e-05, "adam_stats/m_t_max": 0.008761203847825527, "adam_stats/m_t_mean": 6.653232131892395e-11, "adam_stats/m_t_min": -0.028281202539801598, "adam_stats/v_t_max": 0.0002588233910501003, "adam_stats/v_t_mean": 1.31950032497552e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8958333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.09429825097322464, "all_logprobs": -0.01469393353909254, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.5625, "all_logprobs/p1": -0.35451173782348633, "all_logprobs/p10": -1.0371208190917969e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.00193023681640625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.024739595130085945, "clip_ratio": 0.0, "completion_length": 517.1875, "completion_length/correct": 493.2209167480469, "completion_length/correct/max": 1017.0, "completion_length/correct/median": 412.0, "completion_length/correct/min": 235.0, "completion_length/correct/p25": 352.0, "completion_length/correct/p75": 593.0, "completion_length/correct/var": 50892.40625, "completion_length/incorrect": 723.2999877929688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 710.0, "completion_length/incorrect/min": 464.0, "completion_length/incorrect/p25": 508.0, "completion_length/incorrect/p75": 864.0, "completion_length/incorrect/var": 46559.34765625, "completion_length/max": 1024.0, "completion_length/median": 446.0, "completion_length/min": 235.0, "completion_length/p25": 355.25, "completion_length/p75": 693.5, "completion_length/var": 54938.00390625, "epoch": 0.0544, "feature_vector_variance/max_squared_error": 72977.328125, "feature_vector_variance/metric": 26644.759765625, "generated_tokens/total": 1831148.0, "grad_norm": 1.766753077507019, "learning_rate": 1.2518479547691437e-05, "loss": -0.8958, "mean_logprobs": -0.014404296875, "mean_logprobs/var": 9.34600830078125e-05, "num_completions/total": 3264, "per_sentence_gradient_norm": 1.795206069946289, "per_sentence_gradient_norm/max": 4.932981967926025, "per_sentence_gradient_norm/median": 1.7020817995071411, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.049083948135376, "per_sentence_gradient_norm/p75": 2.411003589630127, "per_sentence_gradient_norm/p85": 2.8466434478759766, "per_sentence_gradient_norm/p90": 3.337432384490967, "per_sentence_gradient_norm/p95": 3.920246124267578, "per_sentence_gradient_norm/p99": 4.7273850440979, "per_sentence_gradient_norm/var": 1.338384985923767, "per_token_feature_norm": 185.96661376953125, "per_token_feature_norm/max": 270.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 85.5, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 232.32669067382812, "per_token_full_gradient_variance/max_squared_error": 0.6006714105606079, "per_token_full_gradient_variance/variance": 0.002776314737275243, "per_token_gradient_norm": 1.6823641061782837, "per_token_gradient_norm/max": 270.0703125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 230.31307983398438, "per_token_policy_error_norm": 0.00861609447747469, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.008725421503186226, "policy_entropy": 0.014314430765807629, "policy_entropy/max": 3.0625, "policy_entropy/median": 3.725290298461914e-09, "policy_entropy/min": 4.824699667560495e-18, "policy_entropy/p25": 3.092281986027956e-11, "policy_entropy/p75": 5.802139639854431e-07, "policy_entropy/var": 0.007495641242712736, "policy_error_vector_variance/max_squared_error": 1.9973385334014893, "policy_error_vector_variance/metric": 0.008609125390648842, "policy_loss": -0.8958333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.09429825097322464, "policy_sharpness": 9.591099739074707, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 2.558938503265381, "reward": 0.8958333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.09429825097322464, "rewards/accuracy_reward": 0.8958333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.09429825097322464, "sentence_full_gradient_variance/max_squared_error": 2917.967041015625, "sentence_full_gradient_variance/metric": 845.6061401367188, "sentence_full_gradient_variance/p75": 1338.2662353515625, "sentence_full_gradient_variance/p90": 1717.5849609375, "sentence_full_gradient_variance/p95": 2310.277587890625, "sentence_full_gradient_variance/p99": 2917.967041015625, "state_level_variance/metric": 0.6946369409561157, "state_level_variance_full_gradient/metric": 784.3577880859375, "step": 34 }, { "accuracy_reward": 0.71875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20427630841732025, "action_level_variance/metric": 1.7124385833740234, "action_level_variance_full_gradient/metric": 303.2326354980469, "adam_stats/lr_effective_max": 7.74314466980286e-05, "adam_stats/lr_effective_mean": 4.367887718181862e-10, "adam_stats/lr_effective_min": -7.754366379231215e-05, "adam_stats/m_t_max": 0.009185132570564747, "adam_stats/m_t_mean": 6.320160367279115e-11, "adam_stats/m_t_min": -0.019243303686380386, "adam_stats/v_t_max": 0.00025916658341884613, "adam_stats/v_t_mean": 1.3452901986843457e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.71875, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.20427630841732025, "all_logprobs": -0.02091512456536293, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.96875, "all_logprobs/p1": -0.6020312309265137, "all_logprobs/p10": -0.000301361083984375, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.01618652045726776, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02881605364382267, "clip_ratio": 0.0, "completion_length": 408.21875, "completion_length/correct": 357.5797119140625, "completion_length/correct/max": 780.0, "completion_length/correct/median": 327.0, "completion_length/correct/min": 206.0, "completion_length/correct/p25": 259.0, "completion_length/correct/p75": 431.0, "completion_length/correct/var": 18555.896484375, "completion_length/incorrect": 537.629638671875, "completion_length/incorrect/max": 977.0, "completion_length/incorrect/median": 483.0, "completion_length/incorrect/min": 240.0, "completion_length/incorrect/p25": 397.0, "completion_length/incorrect/p75": 729.0, "completion_length/incorrect/var": 44048.859375, "completion_length/max": 977.0, "completion_length/median": 379.0, "completion_length/min": 206.0, "completion_length/p25": 270.0, "completion_length/p75": 477.0, "completion_length/var": 31959.81640625, "epoch": 0.056, "feature_vector_variance/max_squared_error": 82079.1015625, "feature_vector_variance/metric": 28383.078125, "generated_tokens/total": 1870337.0, "grad_norm": 0.9013814330101013, "learning_rate": 1.2320907072649045e-05, "loss": -0.7188, "mean_logprobs": -0.0208740234375, "mean_logprobs/var": 0.000194549560546875, "num_completions/total": 3360, "per_sentence_gradient_norm": 2.004556179046631, "per_sentence_gradient_norm/max": 7.032506942749023, "per_sentence_gradient_norm/median": 1.4785540103912354, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.2714500427246094, "per_sentence_gradient_norm/p85": 4.340193271636963, "per_sentence_gradient_norm/p90": 5.0398993492126465, "per_sentence_gradient_norm/p95": 6.022871971130371, "per_sentence_gradient_norm/p99": 6.877802848815918, "per_sentence_gradient_norm/var": 3.8658902645111084, "per_token_feature_norm": 186.93377685546875, "per_token_feature_norm/max": 250.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 89.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 234.4046630859375, "per_token_full_gradient_variance/max_squared_error": 0.6022670865058899, "per_token_full_gradient_variance/variance": 0.0027845154982060194, "per_token_gradient_norm": 1.7672148942947388, "per_token_gradient_norm/max": 294.890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 243.97691345214844, "per_token_policy_error_norm": 0.012556786648929119, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011922289617359638, "policy_entropy": 0.02219315990805626, "policy_entropy/max": 1.6171875, "policy_entropy/median": 6.565824151039124e-08, "policy_entropy/min": 9.275010772434589e-20, "policy_entropy/p25": 4.4019543565809727e-10, "policy_entropy/p75": 9.357929229736328e-06, "policy_entropy/var": 0.01129695400595665, "policy_error_vector_variance/max_squared_error": 1.9746392965316772, "policy_error_vector_variance/metric": 0.01254983339458704, "policy_loss": -0.71875, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.20427630841732025, "policy_sharpness": 9.374055862426758, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.8333792686462402, "reward": 0.71875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20427630841732025, "rewards/accuracy_reward": 0.71875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20427630841732025, "sentence_full_gradient_variance/max_squared_error": 2375.946533203125, "sentence_full_gradient_variance/metric": 1030.4912109375, "sentence_full_gradient_variance/p75": 1931.687744140625, "sentence_full_gradient_variance/p90": 1931.687744140625, "sentence_full_gradient_variance/p95": 2042.7740478515625, "sentence_full_gradient_variance/p99": 2375.946533203125, "state_level_variance/metric": 2.538804054260254, "state_level_variance_full_gradient/metric": 727.258544921875, "step": 35 }, { "accuracy_reward": 0.7395833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19462722539901733, "action_level_variance/metric": 1.8349111080169678, "action_level_variance_full_gradient/metric": 168.90966796875, "adam_stats/lr_effective_max": 7.65349977882579e-05, "adam_stats/lr_effective_mean": 4.701440348142683e-10, "adam_stats/lr_effective_min": -7.669923797948286e-05, "adam_stats/m_t_max": 0.005803663283586502, "adam_stats/m_t_mean": 5.160509172763561e-11, "adam_stats/m_t_min": -0.018356570973992348, "adam_stats/v_t_max": 0.00025931309210136533, "adam_stats/v_t_mean": 1.3601855752831682e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7395833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.19462722539901733, "all_logprobs": -0.015408234670758247, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.25, "all_logprobs/p1": -0.474609375, "all_logprobs/p10": -2.777576446533203e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.0034332275390625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.021206041797995567, "clip_ratio": 0.0, "completion_length": 586.8333740234375, "completion_length/correct": 502.57745361328125, "completion_length/correct/max": 999.0, "completion_length/correct/median": 440.0, "completion_length/correct/min": 135.0, "completion_length/correct/p25": 387.5, "completion_length/correct/p75": 648.5, "completion_length/correct/var": 36747.3359375, "completion_length/incorrect": 826.1199951171875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 270.0, "completion_length/incorrect/p25": 691.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 62972.19140625, "completion_length/max": 1024.0, "completion_length/median": 534.0, "completion_length/min": 135.0, "completion_length/p25": 399.0, "completion_length/p75": 748.0, "completion_length/var": 63359.27734375, "epoch": 0.0576, "feature_vector_variance/max_squared_error": 73943.28125, "feature_vector_variance/metric": 26808.544921875, "generated_tokens/total": 1926673.0, "grad_norm": 0.698124349117279, "learning_rate": 1.2117461064942437e-05, "loss": -0.7396, "mean_logprobs": -0.015869140625, "mean_logprobs/var": 0.00012683868408203125, "num_completions/total": 3456, "per_sentence_gradient_norm": 1.6233198642730713, "per_sentence_gradient_norm/max": 7.694370746612549, "per_sentence_gradient_norm/median": 1.3941134214401245, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.260282516479492, "per_sentence_gradient_norm/p85": 2.8443260192871094, "per_sentence_gradient_norm/p90": 3.7133469581604004, "per_sentence_gradient_norm/p95": 4.6426520347595215, "per_sentence_gradient_norm/p99": 6.463584899902344, "per_sentence_gradient_norm/var": 2.398916721343994, "per_token_feature_norm": 186.9038848876953, "per_token_feature_norm/max": 258.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 92.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 251.80043029785156, "per_token_full_gradient_variance/max_squared_error": 0.5277659893035889, "per_token_full_gradient_variance/variance": 0.0022399097215384245, "per_token_gradient_norm": 1.3830630779266357, "per_token_gradient_norm/max": 282.09375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 189.1575164794922, "per_token_policy_error_norm": 0.009252555668354034, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.00857929140329361, "policy_entropy": 0.016522295773029327, "policy_entropy/max": 1.734375, "policy_entropy/median": 3.987224772572517e-09, "policy_entropy/min": 1.3891340334970526e-18, "policy_entropy/p25": 5.093170329928398e-11, "policy_entropy/p75": 7.227063179016113e-07, "policy_entropy/var": 0.008820260874927044, "policy_error_vector_variance/max_squared_error": 1.9968246221542358, "policy_error_vector_variance/metric": 0.00924818217754364, "policy_loss": -0.7395833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.19462722539901733, "policy_sharpness": 9.545028686523438, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 2.8434908390045166, "reward": 0.7395833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19462722539901733, "rewards/accuracy_reward": 0.7395833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19462722539901733, "sentence_full_gradient_variance/max_squared_error": 2614.217041015625, "sentence_full_gradient_variance/metric": 772.154296875, "sentence_full_gradient_variance/p75": 1059.10888671875, "sentence_full_gradient_variance/p90": 1457.83056640625, "sentence_full_gradient_variance/p95": 1587.30859375, "sentence_full_gradient_variance/p99": 2518.169189453125, "state_level_variance/metric": 0.838233232498169, "state_level_variance_full_gradient/metric": 603.24462890625, "step": 36 }, { "accuracy_reward": 0.7291666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19956143200397491, "action_level_variance/metric": 0.9345414638519287, "action_level_variance_full_gradient/metric": 104.6841049194336, "adam_stats/lr_effective_max": 7.386595098068938e-05, "adam_stats/lr_effective_mean": 2.2931363752309863e-10, "adam_stats/lr_effective_min": -7.295924297068268e-05, "adam_stats/m_t_max": 0.008287262171506882, "adam_stats/m_t_mean": 5.365844921167984e-11, "adam_stats/m_t_min": -0.017393717542290688, "adam_stats/v_t_max": 0.0002608791401144117, "adam_stats/v_t_mean": 1.377341036362667e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7291666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.19956143200397491, "all_logprobs": -0.01769796758890152, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.65625, "all_logprobs/p1": -0.49847662448883057, "all_logprobs/p10": -7.972703315317631e-05, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.00860595703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.024989623576402664, "clip_ratio": 0.0, "completion_length": 566.4479370117188, "completion_length/correct": 497.8714294433594, "completion_length/correct/max": 887.0, "completion_length/correct/median": 459.0, "completion_length/correct/min": 257.0, "completion_length/correct/p25": 375.25, "completion_length/correct/p75": 621.75, "completion_length/correct/var": 28483.3359375, "completion_length/incorrect": 751.0769653320312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 694.0, "completion_length/incorrect/min": 393.0, "completion_length/incorrect/p25": 562.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 47939.515625, "completion_length/max": 1024.0, "completion_length/median": 527.0, "completion_length/min": 257.0, "completion_length/p25": 422.0, "completion_length/p75": 708.5, "completion_length/var": 46098.04296875, "epoch": 0.0592, "feature_vector_variance/max_squared_error": 72431.421875, "feature_vector_variance/metric": 27763.88671875, "generated_tokens/total": 1981052.0, "grad_norm": 0.7081826329231262, "learning_rate": 1.1908389392193549e-05, "loss": -0.7292, "mean_logprobs": -0.0162353515625, "mean_logprobs/var": 9.584426879882812e-05, "num_completions/total": 3552, "per_sentence_gradient_norm": 1.5739352703094482, "per_sentence_gradient_norm/max": 5.350212097167969, "per_sentence_gradient_norm/median": 1.5924537181854248, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.4949121475219727, "per_sentence_gradient_norm/p85": 2.9164371490478516, "per_sentence_gradient_norm/p90": 3.3088817596435547, "per_sentence_gradient_norm/p95": 3.8191800117492676, "per_sentence_gradient_norm/p99": 4.483549118041992, "per_sentence_gradient_norm/var": 1.786605715751648, "per_token_feature_norm": 187.14306640625, "per_token_feature_norm/max": 258.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 94.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 224.79010009765625, "per_token_full_gradient_variance/max_squared_error": 0.4776228368282318, "per_token_full_gradient_variance/variance": 0.0024467778857797384, "per_token_gradient_norm": 1.438916802406311, "per_token_gradient_norm/max": 294.859375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 190.38858032226562, "per_token_policy_error_norm": 0.010447686538100243, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010084971785545349, "policy_entropy": 0.01907033659517765, "policy_entropy/max": 2.203125, "policy_entropy/median": 1.9907020032405853e-08, "policy_entropy/min": 1.5449880957918438e-18, "policy_entropy/p25": 1.0550138540565968e-10, "policy_entropy/p75": 3.516674041748047e-06, "policy_entropy/var": 0.010197666473686695, "policy_error_vector_variance/max_squared_error": 1.9844783544540405, "policy_error_vector_variance/metric": 0.01044514775276184, "policy_loss": -0.7291666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.19956143200397491, "policy_sharpness": 9.471638679504395, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.310279130935669, "reward": 0.7291666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19956143200397491, "rewards/accuracy_reward": 0.7291666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19956143200397491, "sentence_full_gradient_variance/max_squared_error": 2176.744873046875, "sentence_full_gradient_variance/metric": 757.8991088867188, "sentence_full_gradient_variance/p75": 1250.295166015625, "sentence_full_gradient_variance/p90": 1250.295166015625, "sentence_full_gradient_variance/p95": 1919.9591064453125, "sentence_full_gradient_variance/p99": 2077.257080078125, "state_level_variance/metric": 1.0366597175598145, "state_level_variance_full_gradient/metric": 653.215087890625, "step": 37 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14035087823867798, "action_level_variance/metric": 1.1033883094787598, "action_level_variance_full_gradient/metric": 104.9048080444336, "adam_stats/lr_effective_max": 7.002902566455305e-05, "adam_stats/lr_effective_mean": 1.240112179612396e-10, "adam_stats/lr_effective_min": -6.973775452934206e-05, "adam_stats/m_t_max": 0.005475176498293877, "adam_stats/m_t_mean": 2.943168972424637e-11, "adam_stats/m_t_min": -0.015446912497282028, "adam_stats/v_t_max": 0.0002612224780023098, "adam_stats/v_t_mean": 1.4147777567530273e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8333333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.14035087823867798, "all_logprobs": -0.017086265608668327, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.25, "all_logprobs/p1": -0.52734375, "all_logprobs/p10": -5.841255187988281e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.0067138671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.023080753162503242, "clip_ratio": 0.0, "completion_length": 454.6145935058594, "completion_length/correct": 410.8500061035156, "completion_length/correct/max": 1010.0, "completion_length/correct/median": 306.0, "completion_length/correct/min": 173.0, "completion_length/correct/p25": 230.75, "completion_length/correct/p75": 576.75, "completion_length/correct/var": 49566.07421875, "completion_length/incorrect": 673.4375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 585.0, "completion_length/incorrect/min": 309.0, "completion_length/incorrect/p25": 405.75, "completion_length/incorrect/p75": 1004.5, "completion_length/incorrect/var": 75334.53125, "completion_length/max": 1024.0, "completion_length/median": 359.0, "completion_length/min": 173.0, "completion_length/p25": 254.5, "completion_length/p75": 601.75, "completion_length/var": 62790.53515625, "epoch": 0.0608, "feature_vector_variance/max_squared_error": 72045.40625, "feature_vector_variance/metric": 27230.79296875, "generated_tokens/total": 2024695.0, "grad_norm": 1.176957368850708, "learning_rate": 1.1693946776030601e-05, "loss": -0.8333, "mean_logprobs": -0.0184326171875, "mean_logprobs/var": 0.00010442733764648438, "num_completions/total": 3648, "per_sentence_gradient_norm": 2.228367805480957, "per_sentence_gradient_norm/max": 8.431888580322266, "per_sentence_gradient_norm/median": 2.257169008255005, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.2302743196487427, "per_sentence_gradient_norm/p75": 2.926043748855591, "per_sentence_gradient_norm/p85": 3.697172164916992, "per_sentence_gradient_norm/p90": 4.100707054138184, "per_sentence_gradient_norm/p95": 4.506948471069336, "per_sentence_gradient_norm/p99": 6.087619304656982, "per_sentence_gradient_norm/var": 2.3760507106781006, "per_token_feature_norm": 186.44644165039062, "per_token_feature_norm/max": 256.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 91.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 227.05416870117188, "per_token_full_gradient_variance/max_squared_error": 0.518624484539032, "per_token_full_gradient_variance/variance": 0.0029019771609455347, "per_token_gradient_norm": 1.9263981580734253, "per_token_gradient_norm/max": 275.4921875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 255.67544555664062, "per_token_policy_error_norm": 0.010238835588097572, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009469358250498772, "policy_entropy": 0.018509140238165855, "policy_entropy/max": 1.6328125, "policy_entropy/median": 1.3620592653751373e-08, "policy_entropy/min": 8.565197162635485e-18, "policy_entropy/p25": 1.1596057447604835e-10, "policy_entropy/p75": 1.866370439529419e-06, "policy_entropy/var": 0.0096846092492342, "policy_error_vector_variance/max_squared_error": 1.9976855516433716, "policy_error_vector_variance/metric": 0.010235658846795559, "policy_loss": -0.8333333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.14035087823867798, "policy_sharpness": 9.491487503051758, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.154214859008789, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14035087823867798, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14035087823867798, "sentence_full_gradient_variance/max_squared_error": 2301.3388671875, "sentence_full_gradient_variance/metric": 1218.7095947265625, "sentence_full_gradient_variance/p75": 1584.175537109375, "sentence_full_gradient_variance/p90": 2301.338134765625, "sentence_full_gradient_variance/p95": 2301.338134765625, "sentence_full_gradient_variance/p99": 2301.338134765625, "state_level_variance/metric": 1.5118199586868286, "state_level_variance_full_gradient/metric": 1113.8046875, "step": 38 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894736886024475, "action_level_variance/metric": 0.6744102239608765, "action_level_variance_full_gradient/metric": 74.64990234375, "adam_stats/lr_effective_max": 6.952830153750256e-05, "adam_stats/lr_effective_mean": 2.701377177505293e-10, "adam_stats/lr_effective_min": -6.910342926857993e-05, "adam_stats/m_t_max": 0.007263672538101673, "adam_stats/m_t_mean": 1.1061957773395026e-11, "adam_stats/m_t_min": -0.013758397661149502, "adam_stats/v_t_max": 0.00026505906134843826, "adam_stats/v_t_mean": 1.4579259674518674e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.75, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.75, "advantages/p75": 1.0, "advantages/var": 0.1894736886024475, "all_logprobs": -0.016483113169670105, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.0, "all_logprobs/p1": -0.474609375, "all_logprobs/p10": -7.486343383789062e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.00592041015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.023020604625344276, "clip_ratio": 0.0, "completion_length": 544.4375, "completion_length/correct": 484.9861145019531, "completion_length/correct/max": 978.0, "completion_length/correct/median": 478.0, "completion_length/correct/min": 154.0, "completion_length/correct/p25": 327.0, "completion_length/correct/p75": 602.25, "completion_length/correct/var": 34594.74609375, "completion_length/incorrect": 722.7916870117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 652.0, "completion_length/incorrect/min": 355.0, "completion_length/incorrect/p25": 611.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 46349.3046875, "completion_length/max": 1024.0, "completion_length/median": 542.0, "completion_length/min": 154.0, "completion_length/p25": 407.5, "completion_length/p75": 659.5, "completion_length/var": 47791.44921875, "epoch": 0.0624, "feature_vector_variance/max_squared_error": 71139.1328125, "feature_vector_variance/metric": 27539.630859375, "generated_tokens/total": 2076961.0, "grad_norm": 1.1670317649841309, "learning_rate": 1.1474394481749037e-05, "loss": -0.75, "mean_logprobs": -0.0167236328125, "mean_logprobs/var": 6.341934204101562e-05, "num_completions/total": 3744, "per_sentence_gradient_norm": 1.9309749603271484, "per_sentence_gradient_norm/max": 5.089372634887695, "per_sentence_gradient_norm/median": 2.080116033554077, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.5565152168273926, "per_sentence_gradient_norm/p75": 3.011300563812256, "per_sentence_gradient_norm/p85": 3.463982582092285, "per_sentence_gradient_norm/p90": 3.6742238998413086, "per_sentence_gradient_norm/p95": 3.9954800605773926, "per_sentence_gradient_norm/p99": 4.828018665313721, "per_sentence_gradient_norm/var": 1.9516023397445679, "per_token_feature_norm": 185.89341735839844, "per_token_feature_norm/max": 258.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 88.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 210.19393920898438, "per_token_full_gradient_variance/max_squared_error": 0.5973175168037415, "per_token_full_gradient_variance/variance": 0.0025923452340066433, "per_token_gradient_norm": 1.681553840637207, "per_token_gradient_norm/max": 298.6875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 224.77423095703125, "per_token_policy_error_norm": 0.009684085845947266, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009264307096600533, "policy_entropy": 0.018016308546066284, "policy_entropy/max": 1.609375, "policy_entropy/median": 9.546056389808655e-09, "policy_entropy/min": 3.076423664427619e-18, "policy_entropy/p25": 6.775735528208315e-11, "policy_entropy/p75": 1.9222497940063477e-06, "policy_entropy/var": 0.009285345673561096, "policy_error_vector_variance/max_squared_error": 1.983730435371399, "policy_error_vector_variance/metric": 0.009674551896750927, "policy_loss": -0.75, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -0.75, "policy_loss/var": 0.1894736886024475, "policy_sharpness": 9.492218017578125, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.1077356338500977, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894736886024475, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894736886024475, "sentence_full_gradient_variance/max_squared_error": 3223.5693359375, "sentence_full_gradient_variance/metric": 745.6162109375, "sentence_full_gradient_variance/p75": 981.4596557617188, "sentence_full_gradient_variance/p90": 1378.298828125, "sentence_full_gradient_variance/p95": 2724.7685546875, "sentence_full_gradient_variance/p99": 3223.5693359375, "state_level_variance/metric": 1.4630881547927856, "state_level_variance_full_gradient/metric": 670.9663696289062, "step": 39 }, { "accuracy_reward": 0.8020833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16041667759418488, "action_level_variance/metric": 0.8663412928581238, "action_level_variance_full_gradient/metric": 98.2168197631836, "adam_stats/lr_effective_max": 6.772133201593533e-05, "adam_stats/lr_effective_mean": 3.5061267777969363e-10, "adam_stats/lr_effective_min": -6.894980469951406e-05, "adam_stats/m_t_max": 0.004842342808842659, "adam_stats/m_t_mean": 3.182240235211076e-11, "adam_stats/m_t_min": -0.015116090886294842, "adam_stats/v_t_max": 0.00026544753927737474, "adam_stats/v_t_mean": 1.500365109929902e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8020833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.16041667759418488, "all_logprobs": -0.01507792342454195, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.5, "all_logprobs/p1": -0.3908594250679016, "all_logprobs/p10": -2.658367156982422e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.003173828125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.023247964680194855, "clip_ratio": 0.0, "completion_length": 500.9270935058594, "completion_length/correct": 445.6363525390625, "completion_length/correct/max": 929.0, "completion_length/correct/median": 354.0, "completion_length/correct/min": 231.0, "completion_length/correct/p25": 294.0, "completion_length/correct/p75": 525.0, "completion_length/correct/var": 45758.33984375, "completion_length/incorrect": 725.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 732.0, "completion_length/incorrect/min": 431.0, "completion_length/incorrect/p25": 524.5, "completion_length/incorrect/p75": 964.5, "completion_length/incorrect/var": 54782.0, "completion_length/max": 1024.0, "completion_length/median": 403.0, "completion_length/min": 231.0, "completion_length/p25": 309.5, "completion_length/p75": 639.75, "completion_length/var": 59505.98828125, "epoch": 0.064, "feature_vector_variance/max_squared_error": 72400.953125, "feature_vector_variance/metric": 27728.314453125, "generated_tokens/total": 2125050.0, "grad_norm": 1.1030175685882568, "learning_rate": 1.125e-05, "loss": -0.8021, "mean_logprobs": -0.0142822265625, "mean_logprobs/var": 5.364418029785156e-05, "num_completions/total": 3840, "per_sentence_gradient_norm": 1.6183236837387085, "per_sentence_gradient_norm/max": 4.205391883850098, "per_sentence_gradient_norm/median": 1.7706358432769775, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.7636888027191162, "per_sentence_gradient_norm/p75": 2.3868799209594727, "per_sentence_gradient_norm/p85": 2.7408480644226074, "per_sentence_gradient_norm/p90": 2.8956222534179688, "per_sentence_gradient_norm/p95": 3.504148006439209, "per_sentence_gradient_norm/p99": 3.7866508960723877, "per_sentence_gradient_norm/var": 1.2132083177566528, "per_token_feature_norm": 186.3468780517578, "per_token_feature_norm/max": 252.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 86.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 221.1864471435547, "per_token_full_gradient_variance/max_squared_error": 0.5324125289916992, "per_token_full_gradient_variance/variance": 0.0025012283585965633, "per_token_gradient_norm": 1.5219917297363281, "per_token_gradient_norm/max": 299.4140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 212.26856994628906, "per_token_policy_error_norm": 0.008771496824920177, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.008532751351594925, "policy_entropy": 0.015738412737846375, "policy_entropy/max": 2.0625, "policy_entropy/median": 6.810296326875687e-09, "policy_entropy/min": 4.960224939121183e-18, "policy_entropy/p25": 5.184119800105691e-11, "policy_entropy/p75": 1.1771917343139648e-06, "policy_entropy/var": 0.008920799009501934, "policy_error_vector_variance/max_squared_error": 1.985790729522705, "policy_error_vector_variance/metric": 0.008748822845518589, "policy_loss": -0.8020833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.16041667759418488, "policy_sharpness": 9.554205894470215, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 2.777477979660034, "reward": 0.8020833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16041667759418488, "rewards/accuracy_reward": 0.8020833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16041667759418488, "sentence_full_gradient_variance/max_squared_error": 2128.35400390625, "sentence_full_gradient_variance/metric": 1066.12646484375, "sentence_full_gradient_variance/p75": 1810.604736328125, "sentence_full_gradient_variance/p90": 1810.604736328125, "sentence_full_gradient_variance/p95": 1882.053955078125, "sentence_full_gradient_variance/p99": 2128.35400390625, "state_level_variance/metric": 0.48275139927864075, "state_level_variance_full_gradient/metric": 967.9096069335938, "step": 40 }, { "accuracy_reward": 0.6458333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2311403602361679, "action_level_variance/metric": 0.4446828067302704, "action_level_variance_full_gradient/metric": 59.50379943847656, "adam_stats/lr_effective_max": 6.603475048905239e-05, "adam_stats/lr_effective_mean": 2.87434576140555e-10, "adam_stats/lr_effective_min": -6.395835225703195e-05, "adam_stats/m_t_max": 0.006951494608074427, "adam_stats/m_t_mean": 5.347692080825972e-11, "adam_stats/m_t_min": -0.012927582487463951, "adam_stats/v_t_max": 0.0002694138674996793, "adam_stats/v_t_mean": 1.550127734506468e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6458333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2311403602361679, "all_logprobs": -0.01816771924495697, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.25, "all_logprobs/p1": -0.5234375, "all_logprobs/p10": -0.000152587890625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.01104736328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02547517791390419, "clip_ratio": 0.0, "completion_length": 598.0208740234375, "completion_length/correct": 493.7257995605469, "completion_length/correct/max": 901.0, "completion_length/correct/median": 467.0, "completion_length/correct/min": 235.0, "completion_length/correct/p25": 369.25, "completion_length/correct/p75": 554.75, "completion_length/correct/var": 33583.3515625, "completion_length/incorrect": 788.2058715820312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 759.0, "completion_length/incorrect/min": 466.0, "completion_length/incorrect/p25": 666.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 37739.25390625, "completion_length/max": 1024.0, "completion_length/median": 542.0, "completion_length/min": 235.0, "completion_length/p25": 439.75, "completion_length/p75": 783.75, "completion_length/var": 54717.62109375, "epoch": 0.0656, "feature_vector_variance/max_squared_error": 74470.90625, "feature_vector_variance/metric": 27359.86328125, "generated_tokens/total": 2182460.0, "grad_norm": 5.554397106170654, "learning_rate": 1.1021036720894182e-05, "loss": -0.6458, "mean_logprobs": -0.017333984375, "mean_logprobs/var": 8.058547973632812e-05, "num_completions/total": 3936, "per_sentence_gradient_norm": 1.540247917175293, "per_sentence_gradient_norm/max": 5.168964385986328, "per_sentence_gradient_norm/median": 1.5721909999847412, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.6952362060546875, "per_sentence_gradient_norm/p85": 3.1246817111968994, "per_sentence_gradient_norm/p90": 3.4522125720977783, "per_sentence_gradient_norm/p95": 3.6758079528808594, "per_sentence_gradient_norm/p99": 4.668148040771484, "per_sentence_gradient_norm/var": 2.1245884895324707, "per_token_feature_norm": 185.86180114746094, "per_token_feature_norm/max": 248.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 98.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 203.90673828125, "per_token_full_gradient_variance/max_squared_error": 0.5267978310585022, "per_token_full_gradient_variance/variance": 0.002324500819668174, "per_token_gradient_norm": 1.3261278867721558, "per_token_gradient_norm/max": 294.984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 171.61460876464844, "per_token_policy_error_norm": 0.010618658736348152, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009999032132327557, "policy_entropy": 0.020256705582141876, "policy_entropy/max": 2.40625, "policy_entropy/median": 1.5133991837501526e-08, "policy_entropy/min": 6.166399856011306e-19, "policy_entropy/p25": 1.07775122160092e-10, "policy_entropy/p75": 2.473592758178711e-06, "policy_entropy/var": 0.010651406832039356, "policy_error_vector_variance/max_squared_error": 2.0009818077087402, "policy_error_vector_variance/metric": 0.010605630464851856, "policy_loss": -0.6458333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2311403602361679, "policy_sharpness": 9.428431510925293, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.523688554763794, "reward": 0.6458333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2311403602361679, "rewards/accuracy_reward": 0.6458333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2311403602361679, "sentence_full_gradient_variance/max_squared_error": 2208.61767578125, "sentence_full_gradient_variance/metric": 1004.175048828125, "sentence_full_gradient_variance/p75": 1275.5882568359375, "sentence_full_gradient_variance/p90": 1930.190673828125, "sentence_full_gradient_variance/p95": 2208.61767578125, "sentence_full_gradient_variance/p99": 2208.61767578125, "state_level_variance/metric": 1.8691197633743286, "state_level_variance_full_gradient/metric": 944.6710815429688, "step": 41 }, { "accuracy_reward": 0.8854166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.10252193361520767, "action_level_variance/metric": 0.5650101900100708, "action_level_variance_full_gradient/metric": 113.74291229248047, "adam_stats/lr_effective_max": 6.396941898856312e-05, "adam_stats/lr_effective_mean": 1.3414010180401448e-10, "adam_stats/lr_effective_min": -6.415415555238724e-05, "adam_stats/m_t_max": 0.00787390861660242, "adam_stats/m_t_mean": 7.164275422910649e-11, "adam_stats/m_t_min": -0.011721799150109291, "adam_stats/v_t_max": 0.00026967673329636455, "adam_stats/v_t_mean": 1.5744780479387543e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8854166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.10252193361520767, "all_logprobs": -0.014375852420926094, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.0, "all_logprobs/p1": -0.38671875, "all_logprobs/p10": -3.123283386230469e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.002471923828125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.01934369094669819, "clip_ratio": 0.0, "completion_length": 477.76043701171875, "completion_length/correct": 461.6941223144531, "completion_length/correct/max": 942.0, "completion_length/correct/median": 406.0, "completion_length/correct/min": 264.0, "completion_length/correct/p25": 369.0, "completion_length/correct/p75": 522.0, "completion_length/correct/var": 20860.623046875, "completion_length/incorrect": 601.9091186523438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 507.0, "completion_length/incorrect/min": 486.0, "completion_length/incorrect/p25": 501.5, "completion_length/incorrect/p75": 534.0, "completion_length/incorrect/var": 43864.296875, "completion_length/max": 1024.0, "completion_length/median": 431.0, "completion_length/min": 264.0, "completion_length/p25": 369.75, "completion_length/p75": 525.75, "completion_length/var": 25078.078125, "epoch": 0.0672, "feature_vector_variance/max_squared_error": 73454.3984375, "feature_vector_variance/metric": 27085.765625, "generated_tokens/total": 2228325.0, "grad_norm": 0.8324281573295593, "learning_rate": 1.078778360091808e-05, "loss": -0.8854, "mean_logprobs": -0.01495361328125, "mean_logprobs/var": 6.4849853515625e-05, "num_completions/total": 4032, "per_sentence_gradient_norm": 2.1261744499206543, "per_sentence_gradient_norm/max": 5.142817497253418, "per_sentence_gradient_norm/median": 2.1202034950256348, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.4252066612243652, "per_sentence_gradient_norm/p75": 2.940223217010498, "per_sentence_gradient_norm/p85": 3.3664493560791016, "per_sentence_gradient_norm/p90": 3.973708391189575, "per_sentence_gradient_norm/p95": 4.2230916023254395, "per_sentence_gradient_norm/p99": 5.002615928649902, "per_sentence_gradient_norm/var": 1.6248682737350464, "per_token_feature_norm": 187.08653259277344, "per_token_feature_norm/max": 246.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 92.5, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 224.6251983642578, "per_token_full_gradient_variance/max_squared_error": 0.5850151777267456, "per_token_full_gradient_variance/variance": 0.003254967974498868, "per_token_gradient_norm": 1.9876513481140137, "per_token_gradient_norm/max": 273.984375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 258.1626892089844, "per_token_policy_error_norm": 0.008539753034710884, "per_token_policy_error_norm/max": 1.9921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.007820827886462212, "policy_entropy": 0.015992095693945885, "policy_entropy/max": 1.40625, "policy_entropy/median": 9.953510016202927e-09, "policy_entropy/min": 8.029872339970767e-19, "policy_entropy/p25": 9.231371222995222e-11, "policy_entropy/p75": 1.3113021850585938e-06, "policy_entropy/var": 0.00836977083235979, "policy_error_vector_variance/max_squared_error": 1.9925984144210815, "policy_error_vector_variance/metric": 0.008524482138454914, "policy_loss": -0.8854166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.10252193361520767, "policy_sharpness": 9.549112319946289, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 2.7898974418640137, "reward": 0.8854166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.10252193361520767, "rewards/accuracy_reward": 0.8854166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.10252193361520767, "sentence_full_gradient_variance/max_squared_error": 1716.2862548828125, "sentence_full_gradient_variance/metric": 565.8397827148438, "sentence_full_gradient_variance/p75": 982.2715454101562, "sentence_full_gradient_variance/p90": 1716.2606201171875, "sentence_full_gradient_variance/p95": 1716.2606201171875, "sentence_full_gradient_variance/p99": 1716.261962890625, "state_level_variance/metric": 1.2147914171218872, "state_level_variance_full_gradient/metric": 452.09686279296875, "step": 42 }, { "accuracy_reward": 0.8125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1539473682641983, "action_level_variance/metric": 1.6047897338867188, "action_level_variance_full_gradient/metric": 277.0709228515625, "adam_stats/lr_effective_max": 5.934419823461212e-05, "adam_stats/lr_effective_mean": 1.2674372662502265e-10, "adam_stats/lr_effective_min": -6.0798804042860866e-05, "adam_stats/m_t_max": 0.006062459200620651, "adam_stats/m_t_mean": 4.259054914301785e-11, "adam_stats/m_t_min": -0.010766377672553062, "adam_stats/v_t_max": 0.0002739352348726243, "adam_stats/v_t_mean": 1.6245194425779097e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8125, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.1539473682641983, "all_logprobs": -0.01651999168097973, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.0, "all_logprobs/p1": -0.3974023461341858, "all_logprobs/p10": -7.104873657226562e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.0067138671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.025475086644291878, "clip_ratio": 0.0, "completion_length": 489.10418701171875, "completion_length/correct": 450.28204345703125, "completion_length/correct/max": 938.0, "completion_length/correct/median": 448.0, "completion_length/correct/min": 182.0, "completion_length/correct/p25": 361.5, "completion_length/correct/p75": 518.0, "completion_length/correct/var": 23559.48046875, "completion_length/incorrect": 657.3333129882812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 606.0, "completion_length/incorrect/min": 430.0, "completion_length/incorrect/p25": 565.25, "completion_length/incorrect/p75": 637.0, "completion_length/incorrect/var": 31874.0, "completion_length/max": 1024.0, "completion_length/median": 460.0, "completion_length/min": 182.0, "completion_length/p25": 371.25, "completion_length/p75": 584.5, "completion_length/var": 31399.107421875, "epoch": 0.0688, "feature_vector_variance/max_squared_error": 70696.625, "feature_vector_variance/metric": 27067.56640625, "generated_tokens/total": 2275279.0, "grad_norm": 3.7731916904449463, "learning_rate": 1.0550524823068504e-05, "loss": -0.8125, "mean_logprobs": -0.016845703125, "mean_logprobs/var": 0.00012683868408203125, "num_completions/total": 4128, "per_sentence_gradient_norm": 2.0089306831359863, "per_sentence_gradient_norm/max": 7.784542083740234, "per_sentence_gradient_norm/median": 2.155395269393921, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.0374873876571655, "per_sentence_gradient_norm/p75": 2.7038092613220215, "per_sentence_gradient_norm/p85": 3.056589126586914, "per_sentence_gradient_norm/p90": 3.7882344722747803, "per_sentence_gradient_norm/p95": 4.625087738037109, "per_sentence_gradient_norm/p99": 5.8974738121032715, "per_sentence_gradient_norm/var": 2.1877496242523193, "per_token_feature_norm": 185.00222778320312, "per_token_feature_norm/max": 272.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 91.0, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 222.97056579589844, "per_token_full_gradient_variance/max_squared_error": 0.4936189651489258, "per_token_full_gradient_variance/variance": 0.0031283271964639425, "per_token_gradient_norm": 1.8074613809585571, "per_token_gradient_norm/max": 285.7265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 245.59307861328125, "per_token_policy_error_norm": 0.009476902894675732, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.00932889524847269, "policy_entropy": 0.017660779878497124, "policy_entropy/max": 1.828125, "policy_entropy/median": 2.0838342607021332e-08, "policy_entropy/min": 6.335806445462167e-19, "policy_entropy/p25": 1.6916601452976465e-10, "policy_entropy/p75": 2.726912498474121e-06, "policy_entropy/var": 0.009344763122498989, "policy_error_vector_variance/max_squared_error": 1.9996232986450195, "policy_error_vector_variance/metric": 0.009470273740589619, "policy_loss": -0.8125, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.1539473682641983, "policy_sharpness": 9.500216484069824, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.0759010314941406, "reward": 0.8125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1539473682641983, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1539473682641983, "sentence_full_gradient_variance/max_squared_error": 3086.79248046875, "sentence_full_gradient_variance/metric": 1016.6434936523438, "sentence_full_gradient_variance/p75": 1589.0169677734375, "sentence_full_gradient_variance/p90": 2013.015625, "sentence_full_gradient_variance/p95": 3014.44970703125, "sentence_full_gradient_variance/p99": 3086.79248046875, "state_level_variance/metric": 0.8299303650856018, "state_level_variance_full_gradient/metric": 739.5726318359375, "step": 43 }, { "accuracy_reward": 0.7083333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20877191424369812, "action_level_variance/metric": 0.842496395111084, "action_level_variance_full_gradient/metric": 83.37053680419922, "adam_stats/lr_effective_max": 5.910735126235522e-05, "adam_stats/lr_effective_mean": -9.896604369341588e-11, "adam_stats/lr_effective_min": -6.217474583536386e-05, "adam_stats/m_t_max": 0.01007047202438116, "adam_stats/m_t_mean": 5.721127044333585e-11, "adam_stats/m_t_min": -0.011114177294075489, "adam_stats/v_t_max": 0.00027750671142712235, "adam_stats/v_t_mean": 1.662986241768305e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7083333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.20877191424369812, "all_logprobs": -0.020019907504320145, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.5625, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00012302398681640625, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.0113525390625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.028704073280096054, "clip_ratio": 0.0, "completion_length": 578.875, "completion_length/correct": 465.9117736816406, "completion_length/correct/max": 985.0, "completion_length/correct/median": 451.0, "completion_length/correct/min": 194.0, "completion_length/correct/p25": 356.25, "completion_length/correct/p75": 527.0, "completion_length/correct/var": 33284.17578125, "completion_length/incorrect": 853.2142944335938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 994.0, "completion_length/incorrect/min": 543.0, "completion_length/incorrect/p25": 653.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 38986.1015625, "completion_length/max": 1024.0, "completion_length/median": 516.0, "completion_length/min": 194.0, "completion_length/p25": 394.25, "completion_length/p75": 712.5, "completion_length/var": 65870.828125, "epoch": 0.0704, "feature_vector_variance/max_squared_error": 72869.1875, "feature_vector_variance/metric": 26892.24609375, "generated_tokens/total": 2330851.0, "grad_norm": 0.9960381984710693, "learning_rate": 1.0309549450619342e-05, "loss": -0.7083, "mean_logprobs": -0.019775390625, "mean_logprobs/var": 7.486343383789062e-05, "num_completions/total": 4224, "per_sentence_gradient_norm": 1.895886778831482, "per_sentence_gradient_norm/max": 5.144503593444824, "per_sentence_gradient_norm/median": 2.0409395694732666, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.874298095703125, "per_sentence_gradient_norm/p85": 3.666062831878662, "per_sentence_gradient_norm/p90": 3.9437341690063477, "per_sentence_gradient_norm/p95": 4.225760459899902, "per_sentence_gradient_norm/p99": 4.979068756103516, "per_sentence_gradient_norm/var": 2.2512013912200928, "per_token_feature_norm": 186.14613342285156, "per_token_feature_norm/max": 251.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 91.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 220.21897888183594, "per_token_full_gradient_variance/max_squared_error": 0.4402497410774231, "per_token_full_gradient_variance/variance": 0.00217621773481369, "per_token_gradient_norm": 1.5054357051849365, "per_token_gradient_norm/max": 275.625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 198.8873748779297, "per_token_policy_error_norm": 0.011693479493260384, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011100295931100845, "policy_entropy": 0.02167021855711937, "policy_entropy/max": 1.9296875, "policy_entropy/median": 2.6426278054714203e-08, "policy_entropy/min": 5.421010862427522e-19, "policy_entropy/p25": 2.034994395216927e-10, "policy_entropy/p75": 3.7848949432373047e-06, "policy_entropy/var": 0.011867321096360683, "policy_error_vector_variance/max_squared_error": 1.9833136796951294, "policy_error_vector_variance/metric": 0.01168489083647728, "policy_loss": -0.7083333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.20877191424369812, "policy_sharpness": 9.433545112609863, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.5759005546569824, "reward": 0.7083333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20877191424369812, "rewards/accuracy_reward": 0.7083333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20877191424369812, "sentence_full_gradient_variance/max_squared_error": 2792.030517578125, "sentence_full_gradient_variance/metric": 1264.3751220703125, "sentence_full_gradient_variance/p75": 2180.682373046875, "sentence_full_gradient_variance/p90": 2180.682373046875, "sentence_full_gradient_variance/p95": 2399.671875, "sentence_full_gradient_variance/p99": 2756.020751953125, "state_level_variance/metric": 1.6260734796524048, "state_level_variance_full_gradient/metric": 1181.004638671875, "step": 44 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14035087823867798, "action_level_variance/metric": 0.5528892874717712, "action_level_variance_full_gradient/metric": 113.12904357910156, "adam_stats/lr_effective_max": 6.042794484528713e-05, "adam_stats/lr_effective_mean": -1.216936829084858e-10, "adam_stats/lr_effective_min": -6.171823770273477e-05, "adam_stats/m_t_max": 0.013123811222612858, "adam_stats/m_t_mean": 3.089005787271226e-11, "adam_stats/m_t_min": -0.012844071723520756, "adam_stats/v_t_max": 0.0002808950957842171, "adam_stats/v_t_mean": 1.6975424538268058e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8333333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.14035087823867798, "all_logprobs": -0.014675467275083065, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -12.5, "all_logprobs/p1": -0.38671875, "all_logprobs/p10": -2.1696090698242188e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.00193023681640625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.024417206645011902, "clip_ratio": 0.0, "completion_length": 634.1979370117188, "completion_length/correct": 574.2000122070312, "completion_length/correct/max": 1002.0, "completion_length/correct/median": 576.0, "completion_length/correct/min": 275.0, "completion_length/correct/p25": 453.25, "completion_length/correct/p75": 689.75, "completion_length/correct/var": 33205.17578125, "completion_length/incorrect": 934.1875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 733.0, "completion_length/incorrect/p25": 798.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 14960.029296875, "completion_length/max": 1024.0, "completion_length/median": 621.0, "completion_length/min": 275.0, "completion_length/p25": 463.75, "completion_length/p75": 774.5, "completion_length/var": 48163.04296875, "epoch": 0.072, "feature_vector_variance/max_squared_error": 82544.7890625, "feature_vector_variance/metric": 26618.720703125, "generated_tokens/total": 2391734.0, "grad_norm": 0.9357803463935852, "learning_rate": 1.0065151074942516e-05, "loss": -0.8333, "mean_logprobs": -0.01483154296875, "mean_logprobs/var": 5.5789947509765625e-05, "num_completions/total": 4320, "per_sentence_gradient_norm": 1.8369197845458984, "per_sentence_gradient_norm/max": 4.6673903465271, "per_sentence_gradient_norm/median": 1.891176700592041, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.308905839920044, "per_sentence_gradient_norm/p75": 2.4839277267456055, "per_sentence_gradient_norm/p85": 2.800163984298706, "per_sentence_gradient_norm/p90": 3.0947775840759277, "per_sentence_gradient_norm/p95": 3.50618839263916, "per_sentence_gradient_norm/p99": 4.326479434967041, "per_sentence_gradient_norm/var": 1.2484123706817627, "per_token_feature_norm": 183.85372924804688, "per_token_feature_norm/max": 250.0, "per_token_feature_norm/median": 185.0, "per_token_feature_norm/min": 91.5, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 211.36978149414062, "per_token_full_gradient_variance/max_squared_error": 0.5581227540969849, "per_token_full_gradient_variance/variance": 0.002709728665649891, "per_token_gradient_norm": 1.6637190580368042, "per_token_gradient_norm/max": 279.515625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 227.23617553710938, "per_token_policy_error_norm": 0.008573372848331928, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.008380644954741001, "policy_entropy": 0.01479231845587492, "policy_entropy/max": 3.03125, "policy_entropy/median": 1.525040715932846e-08, "policy_entropy/min": 8.809142651444724e-19, "policy_entropy/p25": 1.2187229003757238e-10, "policy_entropy/p75": 1.4901161193847656e-06, "policy_entropy/var": 0.007808806840330362, "policy_error_vector_variance/max_squared_error": 1.972198724746704, "policy_error_vector_variance/metric": 0.008530151098966599, "policy_loss": -0.8333333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.14035087823867798, "policy_sharpness": 9.58220100402832, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 2.5877249240875244, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14035087823867798, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14035087823867798, "sentence_full_gradient_variance/max_squared_error": 2129.216796875, "sentence_full_gradient_variance/metric": 642.9505615234375, "sentence_full_gradient_variance/p75": 1119.08935546875, "sentence_full_gradient_variance/p90": 1140.81103515625, "sentence_full_gradient_variance/p95": 2039.28271484375, "sentence_full_gradient_variance/p99": 2129.216796875, "state_level_variance/metric": 0.8199599981307983, "state_level_variance_full_gradient/metric": 529.821533203125, "step": 45 }, { "accuracy_reward": 0.6979166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21304824948310852, "action_level_variance/metric": 1.5919636487960815, "action_level_variance_full_gradient/metric": 186.33322143554688, "adam_stats/lr_effective_max": 5.706811134587042e-05, "adam_stats/lr_effective_mean": 5.447591596569046e-12, "adam_stats/lr_effective_min": -5.7463927078060806e-05, "adam_stats/m_t_max": 0.008772021159529686, "adam_stats/m_t_mean": 7.026588946107415e-12, "adam_stats/m_t_min": -0.007978208363056183, "adam_stats/v_t_max": 0.0002837536158040166, "adam_stats/v_t_mean": 1.7454556897056328e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6979166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21304824948310852, "all_logprobs": -0.021167336031794548, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.8125, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00020313262939453125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.0143890380859375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03300774469971657, "clip_ratio": 0.0, "completion_length": 559.2291870117188, "completion_length/correct": 520.9403076171875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 493.0, "completion_length/correct/min": 215.0, "completion_length/correct/p25": 362.0, "completion_length/correct/p75": 624.0, "completion_length/correct/var": 39561.54296875, "completion_length/incorrect": 647.6896362304688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 601.0, "completion_length/incorrect/min": 311.0, "completion_length/incorrect/p25": 366.0, "completion_length/incorrect/p75": 894.0, "completion_length/incorrect/var": 67235.6484375, "completion_length/max": 1024.0, "completion_length/median": 510.0, "completion_length/min": 215.0, "completion_length/p25": 362.0, "completion_length/p75": 729.0, "completion_length/var": 50724.390625, "epoch": 0.0736, "feature_vector_variance/max_squared_error": 88727.0625, "feature_vector_variance/metric": 27415.98046875, "generated_tokens/total": 2445420.0, "grad_norm": 1.8474754095077515, "learning_rate": 9.817627457812105e-06, "loss": -0.6979, "mean_logprobs": -0.02197265625, "mean_logprobs/var": 0.00014400482177734375, "num_completions/total": 4416, "per_sentence_gradient_norm": 1.9805779457092285, "per_sentence_gradient_norm/max": 7.310485363006592, "per_sentence_gradient_norm/median": 2.146596908569336, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.205789566040039, "per_sentence_gradient_norm/p85": 3.6153404712677, "per_sentence_gradient_norm/p90": 3.885064125061035, "per_sentence_gradient_norm/p95": 4.128162384033203, "per_sentence_gradient_norm/p99": 6.636090278625488, "per_sentence_gradient_norm/var": 2.6521058082580566, "per_token_feature_norm": 186.47222900390625, "per_token_feature_norm/max": 253.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 90.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 208.38101196289062, "per_token_full_gradient_variance/max_squared_error": 0.8030413389205933, "per_token_full_gradient_variance/variance": 0.0028868289664387703, "per_token_gradient_norm": 1.7871159315109253, "per_token_gradient_norm/max": 269.6484375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 237.45965576171875, "per_token_policy_error_norm": 0.012074478901922703, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011549987830221653, "policy_entropy": 0.022581882774829865, "policy_entropy/max": 3.046875, "policy_entropy/median": 1.909211277961731e-08, "policy_entropy/min": 4.228388472693467e-18, "policy_entropy/p25": 1.3278622645884752e-10, "policy_entropy/p75": 3.516674041748047e-06, "policy_entropy/var": 0.013332737609744072, "policy_error_vector_variance/max_squared_error": 1.9994038343429565, "policy_error_vector_variance/metric": 0.012036148458719254, "policy_loss": -0.6979166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21304824948310852, "policy_sharpness": 9.4060697555542, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.6688601970672607, "reward": 0.6979166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21304824948310852, "rewards/accuracy_reward": 0.6979166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21304824948310852, "sentence_full_gradient_variance/max_squared_error": 3182.8984375, "sentence_full_gradient_variance/metric": 903.4691772460938, "sentence_full_gradient_variance/p75": 1264.279052734375, "sentence_full_gradient_variance/p90": 1925.773193359375, "sentence_full_gradient_variance/p95": 2796.987548828125, "sentence_full_gradient_variance/p99": 3134.373779296875, "state_level_variance/metric": 1.343467116355896, "state_level_variance_full_gradient/metric": 717.1359252929688, "step": 46 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1666666865348816, "action_level_variance/metric": 2.157860517501831, "action_level_variance_full_gradient/metric": 161.4264373779297, "adam_stats/lr_effective_max": 5.523979780264199e-05, "adam_stats/lr_effective_mean": 6.760574738917668e-11, "adam_stats/lr_effective_min": -5.8467012422624975e-05, "adam_stats/m_t_max": 0.008030375465750694, "adam_stats/m_t_mean": 3.088030178788337e-11, "adam_stats/m_t_min": -0.006273708771914244, "adam_stats/v_t_max": 0.00028355317772366107, "adam_stats/v_t_mean": 1.7894033477183768e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7916666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.1666666865348816, "all_logprobs": -0.0218729916960001, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.875, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00022077560424804688, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.01416015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03751344978809357, "clip_ratio": 0.0, "completion_length": 510.47918701171875, "completion_length/correct": 484.25, "completion_length/correct/max": 983.0, "completion_length/correct/median": 384.0, "completion_length/correct/min": 218.0, "completion_length/correct/p25": 279.0, "completion_length/correct/p75": 652.5, "completion_length/correct/var": 54340.94140625, "completion_length/incorrect": 610.1500244140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 635.0, "completion_length/incorrect/min": 178.0, "completion_length/incorrect/p25": 233.75, "completion_length/incorrect/p75": 921.25, "completion_length/incorrect/var": 112563.1875, "completion_length/max": 1024.0, "completion_length/median": 424.0, "completion_length/min": 178.0, "completion_length/p25": 276.0, "completion_length/p75": 691.0, "completion_length/var": 68055.1796875, "epoch": 0.0752, "feature_vector_variance/max_squared_error": 87498.828125, "feature_vector_variance/metric": 27683.763671875, "generated_tokens/total": 2494426.0, "grad_norm": 2.0327517986297607, "learning_rate": 9.567280168627493e-06, "loss": -0.7917, "mean_logprobs": -0.0262451171875, "mean_logprobs/var": 0.000354766845703125, "num_completions/total": 4512, "per_sentence_gradient_norm": 2.7175068855285645, "per_sentence_gradient_norm/max": 9.219343185424805, "per_sentence_gradient_norm/median": 2.2213685512542725, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.3823200762271881, "per_sentence_gradient_norm/p75": 4.123540878295898, "per_sentence_gradient_norm/p85": 5.205772399902344, "per_sentence_gradient_norm/p90": 5.7977824211120605, "per_sentence_gradient_norm/p95": 6.606970310211182, "per_sentence_gradient_norm/p99": 9.177910804748535, "per_sentence_gradient_norm/var": 5.347815990447998, "per_token_feature_norm": 187.9308319091797, "per_token_feature_norm/max": 253.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 93.5, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 249.05418395996094, "per_token_full_gradient_variance/max_squared_error": 1.326046109199524, "per_token_full_gradient_variance/variance": 0.003833732334896922, "per_token_gradient_norm": 2.3363304138183594, "per_token_gradient_norm/max": 313.4375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 322.4098205566406, "per_token_policy_error_norm": 0.012141291052103043, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.012055441737174988, "policy_entropy": 0.0228023212403059, "policy_entropy/max": 3.09375, "policy_entropy/median": 4.237517714500427e-08, "policy_entropy/min": 2.2090619264392153e-18, "policy_entropy/p25": 3.055902197957039e-10, "policy_entropy/p75": 6.198883056640625e-06, "policy_entropy/var": 0.014737091027200222, "policy_error_vector_variance/max_squared_error": 1.9876649379730225, "policy_error_vector_variance/metric": 0.012118428014218807, "policy_loss": -0.7916666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.1666666865348816, "policy_sharpness": 9.390731811523438, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.8386826515197754, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1666666865348816, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1666666865348816, "sentence_full_gradient_variance/max_squared_error": 2770.582275390625, "sentence_full_gradient_variance/metric": 1210.114501953125, "sentence_full_gradient_variance/p75": 1582.739013671875, "sentence_full_gradient_variance/p90": 2540.22265625, "sentence_full_gradient_variance/p95": 2557.540283203125, "sentence_full_gradient_variance/p99": 2690.893798828125, "state_level_variance/metric": 3.713435173034668, "state_level_variance_full_gradient/metric": 1048.6881103515625, "step": 47 }, { "accuracy_reward": 0.8125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1539473682641983, "action_level_variance/metric": 3.5007433891296387, "action_level_variance_full_gradient/metric": 317.00653076171875, "adam_stats/lr_effective_max": 5.652839899994433e-05, "adam_stats/lr_effective_mean": -3.43534228841591e-11, "adam_stats/lr_effective_min": -5.8991790865547955e-05, "adam_stats/m_t_max": 0.007293034344911575, "adam_stats/m_t_mean": -1.6364180843719822e-11, "adam_stats/m_t_min": -0.007478184532374144, "adam_stats/v_t_max": 0.00028837862191721797, "adam_stats/v_t_mean": 1.8351790573301052e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8125, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.1539473682641983, "all_logprobs": -0.02718992345035076, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -16.25, "all_logprobs/p1": -0.8125, "all_logprobs/p10": -0.0010223388671875, "all_logprobs/p25": -1.0728836059570312e-06, "all_logprobs/p5": -0.036865234375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.05233834311366081, "clip_ratio": 0.0, "completion_length": 532.7291870117188, "completion_length/correct": 502.9102478027344, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 474.0, "completion_length/correct/min": 253.0, "completion_length/correct/p25": 364.5, "completion_length/correct/p75": 569.5, "completion_length/correct/var": 27845.513671875, "completion_length/incorrect": 661.9444580078125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 577.0, "completion_length/incorrect/min": 271.0, "completion_length/incorrect/p25": 370.5, "completion_length/incorrect/p75": 1004.25, "completion_length/incorrect/var": 89902.40625, "completion_length/max": 1024.0, "completion_length/median": 478.0, "completion_length/min": 253.0, "completion_length/p25": 362.25, "completion_length/p75": 661.0, "completion_length/var": 42550.9375, "epoch": 0.0768, "feature_vector_variance/max_squared_error": 136312.34375, "feature_vector_variance/metric": 28055.4375, "generated_tokens/total": 2545568.0, "grad_norm": 1.8580960035324097, "learning_rate": 9.314414216997507e-06, "loss": -0.8125, "mean_logprobs": -0.02685546875, "mean_logprobs/var": 0.000152587890625, "num_completions/total": 4608, "per_sentence_gradient_norm": 2.9365477561950684, "per_sentence_gradient_norm/max": 10.722333908081055, "per_sentence_gradient_norm/median": 3.143480062484741, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 2.1658833026885986, "per_sentence_gradient_norm/p75": 3.995393753051758, "per_sentence_gradient_norm/p85": 4.252707481384277, "per_sentence_gradient_norm/p90": 4.891226291656494, "per_sentence_gradient_norm/p95": 5.495639324188232, "per_sentence_gradient_norm/p99": 8.129636764526367, "per_sentence_gradient_norm/var": 3.7890729904174805, "per_token_feature_norm": 186.19937133789062, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 96.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 199.71240234375, "per_token_full_gradient_variance/max_squared_error": 2.252875804901123, "per_token_full_gradient_variance/variance": 0.00451288977637887, "per_token_gradient_norm": 2.769946336746216, "per_token_gradient_norm/max": 329.0625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 383.5994567871094, "per_token_policy_error_norm": 0.015256273560225964, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015250944532454014, "policy_entropy": 0.027338791638612747, "policy_entropy/max": 3.328125, "policy_entropy/median": 1.3317912817001343e-07, "policy_entropy/min": 2.524158182817815e-19, "policy_entropy/p25": 8.36735125631094e-10, "policy_entropy/p75": 1.704692840576172e-05, "policy_entropy/var": 0.015510604716837406, "policy_error_vector_variance/max_squared_error": 2.000354051589966, "policy_error_vector_variance/metric": 0.015210550278425217, "policy_loss": -0.8125, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.1539473682641983, "policy_sharpness": 9.230786323547363, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.706415176391602, "reward": 0.8125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1539473682641983, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1539473682641983, "sentence_full_gradient_variance/max_squared_error": 2011.702392578125, "sentence_full_gradient_variance/metric": 798.78076171875, "sentence_full_gradient_variance/p75": 1738.3424072265625, "sentence_full_gradient_variance/p90": 1774.2825927734375, "sentence_full_gradient_variance/p95": 1774.2882080078125, "sentence_full_gradient_variance/p99": 1926.1856689453125, "state_level_variance/metric": 0.7488580346107483, "state_level_variance_full_gradient/metric": 481.774169921875, "step": 48 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894736886024475, "action_level_variance/metric": 2.4969663619995117, "action_level_variance_full_gradient/metric": 342.77752685546875, "adam_stats/lr_effective_max": 5.6785265769576654e-05, "adam_stats/lr_effective_mean": -3.748235662820454e-12, "adam_stats/lr_effective_min": -5.684776260750368e-05, "adam_stats/m_t_max": 0.007728947792202234, "adam_stats/m_t_mean": -2.3593603199617563e-12, "adam_stats/m_t_min": -0.009804444387555122, "adam_stats/v_t_max": 0.00028853630647063255, "adam_stats/v_t_mean": 1.8835164330988086e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.75, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.75, "advantages/p75": 1.0, "advantages/var": 0.1894736886024475, "all_logprobs": -0.02241550199687481, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.28125, "all_logprobs/p1": -0.69140625, "all_logprobs/p10": -0.000431060791015625, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.0181884765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03357083350419998, "clip_ratio": 0.0, "completion_length": 531.2291870117188, "completion_length/correct": 488.4583435058594, "completion_length/correct/max": 918.0, "completion_length/correct/median": 462.0, "completion_length/correct/min": 129.0, "completion_length/correct/p25": 297.25, "completion_length/correct/p75": 684.25, "completion_length/correct/var": 36467.04296875, "completion_length/incorrect": 659.5416870117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 659.0, "completion_length/incorrect/min": 325.0, "completion_length/incorrect/p25": 450.0, "completion_length/incorrect/p75": 828.25, "completion_length/incorrect/var": 45575.0390625, "completion_length/max": 1024.0, "completion_length/median": 525.0, "completion_length/min": 129.0, "completion_length/p25": 362.75, "completion_length/p75": 697.5, "completion_length/var": 43834.07421875, "epoch": 0.0784, "feature_vector_variance/max_squared_error": 84847.5234375, "feature_vector_variance/metric": 27551.9375, "generated_tokens/total": 2596566.0, "grad_norm": 3.3292007446289062, "learning_rate": 9.059337681133194e-06, "loss": -0.75, "mean_logprobs": -0.023193359375, "mean_logprobs/var": 0.0003604888916015625, "num_completions/total": 4704, "per_sentence_gradient_norm": 2.2720868587493896, "per_sentence_gradient_norm/max": 9.899618148803711, "per_sentence_gradient_norm/median": 2.293595552444458, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.8100029230117798, "per_sentence_gradient_norm/p75": 2.967742681503296, "per_sentence_gradient_norm/p85": 3.861401319503784, "per_sentence_gradient_norm/p90": 4.446817398071289, "per_sentence_gradient_norm/p95": 5.319915771484375, "per_sentence_gradient_norm/p99": 7.0417962074279785, "per_sentence_gradient_norm/var": 3.2774384021759033, "per_token_feature_norm": 186.74427795410156, "per_token_feature_norm/max": 254.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 85.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 214.4494171142578, "per_token_full_gradient_variance/max_squared_error": 0.7873191833496094, "per_token_full_gradient_variance/variance": 0.0036228869576007128, "per_token_gradient_norm": 2.0188331604003906, "per_token_gradient_norm/max": 283.53125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 277.9879150390625, "per_token_policy_error_norm": 0.012899327091872692, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.012625137344002724, "policy_entropy": 0.023916658014059067, "policy_entropy/max": 2.40625, "policy_entropy/median": 2.8638169169425964e-08, "policy_entropy/min": 4.0826988057657276e-19, "policy_entropy/p25": 1.646185410209e-10, "policy_entropy/p75": 5.066394805908203e-06, "policy_entropy/var": 0.014270620420575142, "policy_error_vector_variance/max_squared_error": 1.9865927696228027, "policy_error_vector_variance/metric": 0.012894127517938614, "policy_loss": -0.75, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -0.75, "policy_loss/var": 0.1894736886024475, "policy_sharpness": 9.341110229492188, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.084949970245361, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894736886024475, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894736886024475, "sentence_full_gradient_variance/max_squared_error": 4077.67919921875, "sentence_full_gradient_variance/metric": 931.6591796875, "sentence_full_gradient_variance/p75": 1321.2200927734375, "sentence_full_gradient_variance/p90": 1531.5325927734375, "sentence_full_gradient_variance/p95": 2475.9482421875, "sentence_full_gradient_variance/p99": 3072.709228515625, "state_level_variance/metric": 1.1546757221221924, "state_level_variance_full_gradient/metric": 588.881591796875, "step": 49 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14035087823867798, "action_level_variance/metric": 5.956131935119629, "action_level_variance_full_gradient/metric": 419.36370849609375, "adam_stats/lr_effective_max": 5.2458824939094484e-05, "adam_stats/lr_effective_mean": -6.091174786782005e-11, "adam_stats/lr_effective_min": -5.2245795814087614e-05, "adam_stats/m_t_max": 0.010033042170107365, "adam_stats/m_t_mean": -1.6159006424598665e-11, "adam_stats/m_t_min": -0.018238838762044907, "adam_stats/v_t_max": 0.000291242147795856, "adam_stats/v_t_mean": 1.9311042348535423e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8333333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.14035087823867798, "all_logprobs": -0.01834099553525448, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.25, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.0002383231185376644, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.01104736328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.025674471631646156, "clip_ratio": 0.0, "completion_length": 397.3125, "completion_length/correct": 387.2749938964844, "completion_length/correct/max": 793.0, "completion_length/correct/median": 384.0, "completion_length/correct/min": 32.0, "completion_length/correct/p25": 302.75, "completion_length/correct/p75": 473.75, "completion_length/correct/var": 20322.833984375, "completion_length/incorrect": 447.5, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 502.0, "completion_length/incorrect/min": 6.0, "completion_length/incorrect/p25": 63.75, "completion_length/incorrect/p75": 524.75, "completion_length/incorrect/var": 127334.3984375, "completion_length/max": 1024.0, "completion_length/median": 396.0, "completion_length/min": 6.0, "completion_length/p25": 289.0, "completion_length/p75": 492.5, "completion_length/var": 37514.53515625, "epoch": 0.08, "feature_vector_variance/max_squared_error": 87290.53125, "feature_vector_variance/metric": 28249.12890625, "generated_tokens/total": 2634708.0, "grad_norm": 5.496613502502441, "learning_rate": 8.80236133250198e-06, "loss": -0.8333, "mean_logprobs": -0.03271484375, "mean_logprobs/var": 0.004180908203125, "num_completions/total": 4800, "per_sentence_gradient_norm": 2.9541358947753906, "per_sentence_gradient_norm/max": 18.53143310546875, "per_sentence_gradient_norm/median": 2.6242740154266357, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.275794506072998, "per_sentence_gradient_norm/p75": 3.9331462383270264, "per_sentence_gradient_norm/p85": 4.602686882019043, "per_sentence_gradient_norm/p90": 5.03636360168457, "per_sentence_gradient_norm/p95": 6.7795634269714355, "per_sentence_gradient_norm/p99": 18.53143310546875, "per_sentence_gradient_norm/var": 8.579838752746582, "per_token_feature_norm": 185.66973876953125, "per_token_feature_norm/max": 253.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 88.0, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 277.6553955078125, "per_token_full_gradient_variance/max_squared_error": 0.628515899181366, "per_token_full_gradient_variance/variance": 0.003855591407045722, "per_token_gradient_norm": 2.412412166595459, "per_token_gradient_norm/max": 298.828125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 315.8729553222656, "per_token_policy_error_norm": 0.010730205103754997, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009606201201677322, "policy_entropy": 0.02120746672153473, "policy_entropy/max": 1.578125, "policy_entropy/median": 3.329478204250336e-08, "policy_entropy/min": 1.1384122811097797e-18, "policy_entropy/p25": 7.548806024715304e-11, "policy_entropy/p75": 7.033348083496094e-06, "policy_entropy/var": 0.011774244718253613, "policy_error_vector_variance/max_squared_error": 1.986985683441162, "policy_error_vector_variance/metric": 0.010697972029447556, "policy_loss": -0.8333333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.14035087823867798, "policy_sharpness": 9.402856826782227, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.751615285873413, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14035087823867798, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14035087823867798, "sentence_full_gradient_variance/max_squared_error": 4653.7041015625, "sentence_full_gradient_variance/metric": 1000.31591796875, "sentence_full_gradient_variance/p75": 1979.8609619140625, "sentence_full_gradient_variance/p90": 1979.88671875, "sentence_full_gradient_variance/p95": 2450.82470703125, "sentence_full_gradient_variance/p99": 4653.7041015625, "state_level_variance/metric": 3.5769269466400146, "state_level_variance_full_gradient/metric": 580.9521484375, "step": 50 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14035087823867798, "action_level_variance/metric": 1.7566356658935547, "action_level_variance_full_gradient/metric": 206.67098999023438, "adam_stats/lr_effective_max": 5.238016456132755e-05, "adam_stats/lr_effective_mean": -1.9288848296383776e-10, "adam_stats/lr_effective_min": -5.220137245487422e-05, "adam_stats/m_t_max": 0.011598513461649418, "adam_stats/m_t_mean": -5.818472439966804e-12, "adam_stats/m_t_min": -0.020559534430503845, "adam_stats/v_t_max": 0.0002940046542789787, "adam_stats/v_t_mean": 1.9646352253377408e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8333333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.14035087823867798, "all_logprobs": -0.0187021866440773, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.25, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00018978118896484375, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.01104736328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02587861754000187, "clip_ratio": 0.0, "completion_length": 487.9583435058594, "completion_length/correct": 426.13751220703125, "completion_length/correct/max": 1005.0, "completion_length/correct/median": 395.0, "completion_length/correct/min": 154.0, "completion_length/correct/p25": 232.5, "completion_length/correct/p75": 517.25, "completion_length/correct/var": 41544.57421875, "completion_length/incorrect": 797.0625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1007.0, "completion_length/incorrect/min": 171.0, "completion_length/incorrect/p25": 640.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 85101.6640625, "completion_length/max": 1024.0, "completion_length/median": 483.0, "completion_length/min": 154.0, "completion_length/p25": 245.25, "completion_length/p75": 663.5, "completion_length/var": 67294.921875, "epoch": 0.0816, "feature_vector_variance/max_squared_error": 77180.15625, "feature_vector_variance/metric": 27003.90625, "generated_tokens/total": 2681552.0, "grad_norm": 1.1309945583343506, "learning_rate": 8.543798257200491e-06, "loss": -0.8333, "mean_logprobs": -0.0203857421875, "mean_logprobs/var": 0.00011539459228515625, "num_completions/total": 4896, "per_sentence_gradient_norm": 2.6399054527282715, "per_sentence_gradient_norm/max": 8.008747100830078, "per_sentence_gradient_norm/median": 2.7574563026428223, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.6509442329406738, "per_sentence_gradient_norm/p75": 3.7168331146240234, "per_sentence_gradient_norm/p85": 4.10945987701416, "per_sentence_gradient_norm/p90": 4.500798225402832, "per_sentence_gradient_norm/p95": 5.442570209503174, "per_sentence_gradient_norm/p99": 7.016907215118408, "per_sentence_gradient_norm/var": 2.885000705718994, "per_token_feature_norm": 186.92919921875, "per_token_feature_norm/max": 254.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 95.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 225.79730224609375, "per_token_full_gradient_variance/max_squared_error": 0.7926872968673706, "per_token_full_gradient_variance/variance": 0.003602803684771061, "per_token_gradient_norm": 2.1913440227508545, "per_token_gradient_norm/max": 277.3125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 291.787109375, "per_token_policy_error_norm": 0.011071404442191124, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010249230079352856, "policy_entropy": 0.020456815138459206, "policy_entropy/max": 1.875, "policy_entropy/median": 1.618172973394394e-08, "policy_entropy/min": 1.3064636178450328e-17, "policy_entropy/p25": 1.0084022505907342e-10, "policy_entropy/p75": 3.159046173095703e-06, "policy_entropy/var": 0.010775617323815823, "policy_error_vector_variance/max_squared_error": 1.9991085529327393, "policy_error_vector_variance/metric": 0.011059438809752464, "policy_loss": -0.8333333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.14035087823867798, "policy_sharpness": 9.40796947479248, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.6804018020629883, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14035087823867798, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14035087823867798, "sentence_full_gradient_variance/max_squared_error": 2732.59326171875, "sentence_full_gradient_variance/metric": 1080.7259521484375, "sentence_full_gradient_variance/p75": 1946.0411376953125, "sentence_full_gradient_variance/p90": 2732.5673828125, "sentence_full_gradient_variance/p95": 2732.5673828125, "sentence_full_gradient_variance/p99": 2732.568603515625, "state_level_variance/metric": 1.4377009868621826, "state_level_variance_full_gradient/metric": 874.0548706054688, "step": 51 }, { "accuracy_reward": 0.625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23684212565422058, "action_level_variance/metric": 1.4395735263824463, "action_level_variance_full_gradient/metric": 199.52085876464844, "adam_stats/lr_effective_max": 4.9370763008482754e-05, "adam_stats/lr_effective_mean": -2.4692781153135e-10, "adam_stats/lr_effective_min": -4.934842581860721e-05, "adam_stats/m_t_max": 0.010419963859021664, "adam_stats/m_t_mean": 4.439551157253696e-11, "adam_stats/m_t_min": -0.021937333047389984, "adam_stats/v_t_max": 0.0003001581644639373, "adam_stats/v_t_mean": 2.014114049153637e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.625, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.23684212565422058, "all_logprobs": -0.017983322963118553, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -10.5, "all_logprobs/p1": -0.474609375, "all_logprobs/p10": -9.584426879882812e-05, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.0067138671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03392805904150009, "clip_ratio": 0.0, "completion_length": 547.6354370117188, "completion_length/correct": 401.2833557128906, "completion_length/correct/max": 950.0, "completion_length/correct/median": 335.0, "completion_length/correct/min": 184.0, "completion_length/correct/p25": 286.0, "completion_length/correct/p75": 509.5, "completion_length/correct/var": 29590.001953125, "completion_length/incorrect": 791.5555419921875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 802.0, "completion_length/incorrect/min": 24.0, "completion_length/incorrect/p25": 625.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 61913.5703125, "completion_length/max": 1024.0, "completion_length/median": 509.0, "completion_length/min": 24.0, "completion_length/p25": 311.25, "completion_length/p75": 758.75, "completion_length/var": 77261.203125, "epoch": 0.0832, "feature_vector_variance/max_squared_error": 76666.9609375, "feature_vector_variance/metric": 28278.443359375, "generated_tokens/total": 2734125.0, "grad_norm": 5.545817852020264, "learning_rate": 8.283963474507402e-06, "loss": -0.625, "mean_logprobs": -0.0206298828125, "mean_logprobs/var": 0.00019359588623046875, "num_completions/total": 4992, "per_sentence_gradient_norm": 1.7654556035995483, "per_sentence_gradient_norm/max": 7.35231351852417, "per_sentence_gradient_norm/median": 1.7636269330978394, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.985701084136963, "per_sentence_gradient_norm/p85": 3.546560049057007, "per_sentence_gradient_norm/p90": 4.034918308258057, "per_sentence_gradient_norm/p95": 4.703458786010742, "per_sentence_gradient_norm/p99": 6.561004161834717, "per_sentence_gradient_norm/var": 3.0865702629089355, "per_token_feature_norm": 185.4241943359375, "per_token_feature_norm/max": 264.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 93.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 291.9898376464844, "per_token_full_gradient_variance/max_squared_error": 0.5605823993682861, "per_token_full_gradient_variance/variance": 0.0021940332371741533, "per_token_gradient_norm": 1.149945616722107, "per_token_gradient_norm/max": 277.0703125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 154.91302490234375, "per_token_policy_error_norm": 0.010009221732616425, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009690173901617527, "policy_entropy": 0.01860843040049076, "policy_entropy/max": 1.96875, "policy_entropy/median": 8.009374141693115e-08, "policy_entropy/min": 3.1340219048409113e-19, "policy_entropy/p25": 4.802132025361061e-10, "policy_entropy/p75": 8.761882781982422e-06, "policy_entropy/var": 0.010473052971065044, "policy_error_vector_variance/max_squared_error": 2.0013110637664795, "policy_error_vector_variance/metric": 0.010004478506743908, "policy_loss": -0.625, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.23684212565422058, "policy_sharpness": 9.459824562072754, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.3772730827331543, "reward": 0.625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23684212565422058, "rewards/accuracy_reward": 0.625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23684212565422058, "sentence_full_gradient_variance/max_squared_error": 2808.49951171875, "sentence_full_gradient_variance/metric": 1350.208251953125, "sentence_full_gradient_variance/p75": 1794.133056640625, "sentence_full_gradient_variance/p90": 2027.4075927734375, "sentence_full_gradient_variance/p95": 2521.54345703125, "sentence_full_gradient_variance/p99": 2684.293212890625, "state_level_variance/metric": 1.9579548835754395, "state_level_variance_full_gradient/metric": 1150.6875, "step": 52 }, { "accuracy_reward": 0.8645833730697632, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.11831139773130417, "action_level_variance/metric": 1.1117792129516602, "action_level_variance_full_gradient/metric": 145.12185668945312, "adam_stats/lr_effective_max": 4.9449994548922405e-05, "adam_stats/lr_effective_mean": -1.9559209807340494e-10, "adam_stats/lr_effective_min": -4.9322003178531304e-05, "adam_stats/m_t_max": 0.010793983936309814, "adam_stats/m_t_mean": 6.94017482372189e-11, "adam_stats/m_t_min": -0.025114694610238075, "adam_stats/v_t_max": 0.0003004304599016905, "adam_stats/v_t_mean": 2.0300108816151408e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8645833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.11831139773130417, "all_logprobs": -0.015944622457027435, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.5, "all_logprobs/p1": -0.396484375, "all_logprobs/p10": -4.2438507080078125e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.004791259765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.023250238969922066, "clip_ratio": 0.0, "completion_length": 538.03125, "completion_length/correct": 493.7228698730469, "completion_length/correct/max": 998.0, "completion_length/correct/median": 482.0, "completion_length/correct/min": 237.0, "completion_length/correct/p25": 299.5, "completion_length/correct/p75": 614.5, "completion_length/correct/var": 48104.71484375, "completion_length/incorrect": 820.923095703125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 6.0, "completion_length/incorrect/p25": 682.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 100419.75, "completion_length/max": 1024.0, "completion_length/median": 518.0, "completion_length/min": 6.0, "completion_length/p25": 315.25, "completion_length/p75": 649.5, "completion_length/var": 66872.984375, "epoch": 0.0848, "feature_vector_variance/max_squared_error": 79810.8359375, "feature_vector_variance/metric": 26682.486328125, "generated_tokens/total": 2785776.0, "grad_norm": 0.7473664879798889, "learning_rate": 8.02317355308094e-06, "loss": -0.8646, "mean_logprobs": -0.0294189453125, "mean_logprobs/var": 0.011474609375, "num_completions/total": 5088, "per_sentence_gradient_norm": 2.513883590698242, "per_sentence_gradient_norm/max": 8.836280822753906, "per_sentence_gradient_norm/median": 2.4753763675689697, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.5017865896224976, "per_sentence_gradient_norm/p75": 3.497621536254883, "per_sentence_gradient_norm/p85": 4.138235092163086, "per_sentence_gradient_norm/p90": 4.6247148513793945, "per_sentence_gradient_norm/p95": 5.347699165344238, "per_sentence_gradient_norm/p99": 7.409102439880371, "per_sentence_gradient_norm/var": 3.143873691558838, "per_token_feature_norm": 187.9955291748047, "per_token_feature_norm/max": 253.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 99.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 242.5562286376953, "per_token_full_gradient_variance/max_squared_error": 0.7855527997016907, "per_token_full_gradient_variance/variance": 0.0034186069387942553, "per_token_gradient_norm": 1.9894858598709106, "per_token_gradient_norm/max": 285.078125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 268.758056640625, "per_token_policy_error_norm": 0.009317493066191673, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009238405153155327, "policy_entropy": 0.017116213217377663, "policy_entropy/max": 1.78125, "policy_entropy/median": 1.4842953532934189e-08, "policy_entropy/min": 1.2265037076242269e-18, "policy_entropy/p25": 1.7507773009128869e-10, "policy_entropy/p75": 1.5720725059509277e-06, "policy_entropy/var": 0.008815428242087364, "policy_error_vector_variance/max_squared_error": 1.9853614568710327, "policy_error_vector_variance/metric": 0.009311433881521225, "policy_loss": -0.8645833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.11831139773130417, "policy_sharpness": 9.513509750366211, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.0633697509765625, "reward": 0.8645833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.11831139773130417, "rewards/accuracy_reward": 0.8645833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.11831139773130417, "sentence_full_gradient_variance/max_squared_error": 2682.119140625, "sentence_full_gradient_variance/metric": 1173.19482421875, "sentence_full_gradient_variance/p75": 1547.0467529296875, "sentence_full_gradient_variance/p90": 2081.5849609375, "sentence_full_gradient_variance/p95": 2371.512451171875, "sentence_full_gradient_variance/p99": 2682.110595703125, "state_level_variance/metric": 2.3327109813690186, "state_level_variance_full_gradient/metric": 1028.072998046875, "step": 53 }, { "accuracy_reward": 0.6979166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21304824948310852, "action_level_variance/metric": 1.1498490571975708, "action_level_variance_full_gradient/metric": 55.63908386230469, "adam_stats/lr_effective_max": 4.802102193934843e-05, "adam_stats/lr_effective_mean": -1.8359830322722814e-10, "adam_stats/lr_effective_min": -4.826782242162153e-05, "adam_stats/m_t_max": 0.011039848439395428, "adam_stats/m_t_mean": 9.216549051727085e-11, "adam_stats/m_t_min": -0.02606947161257267, "adam_stats/v_t_max": 0.000300714309560135, "adam_stats/v_t_mean": 2.068877014149706e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6979166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21304824948310852, "all_logprobs": -0.022212671115994453, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.625, "all_logprobs/p1": -0.6861330270767212, "all_logprobs/p10": -0.0004901885986328125, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.0184326171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03311982750892639, "clip_ratio": 0.0, "completion_length": 529.6458740234375, "completion_length/correct": 364.64178466796875, "completion_length/correct/max": 1023.0, "completion_length/correct/median": 360.0, "completion_length/correct/min": 171.0, "completion_length/correct/p25": 240.5, "completion_length/correct/p75": 395.0, "completion_length/correct/var": 32644.3828125, "completion_length/incorrect": 910.862060546875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 429.0, "completion_length/incorrect/p25": 844.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 36745.765625, "completion_length/max": 1024.0, "completion_length/median": 391.0, "completion_length/min": 171.0, "completion_length/p25": 281.5, "completion_length/p75": 853.5, "completion_length/var": 97073.9375, "epoch": 0.0864, "feature_vector_variance/max_squared_error": 77525.5859375, "feature_vector_variance/metric": 27395.3046875, "generated_tokens/total": 2836622.0, "grad_norm": 1.070591926574707, "learning_rate": 7.76174622526876e-06, "loss": -0.6979, "mean_logprobs": -0.021484375, "mean_logprobs/var": 0.00011491775512695312, "num_completions/total": 5184, "per_sentence_gradient_norm": 2.102778434753418, "per_sentence_gradient_norm/max": 7.268653869628906, "per_sentence_gradient_norm/median": 1.9907034635543823, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.5477206707000732, "per_sentence_gradient_norm/p85": 4.13662576675415, "per_sentence_gradient_norm/p90": 4.410449504852295, "per_sentence_gradient_norm/p95": 5.124215126037598, "per_sentence_gradient_norm/p99": 5.923561096191406, "per_sentence_gradient_norm/var": 3.2764744758605957, "per_token_feature_norm": 187.4049072265625, "per_token_feature_norm/max": 264.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 96.5, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 229.99917602539062, "per_token_full_gradient_variance/max_squared_error": 0.5415383577346802, "per_token_full_gradient_variance/variance": 0.0025465558283030987, "per_token_gradient_norm": 1.4526617527008057, "per_token_gradient_norm/max": 297.703125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 197.5286407470703, "per_token_policy_error_norm": 0.012859909795224667, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.012087705545127392, "policy_entropy": 0.02398996241390705, "policy_entropy/max": 2.75, "policy_entropy/median": 4.912726581096649e-08, "policy_entropy/min": 2.087089182034596e-18, "policy_entropy/p25": 4.4929038267582655e-10, "policy_entropy/p75": 7.033348083496094e-06, "policy_entropy/var": 0.013305137865245342, "policy_error_vector_variance/max_squared_error": 2.001767873764038, "policy_error_vector_variance/metric": 0.012838801369071007, "policy_loss": -0.6979166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21304824948310852, "policy_sharpness": 9.329471588134766, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.114963531494141, "reward": 0.6979166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21304824948310852, "rewards/accuracy_reward": 0.6979166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21304824948310852, "sentence_full_gradient_variance/max_squared_error": 3483.879638671875, "sentence_full_gradient_variance/metric": 1441.310546875, "sentence_full_gradient_variance/p75": 1672.4404296875, "sentence_full_gradient_variance/p90": 2753.2431640625, "sentence_full_gradient_variance/p95": 2858.052490234375, "sentence_full_gradient_variance/p99": 3289.860107421875, "state_level_variance/metric": 2.4395198822021484, "state_level_variance_full_gradient/metric": 1385.67138671875, "step": 54 }, { "accuracy_reward": 0.6458333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2311403602361679, "action_level_variance/metric": 1.707336187362671, "action_level_variance_full_gradient/metric": 216.583984375, "adam_stats/lr_effective_max": 4.737305425805971e-05, "adam_stats/lr_effective_mean": -2.1614965373117911e-10, "adam_stats/lr_effective_min": -4.6046156057855114e-05, "adam_stats/m_t_max": 0.009813209064304829, "adam_stats/m_t_mean": 8.802315126787974e-11, "adam_stats/m_t_min": -0.022302856668829918, "adam_stats/v_t_max": 0.0003004746395163238, "adam_stats/v_t_mean": 2.0717833698613575e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6458333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2311403602361679, "all_logprobs": -0.020380323752760887, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -4.84375, "all_logprobs/p1": -0.58203125, "all_logprobs/p10": -0.00055694580078125, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.019287109375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.028218558058142662, "clip_ratio": 0.0, "completion_length": 525.84375, "completion_length/correct": 431.1128845214844, "completion_length/correct/max": 943.0, "completion_length/correct/median": 366.0, "completion_length/correct/min": 230.0, "completion_length/correct/p25": 311.25, "completion_length/correct/p75": 490.5, "completion_length/correct/var": 31390.69140625, "completion_length/incorrect": 698.5882568359375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 663.0, "completion_length/incorrect/min": 333.0, "completion_length/incorrect/p25": 451.75, "completion_length/incorrect/p75": 941.25, "completion_length/incorrect/var": 63620.43359375, "completion_length/max": 1024.0, "completion_length/median": 437.0, "completion_length/min": 230.0, "completion_length/p25": 357.0, "completion_length/p75": 657.0, "completion_length/var": 58792.34765625, "epoch": 0.088, "feature_vector_variance/max_squared_error": 77409.0859375, "feature_vector_variance/metric": 27584.404296875, "generated_tokens/total": 2887103.0, "grad_norm": 0.4800097942352295, "learning_rate": 7.5e-06, "loss": -0.6458, "mean_logprobs": -0.0198974609375, "mean_logprobs/var": 0.0001239776611328125, "num_completions/total": 5280, "per_sentence_gradient_norm": 1.6324827671051025, "per_sentence_gradient_norm/max": 6.156660079956055, "per_sentence_gradient_norm/median": 1.730351209640503, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.4242472648620605, "per_sentence_gradient_norm/p85": 3.245640754699707, "per_sentence_gradient_norm/p90": 3.8191967010498047, "per_sentence_gradient_norm/p95": 4.42536735534668, "per_sentence_gradient_norm/p99": 5.278796672821045, "per_sentence_gradient_norm/var": 2.3880205154418945, "per_token_feature_norm": 187.9383087158203, "per_token_feature_norm/max": 248.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 93.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 213.6561737060547, "per_token_full_gradient_variance/max_squared_error": 0.5727161765098572, "per_token_full_gradient_variance/variance": 0.002795176347717643, "per_token_gradient_norm": 1.3834816217422485, "per_token_gradient_norm/max": 297.3046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 187.76515197753906, "per_token_policy_error_norm": 0.011865115724503994, "per_token_policy_error_norm/max": 1.9375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011539807543158531, "policy_entropy": 0.02279931865632534, "policy_entropy/max": 1.640625, "policy_entropy/median": 3.306195139884949e-08, "policy_entropy/min": 1.2739375526704677e-18, "policy_entropy/p25": 1.7007550923153758e-10, "policy_entropy/p75": 7.450580596923828e-06, "policy_entropy/var": 0.011732648126780987, "policy_error_vector_variance/max_squared_error": 1.9431719779968262, "policy_error_vector_variance/metric": 0.011870667338371277, "policy_loss": -0.6458333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2311403602361679, "policy_sharpness": 9.319785118103027, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.0754475593566895, "reward": 0.6458333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2311403602361679, "rewards/accuracy_reward": 0.6458333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2311403602361679, "sentence_full_gradient_variance/max_squared_error": 3174.673095703125, "sentence_full_gradient_variance/metric": 1026.4840087890625, "sentence_full_gradient_variance/p75": 1163.50537109375, "sentence_full_gradient_variance/p90": 1744.102294921875, "sentence_full_gradient_variance/p95": 2978.7861328125, "sentence_full_gradient_variance/p99": 3077.915283203125, "state_level_variance/metric": 0.9482470154762268, "state_level_variance_full_gradient/metric": 809.9000244140625, "step": 55 }, { "accuracy_reward": 0.71875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20427630841732025, "action_level_variance/metric": 0.6249637603759766, "action_level_variance_full_gradient/metric": 74.90141296386719, "adam_stats/lr_effective_max": 4.4781019823858514e-05, "adam_stats/lr_effective_mean": -1.9442833454341724e-10, "adam_stats/lr_effective_min": -4.587259900290519e-05, "adam_stats/m_t_max": 0.009177122265100479, "adam_stats/m_t_mean": 2.5930211103308842e-11, "adam_stats/m_t_min": -0.014652647078037262, "adam_stats/v_t_max": 0.0003048471116926521, "adam_stats/v_t_mean": 2.0995415475622003e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.71875, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.20427630841732025, "all_logprobs": -0.017427368089556694, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.0, "all_logprobs/p1": -0.4976956844329834, "all_logprobs/p10": -0.0001697540283203125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.00860595703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02223648503422737, "clip_ratio": 0.0, "completion_length": 605.7708740234375, "completion_length/correct": 518.9420166015625, "completion_length/correct/max": 969.0, "completion_length/correct/median": 536.0, "completion_length/correct/min": 153.0, "completion_length/correct/p25": 357.0, "completion_length/correct/p75": 666.0, "completion_length/correct/var": 58040.8515625, "completion_length/incorrect": 827.6666870117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 897.0, "completion_length/incorrect/min": 408.0, "completion_length/incorrect/p25": 680.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 51332.921875, "completion_length/max": 1024.0, "completion_length/median": 562.0, "completion_length/min": 153.0, "completion_length/p25": 410.25, "completion_length/p75": 842.5, "completion_length/var": 75063.8046875, "epoch": 0.0896, "feature_vector_variance/max_squared_error": 76190.3515625, "feature_vector_variance/metric": 26977.970703125, "generated_tokens/total": 2945257.0, "grad_norm": 0.8491635322570801, "learning_rate": 7.238253774731245e-06, "loss": -0.7188, "mean_logprobs": -0.01708984375, "mean_logprobs/var": 8.630752563476562e-05, "num_completions/total": 5376, "per_sentence_gradient_norm": 1.5213931798934937, "per_sentence_gradient_norm/max": 5.445921421051025, "per_sentence_gradient_norm/median": 1.5323195457458496, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.319075107574463, "per_sentence_gradient_norm/p85": 2.805887222290039, "per_sentence_gradient_norm/p90": 3.013038158416748, "per_sentence_gradient_norm/p95": 3.7261123657226562, "per_sentence_gradient_norm/p99": 4.570451259613037, "per_sentence_gradient_norm/var": 1.5832171440124512, "per_token_feature_norm": 186.0119171142578, "per_token_feature_norm/max": 253.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 93.5, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 218.0665283203125, "per_token_full_gradient_variance/max_squared_error": 0.49370187520980835, "per_token_full_gradient_variance/variance": 0.002279876731336117, "per_token_gradient_norm": 1.191630244255066, "per_token_gradient_norm/max": 270.9375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 157.2744140625, "per_token_policy_error_norm": 0.010284217074513435, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009299582801759243, "policy_entropy": 0.019955433905124664, "policy_entropy/max": 2.640625, "policy_entropy/median": 3.306195139884949e-08, "policy_entropy/min": 2.710505431213761e-19, "policy_entropy/p25": 2.455635694786906e-10, "policy_entropy/p75": 3.933906555175781e-06, "policy_entropy/var": 0.010933784767985344, "policy_error_vector_variance/max_squared_error": 1.990471601486206, "policy_error_vector_variance/metric": 0.010274739004671574, "policy_loss": -0.71875, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.20427630841732025, "policy_sharpness": 9.432076454162598, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.4509732723236084, "reward": 0.71875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20427630841732025, "rewards/accuracy_reward": 0.71875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20427630841732025, "sentence_full_gradient_variance/max_squared_error": 4684.57080078125, "sentence_full_gradient_variance/metric": 1076.068603515625, "sentence_full_gradient_variance/p75": 926.3267211914062, "sentence_full_gradient_variance/p90": 3186.6357421875, "sentence_full_gradient_variance/p95": 4684.56640625, "sentence_full_gradient_variance/p99": 4684.57080078125, "state_level_variance/metric": 1.1125985383987427, "state_level_variance_full_gradient/metric": 1001.1671142578125, "step": 56 }, { "accuracy_reward": 0.6458333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2311403602361679, "action_level_variance/metric": 0.8255153894424438, "action_level_variance_full_gradient/metric": 50.607025146484375, "adam_stats/lr_effective_max": 4.250190977472812e-05, "adam_stats/lr_effective_mean": -1.3586763658590684e-10, "adam_stats/lr_effective_min": -4.219054608256556e-05, "adam_stats/m_t_max": 0.007896513678133488, "adam_stats/m_t_mean": 1.7422705639313918e-11, "adam_stats/m_t_min": -0.016776250675320625, "adam_stats/v_t_max": 0.0003052246756851673, "adam_stats/v_t_mean": 2.1240037506586873e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6458333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2311403602361679, "all_logprobs": -0.019686127081513405, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.25, "all_logprobs/p1": -0.6328125, "all_logprobs/p10": -0.00020761461928486824, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.01416015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.027649791911244392, "clip_ratio": 0.0, "completion_length": 616.9166870117188, "completion_length/correct": 480.20965576171875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 411.0, "completion_length/correct/min": 255.0, "completion_length/correct/p25": 350.5, "completion_length/correct/p75": 596.5, "completion_length/correct/var": 34769.08203125, "completion_length/incorrect": 866.2058715820312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 450.0, "completion_length/incorrect/p25": 660.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 38467.62109375, "completion_length/max": 1024.0, "completion_length/median": 523.0, "completion_length/min": 255.0, "completion_length/p25": 391.0, "completion_length/p75": 871.25, "completion_length/var": 70126.171875, "epoch": 0.0912, "feature_vector_variance/max_squared_error": 88118.5703125, "feature_vector_variance/metric": 27243.505859375, "generated_tokens/total": 3004481.0, "grad_norm": 0.7835422158241272, "learning_rate": 6.976826446919061e-06, "loss": -0.6458, "mean_logprobs": -0.0194091796875, "mean_logprobs/var": 9.012222290039062e-05, "num_completions/total": 5472, "per_sentence_gradient_norm": 1.628315806388855, "per_sentence_gradient_norm/max": 6.0715718269348145, "per_sentence_gradient_norm/median": 1.6094914674758911, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.522822856903076, "per_sentence_gradient_norm/p85": 3.0116710662841797, "per_sentence_gradient_norm/p90": 3.334352970123291, "per_sentence_gradient_norm/p95": 4.126893520355225, "per_sentence_gradient_norm/p99": 5.781766891479492, "per_sentence_gradient_norm/var": 2.2900683879852295, "per_token_feature_norm": 185.92750549316406, "per_token_feature_norm/max": 266.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 93.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 230.3102569580078, "per_token_full_gradient_variance/max_squared_error": 0.8633973598480225, "per_token_full_gradient_variance/variance": 0.001999646658077836, "per_token_gradient_norm": 1.2035366296768188, "per_token_gradient_norm/max": 311.8203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 164.28158569335938, "per_token_policy_error_norm": 0.011720234528183937, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01102044153958559, "policy_entropy": 0.021116003394126892, "policy_entropy/max": 2.046875, "policy_entropy/median": 1.641456037759781e-08, "policy_entropy/min": 1.2265037076242269e-18, "policy_entropy/p25": 1.0959411156363785e-10, "policy_entropy/p75": 3.382563591003418e-06, "policy_entropy/var": 0.011240486986935139, "policy_error_vector_variance/max_squared_error": 1.9963836669921875, "policy_error_vector_variance/metric": 0.011712208390235901, "policy_loss": -0.6458333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2311403602361679, "policy_sharpness": 9.401419639587402, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.6918551921844482, "reward": 0.6458333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2311403602361679, "rewards/accuracy_reward": 0.6458333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2311403602361679, "sentence_full_gradient_variance/max_squared_error": 2680.637451171875, "sentence_full_gradient_variance/metric": 1086.596923828125, "sentence_full_gradient_variance/p75": 1263.2708740234375, "sentence_full_gradient_variance/p90": 1926.009521484375, "sentence_full_gradient_variance/p95": 2364.31787109375, "sentence_full_gradient_variance/p99": 2581.117919921875, "state_level_variance/metric": 1.6842410564422607, "state_level_variance_full_gradient/metric": 1035.9898681640625, "step": 57 }, { "accuracy_reward": 0.6770833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.22094300389289856, "action_level_variance/metric": 1.9168548583984375, "action_level_variance_full_gradient/metric": 356.16229248046875, "adam_stats/lr_effective_max": 3.935094355256297e-05, "adam_stats/lr_effective_mean": -1.722277528926064e-11, "adam_stats/lr_effective_min": -4.004435322713107e-05, "adam_stats/m_t_max": 0.007681099697947502, "adam_stats/m_t_mean": 1.9128108819099765e-11, "adam_stats/m_t_min": -0.020713860169053078, "adam_stats/v_t_max": 0.0003051449020858854, "adam_stats/v_t_mean": 2.13372305934989e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6770833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.22094300389289856, "all_logprobs": -0.022507643327116966, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.5, "all_logprobs/p1": -0.6594922542572021, "all_logprobs/p10": -0.000530242919921875, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.020648181438446045, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.032458074390888214, "clip_ratio": 0.0, "completion_length": 522.0625, "completion_length/correct": 477.78460693359375, "completion_length/correct/max": 984.0, "completion_length/correct/median": 462.0, "completion_length/correct/min": 187.0, "completion_length/correct/p25": 352.0, "completion_length/correct/p75": 572.0, "completion_length/correct/var": 35386.296875, "completion_length/incorrect": 614.9031982421875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 609.0, "completion_length/incorrect/min": 225.0, "completion_length/incorrect/p25": 325.0, "completion_length/incorrect/p75": 749.0, "completion_length/incorrect/var": 68668.2890625, "completion_length/max": 1024.0, "completion_length/median": 477.0, "completion_length/min": 187.0, "completion_length/p25": 340.0, "completion_length/p75": 613.25, "completion_length/var": 49677.98046875, "epoch": 0.0928, "feature_vector_variance/max_squared_error": 79068.46875, "feature_vector_variance/metric": 27744.310546875, "generated_tokens/total": 3054599.0, "grad_norm": 0.6344282031059265, "learning_rate": 6.7160365254926005e-06, "loss": -0.6771, "mean_logprobs": -0.02197265625, "mean_logprobs/var": 0.00010967254638671875, "num_completions/total": 5568, "per_sentence_gradient_norm": 1.9904260635375977, "per_sentence_gradient_norm/max": 6.029293060302734, "per_sentence_gradient_norm/median": 1.893987774848938, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.4433865547180176, "per_sentence_gradient_norm/p85": 4.014084815979004, "per_sentence_gradient_norm/p90": 4.393274307250977, "per_sentence_gradient_norm/p95": 4.785514831542969, "per_sentence_gradient_norm/p99": 5.742807388305664, "per_sentence_gradient_norm/var": 3.043470621109009, "per_token_feature_norm": 186.953369140625, "per_token_feature_norm/max": 249.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 97.5, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 227.83773803710938, "per_token_full_gradient_variance/max_squared_error": 0.6630939245223999, "per_token_full_gradient_variance/variance": 0.00299888476729393, "per_token_gradient_norm": 1.8055778741836548, "per_token_gradient_norm/max": 274.21875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 244.56649780273438, "per_token_policy_error_norm": 0.013190199621021748, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.012640350498259068, "policy_entropy": 0.024226592853665352, "policy_entropy/max": 2.609375, "policy_entropy/median": 3.166496753692627e-08, "policy_entropy/min": 8.74138001566438e-19, "policy_entropy/p25": 1.7007550923153758e-10, "policy_entropy/p75": 8.285045623779297e-06, "policy_entropy/var": 0.01324536930769682, "policy_error_vector_variance/max_squared_error": 1.9869335889816284, "policy_error_vector_variance/metric": 0.01317568775266409, "policy_loss": -0.6770833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.22094300389289856, "policy_sharpness": 9.319890022277832, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.1200666427612305, "reward": 0.6770833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.22094300389289856, "rewards/accuracy_reward": 0.6770833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.22094300389289856, "sentence_full_gradient_variance/max_squared_error": 3109.41259765625, "sentence_full_gradient_variance/metric": 1129.5048828125, "sentence_full_gradient_variance/p75": 1502.0255126953125, "sentence_full_gradient_variance/p90": 1888.282958984375, "sentence_full_gradient_variance/p95": 2386.00927734375, "sentence_full_gradient_variance/p99": 2829.268310546875, "state_level_variance/metric": 1.4558396339416504, "state_level_variance_full_gradient/metric": 773.3426513671875, "step": 58 }, { "accuracy_reward": 0.6354166865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23410087823867798, "action_level_variance/metric": 1.229945421218872, "action_level_variance_full_gradient/metric": 107.29537200927734, "adam_stats/lr_effective_max": 3.862054290948436e-05, "adam_stats/lr_effective_mean": 4.724504815145636e-11, "adam_stats/lr_effective_min": -3.8461392250610515e-05, "adam_stats/m_t_max": 0.00663268705829978, "adam_stats/m_t_mean": 3.8155319181942815e-11, "adam_stats/m_t_min": -0.014540910720825195, "adam_stats/v_t_max": 0.00030501355649903417, "adam_stats/v_t_mean": 2.1384564258264405e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6354166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.23410087823867798, "all_logprobs": -0.018048394471406937, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.15625, "all_logprobs/p1": -0.474609375, "all_logprobs/p10": -0.00022029876708984375, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.009765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.026626817882061005, "clip_ratio": 0.0, "completion_length": 547.65625, "completion_length/correct": 489.86883544921875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 413.0, "completion_length/correct/min": 130.0, "completion_length/correct/p25": 303.0, "completion_length/correct/p75": 677.0, "completion_length/correct/var": 45923.015625, "completion_length/incorrect": 648.3714599609375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 605.0, "completion_length/incorrect/min": 376.0, "completion_length/incorrect/p25": 491.0, "completion_length/incorrect/p75": 741.5, "completion_length/incorrect/var": 47374.29296875, "completion_length/max": 1024.0, "completion_length/median": 516.0, "completion_length/min": 130.0, "completion_length/p25": 377.0, "completion_length/p75": 686.75, "completion_length/var": 51840.36328125, "epoch": 0.0944, "feature_vector_variance/max_squared_error": 98326.7421875, "feature_vector_variance/metric": 27275.654296875, "generated_tokens/total": 3107174.0, "grad_norm": 0.5325861573219299, "learning_rate": 6.456201742799511e-06, "loss": -0.6354, "mean_logprobs": -0.017578125, "mean_logprobs/var": 8.96453857421875e-05, "num_completions/total": 5664, "per_sentence_gradient_norm": 1.5931028127670288, "per_sentence_gradient_norm/max": 9.102599143981934, "per_sentence_gradient_norm/median": 1.495774269104004, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.515897274017334, "per_sentence_gradient_norm/p85": 2.9419822692871094, "per_sentence_gradient_norm/p90": 3.3920795917510986, "per_sentence_gradient_norm/p95": 4.264031410217285, "per_sentence_gradient_norm/p99": 5.564269542694092, "per_sentence_gradient_norm/var": 2.605781078338623, "per_token_feature_norm": 187.32054138183594, "per_token_feature_norm/max": 260.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 95.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 221.0816192626953, "per_token_full_gradient_variance/max_squared_error": 0.5330458283424377, "per_token_full_gradient_variance/variance": 0.0025767716579139233, "per_token_gradient_norm": 1.4217623472213745, "per_token_gradient_norm/max": 309.0546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 186.3926239013672, "per_token_policy_error_norm": 0.010458793491125107, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010169991292059422, "policy_entropy": 0.019508028402924538, "policy_entropy/max": 1.828125, "policy_entropy/median": 3.3527612686157227e-08, "policy_entropy/min": 2.456395547037471e-19, "policy_entropy/p25": 1.4279066817834973e-10, "policy_entropy/p75": 5.0067901611328125e-06, "policy_entropy/var": 0.010048452764749527, "policy_error_vector_variance/max_squared_error": 1.9896758794784546, "policy_error_vector_variance/metric": 0.010456804186105728, "policy_loss": -0.6354166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.23410087823867798, "policy_sharpness": 9.418543815612793, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.4901576042175293, "reward": 0.6354166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23410087823867798, "rewards/accuracy_reward": 0.6354166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23410087823867798, "sentence_full_gradient_variance/max_squared_error": 3900.52001953125, "sentence_full_gradient_variance/metric": 1197.8206787109375, "sentence_full_gradient_variance/p75": 1242.6494140625, "sentence_full_gradient_variance/p90": 2526.767578125, "sentence_full_gradient_variance/p95": 2700.9384765625, "sentence_full_gradient_variance/p99": 3002.991455078125, "state_level_variance/metric": 1.6390200853347778, "state_level_variance_full_gradient/metric": 1090.525390625, "step": 59 }, { "accuracy_reward": 0.7395833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19462719559669495, "action_level_variance/metric": 1.5733520984649658, "action_level_variance_full_gradient/metric": 330.21917724609375, "adam_stats/lr_effective_max": 3.778645987040363e-05, "adam_stats/lr_effective_mean": 4.6790477742364445e-11, "adam_stats/lr_effective_min": -3.618535629357211e-05, "adam_stats/m_t_max": 0.005652598571032286, "adam_stats/m_t_mean": 5.059308874622026e-11, "adam_stats/m_t_min": -0.010975002311170101, "adam_stats/v_t_max": 0.00030558844446204603, "adam_stats/v_t_mean": 2.1474377831509628e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7395833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.19462719559669495, "all_logprobs": -0.018002528697252274, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.5, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00020313262939453125, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.00933837890625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.024119574576616287, "clip_ratio": 0.0, "completion_length": 514.0104370117188, "completion_length/correct": 476.81689453125, "completion_length/correct/max": 1023.0, "completion_length/correct/median": 396.0, "completion_length/correct/min": 226.0, "completion_length/correct/p25": 305.0, "completion_length/correct/p75": 615.0, "completion_length/correct/var": 47180.296875, "completion_length/incorrect": 619.6400146484375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 538.0, "completion_length/incorrect/min": 328.0, "completion_length/incorrect/p25": 426.0, "completion_length/incorrect/p75": 856.0, "completion_length/incorrect/var": 58678.40625, "completion_length/max": 1024.0, "completion_length/median": 450.0, "completion_length/min": 226.0, "completion_length/p25": 333.75, "completion_length/p75": 628.0, "completion_length/var": 53558.53125, "epoch": 0.096, "feature_vector_variance/max_squared_error": 82735.21875, "feature_vector_variance/metric": 27555.982421875, "generated_tokens/total": 3156519.0, "grad_norm": 0.6246539354324341, "learning_rate": 6.197638667498023e-06, "loss": -0.7396, "mean_logprobs": -0.0179443359375, "mean_logprobs/var": 0.0001010894775390625, "num_completions/total": 5760, "per_sentence_gradient_norm": 1.9158185720443726, "per_sentence_gradient_norm/max": 7.0289387702941895, "per_sentence_gradient_norm/median": 1.6046087741851807, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.0483789443969727, "per_sentence_gradient_norm/p85": 3.6530089378356934, "per_sentence_gradient_norm/p90": 4.259465217590332, "per_sentence_gradient_norm/p95": 5.147843360900879, "per_sentence_gradient_norm/p99": 5.434247970581055, "per_sentence_gradient_norm/var": 2.703674077987671, "per_token_feature_norm": 186.08053588867188, "per_token_feature_norm/max": 246.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 95.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 211.54896545410156, "per_token_full_gradient_variance/max_squared_error": 0.5693877935409546, "per_token_full_gradient_variance/variance": 0.0032454824540764093, "per_token_gradient_norm": 1.7449713945388794, "per_token_gradient_norm/max": 289.0078125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 232.09373474121094, "per_token_policy_error_norm": 0.010676977224647999, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.00983455777168274, "policy_entropy": 0.02017154172062874, "policy_entropy/max": 2.03125, "policy_entropy/median": 2.852175384759903e-08, "policy_entropy/min": 7.535205098774256e-18, "policy_entropy/p25": 1.77351466845721e-10, "policy_entropy/p75": 5.3942203521728516e-06, "policy_entropy/var": 0.010949034243822098, "policy_error_vector_variance/max_squared_error": 2.0012335777282715, "policy_error_vector_variance/metric": 0.01066293753683567, "policy_loss": -0.7395833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.19462719559669495, "policy_sharpness": 9.424521446228027, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.4939444065093994, "reward": 0.7395833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19462719559669495, "rewards/accuracy_reward": 0.7395833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19462719559669495, "sentence_full_gradient_variance/max_squared_error": 2712.031494140625, "sentence_full_gradient_variance/metric": 1347.3089599609375, "sentence_full_gradient_variance/p75": 1811.88330078125, "sentence_full_gradient_variance/p90": 2291.31689453125, "sentence_full_gradient_variance/p95": 2517.964599609375, "sentence_full_gradient_variance/p99": 2596.00537109375, "state_level_variance/metric": 1.416903018951416, "state_level_variance_full_gradient/metric": 1017.0897827148438, "step": 60 }, { "accuracy_reward": 0.8020833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16041667759418488, "action_level_variance/metric": 1.4693588018417358, "action_level_variance_full_gradient/metric": 184.1725311279297, "adam_stats/lr_effective_max": 3.4322663850616664e-05, "adam_stats/lr_effective_mean": 1.096301695258095e-11, "adam_stats/lr_effective_min": -3.3860116673167795e-05, "adam_stats/m_t_max": 0.004931497387588024, "adam_stats/m_t_mean": 7.836876286804184e-11, "adam_stats/m_t_min": -0.007045470178127289, "adam_stats/v_t_max": 0.00030531681841239333, "adam_stats/v_t_mean": 2.1521217100084478e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8020833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.16041667759418488, "all_logprobs": -0.020466987043619156, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.875, "all_logprobs/p1": -0.5814454555511475, "all_logprobs/p10": -0.0002613067626953125, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.0141754150390625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.028181448578834534, "clip_ratio": 0.0, "completion_length": 544.9583740234375, "completion_length/correct": 498.2727355957031, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 430.0, "completion_length/correct/min": 172.0, "completion_length/correct/p25": 367.0, "completion_length/correct/p75": 623.0, "completion_length/correct/var": 49436.7265625, "completion_length/incorrect": 734.1578979492188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 774.0, "completion_length/incorrect/min": 391.0, "completion_length/incorrect/p25": 436.5, "completion_length/incorrect/p75": 996.5, "completion_length/incorrect/var": 68743.25, "completion_length/max": 1024.0, "completion_length/median": 451.0, "completion_length/min": 172.0, "completion_length/p25": 390.75, "completion_length/p75": 746.75, "completion_length/var": 61500.29296875, "epoch": 0.0976, "feature_vector_variance/max_squared_error": 81474.703125, "feature_vector_variance/metric": 27760.748046875, "generated_tokens/total": 3208835.0, "grad_norm": 0.6155907511711121, "learning_rate": 5.9406623188668065e-06, "loss": -0.8021, "mean_logprobs": -0.0218505859375, "mean_logprobs/var": 0.00017261505126953125, "num_completions/total": 5856, "per_sentence_gradient_norm": 2.571720600128174, "per_sentence_gradient_norm/max": 7.7490129470825195, "per_sentence_gradient_norm/median": 2.3592944145202637, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 4.228668689727783, "per_sentence_gradient_norm/p85": 4.7890167236328125, "per_sentence_gradient_norm/p90": 5.277907371520996, "per_sentence_gradient_norm/p95": 6.121644496917725, "per_sentence_gradient_norm/p99": 7.35227108001709, "per_sentence_gradient_norm/var": 4.394619464874268, "per_token_feature_norm": 186.95677185058594, "per_token_feature_norm/max": 268.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 92.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 229.46212768554688, "per_token_full_gradient_variance/max_squared_error": 0.46005865931510925, "per_token_full_gradient_variance/variance": 0.003398180240765214, "per_token_gradient_norm": 2.130890130996704, "per_token_gradient_norm/max": 300.9375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 280.065673828125, "per_token_policy_error_norm": 0.012097794562578201, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011369018815457821, "policy_entropy": 0.022304227575659752, "policy_entropy/max": 2.046875, "policy_entropy/median": 2.8870999813079834e-08, "policy_entropy/min": 1.3145951341386741e-18, "policy_entropy/p25": 1.546140993013978e-10, "policy_entropy/p75": 5.125999450683594e-06, "policy_entropy/var": 0.012245780788362026, "policy_error_vector_variance/max_squared_error": 1.9880242347717285, "policy_error_vector_variance/metric": 0.012079074047505856, "policy_loss": -0.8020833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.16041667759418488, "policy_sharpness": 9.390865325927734, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.7416832447052, "reward": 0.8020833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16041667759418488, "rewards/accuracy_reward": 0.8020833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16041667759418488, "sentence_full_gradient_variance/max_squared_error": 1943.1470947265625, "sentence_full_gradient_variance/metric": 931.7880859375, "sentence_full_gradient_variance/p75": 1850.3529052734375, "sentence_full_gradient_variance/p90": 1850.3538818359375, "sentence_full_gradient_variance/p95": 1850.3577880859375, "sentence_full_gradient_variance/p99": 1924.5516357421875, "state_level_variance/metric": 3.3416216373443604, "state_level_variance_full_gradient/metric": 747.6156005859375, "step": 61 }, { "accuracy_reward": 0.625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2368421107530594, "action_level_variance/metric": 2.4449853897094727, "action_level_variance_full_gradient/metric": 390.0960388183594, "adam_stats/lr_effective_max": 3.3016520319506526e-05, "adam_stats/lr_effective_mean": 7.372815552519896e-11, "adam_stats/lr_effective_min": -3.415480750845745e-05, "adam_stats/m_t_max": 0.0068060364574193954, "adam_stats/m_t_mean": 8.235993137484243e-11, "adam_stats/m_t_min": -0.0071771047078073025, "adam_stats/v_t_max": 0.0003050190571229905, "adam_stats/v_t_mean": 2.17067527147341e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.625, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2368421107530594, "all_logprobs": -0.021266086027026176, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.75, "all_logprobs/p1": -0.5859375, "all_logprobs/p10": -0.00032806396484375, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.0181884765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.030521580949425697, "clip_ratio": 0.0, "completion_length": 567.3021240234375, "completion_length/correct": 499.5333557128906, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 467.0, "completion_length/correct/min": 259.0, "completion_length/correct/p25": 328.0, "completion_length/correct/p75": 630.0, "completion_length/correct/var": 37843.98046875, "completion_length/incorrect": 680.25, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 649.0, "completion_length/incorrect/min": 287.0, "completion_length/incorrect/p25": 401.5, "completion_length/incorrect/p75": 997.75, "completion_length/incorrect/var": 79735.8515625, "completion_length/max": 1024.0, "completion_length/median": 516.0, "completion_length/min": 259.0, "completion_length/p25": 329.5, "completion_length/p75": 704.25, "completion_length/var": 60614.37890625, "epoch": 0.0992, "feature_vector_variance/max_squared_error": 77244.1796875, "feature_vector_variance/metric": 27997.71484375, "generated_tokens/total": 3263296.0, "grad_norm": 0.7381013631820679, "learning_rate": 5.685585783002493e-06, "loss": -0.625, "mean_logprobs": -0.02099609375, "mean_logprobs/var": 8.821487426757812e-05, "num_completions/total": 5952, "per_sentence_gradient_norm": 1.9150581359863281, "per_sentence_gradient_norm/max": 7.022394180297852, "per_sentence_gradient_norm/median": 1.9022067785263062, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.159576416015625, "per_sentence_gradient_norm/p85": 3.7801895141601562, "per_sentence_gradient_norm/p90": 4.485874176025391, "per_sentence_gradient_norm/p95": 4.756157875061035, "per_sentence_gradient_norm/p99": 5.322238922119141, "per_sentence_gradient_norm/var": 3.1078271865844727, "per_token_feature_norm": 188.80320739746094, "per_token_feature_norm/max": 250.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 92.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 211.59117126464844, "per_token_full_gradient_variance/max_squared_error": 0.6434217095375061, "per_token_full_gradient_variance/variance": 0.002713280264288187, "per_token_gradient_norm": 1.696297287940979, "per_token_gradient_norm/max": 277.9453125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 228.4420623779297, "per_token_policy_error_norm": 0.012496685609221458, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011934464797377586, "policy_entropy": 0.022779621183872223, "policy_entropy/max": 1.796875, "policy_entropy/median": 2.0954757928848267e-08, "policy_entropy/min": 6.640738306473715e-19, "policy_entropy/p25": 1.1323209037072957e-10, "policy_entropy/p75": 3.933906555175781e-06, "policy_entropy/var": 0.011884603649377823, "policy_error_vector_variance/max_squared_error": 1.999090313911438, "policy_error_vector_variance/metric": 0.012494444847106934, "policy_loss": -0.625, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2368421107530594, "policy_sharpness": 9.368481636047363, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.8756673336029053, "reward": 0.625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2368421107530594, "rewards/accuracy_reward": 0.625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2368421107530594, "sentence_full_gradient_variance/max_squared_error": 2667.52880859375, "sentence_full_gradient_variance/metric": 950.5506591796875, "sentence_full_gradient_variance/p75": 1162.697998046875, "sentence_full_gradient_variance/p90": 1502.2349853515625, "sentence_full_gradient_variance/p95": 1989.00390625, "sentence_full_gradient_variance/p99": 2470.919189453125, "state_level_variance/metric": 1.0211913585662842, "state_level_variance_full_gradient/metric": 560.45458984375, "step": 62 }, { "accuracy_reward": 0.8854166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.10252192616462708, "action_level_variance/metric": 0.8398014903068542, "action_level_variance_full_gradient/metric": 118.79776000976562, "adam_stats/lr_effective_max": 3.1568102713208646e-05, "adam_stats/lr_effective_mean": -3.521744007528582e-11, "adam_stats/lr_effective_min": -3.272192770964466e-05, "adam_stats/m_t_max": 0.010385449975728989, "adam_stats/m_t_mean": 7.243159544367828e-11, "adam_stats/m_t_min": -0.01596473902463913, "adam_stats/v_t_max": 0.0003051196981687099, "adam_stats/v_t_mean": 2.2173091487887042e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8854166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.10252192616462708, "all_logprobs": -0.016481712460517883, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.5, "all_logprobs/p1": -0.474609375, "all_logprobs/p10": -0.00019359588623046875, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.0067138671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.022463694214820862, "clip_ratio": 0.0, "completion_length": 454.03125, "completion_length/correct": 401.1647033691406, "completion_length/correct/max": 896.0, "completion_length/correct/median": 344.0, "completion_length/correct/min": 192.0, "completion_length/correct/p25": 281.0, "completion_length/correct/p75": 482.0, "completion_length/correct/var": 25134.470703125, "completion_length/incorrect": 862.5454711914062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 393.0, "completion_length/incorrect/p25": 749.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 76794.265625, "completion_length/max": 1024.0, "completion_length/median": 404.0, "completion_length/min": 192.0, "completion_length/p25": 287.75, "completion_length/p75": 502.75, "completion_length/var": 52131.84375, "epoch": 0.1008, "feature_vector_variance/max_squared_error": 80931.5703125, "feature_vector_variance/metric": 27887.927734375, "generated_tokens/total": 3306883.0, "grad_norm": 1.6849066019058228, "learning_rate": 5.432719831372507e-06, "loss": -0.8854, "mean_logprobs": -0.01611328125, "mean_logprobs/var": 9.679794311523438e-05, "num_completions/total": 6048, "per_sentence_gradient_norm": 2.031611680984497, "per_sentence_gradient_norm/max": 6.252163887023926, "per_sentence_gradient_norm/median": 1.8856334686279297, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.052924394607544, "per_sentence_gradient_norm/p75": 2.948113203048706, "per_sentence_gradient_norm/p85": 3.3387203216552734, "per_sentence_gradient_norm/p90": 3.6374993324279785, "per_sentence_gradient_norm/p95": 4.74458646774292, "per_sentence_gradient_norm/p99": 5.509091854095459, "per_sentence_gradient_norm/var": 1.998164415359497, "per_token_feature_norm": 187.51583862304688, "per_token_feature_norm/max": 258.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 98.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 253.53366088867188, "per_token_full_gradient_variance/max_squared_error": 0.6486019492149353, "per_token_full_gradient_variance/variance": 0.0032685829792171717, "per_token_gradient_norm": 1.7170902490615845, "per_token_gradient_norm/max": 297.46875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 232.4082794189453, "per_token_policy_error_norm": 0.009790673851966858, "per_token_policy_error_norm/max": 1.9765625, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009409895166754723, "policy_entropy": 0.017904294654726982, "policy_entropy/max": 1.640625, "policy_entropy/median": 9.953510016202927e-09, "policy_entropy/min": 5.55653613398821e-19, "policy_entropy/p25": 7.185008144006133e-11, "policy_entropy/p75": 1.8402934074401855e-06, "policy_entropy/var": 0.008779551833868027, "policy_error_vector_variance/max_squared_error": 1.9772125482559204, "policy_error_vector_variance/metric": 0.009791201911866665, "policy_loss": -0.8854166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.10252192616462708, "policy_sharpness": 9.446868896484375, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.28462290763855, "reward": 0.8854166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.10252192616462708, "rewards/accuracy_reward": 0.8854166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.10252192616462708, "sentence_full_gradient_variance/max_squared_error": 2165.572509765625, "sentence_full_gradient_variance/metric": 862.793212890625, "sentence_full_gradient_variance/p75": 1436.88232421875, "sentence_full_gradient_variance/p90": 1903.6988525390625, "sentence_full_gradient_variance/p95": 1903.6988525390625, "sentence_full_gradient_variance/p99": 1916.84423828125, "state_level_variance/metric": 1.355480432510376, "state_level_variance_full_gradient/metric": 743.9954833984375, "step": 63 }, { "accuracy_reward": 0.8541666865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.12587720155715942, "action_level_variance/metric": 0.9019883871078491, "action_level_variance_full_gradient/metric": 101.62456512451172, "adam_stats/lr_effective_max": 2.9365506634349003e-05, "adam_stats/lr_effective_mean": 5.574105413352903e-12, "adam_stats/lr_effective_min": -2.8845668566646054e-05, "adam_stats/m_t_max": 0.0067587606608867645, "adam_stats/m_t_mean": 6.868334373466567e-11, "adam_stats/m_t_min": -0.015844428911805153, "adam_stats/v_t_max": 0.0003056743007618934, "adam_stats/v_t_mean": 2.2590606468808616e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8541666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.12587720155715942, "all_logprobs": -0.018488625064492226, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.125, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.0001468658447265625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.00860595703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.025873927399516106, "clip_ratio": 0.0, "completion_length": 579.6771240234375, "completion_length/correct": 510.5731506347656, "completion_length/correct/max": 969.0, "completion_length/correct/median": 484.0, "completion_length/correct/min": 234.0, "completion_length/correct/p25": 428.0, "completion_length/correct/p75": 586.25, "completion_length/correct/var": 26955.30859375, "completion_length/incorrect": 984.4285888671875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 619.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 11812.8798828125, "completion_length/max": 1024.0, "completion_length/median": 489.0, "completion_length/min": 234.0, "completion_length/p25": 439.0, "completion_length/p75": 644.5, "completion_length/var": 52863.7890625, "epoch": 0.1024, "feature_vector_variance/max_squared_error": 76623.5390625, "feature_vector_variance/metric": 27129.103515625, "generated_tokens/total": 3362532.0, "grad_norm": 1.207336187362671, "learning_rate": 5.182372542187895e-06, "loss": -0.8542, "mean_logprobs": -0.0191650390625, "mean_logprobs/var": 0.00010156631469726562, "num_completions/total": 6144, "per_sentence_gradient_norm": 2.4759249687194824, "per_sentence_gradient_norm/max": 8.332606315612793, "per_sentence_gradient_norm/median": 2.3324036598205566, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.7080376148223877, "per_sentence_gradient_norm/p75": 3.0304393768310547, "per_sentence_gradient_norm/p85": 3.9940757751464844, "per_sentence_gradient_norm/p90": 4.517845153808594, "per_sentence_gradient_norm/p95": 5.29376220703125, "per_sentence_gradient_norm/p99": 7.863757610321045, "per_sentence_gradient_norm/var": 2.7502758502960205, "per_token_feature_norm": 185.3780059814453, "per_token_feature_norm/max": 256.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 94.0, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 195.90890502929688, "per_token_full_gradient_variance/max_squared_error": 0.598414957523346, "per_token_full_gradient_variance/variance": 0.0034809247590601444, "per_token_gradient_norm": 2.0792696475982666, "per_token_gradient_norm/max": 303.140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 275.7426452636719, "per_token_policy_error_norm": 0.010964194312691689, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010466353967785835, "policy_entropy": 0.019800664857029915, "policy_entropy/max": 1.5390625, "policy_entropy/median": 1.4901161193847656e-08, "policy_entropy/min": 5.454892180317694e-19, "policy_entropy/p25": 1.064108801074326e-10, "policy_entropy/p75": 2.0712614059448242e-06, "policy_entropy/var": 0.010213756933808327, "policy_error_vector_variance/max_squared_error": 1.9902526140213013, "policy_error_vector_variance/metric": 0.010956985875964165, "policy_loss": -0.8541666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.12587720155715942, "policy_sharpness": 9.44046688079834, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.4134395122528076, "reward": 0.8541666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.12587720155715942, "rewards/accuracy_reward": 0.8541666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.12587720155715942, "sentence_full_gradient_variance/max_squared_error": 2593.26416015625, "sentence_full_gradient_variance/metric": 722.9041748046875, "sentence_full_gradient_variance/p75": 1266.2783203125, "sentence_full_gradient_variance/p90": 1274.49267578125, "sentence_full_gradient_variance/p95": 1883.8428955078125, "sentence_full_gradient_variance/p99": 2377.9609375, "state_level_variance/metric": 2.1080586910247803, "state_level_variance_full_gradient/metric": 621.2796630859375, "step": 64 }, { "accuracy_reward": 0.9479166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.04989034682512283, "action_level_variance/metric": 1.5372064113616943, "action_level_variance_full_gradient/metric": 194.77145385742188, "adam_stats/lr_effective_max": 2.7793654226115905e-05, "adam_stats/lr_effective_mean": -1.0071023008595414e-11, "adam_stats/lr_effective_min": -2.741692151175812e-05, "adam_stats/m_t_max": 0.00627239141613245, "adam_stats/m_t_mean": 7.663907702903927e-11, "adam_stats/m_t_min": -0.013069800101220608, "adam_stats/v_t_max": 0.00030669188708998263, "adam_stats/v_t_mean": 2.277634851555188e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.9479166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.04989034682512283, "all_logprobs": -0.017927948385477066, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.03125, "all_logprobs/p1": -0.482421875, "all_logprobs/p10": -0.00016736984252929688, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.00860595703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.025479499250650406, "clip_ratio": 0.0, "completion_length": 458.29168701171875, "completion_length/correct": 456.74725341796875, "completion_length/correct/max": 896.0, "completion_length/correct/median": 442.0, "completion_length/correct/min": 254.0, "completion_length/correct/p25": 356.5, "completion_length/correct/p75": 534.0, "completion_length/correct/var": 16473.322265625, "completion_length/incorrect": 486.3999938964844, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 363.0, "completion_length/incorrect/min": 313.0, "completion_length/incorrect/p25": 355.0, "completion_length/incorrect/p75": 377.0, "completion_length/incorrect/var": 90885.796875, "completion_length/max": 1024.0, "completion_length/median": 442.0, "completion_length/min": 254.0, "completion_length/p25": 354.75, "completion_length/p75": 534.0, "completion_length/var": 19476.9453125, "epoch": 0.104, "feature_vector_variance/max_squared_error": 73846.390625, "feature_vector_variance/metric": 26953.146484375, "generated_tokens/total": 3406528.0, "grad_norm": 0.7803559899330139, "learning_rate": 4.934848925057485e-06, "loss": -0.9479, "mean_logprobs": -0.0179443359375, "mean_logprobs/var": 0.00013828277587890625, "num_completions/total": 6240, "per_sentence_gradient_norm": 2.4120378494262695, "per_sentence_gradient_norm/max": 6.941617965698242, "per_sentence_gradient_norm/median": 2.3227665424346924, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.3697938919067383, "per_sentence_gradient_norm/p75": 3.2400546073913574, "per_sentence_gradient_norm/p85": 3.920563220977783, "per_sentence_gradient_norm/p90": 4.703217506408691, "per_sentence_gradient_norm/p95": 5.28184700012207, "per_sentence_gradient_norm/p99": 6.334230899810791, "per_sentence_gradient_norm/var": 2.406714916229248, "per_token_feature_norm": 186.48886108398438, "per_token_feature_norm/max": 268.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 95.0, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 224.2503204345703, "per_token_full_gradient_variance/max_squared_error": 0.6597545742988586, "per_token_full_gradient_variance/variance": 0.004362183157354593, "per_token_gradient_norm": 2.3927133083343506, "per_token_gradient_norm/max": 310.78125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 317.0140380859375, "per_token_policy_error_norm": 0.010554137639701366, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010379314422607422, "policy_entropy": 0.0191663708537817, "policy_entropy/max": 2.359375, "policy_entropy/median": 1.2514647096395493e-08, "policy_entropy/min": 5.21772295508649e-19, "policy_entropy/p25": 1.0049916454590857e-10, "policy_entropy/p75": 2.253800630569458e-06, "policy_entropy/var": 0.009830156341195107, "policy_error_vector_variance/max_squared_error": 1.973224401473999, "policy_error_vector_variance/metric": 0.010550973936915398, "policy_loss": -0.9479166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.04989034682512283, "policy_sharpness": 9.436819076538086, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.3956222534179688, "reward": 0.9479166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.04989034682512283, "rewards/accuracy_reward": 0.9479166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.04989034682512283, "sentence_full_gradient_variance/max_squared_error": 1396.9644775390625, "sentence_full_gradient_variance/metric": 480.0782165527344, "sentence_full_gradient_variance/p75": 888.1062622070312, "sentence_full_gradient_variance/p90": 1173.54150390625, "sentence_full_gradient_variance/p95": 1176.4349365234375, "sentence_full_gradient_variance/p99": 1278.4046630859375, "state_level_variance/metric": 1.1308246850967407, "state_level_variance_full_gradient/metric": 285.3067626953125, "step": 65 }, { "accuracy_reward": 0.7291666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19956138730049133, "action_level_variance/metric": 1.0315783023834229, "action_level_variance_full_gradient/metric": 169.2181854248047, "adam_stats/lr_effective_max": 2.6086509024025872e-05, "adam_stats/lr_effective_mean": -9.344108820030783e-12, "adam_stats/lr_effective_min": -2.664176827238407e-05, "adam_stats/m_t_max": 0.00742737902328372, "adam_stats/m_t_mean": 9.701019848540327e-11, "adam_stats/m_t_min": -0.012165652588009834, "adam_stats/v_t_max": 0.0003078356967307627, "adam_stats/v_t_mean": 2.2957391196398724e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7291666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.19956138730049133, "all_logprobs": -0.01522637065500021, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.75, "all_logprobs/p1": -0.40625, "all_logprobs/p10": -4.75406413897872e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.004187010228633881, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.021291812881827354, "clip_ratio": 0.0, "completion_length": 590.1771240234375, "completion_length/correct": 484.9571533203125, "completion_length/correct/max": 888.0, "completion_length/correct/median": 457.0, "completion_length/correct/min": 168.0, "completion_length/correct/p25": 352.25, "completion_length/correct/p75": 595.25, "completion_length/correct/var": 28208.068359375, "completion_length/incorrect": 873.4615478515625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 897.0, "completion_length/incorrect/min": 407.0, "completion_length/incorrect/p25": 759.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 31635.859375, "completion_length/max": 1024.0, "completion_length/median": 542.0, "completion_length/min": 168.0, "completion_length/p25": 387.5, "completion_length/p75": 749.0, "completion_length/var": 58934.125, "epoch": 0.1056, "feature_vector_variance/max_squared_error": 75096.890625, "feature_vector_variance/metric": 27207.109375, "generated_tokens/total": 3463185.0, "grad_norm": 0.7076598405838013, "learning_rate": 4.6904505493806595e-06, "loss": -0.7292, "mean_logprobs": -0.01513671875, "mean_logprobs/var": 6.628036499023438e-05, "num_completions/total": 6336, "per_sentence_gradient_norm": 1.5253932476043701, "per_sentence_gradient_norm/max": 5.5835065841674805, "per_sentence_gradient_norm/median": 1.508421540260315, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.338315486907959, "per_sentence_gradient_norm/p85": 2.9354095458984375, "per_sentence_gradient_norm/p90": 3.4068002700805664, "per_sentence_gradient_norm/p95": 3.85910964012146, "per_sentence_gradient_norm/p99": 4.223379135131836, "per_sentence_gradient_norm/var": 1.6838065385818481, "per_token_feature_norm": 186.44717407226562, "per_token_feature_norm/max": 256.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 94.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 220.43746948242188, "per_token_full_gradient_variance/max_squared_error": 0.5035250186920166, "per_token_full_gradient_variance/variance": 0.002155490219593048, "per_token_gradient_norm": 1.2233480215072632, "per_token_gradient_norm/max": 286.828125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 160.4053955078125, "per_token_policy_error_norm": 0.00894262082874775, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.00842623133212328, "policy_entropy": 0.016774777323007584, "policy_entropy/max": 1.7578125, "policy_entropy/median": 1.0069925338029861e-08, "policy_entropy/min": 1.0299920638612292e-18, "policy_entropy/p25": 6.957634468562901e-11, "policy_entropy/p75": 1.3932585716247559e-06, "policy_entropy/var": 0.008629701100289822, "policy_error_vector_variance/max_squared_error": 1.9762605428695679, "policy_error_vector_variance/metric": 0.008936124853789806, "policy_loss": -0.7291666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.19956138730049133, "policy_sharpness": 9.52267074584961, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 2.9486148357391357, "reward": 0.7291666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19956138730049133, "rewards/accuracy_reward": 0.7291666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19956138730049133, "sentence_full_gradient_variance/max_squared_error": 2947.48779296875, "sentence_full_gradient_variance/metric": 570.7567138671875, "sentence_full_gradient_variance/p75": 805.3468627929688, "sentence_full_gradient_variance/p90": 1101.340087890625, "sentence_full_gradient_variance/p95": 1423.3604736328125, "sentence_full_gradient_variance/p99": 2557.958984375, "state_level_variance/metric": 0.8330574631690979, "state_level_variance_full_gradient/metric": 401.53851318359375, "step": 66 }, { "accuracy_reward": 0.7604166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18410088121891022, "action_level_variance/metric": 0.9555425643920898, "action_level_variance_full_gradient/metric": 129.15591430664062, "adam_stats/lr_effective_max": 2.4893033696571365e-05, "adam_stats/lr_effective_mean": 2.3235939214383272e-11, "adam_stats/lr_effective_min": -2.480810508131981e-05, "adam_stats/m_t_max": 0.007358948700129986, "adam_stats/m_t_mean": 6.832895360631142e-11, "adam_stats/m_t_min": -0.006446249317377806, "adam_stats/v_t_max": 0.0003080684400629252, "adam_stats/v_t_mean": 2.344014739252831e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7604166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.18410088121891022, "all_logprobs": -0.020139357075095177, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.875, "all_logprobs/p1": -0.6015625, "all_logprobs/p10": -0.00026226043701171875, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.0181884765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02742341347038746, "clip_ratio": 0.0, "completion_length": 474.125, "completion_length/correct": 403.58905029296875, "completion_length/correct/max": 873.0, "completion_length/correct/median": 390.0, "completion_length/correct/min": 230.0, "completion_length/correct/p25": 312.0, "completion_length/correct/p75": 466.0, "completion_length/correct/var": 15447.1611328125, "completion_length/incorrect": 698.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 614.0, "completion_length/incorrect/min": 430.0, "completion_length/incorrect/p25": 522.5, "completion_length/incorrect/p75": 927.5, "completion_length/incorrect/var": 49117.7265625, "completion_length/max": 1024.0, "completion_length/median": 438.0, "completion_length/min": 230.0, "completion_length/p25": 326.75, "completion_length/p75": 544.5, "completion_length/var": 39039.41796875, "epoch": 0.1072, "feature_vector_variance/max_squared_error": 73438.5390625, "feature_vector_variance/metric": 28044.671875, "generated_tokens/total": 3508701.0, "grad_norm": 2.320584774017334, "learning_rate": 4.4494751769315e-06, "loss": -0.7604, "mean_logprobs": -0.0194091796875, "mean_logprobs/var": 7.534027099609375e-05, "num_completions/total": 6432, "per_sentence_gradient_norm": 2.008065700531006, "per_sentence_gradient_norm/max": 6.108010768890381, "per_sentence_gradient_norm/median": 2.12666654586792, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.2551910877227783, "per_sentence_gradient_norm/p75": 2.7710044384002686, "per_sentence_gradient_norm/p85": 3.250119686126709, "per_sentence_gradient_norm/p90": 4.053276062011719, "per_sentence_gradient_norm/p95": 4.499205589294434, "per_sentence_gradient_norm/p99": 5.136592388153076, "per_sentence_gradient_norm/var": 2.1237313747406006, "per_token_feature_norm": 188.1980438232422, "per_token_feature_norm/max": 274.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 83.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 239.10545349121094, "per_token_full_gradient_variance/max_squared_error": 0.567018985748291, "per_token_full_gradient_variance/variance": 0.0031480100005865097, "per_token_gradient_norm": 1.7178294658660889, "per_token_gradient_norm/max": 289.234375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 223.90310668945312, "per_token_policy_error_norm": 0.011832806281745434, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011190300807356834, "policy_entropy": 0.0221811942756176, "policy_entropy/max": 2.734375, "policy_entropy/median": 1.57160684466362e-08, "policy_entropy/min": 9.571472303973594e-20, "policy_entropy/p25": 1.0720668797148392e-10, "policy_entropy/p75": 3.874301910400391e-06, "policy_entropy/var": 0.011744600720703602, "policy_error_vector_variance/max_squared_error": 1.98635995388031, "policy_error_vector_variance/metric": 0.011823631823062897, "policy_loss": -0.7604166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.18410088121891022, "policy_sharpness": 9.384379386901855, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.734435796737671, "reward": 0.7604166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18410088121891022, "rewards/accuracy_reward": 0.7604166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18410088121891022, "sentence_full_gradient_variance/max_squared_error": 2498.43896484375, "sentence_full_gradient_variance/metric": 911.8056640625, "sentence_full_gradient_variance/p75": 1187.67138671875, "sentence_full_gradient_variance/p90": 2167.3544921875, "sentence_full_gradient_variance/p95": 2459.626953125, "sentence_full_gradient_variance/p99": 2498.43896484375, "state_level_variance/metric": 1.3805556297302246, "state_level_variance_full_gradient/metric": 782.6497802734375, "step": 67 }, { "accuracy_reward": 0.9270833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.068311408162117, "action_level_variance/metric": 0.754987895488739, "action_level_variance_full_gradient/metric": 109.12200927734375, "adam_stats/lr_effective_max": 2.29864217544673e-05, "adam_stats/lr_effective_mean": 5.929829977741052e-12, "adam_stats/lr_effective_min": -2.292541648785118e-05, "adam_stats/m_t_max": 0.008268174715340137, "adam_stats/m_t_mean": 7.563370763019606e-11, "adam_stats/m_t_min": -0.005671950988471508, "adam_stats/v_t_max": 0.00030778488144278526, "adam_stats/v_t_mean": 2.3941014101747093e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.9270833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.068311408162117, "all_logprobs": -0.016244636848568916, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.53125, "all_logprobs/p1": -0.474609375, "all_logprobs/p10": -7.390975952148438e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.0067138671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.021114684641361237, "clip_ratio": 0.0, "completion_length": 480.16668701171875, "completion_length/correct": 477.8988952636719, "completion_length/correct/max": 1020.0, "completion_length/correct/median": 393.0, "completion_length/correct/min": 185.0, "completion_length/correct/p25": 293.0, "completion_length/correct/p75": 657.0, "completion_length/correct/var": 49045.81640625, "completion_length/incorrect": 509.0000305175781, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 265.0, "completion_length/incorrect/min": 216.0, "completion_length/incorrect/p25": 225.0, "completion_length/incorrect/p75": 804.0, "completion_length/incorrect/var": 131656.984375, "completion_length/max": 1024.0, "completion_length/median": 392.0, "completion_length/min": 185.0, "completion_length/p25": 292.0, "completion_length/p75": 659.5, "completion_length/var": 53813.16796875, "epoch": 0.1088, "feature_vector_variance/max_squared_error": 81563.421875, "feature_vector_variance/metric": 27503.43359375, "generated_tokens/total": 3554797.0, "grad_norm": 12.326937675476074, "learning_rate": 4.212216399081919e-06, "loss": -0.9271, "mean_logprobs": -0.0177001953125, "mean_logprobs/var": 0.00011444091796875, "num_completions/total": 6528, "per_sentence_gradient_norm": 2.583320379257202, "per_sentence_gradient_norm/max": 7.64432430267334, "per_sentence_gradient_norm/median": 2.2981467247009277, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.5527112483978271, "per_sentence_gradient_norm/p75": 3.2883167266845703, "per_sentence_gradient_norm/p85": 4.299737930297852, "per_sentence_gradient_norm/p90": 5.089081287384033, "per_sentence_gradient_norm/p95": 6.074134826660156, "per_sentence_gradient_norm/p99": 7.278357982635498, "per_sentence_gradient_norm/var": 2.800992250442505, "per_token_feature_norm": 188.43798828125, "per_token_feature_norm/max": 272.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 95.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 230.38958740234375, "per_token_full_gradient_variance/max_squared_error": 0.561600923538208, "per_token_full_gradient_variance/variance": 0.004047390539199114, "per_token_gradient_norm": 2.371943950653076, "per_token_gradient_norm/max": 303.140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 316.3211669921875, "per_token_policy_error_norm": 0.00986773893237114, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009104996919631958, "policy_entropy": 0.01764008402824402, "policy_entropy/max": 1.265625, "policy_entropy/median": 3.3614924177527428e-09, "policy_entropy/min": 2.524158182817815e-19, "policy_entropy/p25": 2.205524651799351e-11, "policy_entropy/p75": 8.530914783477783e-07, "policy_entropy/var": 0.008746313862502575, "policy_error_vector_variance/max_squared_error": 1.986688256263733, "policy_error_vector_variance/metric": 0.009863421320915222, "policy_loss": -0.9270833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.068311408162117, "policy_sharpness": 9.498908042907715, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.0690557956695557, "reward": 0.9270833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.068311408162117, "rewards/accuracy_reward": 0.9270833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.068311408162117, "sentence_full_gradient_variance/max_squared_error": 2593.91357421875, "sentence_full_gradient_variance/metric": 928.1484375, "sentence_full_gradient_variance/p75": 1860.3392333984375, "sentence_full_gradient_variance/p90": 2168.3359375, "sentence_full_gradient_variance/p95": 2181.78076171875, "sentence_full_gradient_variance/p99": 2462.371337890625, "state_level_variance/metric": 2.303128242492676, "state_level_variance_full_gradient/metric": 819.0264892578125, "step": 68 }, { "accuracy_reward": 0.8854166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.10252191871404648, "action_level_variance/metric": 1.6757782697677612, "action_level_variance_full_gradient/metric": 222.68414306640625, "adam_stats/lr_effective_max": 2.0493522242759354e-05, "adam_stats/lr_effective_mean": 1.3246335452132829e-11, "adam_stats/lr_effective_min": -2.247215343231801e-05, "adam_stats/m_t_max": 0.009274723939597607, "adam_stats/m_t_mean": 4.57513957274891e-11, "adam_stats/m_t_min": -0.008493110537528992, "adam_stats/v_t_max": 0.00030755894840694964, "adam_stats/v_t_mean": 2.4437892681694606e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8854166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.10252191871404648, "all_logprobs": -0.02173655666410923, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.25, "all_logprobs/p1": -0.69140625, "all_logprobs/p10": -0.0002307891845703125, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.016845703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03142968565225601, "clip_ratio": 0.0, "completion_length": 445.21875, "completion_length/correct": 385.6117858886719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 327.0, "completion_length/correct/min": 181.0, "completion_length/correct/p25": 236.0, "completion_length/correct/p75": 534.0, "completion_length/correct/var": 33856.66796875, "completion_length/incorrect": 905.8182373046875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 256.0, "completion_length/incorrect/p25": 980.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 64106.76953125, "completion_length/max": 1024.0, "completion_length/median": 346.0, "completion_length/min": 181.0, "completion_length/p25": 253.5, "completion_length/p75": 558.25, "completion_length/var": 64428.44921875, "epoch": 0.1104, "feature_vector_variance/max_squared_error": 118946.0078125, "feature_vector_variance/metric": 27860.8359375, "generated_tokens/total": 3597538.0, "grad_norm": 7.01695442199707, "learning_rate": 3.978963279105821e-06, "loss": -0.8854, "mean_logprobs": -0.023193359375, "mean_logprobs/var": 0.00014781951904296875, "num_completions/total": 6624, "per_sentence_gradient_norm": 3.171992778778076, "per_sentence_gradient_norm/max": 9.456338882446289, "per_sentence_gradient_norm/median": 3.010917901992798, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.970742106437683, "per_sentence_gradient_norm/p75": 4.213410377502441, "per_sentence_gradient_norm/p85": 5.0841193199157715, "per_sentence_gradient_norm/p90": 5.772245407104492, "per_sentence_gradient_norm/p95": 6.367264747619629, "per_sentence_gradient_norm/p99": 7.642735004425049, "per_sentence_gradient_norm/var": 3.5510947704315186, "per_token_feature_norm": 187.8314666748047, "per_token_feature_norm/max": 298.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 95.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 212.84878540039062, "per_token_full_gradient_variance/max_squared_error": 1.0518670082092285, "per_token_full_gradient_variance/variance": 0.003799012629315257, "per_token_gradient_norm": 2.673603057861328, "per_token_gradient_norm/max": 342.890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 361.5047607421875, "per_token_policy_error_norm": 0.012811953201889992, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011946161277592182, "policy_entropy": 0.023273345082998276, "policy_entropy/max": 3.203125, "policy_entropy/median": 1.0884832590818405e-08, "policy_entropy/min": 4.489274620447792e-20, "policy_entropy/p25": 1.1823431123048067e-10, "policy_entropy/p75": 1.9222497940063477e-06, "policy_entropy/var": 0.012972161173820496, "policy_error_vector_variance/max_squared_error": 2.0003466606140137, "policy_error_vector_variance/metric": 0.012794791720807552, "policy_loss": -0.8854166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.10252191871404648, "policy_sharpness": 9.381648063659668, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.856231689453125, "reward": 0.8854166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.10252191871404648, "rewards/accuracy_reward": 0.8854166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.10252191871404648, "sentence_full_gradient_variance/max_squared_error": 3425.81396484375, "sentence_full_gradient_variance/metric": 924.3966674804688, "sentence_full_gradient_variance/p75": 979.2232666015625, "sentence_full_gradient_variance/p90": 3425.74609375, "sentence_full_gradient_variance/p95": 3425.74609375, "sentence_full_gradient_variance/p99": 3425.7939453125, "state_level_variance/metric": 2.233961582183838, "state_level_variance_full_gradient/metric": 701.71240234375, "step": 69 }, { "accuracy_reward": 0.6041666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24166668951511383, "action_level_variance/metric": 2.6731762886047363, "action_level_variance_full_gradient/metric": 217.68118286132812, "adam_stats/lr_effective_max": 1.9431096006883308e-05, "adam_stats/lr_effective_mean": 2.6732494256415107e-12, "adam_stats/lr_effective_min": -2.033812415902503e-05, "adam_stats/m_t_max": 0.010666588321328163, "adam_stats/m_t_mean": 1.9672680151572308e-11, "adam_stats/m_t_min": -0.009938721545040607, "adam_stats/v_t_max": 0.0003081678587477654, "adam_stats/v_t_mean": 2.4537533463431238e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6041666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.24166668951511383, "all_logprobs": -0.01973063498735428, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.375, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.000263214111328125, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.0162353515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.026268620043992996, "clip_ratio": 0.0, "completion_length": 449.65625, "completion_length/correct": 440.89654541015625, "completion_length/correct/max": 873.0, "completion_length/correct/median": 384.0, "completion_length/correct/min": 94.0, "completion_length/correct/p25": 296.75, "completion_length/correct/p75": 610.25, "completion_length/correct/var": 39792.6171875, "completion_length/incorrect": 463.02630615234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 441.0, "completion_length/incorrect/min": 291.0, "completion_length/incorrect/p25": 373.25, "completion_length/incorrect/p75": 495.0, "completion_length/incorrect/var": 19088.728515625, "completion_length/max": 1024.0, "completion_length/median": 409.0, "completion_length/min": 94.0, "completion_length/p25": 342.75, "completion_length/p75": 562.5, "completion_length/var": 31428.478515625, "epoch": 0.112, "feature_vector_variance/max_squared_error": 87734.5546875, "feature_vector_variance/metric": 28061.8046875, "generated_tokens/total": 3640705.0, "grad_norm": 0.668353796005249, "learning_rate": 3.750000000000002e-06, "loss": -0.6042, "mean_logprobs": -0.0213623046875, "mean_logprobs/var": 0.00022029876708984375, "num_completions/total": 6720, "per_sentence_gradient_norm": 1.9458049535751343, "per_sentence_gradient_norm/max": 13.186508178710938, "per_sentence_gradient_norm/median": 1.2399791479110718, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.0653579235076904, "per_sentence_gradient_norm/p85": 3.747938871383667, "per_sentence_gradient_norm/p90": 4.551438331604004, "per_sentence_gradient_norm/p95": 7.105353832244873, "per_sentence_gradient_norm/p99": 10.676446914672852, "per_sentence_gradient_norm/var": 6.056300163269043, "per_token_feature_norm": 188.79808044433594, "per_token_feature_norm/max": 250.0, "per_token_feature_norm/median": 189.0, "per_token_feature_norm/min": 90.5, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 220.0090789794922, "per_token_full_gradient_variance/max_squared_error": 0.7158152461051941, "per_token_full_gradient_variance/variance": 0.002864655340090394, "per_token_gradient_norm": 1.679286003112793, "per_token_gradient_norm/max": 285.640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 228.52105712890625, "per_token_policy_error_norm": 0.011564688757061958, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010802987031638622, "policy_entropy": 0.022270487621426582, "policy_entropy/max": 1.4296875, "policy_entropy/median": 1.0593794286251068e-08, "policy_entropy/min": 1.1265538198482195e-19, "policy_entropy/p25": 6.866684998385608e-11, "policy_entropy/p75": 2.473592758178711e-06, "policy_entropy/var": 0.011368093080818653, "policy_error_vector_variance/max_squared_error": 2.0003178119659424, "policy_error_vector_variance/metric": 0.011558258906006813, "policy_loss": -0.6041666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.24166668951511383, "policy_sharpness": 9.378918647766113, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.789362668991089, "reward": 0.6041666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24166668951511383, "rewards/accuracy_reward": 0.6041666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24166668951511383, "sentence_full_gradient_variance/max_squared_error": 5033.06103515625, "sentence_full_gradient_variance/metric": 1337.6322021484375, "sentence_full_gradient_variance/p75": 1717.8243408203125, "sentence_full_gradient_variance/p90": 2991.841064453125, "sentence_full_gradient_variance/p95": 3892.420166015625, "sentence_full_gradient_variance/p99": 4927.91845703125, "state_level_variance/metric": 3.986384391784668, "state_level_variance_full_gradient/metric": 1119.9510498046875, "step": 70 }, { "accuracy_reward": 0.5729166865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24725878238677979, "action_level_variance/metric": 1.4355520009994507, "action_level_variance_full_gradient/metric": 183.97467041015625, "adam_stats/lr_effective_max": 1.90485043276567e-05, "adam_stats/lr_effective_mean": 1.6445766623518487e-11, "adam_stats/lr_effective_min": -1.9049644834012724e-05, "adam_stats/m_t_max": 0.011500196531414986, "adam_stats/m_t_mean": 2.8388897135855906e-11, "adam_stats/m_t_min": -0.011422877199947834, "adam_stats/v_t_max": 0.0003081559552811086, "adam_stats/v_t_mean": 2.468809705280517e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.5729166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.24725878238677979, "all_logprobs": -0.017755581066012383, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.5, "all_logprobs/p1": -0.490293025970459, "all_logprobs/p10": -0.00010600080713629723, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.009896844625473022, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02376549504697323, "clip_ratio": 0.0, "completion_length": 662.4791870117188, "completion_length/correct": 505.0727233886719, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 433.0, "completion_length/correct/min": 249.0, "completion_length/correct/p25": 323.5, "completion_length/correct/p75": 702.5, "completion_length/correct/var": 44191.91796875, "completion_length/incorrect": 873.6340942382812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 403.0, "completion_length/incorrect/p25": 731.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 37825.5390625, "completion_length/max": 1024.0, "completion_length/median": 675.0, "completion_length/min": 249.0, "completion_length/p25": 410.0, "completion_length/p75": 942.75, "completion_length/var": 74633.1875, "epoch": 0.1136, "feature_vector_variance/max_squared_error": 78542.8984375, "feature_vector_variance/metric": 27299.046875, "generated_tokens/total": 3704303.0, "grad_norm": 0.6778813004493713, "learning_rate": 3.525605518250964e-06, "loss": -0.5729, "mean_logprobs": -0.0189208984375, "mean_logprobs/var": 0.00012159347534179688, "num_completions/total": 6816, "per_sentence_gradient_norm": 1.705474853515625, "per_sentence_gradient_norm/max": 7.086727142333984, "per_sentence_gradient_norm/median": 0.578998863697052, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.08261775970459, "per_sentence_gradient_norm/p85": 4.187560081481934, "per_sentence_gradient_norm/p90": 4.498288154602051, "per_sentence_gradient_norm/p95": 5.419313907623291, "per_sentence_gradient_norm/p99": 6.532061576843262, "per_sentence_gradient_norm/var": 3.8109025955200195, "per_token_feature_norm": 188.30018615722656, "per_token_feature_norm/max": 247.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 95.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 191.8558807373047, "per_token_full_gradient_variance/max_squared_error": 0.5572631359100342, "per_token_full_gradient_variance/variance": 0.0020275688730180264, "per_token_gradient_norm": 1.1589409112930298, "per_token_gradient_norm/max": 308.125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 155.65603637695312, "per_token_policy_error_norm": 0.01039549708366394, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009729180485010147, "policy_entropy": 0.020112037658691406, "policy_entropy/max": 1.7578125, "policy_entropy/median": 1.0884832590818405e-08, "policy_entropy/min": 7.589415207398531e-19, "policy_entropy/p25": 8.86757334228605e-11, "policy_entropy/p75": 1.691281795501709e-06, "policy_entropy/var": 0.010898842476308346, "policy_error_vector_variance/max_squared_error": 1.9871907234191895, "policy_error_vector_variance/metric": 0.010389741510152817, "policy_loss": -0.5729166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.24725878238677979, "policy_sharpness": 9.449819564819336, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.410630226135254, "reward": 0.5729166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24725878238677979, "rewards/accuracy_reward": 0.5729166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24725878238677979, "sentence_full_gradient_variance/max_squared_error": 3010.75537109375, "sentence_full_gradient_variance/metric": 1197.2874755859375, "sentence_full_gradient_variance/p75": 1323.2860107421875, "sentence_full_gradient_variance/p90": 2214.73193359375, "sentence_full_gradient_variance/p95": 2530.55615234375, "sentence_full_gradient_variance/p99": 2892.276611328125, "state_level_variance/metric": 2.7437427043914795, "state_level_variance_full_gradient/metric": 1013.3128662109375, "step": 71 }, { "accuracy_reward": 0.625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23684212565422058, "action_level_variance/metric": 1.1564254760742188, "action_level_variance_full_gradient/metric": 135.6236114501953, "adam_stats/lr_effective_max": 1.9172202883055434e-05, "adam_stats/lr_effective_mean": -3.3604917459850725e-11, "adam_stats/lr_effective_min": -1.915383109007962e-05, "adam_stats/m_t_max": 0.010991046205163002, "adam_stats/m_t_mean": -4.164914073345738e-12, "adam_stats/m_t_min": -0.011452464386820793, "adam_stats/v_t_max": 0.0003078478039242327, "adam_stats/v_t_mean": 2.473370640243555e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.625, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.23684212565422058, "all_logprobs": -0.015812154859304428, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.5, "all_logprobs/p1": -0.4300394058227539, "all_logprobs/p10": -5.53131103515625e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.00579833984375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.022400353103876114, "clip_ratio": 0.0, "completion_length": 539.5833740234375, "completion_length/correct": 420.66668701171875, "completion_length/correct/max": 996.0, "completion_length/correct/median": 298.0, "completion_length/correct/min": 154.0, "completion_length/correct/p25": 291.0, "completion_length/correct/p75": 526.5, "completion_length/correct/var": 46711.3828125, "completion_length/incorrect": 737.7777709960938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 825.0, "completion_length/incorrect/min": 232.0, "completion_length/incorrect/p25": 478.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 81654.234375, "completion_length/max": 1024.0, "completion_length/median": 484.0, "completion_length/min": 154.0, "completion_length/p25": 293.75, "completion_length/p75": 806.25, "completion_length/var": 82910.0859375, "epoch": 0.1152, "feature_vector_variance/max_squared_error": 78022.4453125, "feature_vector_variance/metric": 27457.5234375, "generated_tokens/total": 3756103.0, "grad_norm": 0.5298827886581421, "learning_rate": 3.3060532239694e-06, "loss": -0.625, "mean_logprobs": -0.017822265625, "mean_logprobs/var": 0.000278472900390625, "num_completions/total": 6912, "per_sentence_gradient_norm": 1.292069435119629, "per_sentence_gradient_norm/max": 5.607793807983398, "per_sentence_gradient_norm/median": 0.8854415416717529, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.184410810470581, "per_sentence_gradient_norm/p85": 2.8661506175994873, "per_sentence_gradient_norm/p90": 3.123981475830078, "per_sentence_gradient_norm/p95": 3.855605363845825, "per_sentence_gradient_norm/p99": 4.599618434906006, "per_sentence_gradient_norm/var": 1.8301244974136353, "per_token_feature_norm": 188.8817901611328, "per_token_feature_norm/max": 258.0, "per_token_feature_norm/median": 189.0, "per_token_feature_norm/min": 95.5, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 204.34671020507812, "per_token_full_gradient_variance/max_squared_error": 0.481241375207901, "per_token_full_gradient_variance/variance": 0.0019217360531911254, "per_token_gradient_norm": 0.9690006971359253, "per_token_gradient_norm/max": 323.0390625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 122.82633972167969, "per_token_policy_error_norm": 0.009281906299293041, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.008766071870923042, "policy_entropy": 0.017344925552606583, "policy_entropy/max": 1.578125, "policy_entropy/median": 4.48198989033699e-09, "policy_entropy/min": 2.859583229930518e-18, "policy_entropy/p25": 3.319655661471188e-11, "policy_entropy/p75": 8.195638656616211e-07, "policy_entropy/var": 0.008727922104299068, "policy_error_vector_variance/max_squared_error": 1.9989126920700073, "policy_error_vector_variance/metric": 0.009274106472730637, "policy_loss": -0.625, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.23684212565422058, "policy_sharpness": 9.506885528564453, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.0580878257751465, "reward": 0.625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23684212565422058, "rewards/accuracy_reward": 0.625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23684212565422058, "sentence_full_gradient_variance/max_squared_error": 3774.175048828125, "sentence_full_gradient_variance/metric": 1325.8089599609375, "sentence_full_gradient_variance/p75": 1444.3548583984375, "sentence_full_gradient_variance/p90": 2680.79638671875, "sentence_full_gradient_variance/p95": 3092.096435546875, "sentence_full_gradient_variance/p99": 3397.761474609375, "state_level_variance/metric": 0.8718418478965759, "state_level_variance_full_gradient/metric": 1190.1854248046875, "step": 72 }, { "accuracy_reward": 0.7083333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20877191424369812, "action_level_variance/metric": 1.6205453872680664, "action_level_variance_full_gradient/metric": 95.03337097167969, "adam_stats/lr_effective_max": 1.8685541363083757e-05, "adam_stats/lr_effective_mean": -1.3800854903323057e-10, "adam_stats/lr_effective_min": -1.8853361325454898e-05, "adam_stats/m_t_max": 0.008680089376866817, "adam_stats/m_t_mean": -4.223623187304959e-11, "adam_stats/m_t_min": -0.008947369642555714, "adam_stats/v_t_max": 0.0003116184379905462, "adam_stats/v_t_mean": 2.5132205344613467e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7083333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.20877191424369812, "all_logprobs": -0.022341474890708923, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.53125, "all_logprobs/p1": -0.69140625, "all_logprobs/p10": -0.000335693359375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.021240234375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03184647113084793, "clip_ratio": 0.0, "completion_length": 564.375, "completion_length/correct": 447.9705810546875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 417.0, "completion_length/correct/min": 150.0, "completion_length/correct/p25": 286.75, "completion_length/correct/p75": 487.0, "completion_length/correct/var": 45136.38671875, "completion_length/incorrect": 847.0714721679688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 869.0, "completion_length/incorrect/min": 417.0, "completion_length/incorrect/p25": 695.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 41049.40234375, "completion_length/max": 1024.0, "completion_length/median": 461.0, "completion_length/min": 150.0, "completion_length/p25": 384.5, "completion_length/p75": 800.25, "completion_length/var": 76753.2109375, "epoch": 0.1168, "feature_vector_variance/max_squared_error": 85123.265625, "feature_vector_variance/metric": 27761.1796875, "generated_tokens/total": 3810283.0, "grad_norm": 1.1086291074752808, "learning_rate": 3.0916106078064522e-06, "loss": -0.7083, "mean_logprobs": -0.02294921875, "mean_logprobs/var": 0.0001354217529296875, "num_completions/total": 7008, "per_sentence_gradient_norm": 2.255258560180664, "per_sentence_gradient_norm/max": 7.805285453796387, "per_sentence_gradient_norm/median": 2.075464963912964, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.613900661468506, "per_sentence_gradient_norm/p85": 4.065030097961426, "per_sentence_gradient_norm/p90": 4.599481582641602, "per_sentence_gradient_norm/p95": 5.772516250610352, "per_sentence_gradient_norm/p99": 7.062323093414307, "per_sentence_gradient_norm/var": 3.7537693977355957, "per_token_feature_norm": 189.38470458984375, "per_token_feature_norm/max": 253.0, "per_token_feature_norm/median": 189.0, "per_token_feature_norm/min": 95.5, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 215.30889892578125, "per_token_full_gradient_variance/max_squared_error": 0.7995632290840149, "per_token_full_gradient_variance/variance": 0.0027199790347367525, "per_token_gradient_norm": 1.7294731140136719, "per_token_gradient_norm/max": 280.546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 238.00486755371094, "per_token_policy_error_norm": 0.013006340712308884, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.012312524020671844, "policy_entropy": 0.024093903601169586, "policy_entropy/max": 2.09375, "policy_entropy/median": 1.0186340659856796e-08, "policy_entropy/min": 1.8566962203814263e-18, "policy_entropy/p25": 7.275957614183426e-11, "policy_entropy/p75": 2.473592758178711e-06, "policy_entropy/var": 0.013042992912232876, "policy_error_vector_variance/max_squared_error": 1.971328616142273, "policy_error_vector_variance/metric": 0.012994332239031792, "policy_loss": -0.7083333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.20877191424369812, "policy_sharpness": 9.359042167663574, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.9438765048980713, "reward": 0.7083333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20877191424369812, "rewards/accuracy_reward": 0.7083333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20877191424369812, "sentence_full_gradient_variance/max_squared_error": 3108.7548828125, "sentence_full_gradient_variance/metric": 1302.0958251953125, "sentence_full_gradient_variance/p75": 2006.9134521484375, "sentence_full_gradient_variance/p90": 2008.1673583984375, "sentence_full_gradient_variance/p95": 2355.854248046875, "sentence_full_gradient_variance/p99": 2574.73193359375, "state_level_variance/metric": 2.5054805278778076, "state_level_variance_full_gradient/metric": 1207.0623779296875, "step": 73 }, { "accuracy_reward": 0.6145833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23936404287815094, "action_level_variance/metric": 1.054898977279663, "action_level_variance_full_gradient/metric": 85.26223754882812, "adam_stats/lr_effective_max": 1.603229611646384e-05, "adam_stats/lr_effective_mean": -1.1790329823568868e-10, "adam_stats/lr_effective_min": -1.6103496818686835e-05, "adam_stats/m_t_max": 0.011491029523313046, "adam_stats/m_t_mean": -1.7287428433210295e-11, "adam_stats/m_t_min": -0.012964298948645592, "adam_stats/v_t_max": 0.0003122085763607174, "adam_stats/v_t_mean": 2.5415348645085878e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6145833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.23936404287815094, "all_logprobs": -0.019333064556121826, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -4.84375, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.0001583099365234375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.012921124696731567, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0253834780305624, "clip_ratio": 0.0, "completion_length": 667.53125, "completion_length/correct": 522.2711791992188, "completion_length/correct/max": 1018.0, "completion_length/correct/median": 489.0, "completion_length/correct/min": 201.0, "completion_length/correct/p25": 408.0, "completion_length/correct/p75": 587.5, "completion_length/correct/var": 50180.2734375, "completion_length/incorrect": 899.1621704101562, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 930.0, "completion_length/incorrect/min": 600.0, "completion_length/incorrect/p25": 782.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 20781.02734375, "completion_length/max": 1024.0, "completion_length/median": 618.0, "completion_length/min": 201.0, "completion_length/p25": 468.5, "completion_length/p75": 910.0, "completion_length/var": 72512.1953125, "epoch": 0.1184, "feature_vector_variance/max_squared_error": 75617.1640625, "feature_vector_variance/metric": 27765.47265625, "generated_tokens/total": 3874366.0, "grad_norm": 0.8472903370857239, "learning_rate": 2.882538935057563e-06, "loss": -0.6146, "mean_logprobs": -0.0198974609375, "mean_logprobs/var": 9.584426879882812e-05, "num_completions/total": 7104, "per_sentence_gradient_norm": 1.853097677230835, "per_sentence_gradient_norm/max": 5.983743190765381, "per_sentence_gradient_norm/median": 2.0007660388946533, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.3535547256469727, "per_sentence_gradient_norm/p85": 3.746495246887207, "per_sentence_gradient_norm/p90": 3.94711971282959, "per_sentence_gradient_norm/p95": 4.715245246887207, "per_sentence_gradient_norm/p99": 5.718842506408691, "per_sentence_gradient_norm/var": 2.991708755493164, "per_token_feature_norm": 188.6252899169922, "per_token_feature_norm/max": 258.0, "per_token_feature_norm/median": 189.0, "per_token_feature_norm/min": 88.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 229.75987243652344, "per_token_full_gradient_variance/max_squared_error": 0.4992577135562897, "per_token_full_gradient_variance/variance": 0.0023392706643790007, "per_token_gradient_norm": 1.3872126340866089, "per_token_gradient_norm/max": 289.0078125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 182.15731811523438, "per_token_policy_error_norm": 0.011377822607755661, "per_token_policy_error_norm/max": 1.9375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010496127419173717, "policy_entropy": 0.021507503464818, "policy_entropy/max": 2.375, "policy_entropy/median": 6.111804395914078e-09, "policy_entropy/min": 1.3010426069826053e-18, "policy_entropy/p25": 5.775291356258094e-11, "policy_entropy/p75": 1.259148120880127e-06, "policy_entropy/var": 0.012067624367773533, "policy_error_vector_variance/max_squared_error": 1.943827509880066, "policy_error_vector_variance/metric": 0.011374689638614655, "policy_loss": -0.6145833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.23936404287815094, "policy_sharpness": 9.419840812683105, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.57244873046875, "reward": 0.6145833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23936404287815094, "rewards/accuracy_reward": 0.6145833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23936404287815094, "sentence_full_gradient_variance/max_squared_error": 3424.92138671875, "sentence_full_gradient_variance/metric": 1163.0927734375, "sentence_full_gradient_variance/p75": 1229.515380859375, "sentence_full_gradient_variance/p90": 2661.177001953125, "sentence_full_gradient_variance/p95": 2978.771728515625, "sentence_full_gradient_variance/p99": 3412.291748046875, "state_level_variance/metric": 2.2227368354797363, "state_level_variance_full_gradient/metric": 1077.8306884765625, "step": 74 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894736886024475, "action_level_variance/metric": 1.0820105075836182, "action_level_variance_full_gradient/metric": 189.1183319091797, "adam_stats/lr_effective_max": 1.5260542568285018e-05, "adam_stats/lr_effective_mean": -8.460860484449384e-11, "adam_stats/lr_effective_min": -1.5306335626519285e-05, "adam_stats/m_t_max": 0.011342903599143028, "adam_stats/m_t_mean": -1.7927640275083867e-11, "adam_stats/m_t_min": -0.012168357148766518, "adam_stats/v_t_max": 0.0003121292102150619, "adam_stats/v_t_mean": 2.561328926731221e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.75, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.75, "advantages/p75": 1.0, "advantages/var": 0.1894736886024475, "all_logprobs": -0.017206216230988503, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.8125, "all_logprobs/p1": -0.498046875, "all_logprobs/p10": -4.1484832763671875e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.005218505859375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02362688072025776, "clip_ratio": 0.0, "completion_length": 543.75, "completion_length/correct": 453.8055725097656, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 424.0, "completion_length/correct/min": 194.0, "completion_length/correct/p25": 369.0, "completion_length/correct/p75": 522.5, "completion_length/correct/var": 17338.2734375, "completion_length/incorrect": 813.5833740234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 375.0, "completion_length/incorrect/p25": 570.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 67125.125, "completion_length/max": 1024.0, "completion_length/median": 482.0, "completion_length/min": 194.0, "completion_length/p25": 377.0, "completion_length/p75": 592.0, "completion_length/var": 53734.90625, "epoch": 0.12, "feature_vector_variance/max_squared_error": 121782.9609375, "feature_vector_variance/metric": 27447.125, "generated_tokens/total": 3926566.0, "grad_norm": 0.7722269296646118, "learning_rate": 2.6790929273509547e-06, "loss": -0.75, "mean_logprobs": -0.019775390625, "mean_logprobs/var": 0.0002040863037109375, "num_completions/total": 7200, "per_sentence_gradient_norm": 2.2123823165893555, "per_sentence_gradient_norm/max": 7.704073429107666, "per_sentence_gradient_norm/median": 1.99302077293396, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.820023775100708, "per_sentence_gradient_norm/p75": 3.358795404434204, "per_sentence_gradient_norm/p85": 4.20991325378418, "per_sentence_gradient_norm/p90": 4.6454596519470215, "per_sentence_gradient_norm/p95": 5.105810642242432, "per_sentence_gradient_norm/p99": 7.009108066558838, "per_sentence_gradient_norm/var": 3.234004259109497, "per_token_feature_norm": 188.53158569335938, "per_token_feature_norm/max": 290.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 91.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 229.96018981933594, "per_token_full_gradient_variance/max_squared_error": 0.9891095161437988, "per_token_full_gradient_variance/variance": 0.0026189114432781935, "per_token_gradient_norm": 1.7312605381011963, "per_token_gradient_norm/max": 286.46875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 234.41488647460938, "per_token_policy_error_norm": 0.010173424147069454, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009424232877790928, "policy_entropy": 0.01867000013589859, "policy_entropy/max": 2.265625, "policy_entropy/median": 3.128661774098873e-09, "policy_entropy/min": 1.4433441421213278e-18, "policy_entropy/p25": 2.830802259268239e-11, "policy_entropy/p75": 4.991888999938965e-07, "policy_entropy/var": 0.011064959689974785, "policy_error_vector_variance/max_squared_error": 1.9850893020629883, "policy_error_vector_variance/metric": 0.010168010368943214, "policy_loss": -0.75, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -0.75, "policy_loss/var": 0.1894736886024475, "policy_sharpness": 9.515962600708008, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.0298757553100586, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894736886024475, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894736886024475, "sentence_full_gradient_variance/max_squared_error": 2468.509033203125, "sentence_full_gradient_variance/metric": 920.2794799804688, "sentence_full_gradient_variance/p75": 1891.74658203125, "sentence_full_gradient_variance/p90": 1891.74658203125, "sentence_full_gradient_variance/p95": 1891.74658203125, "sentence_full_gradient_variance/p99": 1920.5975341796875, "state_level_variance/metric": 2.458425760269165, "state_level_variance_full_gradient/metric": 731.1610717773438, "step": 75 }, { "accuracy_reward": 0.7604166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.18410088121891022, "action_level_variance/metric": 0.8348366618156433, "action_level_variance_full_gradient/metric": 168.20091247558594, "adam_stats/lr_effective_max": 1.3708862752537243e-05, "adam_stats/lr_effective_mean": -8.279002483568831e-11, "adam_stats/lr_effective_min": -1.3944909369456582e-05, "adam_stats/m_t_max": 0.010911605320870876, "adam_stats/m_t_mean": -3.4852978142430047e-11, "adam_stats/m_t_min": -0.011767447926104069, "adam_stats/v_t_max": 0.00031557216425426304, "adam_stats/v_t_mean": 2.5687736660007232e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7604166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.18410088121891022, "all_logprobs": -0.01868000626564026, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.25, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.000179290771484375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.01123046875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02798256278038025, "clip_ratio": 0.0, "completion_length": 611.3646240234375, "completion_length/correct": 523.4520263671875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 519.0, "completion_length/correct/min": 293.0, "completion_length/correct/p25": 369.0, "completion_length/correct/p75": 587.0, "completion_length/correct/var": 31996.8046875, "completion_length/incorrect": 890.3912963867188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 952.0, "completion_length/incorrect/min": 556.0, "completion_length/incorrect/p25": 834.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 27610.705078125, "completion_length/max": 1024.0, "completion_length/median": 565.0, "completion_length/min": 293.0, "completion_length/p25": 403.75, "completion_length/p75": 813.75, "completion_length/var": 55432.421875, "epoch": 0.1216, "feature_vector_variance/max_squared_error": 93306.3828125, "feature_vector_variance/metric": 27585.244140625, "generated_tokens/total": 3985257.0, "grad_norm": 0.5576895475387573, "learning_rate": 2.4815204523085656e-06, "loss": -0.7604, "mean_logprobs": -0.017333984375, "mean_logprobs/var": 8.344650268554688e-05, "num_completions/total": 7296, "per_sentence_gradient_norm": 1.7573941946029663, "per_sentence_gradient_norm/max": 5.560802459716797, "per_sentence_gradient_norm/median": 1.8205680847167969, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.8547129034996033, "per_sentence_gradient_norm/p75": 2.483048677444458, "per_sentence_gradient_norm/p85": 2.9460344314575195, "per_sentence_gradient_norm/p90": 3.222630739212036, "per_sentence_gradient_norm/p95": 4.019159317016602, "per_sentence_gradient_norm/p99": 4.826723098754883, "per_sentence_gradient_norm/var": 1.6947530508041382, "per_token_feature_norm": 187.25941467285156, "per_token_feature_norm/max": 256.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 91.5, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 210.4175567626953, "per_token_full_gradient_variance/max_squared_error": 0.62384033203125, "per_token_full_gradient_variance/variance": 0.0027266968972980976, "per_token_gradient_norm": 1.5693124532699585, "per_token_gradient_norm/max": 329.4765625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 204.43544006347656, "per_token_policy_error_norm": 0.010721571743488312, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009991489350795746, "policy_entropy": 0.020524419844150543, "policy_entropy/max": 2.265625, "policy_entropy/median": 1.2165401130914688e-08, "policy_entropy/min": 5.183841637196318e-19, "policy_entropy/p25": 8.503775461576879e-11, "policy_entropy/p75": 2.25752592086792e-06, "policy_entropy/var": 0.010911804623901844, "policy_error_vector_variance/max_squared_error": 2.001960515975952, "policy_error_vector_variance/metric": 0.010717596858739853, "policy_loss": -0.7604166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.18410088121891022, "policy_sharpness": 9.424840927124023, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.5041584968566895, "reward": 0.7604166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.18410088121891022, "rewards/accuracy_reward": 0.7604166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.18410088121891022, "sentence_full_gradient_variance/max_squared_error": 2504.819091796875, "sentence_full_gradient_variance/metric": 836.8974609375, "sentence_full_gradient_variance/p75": 1167.9368896484375, "sentence_full_gradient_variance/p90": 1744.2796630859375, "sentence_full_gradient_variance/p95": 1906.1131591796875, "sentence_full_gradient_variance/p99": 2229.58984375, "state_level_variance/metric": 1.0326733589172363, "state_level_variance_full_gradient/metric": 668.696533203125, "step": 76 }, { "accuracy_reward": 1.0, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 1.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": 0.7888388633728027, "action_level_variance_full_gradient/metric": 100.59500122070312, "adam_stats/lr_effective_max": 1.2162226084910799e-05, "adam_stats/lr_effective_mean": -1.2012510430814416e-10, "adam_stats/lr_effective_min": -1.2307313227211125e-05, "adam_stats/m_t_max": 0.014528129249811172, "adam_stats/m_t_mean": -4.428822933388865e-11, "adam_stats/m_t_min": -0.014033086597919464, "adam_stats/v_t_max": 0.00031947391107678413, "adam_stats/v_t_mean": 2.5855085433734715e-11, "adam_stats/v_t_min": 0.0, "advantages": 1.0, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 1.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.0, "all_logprobs": -0.01955009065568447, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.5, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.000244140625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.01416015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.030361171811819077, "clip_ratio": 0.0, "completion_length": 463.78125, "completion_length/correct": 463.78125, "completion_length/correct/max": 967.0, "completion_length/correct/median": 447.0, "completion_length/correct/min": 179.0, "completion_length/correct/p25": 300.75, "completion_length/correct/p75": 561.25, "completion_length/correct/var": 36407.52734375, "completion_length/max": 967.0, "completion_length/median": 447.0, "completion_length/min": 179.0, "completion_length/p25": 300.75, "completion_length/p75": 561.25, "completion_length/var": 36407.52734375, "epoch": 0.1232, "feature_vector_variance/max_squared_error": 75836.453125, "feature_vector_variance/metric": 28348.26953125, "generated_tokens/total": 4029780.0, "grad_norm": 0.7726114392280579, "learning_rate": 2.29006222155752e-06, "loss": -1.0, "mean_logprobs": -0.01904296875, "mean_logprobs/var": 0.00010204315185546875, "num_completions/total": 7392, "per_sentence_gradient_norm": 2.9058403968811035, "per_sentence_gradient_norm/max": 6.263327121734619, "per_sentence_gradient_norm/median": 2.669165849685669, "per_sentence_gradient_norm/min": 0.6205864548683167, "per_sentence_gradient_norm/p25": 1.9731494188308716, "per_sentence_gradient_norm/p75": 3.85410737991333, "per_sentence_gradient_norm/p85": 4.321132659912109, "per_sentence_gradient_norm/p90": 4.808998107910156, "per_sentence_gradient_norm/p95": 5.507746696472168, "per_sentence_gradient_norm/p99": 6.175830841064453, "per_sentence_gradient_norm/var": 1.8446892499923706, "per_token_feature_norm": 188.24130249023438, "per_token_feature_norm/max": 266.0, "per_token_feature_norm/median": 189.0, "per_token_feature_norm/min": 90.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 246.20095825195312, "per_token_full_gradient_variance/max_squared_error": 0.6928913593292236, "per_token_full_gradient_variance/variance": 0.005146566778421402, "per_token_gradient_norm": 2.9668850898742676, "per_token_gradient_norm/max": 319.4296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 391.5148010253906, "per_token_policy_error_norm": 0.011364122852683067, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01069233100861311, "policy_entropy": 0.021028459072113037, "policy_entropy/max": 1.7109375, "policy_entropy/median": 1.2165401130914688e-08, "policy_entropy/min": 3.7269449679189215e-19, "policy_entropy/p25": 6.866684998385608e-11, "policy_entropy/p75": 3.11434268951416e-06, "policy_entropy/var": 0.01064252108335495, "policy_error_vector_variance/max_squared_error": 2.0006422996520996, "policy_error_vector_variance/metric": 0.011352444998919964, "policy_loss": -1.0, "policy_loss/max": -1.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.0, "policy_sharpness": 9.392030715942383, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.705303430557251, "reward": 1.0, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 1.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.0, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 1.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 2983.922607421875, "sentence_full_gradient_variance/metric": 630.457275390625, "sentence_full_gradient_variance/p75": 907.6519775390625, "sentence_full_gradient_variance/p90": 1609.901123046875, "sentence_full_gradient_variance/p95": 1872.24462890625, "sentence_full_gradient_variance/p99": 2983.907958984375, "state_level_variance/metric": 1.238443374633789, "state_level_variance_full_gradient/metric": 529.8622436523438, "step": 77 }, { "accuracy_reward": 0.5833333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24561403691768646, "action_level_variance/metric": 1.2868165969848633, "action_level_variance_full_gradient/metric": 358.0644226074219, "adam_stats/lr_effective_max": 1.1248913324379828e-05, "adam_stats/lr_effective_mean": -1.218724843266017e-10, "adam_stats/lr_effective_min": -1.0941943401121534e-05, "adam_stats/m_t_max": 0.014430297538638115, "adam_stats/m_t_mean": -4.7538493974652596e-11, "adam_stats/m_t_min": -0.011799698695540428, "adam_stats/v_t_max": 0.00031933802529238164, "adam_stats/v_t_mean": 2.60642150223811e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.5833333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.24561403691768646, "all_logprobs": -0.019836468622088432, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.875, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00019226083531975746, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.014404296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02752280794084072, "clip_ratio": 0.0, "completion_length": 573.6979370117188, "completion_length/correct": 552.4285888671875, "completion_length/correct/max": 1021.0, "completion_length/correct/median": 499.0, "completion_length/correct/min": 266.0, "completion_length/correct/p25": 380.75, "completion_length/correct/p75": 716.25, "completion_length/correct/var": 43824.90625, "completion_length/incorrect": 603.4750366210938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 540.0, "completion_length/incorrect/min": 200.0, "completion_length/incorrect/p25": 354.75, "completion_length/incorrect/p75": 906.0, "completion_length/incorrect/var": 97693.1796875, "completion_length/max": 1024.0, "completion_length/median": 502.0, "completion_length/min": 200.0, "completion_length/p25": 368.5, "completion_length/p75": 781.0, "completion_length/var": 66117.9375, "epoch": 0.1248, "feature_vector_variance/max_squared_error": 82540.234375, "feature_vector_variance/metric": 28061.724609375, "generated_tokens/total": 4084855.0, "grad_norm": 0.7532463669776917, "learning_rate": 2.104951497460118e-06, "loss": -0.5833, "mean_logprobs": -0.0203857421875, "mean_logprobs/var": 9.72747802734375e-05, "num_completions/total": 7488, "per_sentence_gradient_norm": 1.649151086807251, "per_sentence_gradient_norm/max": 5.351056098937988, "per_sentence_gradient_norm/median": 1.7035678625106812, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.8514204025268555, "per_sentence_gradient_norm/p85": 3.529205083847046, "per_sentence_gradient_norm/p90": 3.6849617958068848, "per_sentence_gradient_norm/p95": 3.8926305770874023, "per_sentence_gradient_norm/p99": 5.174239158630371, "per_sentence_gradient_norm/var": 2.463167190551758, "per_token_feature_norm": 189.72308349609375, "per_token_feature_norm/max": 253.0, "per_token_feature_norm/median": 189.0, "per_token_feature_norm/min": 93.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 198.0, "per_token_feature_norm/var": 252.51153564453125, "per_token_full_gradient_variance/max_squared_error": 0.5140189528465271, "per_token_full_gradient_variance/variance": 0.002431171480566263, "per_token_gradient_norm": 1.5325316190719604, "per_token_gradient_norm/max": 296.203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 207.2189483642578, "per_token_policy_error_norm": 0.01160399243235588, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010742351412773132, "policy_entropy": 0.02194739505648613, "policy_entropy/max": 1.734375, "policy_entropy/median": 7.130438461899757e-09, "policy_entropy/min": 6.572975670693371e-19, "policy_entropy/p25": 6.366462912410498e-11, "policy_entropy/p75": 1.2740492820739746e-06, "policy_entropy/var": 0.01176635455340147, "policy_error_vector_variance/max_squared_error": 2.0012450218200684, "policy_error_vector_variance/metric": 0.011599184013903141, "policy_loss": -0.5833333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.24561403691768646, "policy_sharpness": 9.408943176269531, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.6535720825195312, "reward": 0.5833333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24561403691768646, "rewards/accuracy_reward": 0.5833333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24561403691768646, "sentence_full_gradient_variance/max_squared_error": 3422.505859375, "sentence_full_gradient_variance/metric": 681.9596557617188, "sentence_full_gradient_variance/p75": 566.2617797851562, "sentence_full_gradient_variance/p90": 1869.353271484375, "sentence_full_gradient_variance/p95": 2332.5693359375, "sentence_full_gradient_variance/p99": 2903.984130859375, "state_level_variance/metric": 1.430775761604309, "state_level_variance_full_gradient/metric": 323.895263671875, "step": 78 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1666666567325592, "action_level_variance/metric": 0.7555645704269409, "action_level_variance_full_gradient/metric": 128.86029052734375, "adam_stats/lr_effective_max": 1.0217102499154862e-05, "adam_stats/lr_effective_mean": -9.745118600967828e-11, "adam_stats/lr_effective_min": -1.0187106454395689e-05, "adam_stats/m_t_max": 0.013261925429105759, "adam_stats/m_t_mean": -2.606912949398854e-12, "adam_stats/m_t_min": -0.010924904607236385, "adam_stats/v_t_max": 0.00031902623595669866, "adam_stats/v_t_mean": 2.6112410844714162e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7916666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.1666666567325592, "all_logprobs": -0.018456557765603065, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.25, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00015926361083984375, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.01104736328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.025440972298383713, "clip_ratio": 0.0, "completion_length": 622.1666870117188, "completion_length/correct": 555.3947143554688, "completion_length/correct/max": 875.0, "completion_length/correct/median": 488.0, "completion_length/correct/min": 400.0, "completion_length/correct/p25": 458.75, "completion_length/correct/p75": 653.0, "completion_length/correct/var": 14442.5615234375, "completion_length/incorrect": 875.9000244140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 926.0, "completion_length/incorrect/min": 421.0, "completion_length/incorrect/p25": 784.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 30660.302734375, "completion_length/max": 1024.0, "completion_length/median": 557.0, "completion_length/min": 400.0, "completion_length/p25": 460.75, "completion_length/p75": 735.25, "completion_length/var": 34654.69140625, "epoch": 0.1264, "feature_vector_variance/max_squared_error": 81931.390625, "feature_vector_variance/metric": 27605.052734375, "generated_tokens/total": 4144583.0, "grad_norm": 0.5295687317848206, "learning_rate": 1.9264138089195424e-06, "loss": -0.7917, "mean_logprobs": -0.0177001953125, "mean_logprobs/var": 7.581710815429688e-05, "num_completions/total": 7584, "per_sentence_gradient_norm": 1.942366123199463, "per_sentence_gradient_norm/max": 5.230381965637207, "per_sentence_gradient_norm/median": 2.0337460041046143, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.9356712102890015, "per_sentence_gradient_norm/p75": 2.8213565349578857, "per_sentence_gradient_norm/p85": 3.379967212677002, "per_sentence_gradient_norm/p90": 3.574538469314575, "per_sentence_gradient_norm/p95": 4.529453277587891, "per_sentence_gradient_norm/p99": 4.989414691925049, "per_sentence_gradient_norm/var": 1.9295308589935303, "per_token_feature_norm": 188.15570068359375, "per_token_feature_norm/max": 252.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 90.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 218.2750701904297, "per_token_full_gradient_variance/max_squared_error": 0.5360637307167053, "per_token_full_gradient_variance/variance": 0.0031542181968688965, "per_token_gradient_norm": 1.7443664073944092, "per_token_gradient_norm/max": 276.59375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 228.85549926757812, "per_token_policy_error_norm": 0.010961083695292473, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010255170054733753, "policy_entropy": 0.02001030184328556, "policy_entropy/max": 1.6171875, "policy_entropy/median": 4.918547347187996e-09, "policy_entropy/min": 4.607859233063394e-19, "policy_entropy/p25": 3.0468072509393096e-11, "policy_entropy/p75": 1.1399388313293457e-06, "policy_entropy/var": 0.01028822548687458, "policy_error_vector_variance/max_squared_error": 2.0011825561523438, "policy_error_vector_variance/metric": 0.010956339538097382, "policy_loss": -0.7916666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.1666666567325592, "policy_sharpness": 9.432150840759277, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.4629061222076416, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1666666567325592, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1666666567325592, "sentence_full_gradient_variance/max_squared_error": 1519.404052734375, "sentence_full_gradient_variance/metric": 515.544677734375, "sentence_full_gradient_variance/p75": 833.7294921875, "sentence_full_gradient_variance/p90": 1266.0203857421875, "sentence_full_gradient_variance/p95": 1394.1468505859375, "sentence_full_gradient_variance/p99": 1459.2620849609375, "state_level_variance/metric": 1.3617959022521973, "state_level_variance_full_gradient/metric": 386.684326171875, "step": 79 }, { "accuracy_reward": 0.7291666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19956141710281372, "action_level_variance/metric": 1.4164750576019287, "action_level_variance_full_gradient/metric": 188.19125366210938, "adam_stats/lr_effective_max": 9.393988875672221e-06, "adam_stats/lr_effective_mean": -8.985219512869236e-11, "adam_stats/lr_effective_min": -9.471453267906327e-06, "adam_stats/m_t_max": 0.015426944009959698, "adam_stats/m_t_mean": 2.767096664649249e-12, "adam_stats/m_t_min": -0.012456926517188549, "adam_stats/v_t_max": 0.0003199260390829295, "adam_stats/v_t_mean": 2.618620424665874e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7291666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.19956141710281372, "all_logprobs": -0.017013853415846825, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.8125, "all_logprobs/p1": -0.474609375, "all_logprobs/p10": -9.059906005859375e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.009344473481178284, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.023676835000514984, "clip_ratio": 0.0, "completion_length": 644.15625, "completion_length/correct": 546.1428833007812, "completion_length/correct/max": 994.0, "completion_length/correct/median": 531.0, "completion_length/correct/min": 269.0, "completion_length/correct/p25": 402.5, "completion_length/correct/p75": 647.25, "completion_length/correct/var": 27423.16796875, "completion_length/incorrect": 908.0385131835938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 383.0, "completion_length/incorrect/p25": 801.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 35257.796875, "completion_length/max": 1024.0, "completion_length/median": 589.0, "completion_length/min": 269.0, "completion_length/p25": 411.5, "completion_length/p75": 774.5, "completion_length/var": 55332.48828125, "epoch": 0.128, "feature_vector_variance/max_squared_error": 74929.875, "feature_vector_variance/metric": 27418.556640625, "generated_tokens/total": 4206422.0, "grad_norm": 0.564148485660553, "learning_rate": 1.7546666766076658e-06, "loss": -0.7292, "mean_logprobs": -0.017333984375, "mean_logprobs/var": 8.20159912109375e-05, "num_completions/total": 7680, "per_sentence_gradient_norm": 1.903967261314392, "per_sentence_gradient_norm/max": 5.445863246917725, "per_sentence_gradient_norm/median": 2.0620012283325195, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.0562524795532227, "per_sentence_gradient_norm/p85": 3.792930841445923, "per_sentence_gradient_norm/p90": 4.219054222106934, "per_sentence_gradient_norm/p95": 4.593979358673096, "per_sentence_gradient_norm/p99": 5.400341033935547, "per_sentence_gradient_norm/var": 2.6619393825531006, "per_token_feature_norm": 188.6678009033203, "per_token_feature_norm/max": 260.0, "per_token_feature_norm/median": 189.0, "per_token_feature_norm/min": 88.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 223.45733642578125, "per_token_full_gradient_variance/max_squared_error": 0.6497176885604858, "per_token_full_gradient_variance/variance": 0.0027729456778615713, "per_token_gradient_norm": 1.5961886644363403, "per_token_gradient_norm/max": 295.9921875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 211.5324249267578, "per_token_policy_error_norm": 0.009838458150625229, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009409550577402115, "policy_entropy": 0.0190824493765831, "policy_entropy/max": 1.6875, "policy_entropy/median": 4.6566128730773926e-09, "policy_entropy/min": 2.303929616531697e-18, "policy_entropy/p25": 3.524291969370097e-11, "policy_entropy/p75": 7.338821887969971e-07, "policy_entropy/var": 0.009524478577077389, "policy_error_vector_variance/max_squared_error": 1.9896173477172852, "policy_error_vector_variance/metric": 0.00983491726219654, "policy_loss": -0.7291666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.19956141710281372, "policy_sharpness": 9.456624984741211, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.364561080932617, "reward": 0.7291666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19956141710281372, "rewards/accuracy_reward": 0.7291666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19956141710281372, "sentence_full_gradient_variance/max_squared_error": 1934.3658447265625, "sentence_full_gradient_variance/metric": 846.8353881835938, "sentence_full_gradient_variance/p75": 1265.271240234375, "sentence_full_gradient_variance/p90": 1378.13232421875, "sentence_full_gradient_variance/p95": 1562.17578125, "sentence_full_gradient_variance/p99": 1828.660400390625, "state_level_variance/metric": 1.5215950012207031, "state_level_variance_full_gradient/metric": 658.6441650390625, "step": 80 }, { "accuracy_reward": 0.875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.11052632331848145, "action_level_variance/metric": 1.2064764499664307, "action_level_variance_full_gradient/metric": 215.21762084960938, "adam_stats/lr_effective_max": 8.679996426508296e-06, "adam_stats/lr_effective_mean": -6.816255199160182e-11, "adam_stats/lr_effective_min": -8.826972589304205e-06, "adam_stats/m_t_max": 0.015125725418329239, "adam_stats/m_t_mean": 1.882851860535162e-11, "adam_stats/m_t_min": -0.011500499211251736, "adam_stats/v_t_max": 0.00031976026366464794, "adam_stats/v_t_mean": 2.667753518093008e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.875, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.11052632331848145, "all_logprobs": -0.023324184119701385, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.5, "all_logprobs/p1": -0.69140625, "all_logprobs/p10": -0.000568389892578125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.0233154296875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03527349233627319, "clip_ratio": 0.0, "completion_length": 436.0625, "completion_length/correct": 415.0238037109375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 395.0, "completion_length/correct/min": 192.0, "completion_length/correct/p25": 273.25, "completion_length/correct/p75": 470.5, "completion_length/correct/var": 33451.1796875, "completion_length/incorrect": 583.3333740234375, "completion_length/incorrect/max": 764.0, "completion_length/incorrect/median": 581.0, "completion_length/incorrect/min": 348.0, "completion_length/incorrect/p25": 505.0, "completion_length/incorrect/p75": 652.5, "completion_length/incorrect/var": 14760.9697265625, "completion_length/max": 1024.0, "completion_length/median": 413.0, "completion_length/min": 192.0, "completion_length/p25": 278.0, "completion_length/p75": 531.75, "completion_length/var": 34065.9375, "epoch": 0.1296, "feature_vector_variance/max_squared_error": 69553.828125, "feature_vector_variance/metric": 28330.978515625, "generated_tokens/total": 4248284.0, "grad_norm": 4.306710243225098, "learning_rate": 1.5899193479495858e-06, "loss": -0.875, "mean_logprobs": -0.02392578125, "mean_logprobs/var": 9.393692016601562e-05, "num_completions/total": 7776, "per_sentence_gradient_norm": 3.025651454925537, "per_sentence_gradient_norm/max": 7.824512958526611, "per_sentence_gradient_norm/median": 3.147327423095703, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 2.5005695819854736, "per_sentence_gradient_norm/p75": 3.8552513122558594, "per_sentence_gradient_norm/p85": 4.269383907318115, "per_sentence_gradient_norm/p90": 4.52714729309082, "per_sentence_gradient_norm/p95": 4.95468282699585, "per_sentence_gradient_norm/p99": 6.883917808532715, "per_sentence_gradient_norm/var": 2.2428150177001953, "per_token_feature_norm": 187.88699340820312, "per_token_feature_norm/max": 260.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 88.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 226.32144165039062, "per_token_full_gradient_variance/max_squared_error": 0.588090181350708, "per_token_full_gradient_variance/variance": 0.004586779046803713, "per_token_gradient_norm": 2.770519733428955, "per_token_gradient_norm/max": 323.578125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 377.533203125, "per_token_policy_error_norm": 0.013615514151751995, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.013178190216422081, "policy_entropy": 0.024394292384386063, "policy_entropy/max": 1.5703125, "policy_entropy/median": 1.0710209608078003e-08, "policy_entropy/min": 4.607859233063394e-19, "policy_entropy/p25": 7.594280759803951e-11, "policy_entropy/p75": 3.200024366378784e-06, "policy_entropy/var": 0.012574817053973675, "policy_error_vector_variance/max_squared_error": 1.9909086227416992, "policy_error_vector_variance/metric": 0.013612793758511543, "policy_loss": -0.875, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.11052632331848145, "policy_sharpness": 9.318913459777832, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.120203495025635, "reward": 0.875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.11052632331848145, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.11052632331848145, "sentence_full_gradient_variance/max_squared_error": 2813.9248046875, "sentence_full_gradient_variance/metric": 890.1470947265625, "sentence_full_gradient_variance/p75": 1545.240234375, "sentence_full_gradient_variance/p90": 1887.911865234375, "sentence_full_gradient_variance/p95": 2021.7166748046875, "sentence_full_gradient_variance/p99": 2441.03759765625, "state_level_variance/metric": 1.2695841789245605, "state_level_variance_full_gradient/metric": 674.929443359375, "step": 81 }, { "accuracy_reward": 0.6354166865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23410087823867798, "action_level_variance/metric": 2.714836835861206, "action_level_variance_full_gradient/metric": 358.643798828125, "adam_stats/lr_effective_max": 7.247710527735762e-06, "adam_stats/lr_effective_mean": -6.120118994923374e-11, "adam_stats/lr_effective_min": -7.237271347548813e-06, "adam_stats/m_t_max": 0.016860224306583405, "adam_stats/m_t_mean": 1.2518439410080795e-11, "adam_stats/m_t_min": -0.01337779313325882, "adam_stats/v_t_max": 0.0003204948443453759, "adam_stats/v_t_mean": 2.683833537353575e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6354166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.23410087823867798, "all_logprobs": -0.021199462935328484, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.8125, "all_logprobs/p1": -0.609375, "all_logprobs/p10": -0.000431060791015625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.0235595703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.028693001717329025, "clip_ratio": 0.0, "completion_length": 515.78125, "completion_length/correct": 414.901611328125, "completion_length/correct/max": 993.0, "completion_length/correct/median": 424.0, "completion_length/correct/min": 224.0, "completion_length/correct/p25": 336.0, "completion_length/correct/p75": 493.0, "completion_length/correct/var": 17686.45703125, "completion_length/incorrect": 691.5999755859375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 666.0, "completion_length/incorrect/min": 297.0, "completion_length/incorrect/p25": 459.5, "completion_length/incorrect/p75": 991.0, "completion_length/incorrect/var": 67971.4765625, "completion_length/max": 1024.0, "completion_length/median": 481.0, "completion_length/min": 224.0, "completion_length/p25": 343.5, "completion_length/p75": 571.0, "completion_length/var": 53420.26171875, "epoch": 0.1312, "feature_vector_variance/max_squared_error": 74852.0703125, "feature_vector_variance/metric": 28173.935546875, "generated_tokens/total": 4297799.0, "grad_norm": 0.7468286752700806, "learning_rate": 1.432372542187895e-06, "loss": -0.6354, "mean_logprobs": -0.021240234375, "mean_logprobs/var": 0.00016689300537109375, "num_completions/total": 7872, "per_sentence_gradient_norm": 1.939449667930603, "per_sentence_gradient_norm/max": 9.179046630859375, "per_sentence_gradient_norm/median": 1.8117945194244385, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.814908742904663, "per_sentence_gradient_norm/p85": 3.8920841217041016, "per_sentence_gradient_norm/p90": 4.625324249267578, "per_sentence_gradient_norm/p95": 5.963352680206299, "per_sentence_gradient_norm/p99": 8.886085510253906, "per_sentence_gradient_norm/var": 4.531623363494873, "per_token_feature_norm": 188.51356506347656, "per_token_feature_norm/max": 258.0, "per_token_feature_norm/median": 189.0, "per_token_feature_norm/min": 90.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 232.43051147460938, "per_token_full_gradient_variance/max_squared_error": 0.5188747644424438, "per_token_full_gradient_variance/variance": 0.002719525946304202, "per_token_gradient_norm": 1.582015872001648, "per_token_gradient_norm/max": 300.9375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 212.1377716064453, "per_token_policy_error_norm": 0.012349911965429783, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011268623173236847, "policy_entropy": 0.024331098422408104, "policy_entropy/max": 2.640625, "policy_entropy/median": 9.89530235528946e-09, "policy_entropy/min": 7.995991022080595e-19, "policy_entropy/p25": 7.639755494892597e-11, "policy_entropy/p75": 2.3990869522094727e-06, "policy_entropy/var": 0.012913579121232033, "policy_error_vector_variance/max_squared_error": 1.9916598796844482, "policy_error_vector_variance/metric": 0.012347276322543621, "policy_loss": -0.6354166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.23410087823867798, "policy_sharpness": 9.341621398925781, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.00831937789917, "reward": 0.6354166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23410087823867798, "rewards/accuracy_reward": 0.6354166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23410087823867798, "sentence_full_gradient_variance/max_squared_error": 3168.16650390625, "sentence_full_gradient_variance/metric": 1199.193115234375, "sentence_full_gradient_variance/p75": 1235.9691162109375, "sentence_full_gradient_variance/p90": 2383.5673828125, "sentence_full_gradient_variance/p95": 3040.983642578125, "sentence_full_gradient_variance/p99": 3131.5908203125, "state_level_variance/metric": 2.3006582260131836, "state_level_variance_full_gradient/metric": 840.5491943359375, "step": 82 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14035087823867798, "action_level_variance/metric": 1.034381628036499, "action_level_variance_full_gradient/metric": 167.6366729736328, "adam_stats/lr_effective_max": 6.329888492473401e-06, "adam_stats/lr_effective_mean": -5.793586096980441e-11, "adam_stats/lr_effective_min": -6.457842573581729e-06, "adam_stats/m_t_max": 0.012308482080698013, "adam_stats/m_t_mean": 4.352205054680702e-11, "adam_stats/m_t_min": -0.010165367275476456, "adam_stats/v_t_max": 0.00032127677695825696, "adam_stats/v_t_mean": 2.6956773618858065e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8333333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.14035087823867798, "all_logprobs": -0.018147684633731842, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -11.5, "all_logprobs/p1": -0.5360157489776611, "all_logprobs/p10": -9.584426879882812e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.00860595703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.028312118723988533, "clip_ratio": 0.0, "completion_length": 512.9166870117188, "completion_length/correct": 484.5625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 457.0, "completion_length/correct/min": 174.0, "completion_length/correct/p25": 346.75, "completion_length/correct/p75": 607.25, "completion_length/correct/var": 39781.9921875, "completion_length/incorrect": 654.6875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 613.0, "completion_length/incorrect/min": 280.0, "completion_length/incorrect/p25": 493.75, "completion_length/incorrect/p75": 895.0, "completion_length/incorrect/var": 71373.03125, "completion_length/max": 1024.0, "completion_length/median": 468.0, "completion_length/min": 174.0, "completion_length/p25": 353.0, "completion_length/p75": 626.0, "completion_length/var": 48413.3984375, "epoch": 0.1328, "feature_vector_variance/max_squared_error": 74264.8984375, "feature_vector_variance/metric": 27974.84375, "generated_tokens/total": 4347039.0, "grad_norm": 0.7423139810562134, "learning_rate": 1.282218205837188e-06, "loss": -0.8333, "mean_logprobs": -0.01904296875, "mean_logprobs/var": 0.00010776519775390625, "num_completions/total": 7968, "per_sentence_gradient_norm": 2.229802131652832, "per_sentence_gradient_norm/max": 5.422088146209717, "per_sentence_gradient_norm/median": 2.1191279888153076, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.3592615127563477, "per_sentence_gradient_norm/p75": 3.2197139263153076, "per_sentence_gradient_norm/p85": 3.812938690185547, "per_sentence_gradient_norm/p90": 4.073474884033203, "per_sentence_gradient_norm/p95": 4.703067779541016, "per_sentence_gradient_norm/p99": 5.108338832855225, "per_sentence_gradient_norm/var": 2.0354983806610107, "per_token_feature_norm": 188.28379821777344, "per_token_feature_norm/max": 255.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 90.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 225.89837646484375, "per_token_full_gradient_variance/max_squared_error": 0.6367136240005493, "per_token_full_gradient_variance/variance": 0.0036008127499371767, "per_token_gradient_norm": 2.0325520038604736, "per_token_gradient_norm/max": 313.25, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 277.880859375, "per_token_policy_error_norm": 0.010579400695860386, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010404758155345917, "policy_entropy": 0.019004927948117256, "policy_entropy/max": 1.5703125, "policy_entropy/median": 3.841705620288849e-09, "policy_entropy/min": 2.1599340154984659e-19, "policy_entropy/p25": 2.955857780762017e-11, "policy_entropy/p75": 7.748603820800781e-07, "policy_entropy/var": 0.010136627592146397, "policy_error_vector_variance/max_squared_error": 1.9723577499389648, "policy_error_vector_variance/metric": 0.010578778572380543, "policy_loss": -0.8333333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.14035087823867798, "policy_sharpness": 9.462847709655762, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.312230348587036, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14035087823867798, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14035087823867798, "sentence_full_gradient_variance/max_squared_error": 2052.70458984375, "sentence_full_gradient_variance/metric": 759.3802490234375, "sentence_full_gradient_variance/p75": 1608.3856201171875, "sentence_full_gradient_variance/p90": 1608.400146484375, "sentence_full_gradient_variance/p95": 1655.337646484375, "sentence_full_gradient_variance/p99": 2052.70458984375, "state_level_variance/metric": 1.2100486755371094, "state_level_variance_full_gradient/metric": 591.7435302734375, "step": 83 }, { "accuracy_reward": 0.6354166865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23410086333751678, "action_level_variance/metric": 2.2588307857513428, "action_level_variance_full_gradient/metric": 306.60809326171875, "adam_stats/lr_effective_max": 5.824145318911178e-06, "adam_stats/lr_effective_mean": -3.3070546301416925e-11, "adam_stats/lr_effective_min": -6.291009867709363e-06, "adam_stats/m_t_max": 0.013836422935128212, "adam_stats/m_t_mean": 2.8249876396491125e-11, "adam_stats/m_t_min": -0.01144375279545784, "adam_stats/v_t_max": 0.0003217784105800092, "adam_stats/v_t_mean": 2.7109692962712373e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6354166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.23410086333751678, "all_logprobs": -0.019964803010225296, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.75, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00012302398681640625, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.01123046875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03012557327747345, "clip_ratio": 0.0, "completion_length": 577.6041870117188, "completion_length/correct": 475.93438720703125, "completion_length/correct/max": 998.0, "completion_length/correct/median": 452.0, "completion_length/correct/min": 142.0, "completion_length/correct/p25": 210.0, "completion_length/correct/p75": 654.0, "completion_length/correct/var": 64801.828125, "completion_length/incorrect": 754.7999877929688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 844.0, "completion_length/incorrect/min": 233.0, "completion_length/incorrect/p25": 564.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 88045.2265625, "completion_length/max": 1024.0, "completion_length/median": 609.0, "completion_length/min": 142.0, "completion_length/p25": 268.5, "completion_length/p75": 820.0, "completion_length/var": 90643.4921875, "epoch": 0.1344, "feature_vector_variance/max_squared_error": 78793.3359375, "feature_vector_variance/metric": 27354.294921875, "generated_tokens/total": 4402489.0, "grad_norm": 0.7295522689819336, "learning_rate": 1.1396392788268054e-06, "loss": -0.6354, "mean_logprobs": -0.021728515625, "mean_logprobs/var": 0.00011396408081054688, "num_completions/total": 8064, "per_sentence_gradient_norm": 2.1395530700683594, "per_sentence_gradient_norm/max": 6.990239143371582, "per_sentence_gradient_norm/median": 2.3670859336853027, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.526583671569824, "per_sentence_gradient_norm/p85": 4.137302398681641, "per_sentence_gradient_norm/p90": 4.446580410003662, "per_sentence_gradient_norm/p95": 5.438882827758789, "per_sentence_gradient_norm/p99": 6.910185813903809, "per_sentence_gradient_norm/var": 3.7926180362701416, "per_token_feature_norm": 188.20481872558594, "per_token_feature_norm/max": 268.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 90.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 229.3682098388672, "per_token_full_gradient_variance/max_squared_error": 0.6164525747299194, "per_token_full_gradient_variance/variance": 0.002377548720687628, "per_token_gradient_norm": 1.6404128074645996, "per_token_gradient_norm/max": 302.609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 226.5906524658203, "per_token_policy_error_norm": 0.01174608338624239, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011334642767906189, "policy_entropy": 0.020569585263729095, "policy_entropy/max": 3.078125, "policy_entropy/median": 4.831235855817795e-09, "policy_entropy/min": 1.362028979184915e-18, "policy_entropy/p25": 4.320099833421409e-11, "policy_entropy/p75": 9.164214134216309e-07, "policy_entropy/var": 0.01075755525380373, "policy_error_vector_variance/max_squared_error": 2.000237226486206, "policy_error_vector_variance/metric": 0.011737310327589512, "policy_loss": -0.6354166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.23410086333751678, "policy_sharpness": 9.443634033203125, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.4601502418518066, "reward": 0.6354166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23410086333751678, "rewards/accuracy_reward": 0.6354166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23410086333751678, "sentence_full_gradient_variance/max_squared_error": 3899.61328125, "sentence_full_gradient_variance/metric": 1401.311279296875, "sentence_full_gradient_variance/p75": 1739.929443359375, "sentence_full_gradient_variance/p90": 3142.4287109375, "sentence_full_gradient_variance/p95": 3702.07763671875, "sentence_full_gradient_variance/p99": 3899.605224609375, "state_level_variance/metric": 1.9381465911865234, "state_level_variance_full_gradient/metric": 1094.703369140625, "step": 84 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894736886024475, "action_level_variance/metric": 1.4617336988449097, "action_level_variance_full_gradient/metric": 114.22518920898438, "adam_stats/lr_effective_max": 5.289815362630179e-06, "adam_stats/lr_effective_mean": -1.6521608733888193e-11, "adam_stats/lr_effective_min": -5.333911303750938e-06, "adam_stats/m_t_max": 0.020583851262927055, "adam_stats/m_t_mean": -5.233826826794852e-12, "adam_stats/m_t_min": -0.014277083799242973, "adam_stats/v_t_max": 0.0003285132406745106, "adam_stats/v_t_mean": 2.752722876031566e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.75, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.75, "advantages/p75": 1.0, "advantages/var": 0.1894736886024475, "all_logprobs": -0.020048577338457108, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.75, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.0002268790267407894, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.01416015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02778511308133602, "clip_ratio": 0.0, "completion_length": 481.6875, "completion_length/correct": 419.8055725097656, "completion_length/correct/max": 825.0, "completion_length/correct/median": 452.0, "completion_length/correct/min": 113.0, "completion_length/correct/p25": 224.75, "completion_length/correct/p75": 556.75, "completion_length/correct/var": 39251.65234375, "completion_length/incorrect": 667.3333740234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 584.0, "completion_length/incorrect/min": 239.0, "completion_length/incorrect/p25": 424.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 85868.234375, "completion_length/max": 1024.0, "completion_length/median": 471.0, "completion_length/min": 113.0, "completion_length/p25": 254.0, "completion_length/p75": 617.0, "completion_length/var": 61733.65234375, "epoch": 0.136, "feature_vector_variance/max_squared_error": 95590.2109375, "feature_vector_variance/metric": 28016.427734375, "generated_tokens/total": 4448731.0, "grad_norm": 1.1625163555145264, "learning_rate": 1.0048094716167097e-06, "loss": -0.75, "mean_logprobs": -0.0216064453125, "mean_logprobs/var": 0.0002117156982421875, "num_completions/total": 8160, "per_sentence_gradient_norm": 2.2501845359802246, "per_sentence_gradient_norm/max": 10.28700065612793, "per_sentence_gradient_norm/median": 2.0948619842529297, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.6983903646469116, "per_sentence_gradient_norm/p75": 3.2828528881073, "per_sentence_gradient_norm/p85": 4.133854866027832, "per_sentence_gradient_norm/p90": 4.508086681365967, "per_sentence_gradient_norm/p95": 5.301643371582031, "per_sentence_gradient_norm/p99": 8.945613861083984, "per_sentence_gradient_norm/var": 3.9035050868988037, "per_token_feature_norm": 187.15574645996094, "per_token_feature_norm/max": 262.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 92.5, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 220.42694091796875, "per_token_full_gradient_variance/max_squared_error": 0.6662302613258362, "per_token_full_gradient_variance/variance": 0.0033330502919852734, "per_token_gradient_norm": 1.8882256746292114, "per_token_gradient_norm/max": 324.9609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 253.2636260986328, "per_token_policy_error_norm": 0.011862398125231266, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01130241621285677, "policy_entropy": 0.021609866991639137, "policy_entropy/max": 2.078125, "policy_entropy/median": 1.1408701539039612e-08, "policy_entropy/min": 8.334804200982315e-19, "policy_entropy/p25": 7.09405867382884e-11, "policy_entropy/p75": 2.0563602447509766e-06, "policy_entropy/var": 0.01134046446532011, "policy_error_vector_variance/max_squared_error": 1.987967848777771, "policy_error_vector_variance/metric": 0.01185233797878027, "policy_loss": -0.75, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -0.75, "policy_loss/var": 0.1894736886024475, "policy_sharpness": 9.40107536315918, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.651219606399536, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894736886024475, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894736886024475, "sentence_full_gradient_variance/max_squared_error": 3301.95751953125, "sentence_full_gradient_variance/metric": 1327.48779296875, "sentence_full_gradient_variance/p75": 1960.781494140625, "sentence_full_gradient_variance/p90": 2772.5029296875, "sentence_full_gradient_variance/p95": 3006.362548828125, "sentence_full_gradient_variance/p99": 3081.05615234375, "state_level_variance/metric": 2.8187198638916016, "state_level_variance_full_gradient/metric": 1213.2625732421875, "step": 85 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894736886024475, "action_level_variance/metric": 1.0694801807403564, "action_level_variance_full_gradient/metric": 286.32867431640625, "adam_stats/lr_effective_max": 4.465942311071558e-06, "adam_stats/lr_effective_mean": -1.6069336833401948e-11, "adam_stats/lr_effective_min": -4.443173111212673e-06, "adam_stats/m_t_max": 0.016547927632927895, "adam_stats/m_t_mean": 1.7322301579247856e-11, "adam_stats/m_t_min": -0.01263080071657896, "adam_stats/v_t_max": 0.0003285758139099926, "adam_stats/v_t_mean": 2.7547757477930368e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.75, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.75, "advantages/p75": 1.0, "advantages/var": 0.1894736886024475, "all_logprobs": -0.015445906668901443, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -4.75, "all_logprobs/p1": -0.38671875, "all_logprobs/p10": -6.580352783203125e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.005950927734375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.021346408873796463, "clip_ratio": 0.0, "completion_length": 566.28125, "completion_length/correct": 505.7638854980469, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 456.0, "completion_length/correct/min": 187.0, "completion_length/correct/p25": 292.5, "completion_length/correct/p75": 596.5, "completion_length/correct/var": 56337.703125, "completion_length/incorrect": 747.8333740234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 791.0, "completion_length/incorrect/min": 353.0, "completion_length/incorrect/p25": 577.25, "completion_length/incorrect/p75": 1015.75, "completion_length/incorrect/var": 56711.1015625, "completion_length/max": 1024.0, "completion_length/median": 511.0, "completion_length/min": 187.0, "completion_length/p25": 371.0, "completion_length/p75": 757.75, "completion_length/var": 66937.78125, "epoch": 0.1376, "feature_vector_variance/max_squared_error": 71259.296875, "feature_vector_variance/metric": 28259.005859375, "generated_tokens/total": 4503094.0, "grad_norm": 0.5178928971290588, "learning_rate": 8.778930535580476e-07, "loss": -0.75, "mean_logprobs": -0.0159912109375, "mean_logprobs/var": 4.792213439941406e-05, "num_completions/total": 8256, "per_sentence_gradient_norm": 1.831459403038025, "per_sentence_gradient_norm/max": 5.019161701202393, "per_sentence_gradient_norm/median": 1.8606503009796143, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.5859261751174927, "per_sentence_gradient_norm/p75": 2.7862977981567383, "per_sentence_gradient_norm/p85": 3.3218894004821777, "per_sentence_gradient_norm/p90": 3.5119147300720215, "per_sentence_gradient_norm/p95": 3.8949310779571533, "per_sentence_gradient_norm/p99": 4.267214298248291, "per_sentence_gradient_norm/var": 1.766932487487793, "per_token_feature_norm": 187.57373046875, "per_token_feature_norm/max": 266.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 90.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 205.54637145996094, "per_token_full_gradient_variance/max_squared_error": 0.422172874212265, "per_token_full_gradient_variance/variance": 0.002812921768054366, "per_token_gradient_norm": 1.582655906677246, "per_token_gradient_norm/max": 276.171875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 212.385009765625, "per_token_policy_error_norm": 0.009051566943526268, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.00879001710563898, "policy_entropy": 0.016983872279524803, "policy_entropy/max": 2.09375, "policy_entropy/median": 7.508788257837296e-09, "policy_entropy/min": 1.734723475976807e-18, "policy_entropy/p25": 7.09405867382884e-11, "policy_entropy/p75": 1.2516975402832031e-06, "policy_entropy/var": 0.00876601692289114, "policy_error_vector_variance/max_squared_error": 1.9684996604919434, "policy_error_vector_variance/metric": 0.0090479189530015, "policy_loss": -0.75, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -0.75, "policy_loss/var": 0.1894736886024475, "policy_sharpness": 9.50288200378418, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.043710947036743, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894736886024475, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894736886024475, "sentence_full_gradient_variance/max_squared_error": 2144.2607421875, "sentence_full_gradient_variance/metric": 976.1549682617188, "sentence_full_gradient_variance/p75": 1389.039306640625, "sentence_full_gradient_variance/p90": 1561.89501953125, "sentence_full_gradient_variance/p95": 1762.169921875, "sentence_full_gradient_variance/p99": 2134.179443359375, "state_level_variance/metric": 0.8866163492202759, "state_level_variance_full_gradient/metric": 689.8263549804688, "step": 86 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1666666567325592, "action_level_variance/metric": 1.2357738018035889, "action_level_variance_full_gradient/metric": 222.2022705078125, "adam_stats/lr_effective_max": 3.8535549720108975e-06, "adam_stats/lr_effective_mean": -1.574875104458362e-12, "adam_stats/lr_effective_min": -3.999253749498166e-06, "adam_stats/m_t_max": 0.015289862640202045, "adam_stats/m_t_mean": 5.311005107921929e-11, "adam_stats/m_t_min": -0.014334029518067837, "adam_stats/v_t_max": 0.00032826297683641315, "adam_stats/v_t_mean": 2.7762457263658114e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7916666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.1666666567325592, "all_logprobs": -0.018062768504023552, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.75, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00010395050048828125, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.00860595703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.024114811792969704, "clip_ratio": 0.0, "completion_length": 497.97918701171875, "completion_length/correct": 434.0789489746094, "completion_length/correct/max": 890.0, "completion_length/correct/median": 392.0, "completion_length/correct/min": 163.0, "completion_length/correct/p25": 289.25, "completion_length/correct/p75": 565.5, "completion_length/correct/var": 32935.703125, "completion_length/incorrect": 740.7999877929688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 612.0, "completion_length/incorrect/min": 375.0, "completion_length/incorrect/p25": 557.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 62095.42578125, "completion_length/max": 1024.0, "completion_length/median": 516.0, "completion_length/min": 163.0, "completion_length/p25": 304.0, "completion_length/p75": 603.75, "completion_length/var": 54100.6015625, "epoch": 0.1392, "feature_vector_variance/max_squared_error": 70962.109375, "feature_vector_variance/metric": 27727.0390625, "generated_tokens/total": 4550900.0, "grad_norm": 0.8145607709884644, "learning_rate": 7.59044652756249e-07, "loss": -0.7917, "mean_logprobs": -0.0174560546875, "mean_logprobs/var": 6.532669067382812e-05, "num_completions/total": 8352, "per_sentence_gradient_norm": 2.1047158241271973, "per_sentence_gradient_norm/max": 5.862218856811523, "per_sentence_gradient_norm/median": 2.2148637771606445, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.3705726861953735, "per_sentence_gradient_norm/p75": 3.2272348403930664, "per_sentence_gradient_norm/p85": 3.556488275527954, "per_sentence_gradient_norm/p90": 3.8315882682800293, "per_sentence_gradient_norm/p95": 4.067923545837402, "per_sentence_gradient_norm/p99": 5.518431186676025, "per_sentence_gradient_norm/var": 2.0837597846984863, "per_token_feature_norm": 187.25267028808594, "per_token_feature_norm/max": 274.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 88.5, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 227.51268005371094, "per_token_full_gradient_variance/max_squared_error": 0.5515983700752258, "per_token_full_gradient_variance/variance": 0.0028908003587275743, "per_token_gradient_norm": 1.888013482093811, "per_token_gradient_norm/max": 269.78125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 252.43687438964844, "per_token_policy_error_norm": 0.010877581313252449, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010023223236203194, "policy_entropy": 0.019439885392785072, "policy_entropy/max": 1.96875, "policy_entropy/median": 4.743924364447594e-09, "policy_entropy/min": 6.030874584450618e-19, "policy_entropy/p25": 2.6147972675971687e-11, "policy_entropy/p75": 1.3187527656555176e-06, "policy_entropy/var": 0.010093471966683865, "policy_error_vector_variance/max_squared_error": 1.9866408109664917, "policy_error_vector_variance/metric": 0.010867811739444733, "policy_loss": -0.7916666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.1666666567325592, "policy_sharpness": 9.459301948547363, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.357164144515991, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1666666567325592, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1666666567325592, "sentence_full_gradient_variance/max_squared_error": 2474.248291015625, "sentence_full_gradient_variance/metric": 1151.8431396484375, "sentence_full_gradient_variance/p75": 2115.921142578125, "sentence_full_gradient_variance/p90": 2115.93603515625, "sentence_full_gradient_variance/p95": 2474.2412109375, "sentence_full_gradient_variance/p99": 2474.248291015625, "state_level_variance/metric": 1.0699113607406616, "state_level_variance_full_gradient/metric": 929.6408081054688, "step": 87 }, { "accuracy_reward": 0.6979166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2130482792854309, "action_level_variance/metric": 0.7917773127555847, "action_level_variance_full_gradient/metric": 137.5386199951172, "adam_stats/lr_effective_max": 3.3230360259040026e-06, "adam_stats/lr_effective_mean": -7.867250254034452e-12, "adam_stats/lr_effective_min": -3.258883680246072e-06, "adam_stats/m_t_max": 0.015268445014953613, "adam_stats/m_t_mean": 7.30109375735033e-11, "adam_stats/m_t_min": -0.01201199647039175, "adam_stats/v_t_max": 0.0003281619865447283, "adam_stats/v_t_mean": 2.7913248101807397e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6979166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2130482792854309, "all_logprobs": -0.019991394132375717, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.75, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00018405914306640625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.01165771484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03162829950451851, "clip_ratio": 0.0, "completion_length": 471.57293701171875, "completion_length/correct": 397.223876953125, "completion_length/correct/max": 1014.0, "completion_length/correct/median": 375.0, "completion_length/correct/min": 192.0, "completion_length/correct/p25": 306.0, "completion_length/correct/p75": 456.5, "completion_length/correct/var": 14845.1162109375, "completion_length/incorrect": 643.3448486328125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 527.0, "completion_length/incorrect/min": 244.0, "completion_length/incorrect/p25": 438.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 68941.8046875, "completion_length/max": 1024.0, "completion_length/median": 424.0, "completion_length/min": 192.0, "completion_length/p25": 316.5, "completion_length/p75": 509.25, "completion_length/var": 43538.65234375, "epoch": 0.1408, "feature_vector_variance/max_squared_error": 70702.53125, "feature_vector_variance/metric": 27795.314453125, "generated_tokens/total": 4596171.0, "grad_norm": 0.6699965000152588, "learning_rate": 6.484090676804927e-07, "loss": -0.6979, "mean_logprobs": -0.0198974609375, "mean_logprobs/var": 0.0001430511474609375, "num_completions/total": 8448, "per_sentence_gradient_norm": 1.6495931148529053, "per_sentence_gradient_norm/max": 4.555275917053223, "per_sentence_gradient_norm/median": 1.5877214670181274, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.676107168197632, "per_sentence_gradient_norm/p85": 3.2180604934692383, "per_sentence_gradient_norm/p90": 3.329346179962158, "per_sentence_gradient_norm/p95": 3.9664509296417236, "per_sentence_gradient_norm/p99": 4.28080415725708, "per_sentence_gradient_norm/var": 1.8246368169784546, "per_token_feature_norm": 188.00961303710938, "per_token_feature_norm/max": 258.0, "per_token_feature_norm/median": 189.0, "per_token_feature_norm/min": 94.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 277.0307312011719, "per_token_full_gradient_variance/max_squared_error": 0.6252016425132751, "per_token_full_gradient_variance/variance": 0.002462102798745036, "per_token_gradient_norm": 1.3958863019943237, "per_token_gradient_norm/max": 294.8125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 186.5809783935547, "per_token_policy_error_norm": 0.011633801274001598, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011243424378335476, "policy_entropy": 0.02094959281384945, "policy_entropy/max": 1.5703125, "policy_entropy/median": 7.8580342233181e-09, "policy_entropy/min": 1.1519648082658485e-18, "policy_entropy/p25": 5.4569682106375694e-11, "policy_entropy/p75": 1.7210841178894043e-06, "policy_entropy/var": 0.010964603163301945, "policy_error_vector_variance/max_squared_error": 2.000678300857544, "policy_error_vector_variance/metric": 0.011629220098257065, "policy_loss": -0.6979166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2130482792854309, "policy_sharpness": 9.416807174682617, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.581277847290039, "reward": 0.6979166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2130482792854309, "rewards/accuracy_reward": 0.6979166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2130482792854309, "sentence_full_gradient_variance/max_squared_error": 2144.570556640625, "sentence_full_gradient_variance/metric": 1081.677490234375, "sentence_full_gradient_variance/p75": 1809.788330078125, "sentence_full_gradient_variance/p90": 1809.788330078125, "sentence_full_gradient_variance/p95": 1809.788330078125, "sentence_full_gradient_variance/p99": 1826.5284423828125, "state_level_variance/metric": 1.2139910459518433, "state_level_variance_full_gradient/metric": 944.138916015625, "step": 88 }, { "accuracy_reward": 0.6979166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21304824948310852, "action_level_variance/metric": 1.0863131284713745, "action_level_variance_full_gradient/metric": 128.0721435546875, "adam_stats/lr_effective_max": 3.0050246095925104e-06, "adam_stats/lr_effective_mean": -6.442010987151026e-12, "adam_stats/lr_effective_min": -2.963984115922358e-06, "adam_stats/m_t_max": 0.016964256763458252, "adam_stats/m_t_mean": 6.425349141636616e-11, "adam_stats/m_t_min": -0.011527505703270435, "adam_stats/v_t_max": 0.0003288723819423467, "adam_stats/v_t_mean": 2.7990075535111458e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6979166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21304824948310852, "all_logprobs": -0.018194139003753662, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.25, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -9.584426879882812e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.010162353515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.026115991175174713, "clip_ratio": 0.0, "completion_length": 581.9896240234375, "completion_length/correct": 504.7760925292969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 421.0, "completion_length/correct/min": 176.0, "completion_length/correct/p25": 362.0, "completion_length/correct/p75": 680.0, "completion_length/correct/var": 41899.90625, "completion_length/incorrect": 760.3793334960938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 733.0, "completion_length/incorrect/min": 486.0, "completion_length/incorrect/p25": 659.0, "completion_length/incorrect/p75": 832.0, "completion_length/incorrect/var": 31513.884765625, "completion_length/max": 1024.0, "completion_length/median": 582.0, "completion_length/min": 176.0, "completion_length/p25": 410.5, "completion_length/p75": 731.5, "completion_length/var": 52316.78515625, "epoch": 0.1424, "feature_vector_variance/max_squared_error": 72865.21875, "feature_vector_variance/metric": 27931.41796875, "generated_tokens/total": 4652042.0, "grad_norm": 0.6363190412521362, "learning_rate": 5.461210907490952e-07, "loss": -0.6979, "mean_logprobs": -0.019287109375, "mean_logprobs/var": 0.00015735626220703125, "num_completions/total": 8544, "per_sentence_gradient_norm": 2.1264591217041016, "per_sentence_gradient_norm/max": 7.980734348297119, "per_sentence_gradient_norm/median": 1.7746375799179077, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.2991559505462646, "per_sentence_gradient_norm/p85": 4.012787342071533, "per_sentence_gradient_norm/p90": 4.697487831115723, "per_sentence_gradient_norm/p95": 6.294036865234375, "per_sentence_gradient_norm/p99": 7.491936683654785, "per_sentence_gradient_norm/var": 4.000926971435547, "per_token_feature_norm": 187.35247802734375, "per_token_feature_norm/max": 258.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 91.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 210.8848114013672, "per_token_full_gradient_variance/max_squared_error": 0.45929551124572754, "per_token_full_gradient_variance/variance": 0.0025844769552350044, "per_token_gradient_norm": 1.6743183135986328, "per_token_gradient_norm/max": 308.265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 227.5352020263672, "per_token_policy_error_norm": 0.010700605809688568, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010128861293196678, "policy_entropy": 0.019705545157194138, "policy_entropy/max": 1.625, "policy_entropy/median": 6.83940015733242e-09, "policy_entropy/min": 1.6855955650360577e-19, "policy_entropy/p25": 4.956746124662459e-11, "policy_entropy/p75": 1.125037670135498e-06, "policy_entropy/var": 0.010299956426024437, "policy_error_vector_variance/max_squared_error": 2.001370668411255, "policy_error_vector_variance/metric": 0.01069706305861473, "policy_loss": -0.6979166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21304824948310852, "policy_sharpness": 9.463035583496094, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.324582815170288, "reward": 0.6979166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21304824948310852, "rewards/accuracy_reward": 0.6979166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21304824948310852, "sentence_full_gradient_variance/max_squared_error": 3294.6875, "sentence_full_gradient_variance/metric": 1077.1328125, "sentence_full_gradient_variance/p75": 1574.83154296875, "sentence_full_gradient_variance/p90": 1681.51953125, "sentence_full_gradient_variance/p95": 2802.373046875, "sentence_full_gradient_variance/p99": 3158.82421875, "state_level_variance/metric": 3.2822470664978027, "state_level_variance_full_gradient/metric": 949.0606689453125, "step": 89 }, { "accuracy_reward": 0.6875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21710526943206787, "action_level_variance/metric": 1.3274884223937988, "action_level_variance_full_gradient/metric": 125.04060363769531, "adam_stats/lr_effective_max": 2.579022520876606e-06, "adam_stats/lr_effective_mean": -7.083593694251489e-13, "adam_stats/lr_effective_min": -2.52367544817389e-06, "adam_stats/m_t_max": 0.015298347920179367, "adam_stats/m_t_mean": 9.0389827567261e-11, "adam_stats/m_t_min": -0.01129709929227829, "adam_stats/v_t_max": 0.00032854359596967697, "adam_stats/v_t_mean": 2.8066344387456255e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6875, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21710526943206787, "all_logprobs": -0.017817959189414978, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.5, "all_logprobs/p1": -0.474609375, "all_logprobs/p10": -0.00012302398681640625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.009735107421875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.025661032646894455, "clip_ratio": 0.0, "completion_length": 502.5625, "completion_length/correct": 357.3333435058594, "completion_length/correct/max": 737.0, "completion_length/correct/median": 306.0, "completion_length/correct/min": 194.0, "completion_length/correct/p25": 254.0, "completion_length/correct/p75": 402.5, "completion_length/correct/var": 20402.1015625, "completion_length/incorrect": 822.0667114257812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 826.0, "completion_length/incorrect/min": 435.0, "completion_length/incorrect/p25": 669.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 35495.3046875, "completion_length/max": 1024.0, "completion_length/median": 397.0, "completion_length/min": 194.0, "completion_length/p25": 278.0, "completion_length/p75": 684.5, "completion_length/var": 71684.4921875, "epoch": 0.144, "feature_vector_variance/max_squared_error": 70925.2578125, "feature_vector_variance/metric": 27619.86328125, "generated_tokens/total": 4700288.0, "grad_norm": 0.6660287380218506, "learning_rate": 4.5230534410568764e-07, "loss": -0.6875, "mean_logprobs": -0.017578125, "mean_logprobs/var": 0.00010776519775390625, "num_completions/total": 8640, "per_sentence_gradient_norm": 1.8611297607421875, "per_sentence_gradient_norm/max": 7.733754634857178, "per_sentence_gradient_norm/median": 1.916717529296875, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.037954807281494, "per_sentence_gradient_norm/p85": 3.701336622238159, "per_sentence_gradient_norm/p90": 4.057961940765381, "per_sentence_gradient_norm/p95": 5.124927520751953, "per_sentence_gradient_norm/p99": 6.0114216804504395, "per_sentence_gradient_norm/var": 3.1693904399871826, "per_token_feature_norm": 186.17303466796875, "per_token_feature_norm/max": 260.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 96.5, "per_token_feature_norm/p25": 179.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 222.5536346435547, "per_token_full_gradient_variance/max_squared_error": 0.5551852583885193, "per_token_full_gradient_variance/variance": 0.0023402548395097256, "per_token_gradient_norm": 1.3774490356445312, "per_token_gradient_norm/max": 284.2265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 180.78074645996094, "per_token_policy_error_norm": 0.010258784517645836, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009573213756084442, "policy_entropy": 0.019870897755026817, "policy_entropy/max": 1.71875, "policy_entropy/median": 1.594889909029007e-08, "policy_entropy/min": 2.1141942363467336e-18, "policy_entropy/p25": 1.3278622645884752e-10, "policy_entropy/p75": 2.0712614059448242e-06, "policy_entropy/var": 0.01057005301117897, "policy_error_vector_variance/max_squared_error": 2.0012106895446777, "policy_error_vector_variance/metric": 0.010256470181047916, "policy_loss": -0.6875, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21710526943206787, "policy_sharpness": 9.453737258911133, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.3450818061828613, "reward": 0.6875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21710526943206787, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21710526943206787, "sentence_full_gradient_variance/max_squared_error": 2330.0595703125, "sentence_full_gradient_variance/metric": 1369.843505859375, "sentence_full_gradient_variance/p75": 2330.03125, "sentence_full_gradient_variance/p90": 2330.03125, "sentence_full_gradient_variance/p95": 2330.03125, "sentence_full_gradient_variance/p99": 2330.03271484375, "state_level_variance/metric": 2.154353141784668, "state_level_variance_full_gradient/metric": 1244.8028564453125, "step": 90 }, { "accuracy_reward": 0.6458333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2311403751373291, "action_level_variance/metric": 1.5000966787338257, "action_level_variance_full_gradient/metric": 285.0157470703125, "adam_stats/lr_effective_max": 1.8521453739595017e-06, "adam_stats/lr_effective_mean": 1.2951569070690505e-12, "adam_stats/lr_effective_min": -1.811467200241168e-06, "adam_stats/m_t_max": 0.01037495769560337, "adam_stats/m_t_mean": 9.133843681397025e-11, "adam_stats/m_t_min": -0.008556060492992401, "adam_stats/v_t_max": 0.0003293666522949934, "adam_stats/v_t_mean": 2.821453487511505e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6458333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2311403751373291, "all_logprobs": -0.021256783977150917, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.375, "all_logprobs/p1": -0.578125, "all_logprobs/p10": -0.000335693359375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.0181884765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03317691385746002, "clip_ratio": 0.0, "completion_length": 556.21875, "completion_length/correct": 437.6773986816406, "completion_length/correct/max": 955.0, "completion_length/correct/median": 414.0, "completion_length/correct/min": 211.0, "completion_length/correct/p25": 328.5, "completion_length/correct/p75": 486.25, "completion_length/correct/var": 20347.732421875, "completion_length/incorrect": 772.3823852539062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 797.0, "completion_length/incorrect/min": 329.0, "completion_length/incorrect/p25": 553.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 52479.7578125, "completion_length/max": 1024.0, "completion_length/median": 480.0, "completion_length/min": 211.0, "completion_length/p25": 384.0, "completion_length/p75": 699.5, "completion_length/var": 57189.25390625, "epoch": 0.1456, "feature_vector_variance/max_squared_error": 73920.046875, "feature_vector_variance/metric": 27784.275390625, "generated_tokens/total": 4753685.0, "grad_norm": 0.7396317720413208, "learning_rate": 3.6707612778634855e-07, "loss": -0.6458, "mean_logprobs": -0.021484375, "mean_logprobs/var": 0.00011682510375976562, "num_completions/total": 8736, "per_sentence_gradient_norm": 2.0204715728759766, "per_sentence_gradient_norm/max": 6.989201545715332, "per_sentence_gradient_norm/median": 2.1395905017852783, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.3264834880828857, "per_sentence_gradient_norm/p85": 4.171425819396973, "per_sentence_gradient_norm/p90": 4.288690567016602, "per_sentence_gradient_norm/p95": 4.555151462554932, "per_sentence_gradient_norm/p99": 6.138337135314941, "per_sentence_gradient_norm/var": 3.2339284420013428, "per_token_feature_norm": 187.68716430664062, "per_token_feature_norm/max": 258.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 91.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 201.40252685546875, "per_token_full_gradient_variance/max_squared_error": 0.49428316950798035, "per_token_full_gradient_variance/variance": 0.0026851417496800423, "per_token_gradient_norm": 1.5743443965911865, "per_token_gradient_norm/max": 274.8046875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 209.74697875976562, "per_token_policy_error_norm": 0.012324179522693157, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.012239489704370499, "policy_entropy": 0.022134296596050262, "policy_entropy/max": 1.890625, "policy_entropy/median": 9.837094694375992e-09, "policy_entropy/min": 1.6601845766184287e-18, "policy_entropy/p25": 8.276401786133647e-11, "policy_entropy/p75": 2.115964889526367e-06, "policy_entropy/var": 0.01119981613010168, "policy_error_vector_variance/max_squared_error": 1.9848285913467407, "policy_error_vector_variance/metric": 0.01232366356998682, "policy_loss": -0.6458333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2311403751373291, "policy_sharpness": 9.37031078338623, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.825657606124878, "reward": 0.6458333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2311403751373291, "rewards/accuracy_reward": 0.6458333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2311403751373291, "sentence_full_gradient_variance/max_squared_error": 1978.719482421875, "sentence_full_gradient_variance/metric": 975.1507568359375, "sentence_full_gradient_variance/p75": 1491.1837158203125, "sentence_full_gradient_variance/p90": 1511.851806640625, "sentence_full_gradient_variance/p95": 1584.86962890625, "sentence_full_gradient_variance/p99": 1888.755615234375, "state_level_variance/metric": 2.05926251411438, "state_level_variance_full_gradient/metric": 690.1349487304688, "step": 91 }, { "accuracy_reward": 0.65625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2279605269432068, "action_level_variance/metric": 1.4902253150939941, "action_level_variance_full_gradient/metric": 170.31121826171875, "adam_stats/lr_effective_max": 1.4709725064676604e-06, "adam_stats/lr_effective_mean": -3.834379862233117e-12, "adam_stats/lr_effective_min": -1.4664005902886856e-06, "adam_stats/m_t_max": 0.008287657052278519, "adam_stats/m_t_mean": 6.64203553268905e-11, "adam_stats/m_t_min": -0.006571303587406874, "adam_stats/v_t_max": 0.0003291475004516542, "adam_stats/v_t_mean": 2.8331420542926367e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.65625, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2279605269432068, "all_logprobs": -0.018005134537816048, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.875, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00010452279821038246, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.0086669921875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.025705533102154732, "clip_ratio": 0.0, "completion_length": 494.1145935058594, "completion_length/correct": 410.19049072265625, "completion_length/correct/max": 1000.0, "completion_length/correct/median": 346.0, "completion_length/correct/min": 164.0, "completion_length/correct/p25": 266.0, "completion_length/correct/p75": 490.0, "completion_length/correct/var": 37357.08984375, "completion_length/incorrect": 654.3333740234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 776.0, "completion_length/incorrect/min": 164.0, "completion_length/incorrect/p25": 337.0, "completion_length/incorrect/p75": 894.0, "completion_length/incorrect/var": 96715.0390625, "completion_length/max": 1024.0, "completion_length/median": 396.0, "completion_length/min": 164.0, "completion_length/p25": 295.25, "completion_length/p75": 718.5, "completion_length/var": 70545.859375, "epoch": 0.1472, "feature_vector_variance/max_squared_error": 72336.734375, "feature_vector_variance/metric": 27588.03515625, "generated_tokens/total": 4801120.0, "grad_norm": 0.6791370511054993, "learning_rate": 2.905372804626083e-07, "loss": -0.6562, "mean_logprobs": -0.01708984375, "mean_logprobs/var": 0.00019168853759765625, "num_completions/total": 8832, "per_sentence_gradient_norm": 1.3160090446472168, "per_sentence_gradient_norm/max": 7.46687126159668, "per_sentence_gradient_norm/median": 1.0941425561904907, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 1.9700541496276855, "per_sentence_gradient_norm/p85": 2.5325560569763184, "per_sentence_gradient_norm/p90": 2.9700775146484375, "per_sentence_gradient_norm/p95": 3.7929649353027344, "per_sentence_gradient_norm/p99": 4.503139019012451, "per_sentence_gradient_norm/var": 1.868369460105896, "per_token_feature_norm": 187.74232482910156, "per_token_feature_norm/max": 254.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 89.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 257.9377746582031, "per_token_full_gradient_variance/max_squared_error": 0.5516806244850159, "per_token_full_gradient_variance/variance": 0.0022179954685270786, "per_token_gradient_norm": 1.1662213802337646, "per_token_gradient_norm/max": 291.09375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 158.93002319335938, "per_token_policy_error_norm": 0.010685345157980919, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009991578757762909, "policy_entropy": 0.01933475211262703, "policy_entropy/max": 1.6328125, "policy_entropy/median": 5.878973752260208e-09, "policy_entropy/min": 8.097634975751111e-19, "policy_entropy/p25": 5.184119800105691e-11, "policy_entropy/p75": 1.259148120880127e-06, "policy_entropy/var": 0.009751961566507816, "policy_error_vector_variance/max_squared_error": 1.9842573404312134, "policy_error_vector_variance/metric": 0.010679453611373901, "policy_loss": -0.65625, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2279605269432068, "policy_sharpness": 9.456433296203613, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.3406779766082764, "reward": 0.65625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2279605269432068, "rewards/accuracy_reward": 0.65625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2279605269432068, "sentence_full_gradient_variance/max_squared_error": 2614.04150390625, "sentence_full_gradient_variance/metric": 1045.9744873046875, "sentence_full_gradient_variance/p75": 1408.2972412109375, "sentence_full_gradient_variance/p90": 1523.459228515625, "sentence_full_gradient_variance/p95": 2043.8502197265625, "sentence_full_gradient_variance/p99": 2452.4677734375, "state_level_variance/metric": 0.594501793384552, "state_level_variance_full_gradient/metric": 875.663330078125, "step": 92 }, { "accuracy_reward": 0.7291666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19956141710281372, "action_level_variance/metric": 2.090050220489502, "action_level_variance_full_gradient/metric": 232.02639770507812, "adam_stats/lr_effective_max": 1.0801420557982055e-06, "adam_stats/lr_effective_mean": 6.097321540894651e-13, "adam_stats/lr_effective_min": -1.1473498489067424e-06, "adam_stats/m_t_max": 0.009409375488758087, "adam_stats/m_t_mean": 5.537442726577524e-11, "adam_stats/m_t_min": -0.010913556441664696, "adam_stats/v_t_max": 0.0003288934822194278, "adam_stats/v_t_mean": 2.866628288911155e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.7291666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.19956141710281372, "all_logprobs": -0.016496917232871056, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.25, "all_logprobs/p1": -0.404296875, "all_logprobs/p10": -7.486343383789062e-05, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.0052490234375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.025122711434960365, "clip_ratio": 0.0, "completion_length": 414.07293701171875, "completion_length/correct": 348.6571350097656, "completion_length/correct/max": 780.0, "completion_length/correct/median": 299.0, "completion_length/correct/min": 206.0, "completion_length/correct/p25": 252.0, "completion_length/correct/p75": 409.0, "completion_length/correct/var": 15658.89453125, "completion_length/incorrect": 590.1923217773438, "completion_length/incorrect/max": 1016.0, "completion_length/incorrect/median": 507.0, "completion_length/incorrect/min": 193.0, "completion_length/incorrect/p25": 415.0, "completion_length/incorrect/p75": 872.0, "completion_length/incorrect/var": 76709.6015625, "completion_length/max": 1016.0, "completion_length/median": 368.0, "completion_length/min": 193.0, "completion_length/p25": 252.0, "completion_length/p75": 505.0, "completion_length/var": 43202.3046875, "epoch": 0.1488, "feature_vector_variance/max_squared_error": 75936.84375, "feature_vector_variance/metric": 28008.19140625, "generated_tokens/total": 4840871.0, "grad_norm": 0.9413614273071289, "learning_rate": 2.2278205293002645e-07, "loss": -0.7292, "mean_logprobs": -0.017333984375, "mean_logprobs/var": 0.000179290771484375, "num_completions/total": 8928, "per_sentence_gradient_norm": 1.763742208480835, "per_sentence_gradient_norm/max": 9.231871604919434, "per_sentence_gradient_norm/median": 1.4753180742263794, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.571850299835205, "per_sentence_gradient_norm/p85": 3.1721456050872803, "per_sentence_gradient_norm/p90": 3.87579345703125, "per_sentence_gradient_norm/p95": 5.0336198806762695, "per_sentence_gradient_norm/p99": 6.531631946563721, "per_sentence_gradient_norm/var": 2.9324898719787598, "per_token_feature_norm": 188.20278930664062, "per_token_feature_norm/max": 262.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 91.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 268.3639831542969, "per_token_full_gradient_variance/max_squared_error": 0.5766193270683289, "per_token_full_gradient_variance/variance": 0.0026545131113380194, "per_token_gradient_norm": 1.4738435745239258, "per_token_gradient_norm/max": 329.0625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 199.52906799316406, "per_token_policy_error_norm": 0.009396431967616081, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009226136840879917, "policy_entropy": 0.0177925955504179, "policy_entropy/max": 2.03125, "policy_entropy/median": 7.363269105553627e-09, "policy_entropy/min": 1.8973538018496328e-18, "policy_entropy/p25": 5.1159076974727213e-11, "policy_entropy/p75": 1.4379620552062988e-06, "policy_entropy/var": 0.009679222479462624, "policy_error_vector_variance/max_squared_error": 1.9910223484039307, "policy_error_vector_variance/metric": 0.00939267035573721, "policy_loss": -0.7291666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.19956141710281372, "policy_sharpness": 9.498686790466309, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.0942933559417725, "reward": 0.7291666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19956141710281372, "rewards/accuracy_reward": 0.7291666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19956141710281372, "sentence_full_gradient_variance/max_squared_error": 2436.159912109375, "sentence_full_gradient_variance/metric": 1184.953857421875, "sentence_full_gradient_variance/p75": 1436.7891845703125, "sentence_full_gradient_variance/p90": 1942.6107177734375, "sentence_full_gradient_variance/p95": 2135.86865234375, "sentence_full_gradient_variance/p99": 2393.681884765625, "state_level_variance/metric": 1.1707082986831665, "state_level_variance_full_gradient/metric": 952.927490234375, "step": 93 }, { "accuracy_reward": 0.8020833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.16041667759418488, "action_level_variance/metric": 1.9918493032455444, "action_level_variance_full_gradient/metric": 212.26181030273438, "adam_stats/lr_effective_max": 8.214337867684662e-07, "adam_stats/lr_effective_mean": -5.428706963128693e-12, "adam_stats/lr_effective_min": -8.40885888919729e-07, "adam_stats/m_t_max": 0.011023076251149178, "adam_stats/m_t_mean": 4.247640086885163e-11, "adam_stats/m_t_min": -0.011201595887541771, "adam_stats/v_t_max": 0.00032865407411009073, "adam_stats/v_t_mean": 2.8905301763249014e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8020833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.16041667759418488, "all_logprobs": -0.019462507218122482, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.125, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00020313262939453125, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.01416015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.027328217402100563, "clip_ratio": 0.0, "completion_length": 466.8125, "completion_length/correct": 437.8311767578125, "completion_length/correct/max": 947.0, "completion_length/correct/median": 393.0, "completion_length/correct/min": 141.0, "completion_length/correct/p25": 243.0, "completion_length/correct/p75": 663.0, "completion_length/correct/var": 49279.61328125, "completion_length/incorrect": 584.26318359375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 398.0, "completion_length/incorrect/min": 218.0, "completion_length/incorrect/p25": 334.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 101987.3203125, "completion_length/max": 1024.0, "completion_length/median": 393.0, "completion_length/min": 141.0, "completion_length/p25": 270.0, "completion_length/p75": 669.5, "completion_length/var": 62187.30859375, "epoch": 0.1504, "feature_vector_variance/max_squared_error": 111522.8359375, "feature_vector_variance/metric": 27310.876953125, "generated_tokens/total": 4885685.0, "grad_norm": 0.8567059636116028, "learning_rate": 1.6389299449645734e-07, "loss": -0.8021, "mean_logprobs": -0.0201416015625, "mean_logprobs/var": 0.000125885009765625, "num_completions/total": 9024, "per_sentence_gradient_norm": 2.3020107746124268, "per_sentence_gradient_norm/max": 7.8773393630981445, "per_sentence_gradient_norm/median": 2.3603625297546387, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.2232940196990967, "per_sentence_gradient_norm/p75": 3.2917075157165527, "per_sentence_gradient_norm/p85": 4.128909587860107, "per_sentence_gradient_norm/p90": 4.466078758239746, "per_sentence_gradient_norm/p95": 5.450569152832031, "per_sentence_gradient_norm/p99": 6.476341724395752, "per_sentence_gradient_norm/var": 2.9670956134796143, "per_token_feature_norm": 187.3700714111328, "per_token_feature_norm/max": 278.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 87.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 259.8382873535156, "per_token_full_gradient_variance/max_squared_error": 1.273752212524414, "per_token_full_gradient_variance/variance": 0.0033715013414621353, "per_token_gradient_norm": 2.050440549850464, "per_token_gradient_norm/max": 294.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 266.73297119140625, "per_token_policy_error_norm": 0.011232670396566391, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010275696404278278, "policy_entropy": 0.021795036271214485, "policy_entropy/max": 2.96875, "policy_entropy/median": 5.675246939063072e-09, "policy_entropy/min": 1.9922214919421144e-18, "policy_entropy/p25": 4.092726157978177e-11, "policy_entropy/p75": 1.341104507446289e-06, "policy_entropy/var": 0.011722192168235779, "policy_error_vector_variance/max_squared_error": 2.0033905506134033, "policy_error_vector_variance/metric": 0.011227913200855255, "policy_loss": -0.8020833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.16041667759418488, "policy_sharpness": 9.40844440460205, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.6690120697021484, "reward": 0.8020833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.16041667759418488, "rewards/accuracy_reward": 0.8020833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.16041667759418488, "sentence_full_gradient_variance/max_squared_error": 2383.67724609375, "sentence_full_gradient_variance/metric": 1267.4520263671875, "sentence_full_gradient_variance/p75": 2155.896484375, "sentence_full_gradient_variance/p90": 2155.896484375, "sentence_full_gradient_variance/p95": 2155.910888671875, "sentence_full_gradient_variance/p99": 2383.67724609375, "state_level_variance/metric": 1.3018035888671875, "state_level_variance_full_gradient/metric": 1055.190185546875, "step": 94 }, { "accuracy_reward": 0.71875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20427630841732025, "action_level_variance/metric": 0.7539593577384949, "action_level_variance_full_gradient/metric": 131.3836212158203, "adam_stats/lr_effective_max": 5.780482865702652e-07, "adam_stats/lr_effective_mean": -4.214106060634881e-12, "adam_stats/lr_effective_min": -5.751504659201601e-07, "adam_stats/m_t_max": 0.008893417194485664, "adam_stats/m_t_mean": 4.27055335538995e-11, "adam_stats/m_t_min": -0.011327750980854034, "adam_stats/v_t_max": 0.0003288633597549051, "adam_stats/v_t_mean": 2.899754221463713e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.71875, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.20427630841732025, "all_logprobs": -0.01932472363114357, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.75, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.000152587890625, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.011929318308830261, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02675020694732666, "clip_ratio": 0.0, "completion_length": 533.6666870117188, "completion_length/correct": 386.39129638671875, "completion_length/correct/max": 823.0, "completion_length/correct/median": 381.0, "completion_length/correct/min": 102.0, "completion_length/correct/p25": 286.0, "completion_length/correct/p75": 458.0, "completion_length/correct/var": 28481.419921875, "completion_length/incorrect": 910.0370483398438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 398.0, "completion_length/incorrect/p25": 859.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 36604.1171875, "completion_length/max": 1024.0, "completion_length/median": 405.0, "completion_length/min": 102.0, "completion_length/p25": 320.75, "completion_length/p75": 779.75, "completion_length/var": 86418.2265625, "epoch": 0.152, "feature_vector_variance/max_squared_error": 73424.109375, "feature_vector_variance/metric": 27900.93359375, "generated_tokens/total": 4936917.0, "grad_norm": 0.7058826088905334, "learning_rate": 1.1394185240843985e-07, "loss": -0.7188, "mean_logprobs": -0.020751953125, "mean_logprobs/var": 9.918212890625e-05, "num_completions/total": 9120, "per_sentence_gradient_norm": 2.3632564544677734, "per_sentence_gradient_norm/max": 7.452812671661377, "per_sentence_gradient_norm/median": 2.5663259029388428, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.649341106414795, "per_sentence_gradient_norm/p85": 3.947697162628174, "per_sentence_gradient_norm/p90": 4.410885810852051, "per_sentence_gradient_norm/p95": 5.478458881378174, "per_sentence_gradient_norm/p99": 6.226627349853516, "per_sentence_gradient_norm/var": 3.30177640914917, "per_token_feature_norm": 188.36810302734375, "per_token_feature_norm/max": 262.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 91.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 208.0432891845703, "per_token_full_gradient_variance/max_squared_error": 0.514174222946167, "per_token_full_gradient_variance/variance": 0.0026077600196003914, "per_token_gradient_norm": 1.64805269241333, "per_token_gradient_norm/max": 287.0546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 228.3180694580078, "per_token_policy_error_norm": 0.011355360969901085, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010665075853466988, "policy_entropy": 0.021161498501896858, "policy_entropy/max": 2.1875, "policy_entropy/median": 9.89530235528946e-09, "policy_entropy/min": 5.624298769768554e-19, "policy_entropy/p25": 9.595169103704393e-11, "policy_entropy/p75": 1.55717134475708e-06, "policy_entropy/var": 0.011442175135016441, "policy_error_vector_variance/max_squared_error": 1.999465823173523, "policy_error_vector_variance/metric": 0.011349792592227459, "policy_loss": -0.71875, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.20427630841732025, "policy_sharpness": 9.430720329284668, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.5263195037841797, "reward": 0.71875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20427630841732025, "rewards/accuracy_reward": 0.71875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20427630841732025, "sentence_full_gradient_variance/max_squared_error": 2791.0078125, "sentence_full_gradient_variance/metric": 1328.497802734375, "sentence_full_gradient_variance/p75": 2725.87890625, "sentence_full_gradient_variance/p90": 2725.87890625, "sentence_full_gradient_variance/p95": 2725.87890625, "sentence_full_gradient_variance/p99": 2791.0078125, "state_level_variance/metric": 2.844728708267212, "state_level_variance_full_gradient/metric": 1197.1141357421875, "step": 95 }, { "accuracy_reward": 0.6875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21710528433322906, "action_level_variance/metric": 0.8917192220687866, "action_level_variance_full_gradient/metric": 124.78731536865234, "adam_stats/lr_effective_max": 3.7595401636281167e-07, "adam_stats/lr_effective_mean": -3.350622730657893e-12, "adam_stats/lr_effective_min": -3.7614123016282974e-07, "adam_stats/m_t_max": 0.01191394217312336, "adam_stats/m_t_mean": 1.6431904448221957e-11, "adam_stats/m_t_min": -0.01903286948800087, "adam_stats/v_t_max": 0.00032900131191127, "adam_stats/v_t_mean": 2.92274260349501e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6875, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21710528433322906, "all_logprobs": -0.019111206755042076, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.59375, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.0001239776611328125, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.01416015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02591361664235592, "clip_ratio": 0.0, "completion_length": 604.71875, "completion_length/correct": 471.5303039550781, "completion_length/correct/max": 903.0, "completion_length/correct/median": 440.0, "completion_length/correct/min": 178.0, "completion_length/correct/p25": 341.75, "completion_length/correct/p75": 574.0, "completion_length/correct/var": 29312.900390625, "completion_length/incorrect": 897.7333984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 584.0, "completion_length/incorrect/p25": 769.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 25585.306640625, "completion_length/max": 1024.0, "completion_length/median": 543.0, "completion_length/min": 178.0, "completion_length/p25": 388.25, "completion_length/p75": 817.0, "completion_length/var": 67303.40625, "epoch": 0.1536, "feature_vector_variance/max_squared_error": 74348.234375, "feature_vector_variance/metric": 28048.884765625, "generated_tokens/total": 4994970.0, "grad_norm": 0.831609308719635, "learning_rate": 7.298948443822229e-08, "loss": -0.6875, "mean_logprobs": -0.0194091796875, "mean_logprobs/var": 6.079673767089844e-05, "num_completions/total": 9216, "per_sentence_gradient_norm": 2.019984006881714, "per_sentence_gradient_norm/max": 4.877974987030029, "per_sentence_gradient_norm/median": 2.0757882595062256, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.3493871688842773, "per_sentence_gradient_norm/p85": 3.8558249473571777, "per_sentence_gradient_norm/p90": 4.144574165344238, "per_sentence_gradient_norm/p95": 4.460883617401123, "per_sentence_gradient_norm/p99": 4.877257347106934, "per_sentence_gradient_norm/var": 2.5685219764709473, "per_token_feature_norm": 187.50917053222656, "per_token_feature_norm/max": 262.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 91.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 209.87844848632812, "per_token_full_gradient_variance/max_squared_error": 0.5130887031555176, "per_token_full_gradient_variance/variance": 0.002368873916566372, "per_token_gradient_norm": 1.5222575664520264, "per_token_gradient_norm/max": 298.125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 202.08026123046875, "per_token_policy_error_norm": 0.011334201321005821, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010667246766388416, "policy_entropy": 0.020888278260827065, "policy_entropy/max": 1.953125, "policy_entropy/median": 1.2980308383703232e-08, "policy_entropy/min": 7.148958074826295e-19, "policy_entropy/p25": 1.0277290130034089e-10, "policy_entropy/p75": 2.16066837310791e-06, "policy_entropy/var": 0.010768177919089794, "policy_error_vector_variance/max_squared_error": 1.9728775024414062, "policy_error_vector_variance/metric": 0.011326906271278858, "policy_loss": -0.6875, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21710528433322906, "policy_sharpness": 9.433130264282227, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.4991087913513184, "reward": 0.6875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21710528433322906, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21710528433322906, "sentence_full_gradient_variance/max_squared_error": 2850.619140625, "sentence_full_gradient_variance/metric": 1120.4910888671875, "sentence_full_gradient_variance/p75": 1892.3330078125, "sentence_full_gradient_variance/p90": 1892.3330078125, "sentence_full_gradient_variance/p95": 1892.3330078125, "sentence_full_gradient_variance/p99": 2815.89453125, "state_level_variance/metric": 1.9216498136520386, "state_level_variance_full_gradient/metric": 995.703857421875, "step": 96 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894736886024475, "action_level_variance/metric": 1.5701775550842285, "action_level_variance_full_gradient/metric": 382.9104919433594, "adam_stats/lr_effective_max": 2.071250833068916e-07, "adam_stats/lr_effective_mean": -1.5410987373384866e-12, "adam_stats/lr_effective_min": -2.0913270759592706e-07, "adam_stats/m_t_max": 0.009135633707046509, "adam_stats/m_t_mean": -4.172615725162032e-11, "adam_stats/m_t_min": -0.012832704931497574, "adam_stats/v_t_max": 0.00032906339038163424, "adam_stats/v_t_mean": 2.94588242688576e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.75, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.75, "advantages/p75": 1.0, "advantages/var": 0.1894736886024475, "all_logprobs": -0.016701633110642433, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.0, "all_logprobs/p1": -0.474609375, "all_logprobs/p10": -5.8650970458984375e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.00592041015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.023999467492103577, "clip_ratio": 0.0, "completion_length": 548.5625, "completion_length/correct": 461.1388854980469, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 396.0, "completion_length/correct/min": 217.0, "completion_length/correct/p25": 264.5, "completion_length/correct/p75": 540.0, "completion_length/correct/var": 53234.4296875, "completion_length/incorrect": 810.8333740234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 228.0, "completion_length/incorrect/p25": 520.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 94442.5, "completion_length/max": 1024.0, "completion_length/median": 479.0, "completion_length/min": 217.0, "completion_length/p25": 294.5, "completion_length/p75": 741.75, "completion_length/var": 85820.78125, "epoch": 0.1552, "feature_vector_variance/max_squared_error": 98772.6640625, "feature_vector_variance/metric": 27821.09375, "generated_tokens/total": 5047632.0, "grad_norm": 0.8587484359741211, "learning_rate": 4.108578473795033e-08, "loss": -0.75, "mean_logprobs": -0.0184326171875, "mean_logprobs/var": 0.00015735626220703125, "num_completions/total": 9312, "per_sentence_gradient_norm": 1.9078515768051147, "per_sentence_gradient_norm/max": 6.413704872131348, "per_sentence_gradient_norm/median": 1.8624088764190674, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.6964734792709351, "per_sentence_gradient_norm/p75": 2.729219436645508, "per_sentence_gradient_norm/p85": 3.1139280796051025, "per_sentence_gradient_norm/p90": 4.041272163391113, "per_sentence_gradient_norm/p95": 4.7657012939453125, "per_sentence_gradient_norm/p99": 6.190260410308838, "per_sentence_gradient_norm/var": 2.4144957065582275, "per_token_feature_norm": 187.49095153808594, "per_token_feature_norm/max": 268.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 88.5, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 231.74273681640625, "per_token_full_gradient_variance/max_squared_error": 0.4845105707645416, "per_token_full_gradient_variance/variance": 0.002334314864128828, "per_token_gradient_norm": 1.4330974817276, "per_token_gradient_norm/max": 288.46875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 185.057861328125, "per_token_policy_error_norm": 0.009751087054610252, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009051015600562096, "policy_entropy": 0.01833127811551094, "policy_entropy/max": 1.921875, "policy_entropy/median": 8.09086486697197e-09, "policy_entropy/min": 2.642742795433417e-18, "policy_entropy/p25": 5.866240826435387e-11, "policy_entropy/p75": 1.1324882507324219e-06, "policy_entropy/var": 0.00971481204032898, "policy_error_vector_variance/max_squared_error": 2.000450849533081, "policy_error_vector_variance/metric": 0.009730655699968338, "policy_loss": -0.75, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -0.75, "policy_loss/var": 0.1894736886024475, "policy_sharpness": 9.491801261901855, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.147737741470337, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894736886024475, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894736886024475, "sentence_full_gradient_variance/max_squared_error": 2233.41650390625, "sentence_full_gradient_variance/metric": 1223.992431640625, "sentence_full_gradient_variance/p75": 2042.095947265625, "sentence_full_gradient_variance/p90": 2042.095947265625, "sentence_full_gradient_variance/p95": 2073.973388671875, "sentence_full_gradient_variance/p99": 2121.772705078125, "state_level_variance/metric": 1.1077520847320557, "state_level_variance_full_gradient/metric": 841.0819091796875, "step": 97 }, { "accuracy_reward": 0.6979166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21304824948310852, "action_level_variance/metric": 0.9374365210533142, "action_level_variance_full_gradient/metric": 182.6464080810547, "adam_stats/lr_effective_max": 9.340155315840093e-08, "adam_stats/lr_effective_mean": -6.205682127023715e-13, "adam_stats/lr_effective_min": -8.946757645844627e-08, "adam_stats/m_t_max": 0.011286036111414433, "adam_stats/m_t_mean": -6.885209069551479e-11, "adam_stats/m_t_min": -0.009320336394011974, "adam_stats/v_t_max": 0.00032883454696275294, "adam_stats/v_t_mean": 2.9590573047411084e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.6979166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21304824948310852, "all_logprobs": -0.0191629808396101, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.5, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -7.667532190680504e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.00860595703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.028811343014240265, "clip_ratio": 0.0, "completion_length": 520.7604370117188, "completion_length/correct": 377.0149230957031, "completion_length/correct/max": 961.0, "completion_length/correct/median": 284.0, "completion_length/correct/min": 172.0, "completion_length/correct/p25": 235.5, "completion_length/correct/p75": 344.0, "completion_length/correct/var": 58127.046875, "completion_length/incorrect": 852.862060546875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 968.0, "completion_length/incorrect/min": 240.0, "completion_length/incorrect/p25": 763.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 45271.6953125, "completion_length/max": 1024.0, "completion_length/median": 315.0, "completion_length/min": 172.0, "completion_length/p25": 241.0, "completion_length/p75": 835.5, "completion_length/var": 101966.859375, "epoch": 0.1568, "feature_vector_variance/max_squared_error": 86862.6875, "feature_vector_variance/metric": 28141.0546875, "generated_tokens/total": 5097625.0, "grad_norm": 0.7405887246131897, "learning_rate": 1.8269623051318517e-08, "loss": -0.6979, "mean_logprobs": -0.0196533203125, "mean_logprobs/var": 0.00010013580322265625, "num_completions/total": 9408, "per_sentence_gradient_norm": 2.095142126083374, "per_sentence_gradient_norm/max": 6.303183078765869, "per_sentence_gradient_norm/median": 2.1783506870269775, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.341097593307495, "per_sentence_gradient_norm/p85": 3.7573649883270264, "per_sentence_gradient_norm/p90": 4.300251483917236, "per_sentence_gradient_norm/p95": 4.625199317932129, "per_sentence_gradient_norm/p99": 6.199306964874268, "per_sentence_gradient_norm/var": 2.8823180198669434, "per_token_feature_norm": 187.43190002441406, "per_token_feature_norm/max": 270.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 91.0, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 194.90850830078125, "per_token_full_gradient_variance/max_squared_error": 0.8780617713928223, "per_token_full_gradient_variance/variance": 0.0022459900937974453, "per_token_gradient_norm": 1.4108575582504272, "per_token_gradient_norm/max": 279.6875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 194.3161163330078, "per_token_policy_error_norm": 0.011231689713895321, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010941124521195889, "policy_entropy": 0.019932739436626434, "policy_entropy/max": 2.171875, "policy_entropy/median": 5.908077582716942e-09, "policy_entropy/min": 1.9735867671025198e-19, "policy_entropy/p25": 5.820766091346741e-11, "policy_entropy/p75": 8.23289155960083e-07, "policy_entropy/var": 0.011081568896770477, "policy_error_vector_variance/max_squared_error": 1.997650146484375, "policy_error_vector_variance/metric": 0.011227168142795563, "policy_loss": -0.6979166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21304824948310852, "policy_sharpness": 9.473703384399414, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.2754435539245605, "reward": 0.6979166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21304824948310852, "rewards/accuracy_reward": 0.6979166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21304824948310852, "sentence_full_gradient_variance/max_squared_error": 2775.6162109375, "sentence_full_gradient_variance/metric": 1308.990478515625, "sentence_full_gradient_variance/p75": 1583.2216796875, "sentence_full_gradient_variance/p90": 2114.888671875, "sentence_full_gradient_variance/p95": 2284.956787109375, "sentence_full_gradient_variance/p99": 2573.1298828125, "state_level_variance/metric": 2.2167680263519287, "state_level_variance_full_gradient/metric": 1126.343994140625, "step": 98 }, { "accuracy_reward": 0.9166666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.07719297707080841, "action_level_variance/metric": 0.956527590751648, "action_level_variance_full_gradient/metric": 129.2242889404297, "adam_stats/lr_effective_max": 2.194452797255053e-08, "adam_stats/lr_effective_mean": -2.0898831710330912e-13, "adam_stats/lr_effective_min": -2.2892518103390103e-08, "adam_stats/m_t_max": 0.006983603350818157, "adam_stats/m_t_mean": -4.295199959591933e-11, "adam_stats/m_t_min": -0.006054002791643143, "adam_stats/v_t_max": 0.00032861848012544215, "adam_stats/v_t_mean": 2.964602521804416e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.9166666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.07719297707080841, "all_logprobs": -0.017069531604647636, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.28125, "all_logprobs/p1": -0.474609375, "all_logprobs/p10": -7.991795428097248e-05, "all_logprobs/p25": 0.0, "all_logprobs/p5": -0.008056640625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.022891659289598465, "clip_ratio": 0.0, "completion_length": 430.15625, "completion_length/correct": 378.3863830566406, "completion_length/correct/max": 900.0, "completion_length/correct/median": 338.0, "completion_length/correct/min": 125.0, "completion_length/correct/p25": 251.75, "completion_length/correct/p75": 473.5, "completion_length/correct/var": 33670.1484375, "completion_length/incorrect": 999.625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 829.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 4753.125, "completion_length/max": 1024.0, "completion_length/median": 353.0, "completion_length/min": 125.0, "completion_length/p25": 263.5, "completion_length/p75": 543.25, "completion_length/var": 60976.66796875, "epoch": 0.1584, "feature_vector_variance/max_squared_error": 73581.6484375, "feature_vector_variance/metric": 27432.8828125, "generated_tokens/total": 5138920.0, "grad_norm": 0.594388484954834, "learning_rate": 4.568797356781784e-09, "loss": -0.9167, "mean_logprobs": -0.0162353515625, "mean_logprobs/var": 9.679794311523438e-05, "num_completions/total": 9504, "per_sentence_gradient_norm": 2.262226104736328, "per_sentence_gradient_norm/max": 7.235019683837891, "per_sentence_gradient_norm/median": 2.119342565536499, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.2763850688934326, "per_sentence_gradient_norm/p75": 3.1258726119995117, "per_sentence_gradient_norm/p85": 3.7521119117736816, "per_sentence_gradient_norm/p90": 4.138649940490723, "per_sentence_gradient_norm/p95": 4.7039079666137695, "per_sentence_gradient_norm/p99": 5.658716678619385, "per_sentence_gradient_norm/var": 2.050589084625244, "per_token_feature_norm": 188.76649475097656, "per_token_feature_norm/max": 254.0, "per_token_feature_norm/median": 189.0, "per_token_feature_norm/min": 94.5, "per_token_feature_norm/p25": 181.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 234.63265991210938, "per_token_full_gradient_variance/max_squared_error": 0.38255611062049866, "per_token_full_gradient_variance/variance": 0.0034835755359381437, "per_token_gradient_norm": 1.995485544204712, "per_token_gradient_norm/max": 295.921875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 259.9835510253906, "per_token_policy_error_norm": 0.010019042529165745, "per_token_policy_error_norm/max": 1.921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009387392550706863, "policy_entropy": 0.019080236554145813, "policy_entropy/max": 1.875, "policy_entropy/median": 6.111804395914078e-09, "policy_entropy/min": 4.0488174878755556e-19, "policy_entropy/p25": 5.297806637827307e-11, "policy_entropy/p75": 1.1175870895385742e-06, "policy_entropy/var": 0.010214302688837051, "policy_error_vector_variance/max_squared_error": 1.9281562566757202, "policy_error_vector_variance/metric": 0.010015360079705715, "policy_loss": -0.9166666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.07719297707080841, "policy_sharpness": 9.482438087463379, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.189425230026245, "reward": 0.9166666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.07719297707080841, "rewards/accuracy_reward": 0.9166666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.07719297707080841, "sentence_full_gradient_variance/max_squared_error": 3149.19873046875, "sentence_full_gradient_variance/metric": 1061.7275390625, "sentence_full_gradient_variance/p75": 1618.298828125, "sentence_full_gradient_variance/p90": 3149.17333984375, "sentence_full_gradient_variance/p95": 3149.1875, "sentence_full_gradient_variance/p99": 3149.192138671875, "state_level_variance/metric": 1.3006548881530762, "state_level_variance_full_gradient/metric": 932.5031127929688, "step": 99 }, { "accuracy_reward": 0.8125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1539473831653595, "action_level_variance/metric": 0.9387954473495483, "action_level_variance_full_gradient/metric": 82.81768798828125, "adam_stats/lr_effective_max": 0.0, "adam_stats/lr_effective_mean": 0.0, "adam_stats/lr_effective_min": 0.0, "adam_stats/m_t_max": 0.011558681726455688, "adam_stats/m_t_mean": -9.25537840812396e-11, "adam_stats/m_t_min": -0.008298983797430992, "adam_stats/v_t_max": 0.00032983487471938133, "adam_stats/v_t_mean": 2.974033172509216e-11, "adam_stats/v_t_min": 0.0, "advantages": 0.8125, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.1539473831653595, "all_logprobs": -0.018476132303476334, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.0, "all_logprobs/p1": -0.5268359184265137, "all_logprobs/p10": -0.0001583099365234375, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.01104736328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.02612353302538395, "clip_ratio": 0.0, "completion_length": 392.85418701171875, "completion_length/correct": 384.20513916015625, "completion_length/correct/max": 803.0, "completion_length/correct/median": 388.0, "completion_length/correct/min": 173.0, "completion_length/correct/p25": 313.0, "completion_length/correct/p75": 480.75, "completion_length/correct/var": 13010.7890625, "completion_length/incorrect": 430.3333435058594, "completion_length/incorrect/max": 653.0, "completion_length/incorrect/median": 417.0, "completion_length/incorrect/min": 233.0, "completion_length/incorrect/p25": 402.5, "completion_length/incorrect/p75": 456.0, "completion_length/incorrect/var": 6240.82373046875, "completion_length/max": 803.0, "completion_length/median": 405.0, "completion_length/min": 173.0, "completion_length/p25": 316.5, "completion_length/p75": 478.5, "completion_length/var": 11989.9384765625, "epoch": 0.16, "feature_vector_variance/max_squared_error": 69786.9375, "feature_vector_variance/metric": 28484.375, "generated_tokens/total": 5176634.0, "grad_norm": 0.7253928184509277, "learning_rate": 0.0, "loss": -0.8125, "mean_logprobs": -0.0186767578125, "mean_logprobs/var": 0.00018787384033203125, "num_completions/total": 9600, "per_sentence_gradient_norm": 2.0886855125427246, "per_sentence_gradient_norm/max": 7.5883097648620605, "per_sentence_gradient_norm/median": 1.875691533088684, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.883804202079773, "per_sentence_gradient_norm/p75": 2.890573024749756, "per_sentence_gradient_norm/p85": 3.6051323413848877, "per_sentence_gradient_norm/p90": 4.514799118041992, "per_sentence_gradient_norm/p95": 5.810911655426025, "per_sentence_gradient_norm/p99": 6.78096866607666, "per_sentence_gradient_norm/var": 3.067631483078003, "per_token_feature_norm": 187.79144287109375, "per_token_feature_norm/max": 272.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 95.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 242.2535400390625, "per_token_full_gradient_variance/max_squared_error": 0.457150399684906, "per_token_full_gradient_variance/variance": 0.0032604460138827562, "per_token_gradient_norm": 1.9651683568954468, "per_token_gradient_norm/max": 278.4375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 259.0397033691406, "per_token_policy_error_norm": 0.010820334777235985, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01023039035499096, "policy_entropy": 0.020213402807712555, "policy_entropy/max": 1.546875, "policy_entropy/median": 1.0128132998943329e-08, "policy_entropy/min": 2.558039500707987e-19, "policy_entropy/p25": 5.1386450650170445e-11, "policy_entropy/p75": 2.4139881134033203e-06, "policy_entropy/var": 0.010341455228626728, "policy_error_vector_variance/max_squared_error": 1.9902328252792358, "policy_error_vector_variance/metric": 0.010813764296472073, "policy_loss": -0.8125, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.1539473831653595, "policy_sharpness": 9.424545288085938, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.5375492572784424, "reward": 0.8125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1539473831653595, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1539473831653595, "sentence_full_gradient_variance/max_squared_error": 3190.674560546875, "sentence_full_gradient_variance/metric": 702.0245971679688, "sentence_full_gradient_variance/p75": 1219.708984375, "sentence_full_gradient_variance/p90": 1219.72021484375, "sentence_full_gradient_variance/p95": 2757.094970703125, "sentence_full_gradient_variance/p99": 3128.68310546875, "state_level_variance/metric": 2.41552472114563, "state_level_variance_full_gradient/metric": 619.2069091796875, "step": 100 }, { "adam_stats/lr_effective_max": 0.0, "adam_stats/lr_effective_mean": 0.0, "adam_stats/lr_effective_min": 0.0, "adam_stats/m_t_max": 0.011558681726455688, "adam_stats/m_t_mean": -9.25537840812396e-11, "adam_stats/m_t_min": -0.008298983797430992, "adam_stats/v_t_max": 0.00032983487471938133, "adam_stats/v_t_mean": 2.974033172509216e-11, "adam_stats/v_t_min": 0.0, "epoch": 0.16, "step": 100, "total_flos": 0.0, "train_loss": -0.7403125202655793, "train_runtime": 17408.0197, "train_samples_per_second": 0.551, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 24, "trial_name": null, "trial_params": null }