{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2784810126582278, "eval_steps": 10, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "accuracy_reward": 0.5833333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24561403691768646, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 4.743426416098373e-06, "adam_stats/lr_effective_mean": 4.37458359467402e-12, "adam_stats/lr_effective_min": -4.743419140140759e-06, "adam_stats/m_t_max": 0.006835939362645149, "adam_stats/m_t_mean": -3.13253104011757e-11, "adam_stats/m_t_min": -0.005126954521983862, "adam_stats/v_t_max": 4.672943759942427e-06, "adam_stats/v_t_mean": 3.8160186816016406e-13, "adam_stats/v_t_min": 0.0, "advantages": 0.5833333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.24561403691768646, "all_logprobs": -0.14940854907035828, "all_logprobs/max": 0.0, "all_logprobs/median": -1.621246337890625e-05, "all_logprobs/min": -11.125, "all_logprobs/p1": -2.59375, "all_logprobs/p10": -0.38671875, "all_logprobs/p25": -0.0111083984375, "all_logprobs/p5": -0.97265625, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.25658857822418213, "clip_ratio": 0.0, "completion_length": 677.1979370117188, "completion_length/correct": 574.357177734375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 516.0, "completion_length/correct/min": 108.0, "completion_length/correct/p25": 373.5, "completion_length/correct/p75": 756.5, "completion_length/correct/var": 65520.41796875, "completion_length/incorrect": 821.1749877929688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 2.0, "completion_length/incorrect/p25": 495.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 119410.5625, "completion_length/max": 1024.0, "completion_length/median": 718.0, "completion_length/min": 2.0, "completion_length/p25": 401.5, "completion_length/p75": 1024.0, "completion_length/var": 101916.6171875, "epoch": 0.012658227848101266, "feature_vector_variance/max_squared_error": 94589.0234375, "feature_vector_variance/metric": 25078.30078125, "generated_tokens/total": 65011.0, "grad_norm": 0.913669228553772, "learning_rate": 1.5e-06, "loss": -0.5833, "mean_logprobs": -0.2236328125, "mean_logprobs/var": 0.1962890625, "num_completions/total": 96, "per_sentence_gradient_norm": 12.392972946166992, "per_sentence_gradient_norm/max": 62.68790054321289, "per_sentence_gradient_norm/median": 9.565930366516113, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 20.20440101623535, "per_sentence_gradient_norm/p85": 24.240942001342773, "per_sentence_gradient_norm/p90": 31.816009521484375, "per_sentence_gradient_norm/p95": 40.558021545410156, "per_sentence_gradient_norm/p99": 43.86935043334961, "per_sentence_gradient_norm/var": 188.11375427246094, "per_token_feature_norm": 161.2136993408203, "per_token_feature_norm/max": 324.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 63.75, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 2314.141357421875, "per_token_full_gradient_variance/max_squared_error": 2.374922513961792, "per_token_full_gradient_variance/variance": 0.010695422068238258, "per_token_gradient_norm": 9.477930068969727, "per_token_gradient_norm/max": 401.953125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1476.886962890625, "per_token_policy_error_norm": 0.07703681290149689, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06520389020442963, "policy_entropy": 0.1652372032403946, "policy_entropy/max": 3.734375, "policy_entropy/median": 0.00020503997802734375, "policy_entropy/min": 4.111294638065033e-16, "policy_entropy/p25": 6.705522537231445e-06, "policy_entropy/p75": 0.06689453125, "policy_entropy/var": 0.14286750555038452, "policy_error_vector_variance/max_squared_error": 2.0128326416015625, "policy_error_vector_variance/metric": 0.07663638889789581, "policy_loss": -0.5833333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.24561403691768646, "policy_sharpness": 7.284275054931641, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.319812059402466, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.84730052947998, "reward": 0.5833333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24561403691768646, "rewards/accuracy_reward": 0.5833333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24561403691768646, "sentence_full_gradient_variance/max_squared_error": 5887.55078125, "sentence_full_gradient_variance/metric": 1194.2177734375, "sentence_full_gradient_variance/p75": 972.727294921875, "sentence_full_gradient_variance/p90": 2533.55078125, "sentence_full_gradient_variance/p95": 3328.236328125, "sentence_full_gradient_variance/p99": 5785.25830078125, "state_level_variance/metric": 188.11375427246094, "state_level_variance_full_gradient/metric": 1194.2177734375, "step": 1 }, { "accuracy_reward": 0.6458333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2311403751373291, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.276604689337546e-05, "adam_stats/lr_effective_mean": -1.3572311330367626e-10, "adam_stats/lr_effective_min": -1.2766065992764197e-05, "adam_stats/m_t_max": 0.015845611691474915, "adam_stats/m_t_mean": -1.7168916288667901e-10, "adam_stats/m_t_min": -0.011469323188066483, "adam_stats/v_t_max": 1.4642871065007057e-05, "adam_stats/v_t_mean": 8.841951151285221e-13, "adam_stats/v_t_min": 0.0, "advantages": 0.6458333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2311403751373291, "all_logprobs": -0.1540435403585434, "all_logprobs/max": 0.0, "all_logprobs/median": -1.7523765563964844e-05, "all_logprobs/min": -10.3125, "all_logprobs/p1": -2.59375, "all_logprobs/p10": -0.4228515625, "all_logprobs/p25": -0.0147705078125, "all_logprobs/p5": -0.984375, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.2622447907924652, "clip_ratio": 0.0, "completion_length": 592.1458740234375, "completion_length/correct": 528.0, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 442.0, "completion_length/correct/min": 196.0, "completion_length/correct/p25": 336.75, "completion_length/correct/p75": 715.25, "completion_length/correct/var": 62807.8046875, "completion_length/incorrect": 709.11767578125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 865.0, "completion_length/incorrect/min": 28.0, "completion_length/incorrect/p25": 410.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 129085.4375, "completion_length/max": 1024.0, "completion_length/median": 540.0, "completion_length/min": 28.0, "completion_length/p25": 349.5, "completion_length/p75": 890.0, "completion_length/var": 92751.6640625, "epoch": 0.02531645569620253, "feature_vector_variance/max_squared_error": 93685.7421875, "feature_vector_variance/metric": 24948.830078125, "generated_tokens/total": 121857.0, "grad_norm": 1.5270112752914429, "learning_rate": 3e-06, "loss": -0.6458, "mean_logprobs": -0.1728515625, "mean_logprobs/var": 0.0113525390625, "num_completions/total": 192, "per_sentence_gradient_norm": 13.363329887390137, "per_sentence_gradient_norm/max": 55.00668716430664, "per_sentence_gradient_norm/median": 14.23275375366211, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 20.210350036621094, "per_sentence_gradient_norm/p85": 24.2604923248291, "per_sentence_gradient_norm/p90": 29.494443893432617, "per_sentence_gradient_norm/p95": 34.221710205078125, "per_sentence_gradient_norm/p99": 48.816505432128906, "per_sentence_gradient_norm/var": 160.69923400878906, "per_token_feature_norm": 161.47509765625, "per_token_feature_norm/max": 326.0, "per_token_feature_norm/median": 151.0, "per_token_feature_norm/min": 63.75, "per_token_feature_norm/p25": 123.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 2375.8447265625, "per_token_full_gradient_variance/max_squared_error": 1.8752639293670654, "per_token_full_gradient_variance/variance": 0.012821534648537636, "per_token_gradient_norm": 11.275867462158203, "per_token_gradient_norm/max": 405.8125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1665.565673828125, "per_token_policy_error_norm": 0.07987688481807709, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06659556180238724, "policy_entropy": 0.17114023864269257, "policy_entropy/max": 3.703125, "policy_entropy/median": 0.00022125244140625, "policy_entropy/min": 2.0317081350640365e-14, "policy_entropy/p25": 7.092952728271484e-06, "policy_entropy/p75": 0.0830078125, "policy_entropy/var": 0.1439649760723114, "policy_error_vector_variance/max_squared_error": 2.0166103839874268, "policy_error_vector_variance/metric": 0.07947193831205368, "policy_loss": -0.6458333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2311403751373291, "policy_sharpness": 7.245578765869141, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.241455078125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.979913711547852, "reward": 0.6458333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2311403751373291, "rewards/accuracy_reward": 0.6458333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2311403751373291, "sentence_full_gradient_variance/max_squared_error": 3550.67626953125, "sentence_full_gradient_variance/metric": 1325.1473388671875, "sentence_full_gradient_variance/p75": 1413.8726806640625, "sentence_full_gradient_variance/p90": 2421.11181640625, "sentence_full_gradient_variance/p95": 2773.408203125, "sentence_full_gradient_variance/p99": 3484.568603515625, "state_level_variance/metric": 160.69923400878906, "state_level_variance_full_gradient/metric": 1325.1473388671875, "step": 2 }, { "accuracy_reward": 0.5833333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24561406672000885, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 2.2355628971126862e-05, "adam_stats/lr_effective_mean": -1.8848103633395397e-10, "adam_stats/lr_effective_min": -2.2355005057761446e-05, "adam_stats/m_t_max": 0.014203066937625408, "adam_stats/m_t_mean": -1.4408385595743312e-10, "adam_stats/m_t_min": -0.01036206353455782, "adam_stats/v_t_max": 1.4628564713348169e-05, "adam_stats/v_t_mean": 9.466392513618471e-13, "adam_stats/v_t_min": 0.0, "advantages": 0.5833333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.24561406672000885, "all_logprobs": -0.14457347989082336, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1801719665527344e-05, "all_logprobs/min": -9.9375, "all_logprobs/p1": -2.515625, "all_logprobs/p10": -0.38671875, "all_logprobs/p25": -0.009765625, "all_logprobs/p5": -0.9300775527954102, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.2415204644203186, "clip_ratio": 0.0, "completion_length": 604.9896240234375, "completion_length/correct": 542.857177734375, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 498.0, "completion_length/correct/min": 122.0, "completion_length/correct/p25": 349.0, "completion_length/correct/p75": 669.0, "completion_length/correct/var": 59726.2421875, "completion_length/incorrect": 691.9750366210938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 841.0, "completion_length/incorrect/min": 42.0, "completion_length/incorrect/p25": 389.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 135692.4375, "completion_length/max": 1024.0, "completion_length/median": 552.0, "completion_length/min": 42.0, "completion_length/p25": 357.5, "completion_length/p75": 926.5, "completion_length/var": 95745.171875, "epoch": 0.0379746835443038, "feature_vector_variance/max_squared_error": 87067.03125, "feature_vector_variance/metric": 24525.6875, "generated_tokens/total": 179936.0, "grad_norm": 0.4568434953689575, "learning_rate": 4.5e-06, "loss": -0.5833, "mean_logprobs": -0.185546875, "mean_logprobs/var": 0.04931640625, "num_completions/total": 288, "per_sentence_gradient_norm": 11.067224502563477, "per_sentence_gradient_norm/max": 48.71635818481445, "per_sentence_gradient_norm/median": 9.91235637664795, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 19.47209930419922, "per_sentence_gradient_norm/p85": 23.072307586669922, "per_sentence_gradient_norm/p90": 24.981000900268555, "per_sentence_gradient_norm/p95": 30.892881393432617, "per_sentence_gradient_norm/p99": 43.258907318115234, "per_sentence_gradient_norm/var": 131.37527465820312, "per_token_feature_norm": 159.1809539794922, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 149.0, "per_token_feature_norm/min": 62.5, "per_token_feature_norm/p25": 122.5, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 2199.015380859375, "per_token_full_gradient_variance/max_squared_error": 2.5947277545928955, "per_token_full_gradient_variance/variance": 0.01065096352249384, "per_token_gradient_norm": 9.279675483703613, "per_token_gradient_norm/max": 396.09375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1351.7083740234375, "per_token_policy_error_norm": 0.07524459809064865, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06291178613901138, "policy_entropy": 0.16111360490322113, "policy_entropy/max": 3.65625, "policy_entropy/median": 0.00015354156494140625, "policy_entropy/min": 7.744915819785092e-13, "policy_entropy/p25": 5.632638931274414e-06, "policy_entropy/p75": 0.0601806640625, "policy_entropy/var": 0.1372612714767456, "policy_error_vector_variance/max_squared_error": 2.0125551223754883, "policy_error_vector_variance/metric": 0.07480636239051819, "policy_loss": -0.5833333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.24561406672000885, "policy_sharpness": 7.3879547119140625, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.683562994003296, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.515349388122559, "reward": 0.5833333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24561406672000885, "rewards/accuracy_reward": 0.5833333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24561406672000885, "sentence_full_gradient_variance/max_squared_error": 5496.21240234375, "sentence_full_gradient_variance/metric": 1288.21875, "sentence_full_gradient_variance/p75": 1141.1793212890625, "sentence_full_gradient_variance/p90": 2439.18994140625, "sentence_full_gradient_variance/p95": 3235.97314453125, "sentence_full_gradient_variance/p99": 4767.13671875, "state_level_variance/metric": 131.37527465820312, "state_level_variance_full_gradient/metric": 1288.21875, "step": 3 }, { "accuracy_reward": 0.5520833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24989035725593567, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 3.283397381892428e-05, "adam_stats/lr_effective_mean": -6.109292932654498e-11, "adam_stats/lr_effective_min": -3.280747478129342e-05, "adam_stats/m_t_max": 0.006224364973604679, "adam_stats/m_t_mean": -3.869844028558944e-11, "adam_stats/m_t_min": -0.005567188840359449, "adam_stats/v_t_max": 1.9494678781484254e-05, "adam_stats/v_t_mean": 1.441125213955119e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.5520833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.24989035725593567, "all_logprobs": -0.15050065517425537, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1444091796875e-05, "all_logprobs/min": -12.375, "all_logprobs/p1": -2.578125, "all_logprobs/p10": -0.392578125, "all_logprobs/p25": -0.01275634765625, "all_logprobs/p5": -0.96875, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.2613513767719269, "clip_ratio": 0.0, "completion_length": 651.0625, "completion_length/correct": 534.8490600585938, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 434.0, "completion_length/correct/min": 202.0, "completion_length/correct/p25": 358.0, "completion_length/correct/p75": 699.0, "completion_length/correct/var": 62905.40234375, "completion_length/incorrect": 794.3023071289062, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 35.0, "completion_length/incorrect/p25": 577.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 107448.3125, "completion_length/max": 1024.0, "completion_length/median": 582.0, "completion_length/min": 35.0, "completion_length/p25": 368.5, "completion_length/p75": 1024.0, "completion_length/var": 98757.515625, "epoch": 0.05063291139240506, "feature_vector_variance/max_squared_error": 93465.390625, "feature_vector_variance/metric": 25409.162109375, "generated_tokens/total": 242438.0, "grad_norm": 1.1113535165786743, "learning_rate": 6e-06, "loss": -0.5521, "mean_logprobs": -0.1630859375, "mean_logprobs/var": 0.01422119140625, "num_completions/total": 384, "per_sentence_gradient_norm": 9.971661567687988, "per_sentence_gradient_norm/max": 55.16937255859375, "per_sentence_gradient_norm/median": 8.45531177520752, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 16.83880043029785, "per_sentence_gradient_norm/p85": 21.176626205444336, "per_sentence_gradient_norm/p90": 23.115062713623047, "per_sentence_gradient_norm/p95": 27.563467025756836, "per_sentence_gradient_norm/p99": 40.3923454284668, "per_sentence_gradient_norm/var": 122.34656524658203, "per_token_feature_norm": 161.7351837158203, "per_token_feature_norm/max": 328.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 61.0, "per_token_feature_norm/p25": 124.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 2292.133056640625, "per_token_full_gradient_variance/max_squared_error": 2.8860602378845215, "per_token_full_gradient_variance/variance": 0.00936584360897541, "per_token_gradient_norm": 8.285364151000977, "per_token_gradient_norm/max": 411.140625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1234.8192138671875, "per_token_policy_error_norm": 0.07717403769493103, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.06463631987571716, "policy_entropy": 0.16760577261447906, "policy_entropy/max": 3.75, "policy_entropy/median": 0.00014972686767578125, "policy_entropy/min": 2.6645352591003757e-14, "policy_entropy/p25": 5.125999450683594e-06, "policy_entropy/p75": 0.07470703125, "policy_entropy/var": 0.1435241550207138, "policy_error_vector_variance/max_squared_error": 2.0205647945404053, "policy_error_vector_variance/metric": 0.07677412778139114, "policy_loss": -0.5520833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.24989035725593567, "policy_sharpness": 7.308066368103027, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.425717353820801, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.804560661315918, "reward": 0.5520833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24989035725593567, "rewards/accuracy_reward": 0.5520833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24989035725593567, "sentence_full_gradient_variance/max_squared_error": 4184.80224609375, "sentence_full_gradient_variance/metric": 1280.8516845703125, "sentence_full_gradient_variance/p75": 1689.5166015625, "sentence_full_gradient_variance/p90": 2325.9130859375, "sentence_full_gradient_variance/p95": 3030.611572265625, "sentence_full_gradient_variance/p99": 3484.142822265625, "state_level_variance/metric": 122.34656524658203, "state_level_variance_full_gradient/metric": 1280.8516845703125, "step": 4 }, { "accuracy_reward": 0.7083333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20877191424369812, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 4.388876186567359e-05, "adam_stats/lr_effective_mean": -5.359925003833865e-11, "adam_stats/lr_effective_min": -4.386117507237941e-05, "adam_stats/m_t_max": 0.007589779794216156, "adam_stats/m_t_mean": -7.319130024274756e-11, "adam_stats/m_t_min": -0.007293200120329857, "adam_stats/v_t_max": 1.9509516278048977e-05, "adam_stats/v_t_mean": 1.60243487316164e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7083333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.20877191424369812, "all_logprobs": -0.13276144862174988, "all_logprobs/max": 0.0, "all_logprobs/median": -7.748603820800781e-06, "all_logprobs/min": -9.6875, "all_logprobs/p1": -2.359375, "all_logprobs/p10": -0.333984375, "all_logprobs/p25": -0.0078125, "all_logprobs/p5": -0.8359375, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.21286781132221222, "clip_ratio": 0.0, "completion_length": 610.5, "completion_length/correct": 525.6323852539062, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 457.0, "completion_length/correct/min": 142.0, "completion_length/correct/p25": 328.5, "completion_length/correct/p75": 658.5, "completion_length/correct/var": 66494.4140625, "completion_length/incorrect": 816.607177734375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 148.0, "completion_length/incorrect/p25": 586.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 111961.359375, "completion_length/max": 1024.0, "completion_length/median": 503.0, "completion_length/min": 142.0, "completion_length/p25": 340.75, "completion_length/p75": 1024.0, "completion_length/var": 96392.609375, "epoch": 0.06329113924050633, "feature_vector_variance/max_squared_error": 97422.1875, "feature_vector_variance/metric": 24583.884765625, "generated_tokens/total": 301046.0, "grad_norm": 0.6433133482933044, "learning_rate": 7.5e-06, "loss": -0.7083, "mean_logprobs": -0.1455078125, "mean_logprobs/var": 0.007354736328125, "num_completions/total": 480, "per_sentence_gradient_norm": 12.566581726074219, "per_sentence_gradient_norm/max": 50.03265380859375, "per_sentence_gradient_norm/median": 13.572606086730957, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 18.70357894897461, "per_sentence_gradient_norm/p85": 22.543807983398438, "per_sentence_gradient_norm/p90": 25.17254638671875, "per_sentence_gradient_norm/p95": 28.57822608947754, "per_sentence_gradient_norm/p99": 45.157676696777344, "per_sentence_gradient_norm/var": 110.09226989746094, "per_token_feature_norm": 158.82791137695312, "per_token_feature_norm/max": 332.0, "per_token_feature_norm/median": 150.0, "per_token_feature_norm/min": 66.0, "per_token_feature_norm/p25": 122.5, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 2096.82373046875, "per_token_full_gradient_variance/max_squared_error": 39140.1640625, "per_token_full_gradient_variance/variance": 1.1692590713500977, "per_token_gradient_norm": 10.235332489013672, "per_token_gradient_norm/max": 416.5625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1451.700439453125, "per_token_policy_error_norm": 0.07058073580265045, "per_token_policy_error_norm/max": 1.9921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.059430383145809174, "policy_entropy": 0.14817525446414948, "policy_entropy/max": 3.765625, "policy_entropy/median": 0.00010251998901367188, "policy_entropy/min": 2.862293735361732e-17, "policy_entropy/p25": 4.231929779052734e-06, "policy_entropy/p75": 0.050048828125, "policy_entropy/var": 0.11673962324857712, "policy_error_vector_variance/max_squared_error": 2.004941463470459, "policy_error_vector_variance/metric": 0.07031179964542389, "policy_loss": -0.7083333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.20877191424369812, "policy_sharpness": 7.483151912689209, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 3.885589361190796, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 13.034492492675781, "reward": 0.7083333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20877191424369812, "rewards/accuracy_reward": 0.7083333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20877191424369812, "sentence_full_gradient_variance/max_squared_error": 3910.665283203125, "sentence_full_gradient_variance/metric": 1365.7376708984375, "sentence_full_gradient_variance/p75": 1741.6129150390625, "sentence_full_gradient_variance/p90": 2258.47802734375, "sentence_full_gradient_variance/p95": 2648.494140625, "sentence_full_gradient_variance/p99": 3488.101318359375, "state_level_variance/metric": 110.09226989746094, "state_level_variance_full_gradient/metric": 1365.7376708984375, "step": 5 }, { "accuracy_reward": 0.6666666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.224561408162117, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 5.507300375029445e-05, "adam_stats/lr_effective_mean": -2.5885948939929904e-10, "adam_stats/lr_effective_min": -5.506190427695401e-05, "adam_stats/m_t_max": 0.00865072663873434, "adam_stats/m_t_mean": -7.566183096718859e-11, "adam_stats/m_t_min": -0.007240136153995991, "adam_stats/v_t_max": 1.987147334148176e-05, "adam_stats/v_t_mean": 1.6415316371823363e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6666666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.224561408162117, "all_logprobs": -0.1240803450345993, "all_logprobs/max": 0.0, "all_logprobs/median": -3.2186508178710938e-06, "all_logprobs/min": -8.875, "all_logprobs/p1": -2.265625, "all_logprobs/p10": -0.291015625, "all_logprobs/p25": -0.0049591064453125, "all_logprobs/p5": -0.80859375, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.19589243829250336, "clip_ratio": 0.0, "completion_length": 618.15625, "completion_length/correct": 535.3125, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 469.0, "completion_length/correct/min": 153.0, "completion_length/correct/p25": 384.0, "completion_length/correct/p75": 655.0, "completion_length/correct/var": 54423.6484375, "completion_length/incorrect": 783.84375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 936.0, "completion_length/incorrect/min": 107.0, "completion_length/incorrect/p25": 484.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 85986.78125, "completion_length/max": 1024.0, "completion_length/median": 509.0, "completion_length/min": 107.0, "completion_length/p25": 399.25, "completion_length/p75": 921.0, "completion_length/var": 78020.9765625, "epoch": 0.0759493670886076, "feature_vector_variance/max_squared_error": 94249.7890625, "feature_vector_variance/metric": 25251.2265625, "generated_tokens/total": 360389.0, "grad_norm": 0.3747636675834656, "learning_rate": 9e-06, "loss": -0.6667, "mean_logprobs": -0.1259765625, "mean_logprobs/var": 0.0052490234375, "num_completions/total": 576, "per_sentence_gradient_norm": 10.188563346862793, "per_sentence_gradient_norm/max": 37.084228515625, "per_sentence_gradient_norm/median": 9.889364242553711, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 16.0366153717041, "per_sentence_gradient_norm/p85": 19.660629272460938, "per_sentence_gradient_norm/p90": 21.439125061035156, "per_sentence_gradient_norm/p95": 25.888935089111328, "per_sentence_gradient_norm/p99": 31.952434539794922, "per_sentence_gradient_norm/var": 80.95215606689453, "per_token_feature_norm": 159.57652282714844, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 152.0, "per_token_feature_norm/min": 62.5, "per_token_feature_norm/p25": 124.5, "per_token_feature_norm/p75": 189.0, "per_token_feature_norm/var": 1965.6478271484375, "per_token_full_gradient_variance/max_squared_error": 2.640145778656006, "per_token_full_gradient_variance/variance": 0.009968742728233337, "per_token_gradient_norm": 8.872312545776367, "per_token_gradient_norm/max": 409.671875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1233.5631103515625, "per_token_policy_error_norm": 0.0663188025355339, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.055989544838666916, "policy_entropy": 0.13830378651618958, "policy_entropy/max": 3.75, "policy_entropy/median": 4.6253204345703125e-05, "policy_entropy/min": 1.9761969838327786e-14, "policy_entropy/p25": 1.9371509552001953e-06, "policy_entropy/p75": 0.032958984375, "policy_entropy/var": 0.10809075832366943, "policy_error_vector_variance/max_squared_error": 2.002169132232666, "policy_error_vector_variance/metric": 0.06618396937847137, "policy_loss": -0.6666666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.224561408162117, "policy_sharpness": 7.657914638519287, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 4.249999523162842, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 12.472756385803223, "reward": 0.6666666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.224561408162117, "rewards/accuracy_reward": 0.6666666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.224561408162117, "sentence_full_gradient_variance/max_squared_error": 4190.31103515625, "sentence_full_gradient_variance/metric": 1251.5927734375, "sentence_full_gradient_variance/p75": 1496.2607421875, "sentence_full_gradient_variance/p90": 1820.2762451171875, "sentence_full_gradient_variance/p95": 2673.48681640625, "sentence_full_gradient_variance/p99": 4002.691162109375, "state_level_variance/metric": 80.95215606689453, "state_level_variance_full_gradient/metric": 1251.5927734375, "step": 6 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1666666716337204, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 6.631619180552661e-05, "adam_stats/lr_effective_mean": -1.2351995815063077e-10, "adam_stats/lr_effective_min": -6.641467189183459e-05, "adam_stats/m_t_max": 0.011105966754257679, "adam_stats/m_t_mean": -1.0082377294162725e-10, "adam_stats/m_t_min": -0.008151864632964134, "adam_stats/v_t_max": 2.204884003731422e-05, "adam_stats/v_t_mean": 1.7215558405919706e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7916666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.1666666716337204, "all_logprobs": -0.10298579931259155, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-06, "all_logprobs/min": -8.25, "all_logprobs/p1": -2.0625, "all_logprobs/p10": -0.201171875, "all_logprobs/p25": -0.00171661376953125, "all_logprobs/p5": -0.6484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.15692144632339478, "clip_ratio": 0.0, "completion_length": 606.3541870117188, "completion_length/correct": 538.6447143554688, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 460.0, "completion_length/correct/min": 150.0, "completion_length/correct/p25": 351.75, "completion_length/correct/p75": 690.25, "completion_length/correct/var": 60664.1015625, "completion_length/incorrect": 863.6500244140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 225.0, "completion_length/incorrect/p25": 869.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 81792.453125, "completion_length/max": 1024.0, "completion_length/median": 486.0, "completion_length/min": 150.0, "completion_length/p25": 387.25, "completion_length/p75": 940.0, "completion_length/var": 81855.9453125, "epoch": 0.08860759493670886, "feature_vector_variance/max_squared_error": 102746.953125, "feature_vector_variance/metric": 25200.646484375, "generated_tokens/total": 418599.0, "grad_norm": 0.48198527097702026, "learning_rate": 1.05e-05, "loss": -0.7917, "mean_logprobs": -0.10546875, "mean_logprobs/var": 0.0026702880859375, "num_completions/total": 672, "per_sentence_gradient_norm": 10.262126922607422, "per_sentence_gradient_norm/max": 54.19874572753906, "per_sentence_gradient_norm/median": 10.69676685333252, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 7.156065940856934, "per_sentence_gradient_norm/p75": 14.040862083435059, "per_sentence_gradient_norm/p85": 15.842475891113281, "per_sentence_gradient_norm/p90": 18.038803100585938, "per_sentence_gradient_norm/p95": 20.086505889892578, "per_sentence_gradient_norm/p99": 25.289342880249023, "per_sentence_gradient_norm/var": 59.134273529052734, "per_token_feature_norm": 159.41700744628906, "per_token_feature_norm/max": 318.0, "per_token_feature_norm/median": 154.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 127.0, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 1666.07275390625, "per_token_full_gradient_variance/max_squared_error": 1.784854531288147, "per_token_full_gradient_variance/variance": 0.00987046118825674, "per_token_gradient_norm": 8.841997146606445, "per_token_gradient_norm/max": 394.875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1156.353515625, "per_token_policy_error_norm": 0.05599019676446915, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04781246557831764, "policy_entropy": 0.1149752214550972, "policy_entropy/max": 3.734375, "policy_entropy/median": 1.823902130126953e-05, "policy_entropy/min": 1.3100631690576847e-14, "policy_entropy/p25": 7.450580596923828e-07, "policy_entropy/p75": 0.013427734375, "policy_entropy/var": 0.08543799817562103, "policy_error_vector_variance/max_squared_error": 1.9980255365371704, "policy_error_vector_variance/metric": 0.055928487330675125, "policy_loss": -0.7916666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.1666666716337204, "policy_sharpness": 7.929690837860107, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 5.62109375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 11.362161636352539, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1666666716337204, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1666666716337204, "sentence_full_gradient_variance/max_squared_error": 3195.522216796875, "sentence_full_gradient_variance/metric": 1148.9847412109375, "sentence_full_gradient_variance/p75": 1966.886474609375, "sentence_full_gradient_variance/p90": 1966.886474609375, "sentence_full_gradient_variance/p95": 2060.5361328125, "sentence_full_gradient_variance/p99": 2508.120361328125, "state_level_variance/metric": 59.134273529052734, "state_level_variance_full_gradient/metric": 1148.9847412109375, "step": 7 }, { "accuracy_reward": 0.6875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21710528433322906, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 7.778461440466344e-05, "adam_stats/lr_effective_mean": -2.7503557764596565e-10, "adam_stats/lr_effective_min": -7.80166665208526e-05, "adam_stats/m_t_max": 0.008176522329449654, "adam_stats/m_t_mean": -9.584560228814709e-11, "adam_stats/m_t_min": -0.0054934159852564335, "adam_stats/v_t_max": 2.2122158043202944e-05, "adam_stats/v_t_mean": 1.7574288378729985e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6875, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21710528433322906, "all_logprobs": -0.09549915045499802, "all_logprobs/max": 0.0, "all_logprobs/median": -5.960464477539062e-07, "all_logprobs/min": -8.9375, "all_logprobs/p1": -1.9453125, "all_logprobs/p10": -0.1708984375, "all_logprobs/p25": -0.00124359130859375, "all_logprobs/p5": -0.578125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.1461678296327591, "clip_ratio": 0.0, "completion_length": 614.125, "completion_length/correct": 494.6969909667969, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 443.0, "completion_length/correct/min": 224.0, "completion_length/correct/p25": 358.25, "completion_length/correct/p75": 569.0, "completion_length/correct/var": 42479.69140625, "completion_length/incorrect": 876.86669921875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 127.0, "completion_length/incorrect/p25": 795.25, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 56036.0546875, "completion_length/max": 1024.0, "completion_length/median": 526.0, "completion_length/min": 127.0, "completion_length/p25": 384.25, "completion_length/p75": 888.75, "completion_length/var": 77879.8125, "epoch": 0.10126582278481013, "feature_vector_variance/max_squared_error": 109595.3046875, "feature_vector_variance/metric": 25040.3125, "generated_tokens/total": 477555.0, "grad_norm": 0.3520490527153015, "learning_rate": 1.2e-05, "loss": -0.6875, "mean_logprobs": -0.09619140625, "mean_logprobs/var": 0.002532958984375, "num_completions/total": 768, "per_sentence_gradient_norm": 6.989754676818848, "per_sentence_gradient_norm/max": 19.683652877807617, "per_sentence_gradient_norm/median": 7.580759048461914, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 10.752836227416992, "per_sentence_gradient_norm/p85": 12.004343032836914, "per_sentence_gradient_norm/p90": 14.315092086791992, "per_sentence_gradient_norm/p95": 15.25223159790039, "per_sentence_gradient_norm/p99": 19.386459350585938, "per_sentence_gradient_norm/var": 30.172212600708008, "per_token_feature_norm": 160.10499572753906, "per_token_feature_norm/max": 316.0, "per_token_feature_norm/median": 156.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 129.0, "per_token_feature_norm/p75": 188.0, "per_token_feature_norm/var": 1544.4713134765625, "per_token_full_gradient_variance/max_squared_error": 2.2018394470214844, "per_token_full_gradient_variance/variance": 0.006494813598692417, "per_token_gradient_norm": 5.628243446350098, "per_token_gradient_norm/max": 379.6875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 704.30126953125, "per_token_policy_error_norm": 0.052765581756830215, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.046524208039045334, "policy_entropy": 0.10364647954702377, "policy_entropy/max": 3.734375, "policy_entropy/median": 1.0251998901367188e-05, "policy_entropy/min": 2.5847379792054426e-16, "policy_entropy/p25": 3.8929283618927e-07, "policy_entropy/p75": 0.0101318359375, "policy_entropy/var": 0.0697413831949234, "policy_error_vector_variance/max_squared_error": 1.9963799715042114, "policy_error_vector_variance/metric": 0.05274662747979164, "policy_loss": -0.6875, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21710528433322906, "policy_sharpness": 8.015316009521484, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 6.24609375, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 10.855183601379395, "reward": 0.6875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21710528433322906, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21710528433322906, "sentence_full_gradient_variance/max_squared_error": 2484.00390625, "sentence_full_gradient_variance/metric": 1161.819580078125, "sentence_full_gradient_variance/p75": 1729.556396484375, "sentence_full_gradient_variance/p90": 1819.0252685546875, "sentence_full_gradient_variance/p95": 2130.4990234375, "sentence_full_gradient_variance/p99": 2440.889892578125, "state_level_variance/metric": 30.172212600708008, "state_level_variance_full_gradient/metric": 1161.819580078125, "step": 8 }, { "accuracy_reward": 0.8333333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14035087823867798, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 8.885658462531865e-05, "adam_stats/lr_effective_mean": -2.4905633111416137e-10, "adam_stats/lr_effective_min": -8.822716335998848e-05, "adam_stats/m_t_max": 0.0063390168361365795, "adam_stats/m_t_mean": -6.74979724890612e-11, "adam_stats/m_t_min": -0.004912409000098705, "adam_stats/v_t_max": 2.3046306523610838e-05, "adam_stats/v_t_mean": 1.8111460681896618e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.8333333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.14035087823867798, "all_logprobs": -0.068961963057518, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -10.875, "all_logprobs/p1": -1.578125, "all_logprobs/p10": -0.0888671875, "all_logprobs/p25": -0.00017547607421875, "all_logprobs/p5": -0.38671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0964595377445221, "clip_ratio": 0.0, "completion_length": 576.3125, "completion_length/correct": 520.6375122070312, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 469.0, "completion_length/correct/min": 183.0, "completion_length/correct/p25": 322.75, "completion_length/correct/p75": 677.75, "completion_length/correct/var": 51996.96484375, "completion_length/incorrect": 854.6875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 936.0, "completion_length/incorrect/min": 372.0, "completion_length/incorrect/p25": 711.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 40310.62890625, "completion_length/max": 1024.0, "completion_length/median": 520.0, "completion_length/min": 183.0, "completion_length/p25": 362.0, "completion_length/p75": 770.5, "completion_length/var": 65266.08984375, "epoch": 0.11392405063291139, "feature_vector_variance/max_squared_error": 113688.7421875, "feature_vector_variance/metric": 26378.841796875, "generated_tokens/total": 532881.0, "grad_norm": 0.4214758276939392, "learning_rate": 1.3500000000000001e-05, "loss": -0.8333, "mean_logprobs": -0.068359375, "mean_logprobs/var": 0.00066375732421875, "num_completions/total": 864, "per_sentence_gradient_norm": 7.14459228515625, "per_sentence_gradient_norm/max": 21.407365798950195, "per_sentence_gradient_norm/median": 6.857326030731201, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 5.347108364105225, "per_sentence_gradient_norm/p75": 10.00040054321289, "per_sentence_gradient_norm/p85": 11.56184196472168, "per_sentence_gradient_norm/p90": 12.131593704223633, "per_sentence_gradient_norm/p95": 14.148147583007812, "per_sentence_gradient_norm/p99": 17.384185791015625, "per_sentence_gradient_norm/var": 19.301490783691406, "per_token_feature_norm": 167.8490753173828, "per_token_feature_norm/max": 310.0, "per_token_feature_norm/median": 167.0, "per_token_feature_norm/min": 69.0, "per_token_feature_norm/p25": 139.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 1421.0152587890625, "per_token_full_gradient_variance/max_squared_error": 1.7483248710632324, "per_token_full_gradient_variance/variance": 0.007807530928403139, "per_token_gradient_norm": 6.4791717529296875, "per_token_gradient_norm/max": 374.125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 797.3472290039062, "per_token_policy_error_norm": 0.03898797929286957, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.03397907316684723, "policy_entropy": 0.07820017635822296, "policy_entropy/max": 3.03125, "policy_entropy/median": 2.339482307434082e-06, "policy_entropy/min": 4.884981308350689e-15, "policy_entropy/p25": 7.497146725654602e-08, "policy_entropy/p75": 0.00173187255859375, "policy_entropy/var": 0.0484619103372097, "policy_error_vector_variance/max_squared_error": 2.007173776626587, "policy_error_vector_variance/metric": 0.03896579146385193, "policy_loss": -0.8333333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.14035087823867798, "policy_sharpness": 8.366372108459473, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 8.75, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.260960578918457, "reward": 0.8333333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14035087823867798, "rewards/accuracy_reward": 0.8333333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14035087823867798, "sentence_full_gradient_variance/max_squared_error": 2378.39453125, "sentence_full_gradient_variance/metric": 1043.760986328125, "sentence_full_gradient_variance/p75": 1703.875732421875, "sentence_full_gradient_variance/p90": 2355.142578125, "sentence_full_gradient_variance/p95": 2355.142578125, "sentence_full_gradient_variance/p99": 2356.396240234375, "state_level_variance/metric": 19.301490783691406, "state_level_variance_full_gradient/metric": 1043.760986328125, "step": 9 }, { "accuracy_reward": 0.7708333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17850877344608307, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 0.00010032861609943211, "adam_stats/lr_effective_mean": -2.0459799132677148e-10, "adam_stats/lr_effective_min": -9.943997429218143e-05, "adam_stats/m_t_max": 0.004596615210175514, "adam_stats/m_t_mean": -7.174798949405314e-11, "adam_stats/m_t_min": -0.004458552226424217, "adam_stats/v_t_max": 2.3248696379596367e-05, "adam_stats/v_t_mean": 1.845021314547668e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7708333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.17850877344608307, "all_logprobs": -0.054297901690006256, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.28125, "all_logprobs/p1": -1.3701562881469727, "all_logprobs/p10": -0.0380859375, "all_logprobs/p25": -2.6881694793701172e-05, "all_logprobs/p5": -0.251953125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.07692544162273407, "clip_ratio": 0.0, "completion_length": 580.8646240234375, "completion_length/correct": 498.89190673828125, "completion_length/correct/max": 990.0, "completion_length/correct/median": 494.0, "completion_length/correct/min": 154.0, "completion_length/correct/p25": 325.25, "completion_length/correct/p75": 637.0, "completion_length/correct/var": 35951.38671875, "completion_length/incorrect": 856.5909423828125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 984.0, "completion_length/incorrect/min": 347.0, "completion_length/incorrect/p25": 752.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 43684.82421875, "completion_length/max": 1024.0, "completion_length/median": 550.0, "completion_length/min": 154.0, "completion_length/p25": 403.5, "completion_length/p75": 729.75, "completion_length/var": 60122.390625, "epoch": 0.12658227848101267, "feature_vector_variance/max_squared_error": 123618.8515625, "feature_vector_variance/metric": 28542.515625, "generated_tokens/total": 588644.0, "grad_norm": 0.379601389169693, "learning_rate": 1.5e-05, "loss": -0.7708, "mean_logprobs": -0.053466796875, "mean_logprobs/var": 0.0009307861328125, "num_completions/total": 960, "per_sentence_gradient_norm": 4.954092979431152, "per_sentence_gradient_norm/max": 14.660189628601074, "per_sentence_gradient_norm/median": 4.642063140869141, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 2.5004191398620605, "per_sentence_gradient_norm/p75": 7.2362260818481445, "per_sentence_gradient_norm/p85": 8.82817268371582, "per_sentence_gradient_norm/p90": 10.043956756591797, "per_sentence_gradient_norm/p95": 11.538249015808105, "per_sentence_gradient_norm/p99": 13.969881057739258, "per_sentence_gradient_norm/var": 14.115762710571289, "per_token_feature_norm": 178.7992706298828, "per_token_feature_norm/max": 312.0, "per_token_feature_norm/median": 182.0, "per_token_feature_norm/min": 73.0, "per_token_feature_norm/p25": 152.0, "per_token_feature_norm/p75": 205.0, "per_token_feature_norm/var": 1327.9427490234375, "per_token_full_gradient_variance/max_squared_error": 0.7912642359733582, "per_token_full_gradient_variance/variance": 0.005805783439427614, "per_token_gradient_norm": 4.171056270599365, "per_token_gradient_norm/max": 352.265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 526.5712280273438, "per_token_policy_error_norm": 0.030794179067015648, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.027508771046996117, "policy_entropy": 0.06100211292505264, "policy_entropy/max": 3.390625, "policy_entropy/median": 7.711350917816162e-07, "policy_entropy/min": 2.896988204881268e-16, "policy_entropy/p25": 1.5366822481155396e-08, "policy_entropy/p75": 0.000316619873046875, "policy_entropy/var": 0.03784661367535591, "policy_error_vector_variance/max_squared_error": 1.995065689086914, "policy_error_vector_variance/metric": 0.03077748976647854, "policy_loss": -0.7708333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.17850877344608307, "policy_sharpness": 8.641058921813965, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.89131498336792, "reward": 0.7708333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17850877344608307, "rewards/accuracy_reward": 0.7708333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17850877344608307, "sentence_full_gradient_variance/max_squared_error": 2850.792724609375, "sentence_full_gradient_variance/metric": 1005.1348876953125, "sentence_full_gradient_variance/p75": 2137.742919921875, "sentence_full_gradient_variance/p90": 2137.742919921875, "sentence_full_gradient_variance/p95": 2137.742919921875, "sentence_full_gradient_variance/p99": 2477.737548828125, "state_level_variance/metric": 14.115762710571289, "state_level_variance_full_gradient/metric": 1005.1348876953125, "step": 10 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1666666865348816, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.985362703446299e-05, "adam_stats/lr_effective_mean": -9.438145820439559e-10, "adam_stats/lr_effective_min": -0.00010036140156444162, "adam_stats/m_t_max": 0.003996435087174177, "adam_stats/m_t_mean": -8.880107066344678e-11, "adam_stats/m_t_min": -0.004347505047917366, "adam_stats/v_t_max": 2.323532862646971e-05, "adam_stats/v_t_mean": 1.9861714269792108e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7916666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.1666666865348816, "all_logprobs": -0.03699127212166786, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.75, "all_logprobs/p1": -1.015625, "all_logprobs/p10": -0.00860595703125, "all_logprobs/p25": -2.2649765014648438e-06, "all_logprobs/p5": -0.11328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.0500246100127697, "clip_ratio": 0.0, "completion_length": 553.7604370117188, "completion_length/correct": 459.7631530761719, "completion_length/correct/max": 896.0, "completion_length/correct/median": 424.0, "completion_length/correct/min": 177.0, "completion_length/correct/p25": 305.25, "completion_length/correct/p75": 605.5, "completion_length/correct/var": 37013.67578125, "completion_length/incorrect": 910.9500122070312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 449.0, "completion_length/incorrect/p25": 849.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 32934.26171875, "completion_length/max": 1024.0, "completion_length/median": 485.0, "completion_length/min": 177.0, "completion_length/p25": 339.75, "completion_length/p75": 728.5, "completion_length/var": 69736.4375, "epoch": 0.13924050632911392, "feature_vector_variance/max_squared_error": 136761.59375, "feature_vector_variance/metric": 29971.39453125, "generated_tokens/total": 641805.0, "grad_norm": 0.6338839530944824, "learning_rate": 1.4995431202643219e-05, "loss": -0.7917, "mean_logprobs": -0.036865234375, "mean_logprobs/var": 0.0004177093505859375, "num_completions/total": 1056, "per_sentence_gradient_norm": 3.9646100997924805, "per_sentence_gradient_norm/max": 11.192739486694336, "per_sentence_gradient_norm/median": 3.6756033897399902, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.6912193298339844, "per_sentence_gradient_norm/p75": 5.774381160736084, "per_sentence_gradient_norm/p85": 6.83161735534668, "per_sentence_gradient_norm/p90": 7.709174156188965, "per_sentence_gradient_norm/p95": 9.269051551818848, "per_sentence_gradient_norm/p99": 10.184507369995117, "per_sentence_gradient_norm/var": 8.545470237731934, "per_token_feature_norm": 190.42526245117188, "per_token_feature_norm/max": 304.0, "per_token_feature_norm/median": 193.0, "per_token_feature_norm/min": 79.0, "per_token_feature_norm/p25": 174.0, "per_token_feature_norm/p75": 211.0, "per_token_feature_norm/var": 973.43408203125, "per_token_full_gradient_variance/max_squared_error": 0.9432310461997986, "per_token_full_gradient_variance/variance": 0.00504636112600565, "per_token_gradient_norm": 3.2228400707244873, "per_token_gradient_norm/max": 335.359375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 431.10772705078125, "per_token_policy_error_norm": 0.021290315315127373, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.019254615530371666, "policy_entropy": 0.04213464632630348, "policy_entropy/max": 2.703125, "policy_entropy/median": 1.471489667892456e-07, "policy_entropy/min": 7.26415455565288e-18, "policy_entropy/p25": 2.5756889954209328e-09, "policy_entropy/p75": 3.1948089599609375e-05, "policy_entropy/var": 0.024501515552401543, "policy_error_vector_variance/max_squared_error": 1.988728642463684, "policy_error_vector_variance/metric": 0.021277541294693947, "policy_loss": -0.7916666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.1666666865348816, "policy_sharpness": 8.97387409210205, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.132969379425049, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1666666865348816, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1666666865348816, "sentence_full_gradient_variance/max_squared_error": 2690.386474609375, "sentence_full_gradient_variance/metric": 1097.894287109375, "sentence_full_gradient_variance/p75": 1864.513671875, "sentence_full_gradient_variance/p90": 2690.386474609375, "sentence_full_gradient_variance/p95": 2690.386474609375, "sentence_full_gradient_variance/p99": 2690.386474609375, "state_level_variance/metric": 8.545470237731934, "state_level_variance_full_gradient/metric": 1097.894287109375, "step": 11 }, { "accuracy_reward": 0.71875, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.20427630841732025, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.937180584529415e-05, "adam_stats/lr_effective_mean": -1.046243314206663e-09, "adam_stats/lr_effective_min": -0.00010081756045110524, "adam_stats/m_t_max": 0.00631715077906847, "adam_stats/m_t_mean": -5.730622573696387e-11, "adam_stats/m_t_min": -0.00478196470066905, "adam_stats/v_t_max": 2.4126527932821773e-05, "adam_stats/v_t_mean": 2.0467971935389517e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.71875, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.20427630841732025, "all_logprobs": -0.030509136617183685, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.375, "all_logprobs/p1": -0.8815624713897705, "all_logprobs/p10": -0.002471923828125, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.043740153312683105, "clip_ratio": 0.0, "completion_length": 581.59375, "completion_length/correct": 493.7391357421875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 464.0, "completion_length/correct/min": 193.0, "completion_length/correct/p25": 352.0, "completion_length/correct/p75": 609.0, "completion_length/correct/var": 36959.9609375, "completion_length/incorrect": 806.1111450195312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 185.0, "completion_length/incorrect/p25": 598.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 79414.1796875, "completion_length/max": 1024.0, "completion_length/median": 520.0, "completion_length/min": 185.0, "completion_length/p25": 368.75, "completion_length/p75": 729.75, "completion_length/var": 68122.4765625, "epoch": 0.1518987341772152, "feature_vector_variance/max_squared_error": 150523.96875, "feature_vector_variance/metric": 30551.5078125, "generated_tokens/total": 697638.0, "grad_norm": 0.5143095850944519, "learning_rate": 1.4981730376948682e-05, "loss": -0.7188, "mean_logprobs": -0.0296630859375, "mean_logprobs/var": 0.0003833770751953125, "num_completions/total": 1152, "per_sentence_gradient_norm": 2.9571502208709717, "per_sentence_gradient_norm/max": 10.839898109436035, "per_sentence_gradient_norm/median": 3.030142307281494, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 4.723983287811279, "per_sentence_gradient_norm/p85": 5.1289143562316895, "per_sentence_gradient_norm/p90": 5.505683422088623, "per_sentence_gradient_norm/p95": 6.859077453613281, "per_sentence_gradient_norm/p99": 8.40793228149414, "per_sentence_gradient_norm/var": 5.957475185394287, "per_token_feature_norm": 197.85655212402344, "per_token_feature_norm/max": 314.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 72.5, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 214.0, "per_token_feature_norm/var": 675.1654052734375, "per_token_full_gradient_variance/max_squared_error": 0.9457817077636719, "per_token_full_gradient_variance/variance": 0.004125404637306929, "per_token_gradient_norm": 2.49526047706604, "per_token_gradient_norm/max": 368.5, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 335.260986328125, "per_token_policy_error_norm": 0.017413116991519928, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.015832040458917618, "policy_entropy": 0.03454947471618652, "policy_entropy/max": 3.6875, "policy_entropy/median": 3.958120942115784e-08, "policy_entropy/min": 8.847089727481716e-17, "policy_entropy/p25": 7.203198038041592e-10, "policy_entropy/p75": 9.059906005859375e-06, "policy_entropy/var": 0.022028319537639618, "policy_error_vector_variance/max_squared_error": 2.004551410675049, "policy_error_vector_variance/metric": 0.017389824613928795, "policy_loss": -0.71875, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.20427630841732025, "policy_sharpness": 9.158726692199707, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.150332927703857, "reward": 0.71875, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.20427630841732025, "rewards/accuracy_reward": 0.71875, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.20427630841732025, "sentence_full_gradient_variance/max_squared_error": 2650.006591796875, "sentence_full_gradient_variance/metric": 1087.024169921875, "sentence_full_gradient_variance/p75": 1901.9296875, "sentence_full_gradient_variance/p90": 1901.9296875, "sentence_full_gradient_variance/p95": 1987.8194580078125, "sentence_full_gradient_variance/p99": 2506.596923828125, "state_level_variance/metric": 5.957475185394287, "state_level_variance_full_gradient/metric": 1087.024169921875, "step": 12 }, { "accuracy_reward": 0.8229166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.14725878834724426, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 0.00010049149568658322, "adam_stats/lr_effective_mean": -1.3094303419336484e-09, "adam_stats/lr_effective_min": -0.00010178948286920786, "adam_stats/m_t_max": 0.011354900896549225, "adam_stats/m_t_mean": -1.3503702323003353e-10, "adam_stats/m_t_min": -0.010187558829784393, "adam_stats/v_t_max": 2.747860344243236e-05, "adam_stats/v_t_mean": 2.1817168297660894e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.8229166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.14725878834724426, "all_logprobs": -0.02385159581899643, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -4.5, "all_logprobs/p1": -0.7578125, "all_logprobs/p10": -0.00061798095703125, "all_logprobs/p25": -1.1920928955078125e-07, "all_logprobs/p5": -0.02978515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.030771352350711823, "clip_ratio": 0.0, "completion_length": 596.0416870117188, "completion_length/correct": 524.8101196289062, "completion_length/correct/max": 992.0, "completion_length/correct/median": 500.0, "completion_length/correct/min": 174.0, "completion_length/correct/p25": 363.0, "completion_length/correct/p75": 655.5, "completion_length/correct/var": 41653.796875, "completion_length/incorrect": 927.058837890625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 632.0, "completion_length/incorrect/p25": 787.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 20630.18359375, "completion_length/max": 1024.0, "completion_length/median": 551.0, "completion_length/min": 174.0, "completion_length/p25": 422.75, "completion_length/p75": 790.75, "completion_length/var": 61501.57421875, "epoch": 0.16455696202531644, "feature_vector_variance/max_squared_error": 131270.453125, "feature_vector_variance/metric": 29918.556640625, "generated_tokens/total": 754858.0, "grad_norm": 0.7412737607955933, "learning_rate": 1.495891421526205e-05, "loss": -0.8229, "mean_logprobs": -0.0242919921875, "mean_logprobs/var": 0.00017070770263671875, "num_completions/total": 1248, "per_sentence_gradient_norm": 3.1076817512512207, "per_sentence_gradient_norm/max": 9.30029582977295, "per_sentence_gradient_norm/median": 2.863419532775879, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.9522181749343872, "per_sentence_gradient_norm/p75": 4.374250411987305, "per_sentence_gradient_norm/p85": 5.276529788970947, "per_sentence_gradient_norm/p90": 6.12038516998291, "per_sentence_gradient_norm/p95": 7.334644317626953, "per_sentence_gradient_norm/p99": 8.668599128723145, "per_sentence_gradient_norm/var": 4.868418216705322, "per_token_feature_norm": 199.23928833007812, "per_token_feature_norm/max": 294.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 84.5, "per_token_feature_norm/p25": 186.0, "per_token_feature_norm/p75": 213.0, "per_token_feature_norm/var": 462.26153564453125, "per_token_full_gradient_variance/max_squared_error": 0.7903134822845459, "per_token_full_gradient_variance/variance": 0.0044243731535971165, "per_token_gradient_norm": 2.70082950592041, "per_token_gradient_norm/max": 319.375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 380.4443054199219, "per_token_policy_error_norm": 0.014269178733229637, "per_token_policy_error_norm/max": 1.953125, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.013188373297452927, "policy_entropy": 0.026353055611252785, "policy_entropy/max": 2.265625, "policy_entropy/median": 1.641456037759781e-08, "policy_entropy/min": 6.722053469410127e-17, "policy_entropy/p25": 2.582964953035116e-10, "policy_entropy/p75": 3.129243850708008e-06, "policy_entropy/var": 0.014476193115115166, "policy_error_vector_variance/max_squared_error": 1.9549970626831055, "policy_error_vector_variance/metric": 0.014263397082686424, "policy_loss": -0.8229166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.14725878834724426, "policy_sharpness": 9.306662559509277, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.245369911193848, "reward": 0.8229166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.14725878834724426, "rewards/accuracy_reward": 0.8229166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.14725878834724426, "sentence_full_gradient_variance/max_squared_error": 2588.884033203125, "sentence_full_gradient_variance/metric": 941.54833984375, "sentence_full_gradient_variance/p75": 1831.07421875, "sentence_full_gradient_variance/p90": 2264.708984375, "sentence_full_gradient_variance/p95": 2264.708984375, "sentence_full_gradient_variance/p99": 2500.072998046875, "state_level_variance/metric": 4.868418216705322, "state_level_variance_full_gradient/metric": 941.54833984375, "step": 13 }, { "accuracy_reward": 0.7395833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19462719559669495, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 0.00010106991248903796, "adam_stats/lr_effective_mean": -6.665259455473915e-10, "adam_stats/lr_effective_min": -0.00010027398093370721, "adam_stats/m_t_max": 0.014906912110745907, "adam_stats/m_t_mean": -1.4697743022651366e-10, "adam_stats/m_t_min": -0.011732280254364014, "adam_stats/v_t_max": 2.964836312457919e-05, "adam_stats/v_t_mean": 2.3967273478364692e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7395833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.19462719559669495, "all_logprobs": -0.023980258032679558, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.75, "all_logprobs/p1": -0.69140625, "all_logprobs/p10": -0.00048828125, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.02978515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03765695542097092, "clip_ratio": 0.0, "completion_length": 561.1875, "completion_length/correct": 456.9013977050781, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 388.0, "completion_length/correct/min": 130.0, "completion_length/correct/p25": 282.0, "completion_length/correct/p75": 579.0, "completion_length/correct/var": 51076.375, "completion_length/incorrect": 857.3599853515625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 203.0, "completion_length/incorrect/p25": 703.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 70838.4921875, "completion_length/max": 1024.0, "completion_length/median": 478.0, "completion_length/min": 130.0, "completion_length/p25": 312.5, "completion_length/p75": 801.75, "completion_length/var": 86743.0625, "epoch": 0.17721518987341772, "feature_vector_variance/max_squared_error": 137319.375, "feature_vector_variance/metric": 29899.123046875, "generated_tokens/total": 808732.0, "grad_norm": 0.8271145820617676, "learning_rate": 1.4927010515561777e-05, "loss": -0.7396, "mean_logprobs": -0.0234375, "mean_logprobs/var": 0.000232696533203125, "num_completions/total": 1344, "per_sentence_gradient_norm": 2.4733572006225586, "per_sentence_gradient_norm/max": 8.672226905822754, "per_sentence_gradient_norm/median": 2.321056604385376, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.8999862670898438, "per_sentence_gradient_norm/p85": 4.7038116455078125, "per_sentence_gradient_norm/p90": 5.333268165588379, "per_sentence_gradient_norm/p95": 6.710261344909668, "per_sentence_gradient_norm/p99": 7.4721574783325195, "per_sentence_gradient_norm/var": 4.532730579376221, "per_token_feature_norm": 196.6710968017578, "per_token_feature_norm/max": 300.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 75.0, "per_token_feature_norm/p25": 185.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 416.6190185546875, "per_token_full_gradient_variance/max_squared_error": 1.3459985256195068, "per_token_full_gradient_variance/variance": 0.003248048946261406, "per_token_gradient_norm": 1.9503755569458008, "per_token_gradient_norm/max": 337.40625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 261.5531311035156, "per_token_policy_error_norm": 0.013583235442638397, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.012413560412824154, "policy_entropy": 0.026552679017186165, "policy_entropy/max": 3.40625, "policy_entropy/median": 1.7578713595867157e-08, "policy_entropy/min": 1.0137290312739466e-17, "policy_entropy/p25": 2.2464519133791327e-10, "policy_entropy/p75": 3.6656856536865234e-06, "policy_entropy/var": 0.015804985538125038, "policy_error_vector_variance/max_squared_error": 2.003892421722412, "policy_error_vector_variance/metric": 0.013566062785685062, "policy_loss": -0.7395833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.19462719559669495, "policy_sharpness": 9.321471214294434, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.193946838378906, "reward": 0.7395833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19462719559669495, "rewards/accuracy_reward": 0.7395833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19462719559669495, "sentence_full_gradient_variance/max_squared_error": 2928.87890625, "sentence_full_gradient_variance/metric": 1289.77685546875, "sentence_full_gradient_variance/p75": 2287.500732421875, "sentence_full_gradient_variance/p90": 2287.500732421875, "sentence_full_gradient_variance/p95": 2287.519287109375, "sentence_full_gradient_variance/p99": 2688.0712890625, "state_level_variance/metric": 4.532730579376221, "state_level_variance_full_gradient/metric": 1289.77685546875, "step": 14 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894736886024475, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 0.00010071657743537799, "adam_stats/lr_effective_mean": -6.846527234039002e-10, "adam_stats/lr_effective_min": -0.0001000705742626451, "adam_stats/m_t_max": 0.017504464834928513, "adam_stats/m_t_mean": -1.7611415104035189e-10, "adam_stats/m_t_min": -0.013177738524973392, "adam_stats/v_t_max": 3.1290066544897854e-05, "adam_stats/v_t_mean": 2.7834342903459985e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.75, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.75, "advantages/p75": 1.0, "advantages/var": 0.1894736886024475, "all_logprobs": -0.02283434383571148, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.25, "all_logprobs/p1": -0.69140625, "all_logprobs/p10": -0.0004482269287109375, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.023193359375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03344111889600754, "clip_ratio": 0.0, "completion_length": 561.9583740234375, "completion_length/correct": 501.0, "completion_length/correct/max": 981.0, "completion_length/correct/median": 488.0, "completion_length/correct/min": 184.0, "completion_length/correct/p25": 309.75, "completion_length/correct/p75": 650.75, "completion_length/correct/var": 41294.78515625, "completion_length/incorrect": 744.8333740234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 695.0, "completion_length/incorrect/min": 246.0, "completion_length/incorrect/p25": 512.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 71589.453125, "completion_length/max": 1024.0, "completion_length/median": 513.0, "completion_length/min": 184.0, "completion_length/p25": 327.0, "completion_length/p75": 721.5, "completion_length/var": 59459.70703125, "epoch": 0.189873417721519, "feature_vector_variance/max_squared_error": 130878.1875, "feature_vector_variance/metric": 29899.705078125, "generated_tokens/total": 862680.0, "grad_norm": 1.1047769784927368, "learning_rate": 1.488605814759156e-05, "loss": -0.75, "mean_logprobs": -0.0234375, "mean_logprobs/var": 0.0004749298095703125, "num_completions/total": 1440, "per_sentence_gradient_norm": 2.2919511795043945, "per_sentence_gradient_norm/max": 7.511486053466797, "per_sentence_gradient_norm/median": 2.2772881984710693, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.48326992988586426, "per_sentence_gradient_norm/p75": 3.349100112915039, "per_sentence_gradient_norm/p85": 4.391894340515137, "per_sentence_gradient_norm/p90": 4.750701427459717, "per_sentence_gradient_norm/p95": 5.386600971221924, "per_sentence_gradient_norm/p99": 6.331819534301758, "per_sentence_gradient_norm/var": 3.26106333732605, "per_token_feature_norm": 193.63790893554688, "per_token_feature_norm/max": 298.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 85.0, "per_token_feature_norm/p25": 183.0, "per_token_feature_norm/p75": 205.0, "per_token_feature_norm/var": 407.25567626953125, "per_token_full_gradient_variance/max_squared_error": 0.48824775218963623, "per_token_full_gradient_variance/variance": 0.0031927882228046656, "per_token_gradient_norm": 1.977919340133667, "per_token_gradient_norm/max": 286.53125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 265.8451843261719, "per_token_policy_error_norm": 0.013049421831965446, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.012044509872794151, "policy_entropy": 0.025424981489777565, "policy_entropy/max": 2.796875, "policy_entropy/median": 2.421438694000244e-08, "policy_entropy/min": 2.3445871979999033e-18, "policy_entropy/p25": 2.673914423212409e-10, "policy_entropy/p75": 4.708766937255859e-06, "policy_entropy/var": 0.015595939941704273, "policy_error_vector_variance/max_squared_error": 1.989957332611084, "policy_error_vector_variance/metric": 0.01304170023649931, "policy_loss": -0.75, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -0.75, "policy_loss/var": 0.1894736886024475, "policy_sharpness": 9.33561897277832, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.084044456481934, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894736886024475, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894736886024475, "sentence_full_gradient_variance/max_squared_error": 2787.625732421875, "sentence_full_gradient_variance/metric": 1006.2298583984375, "sentence_full_gradient_variance/p75": 1837.40087890625, "sentence_full_gradient_variance/p90": 1837.40087890625, "sentence_full_gradient_variance/p95": 2207.2119140625, "sentence_full_gradient_variance/p99": 2730.8427734375, "state_level_variance/metric": 3.26106333732605, "state_level_variance_full_gradient/metric": 1006.2298583984375, "step": 15 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1666666567325592, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 0.00010158387158298865, "adam_stats/lr_effective_mean": -1.2239224078669508e-09, "adam_stats/lr_effective_min": -9.902405145112425e-05, "adam_stats/m_t_max": 0.016840098425745964, "adam_stats/m_t_mean": -2.3588944686459e-10, "adam_stats/m_t_min": -0.013866179622709751, "adam_stats/v_t_max": 3.137673047604039e-05, "adam_stats/v_t_mean": 3.126799817329551e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7916666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.1666666567325592, "all_logprobs": -0.021431708708405495, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -8.3125, "all_logprobs/p1": -0.6328125, "all_logprobs/p10": -0.0001927376724779606, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.01416015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.033156462013721466, "clip_ratio": 0.0, "completion_length": 493.85418701171875, "completion_length/correct": 439.1447448730469, "completion_length/correct/max": 904.0, "completion_length/correct/median": 413.0, "completion_length/correct/min": 176.0, "completion_length/correct/p25": 296.75, "completion_length/correct/p75": 535.25, "completion_length/correct/var": 31772.283203125, "completion_length/incorrect": 701.75, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 600.0, "completion_length/incorrect/min": 299.0, "completion_length/incorrect/p25": 469.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 74586.9375, "completion_length/max": 1024.0, "completion_length/median": 438.0, "completion_length/min": 176.0, "completion_length/p25": 315.75, "completion_length/p75": 594.0, "completion_length/var": 51494.36328125, "epoch": 0.20253164556962025, "feature_vector_variance/max_squared_error": 139408.40625, "feature_vector_variance/metric": 29251.005859375, "generated_tokens/total": 910090.0, "grad_norm": 1.0958527326583862, "learning_rate": 1.4836107005503543e-05, "loss": -0.7917, "mean_logprobs": -0.02099609375, "mean_logprobs/var": 0.00019931793212890625, "num_completions/total": 1536, "per_sentence_gradient_norm": 2.1650376319885254, "per_sentence_gradient_norm/max": 7.723714351654053, "per_sentence_gradient_norm/median": 2.2034881114959717, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 1.0549285411834717, "per_sentence_gradient_norm/p75": 3.1252923011779785, "per_sentence_gradient_norm/p85": 3.6020305156707764, "per_sentence_gradient_norm/p90": 3.984048843383789, "per_sentence_gradient_norm/p95": 5.2352423667907715, "per_sentence_gradient_norm/p99": 6.493836402893066, "per_sentence_gradient_norm/var": 2.764345407485962, "per_token_feature_norm": 193.83033752441406, "per_token_feature_norm/max": 308.0, "per_token_feature_norm/median": 192.0, "per_token_feature_norm/min": 85.0, "per_token_feature_norm/p25": 182.0, "per_token_feature_norm/p75": 205.0, "per_token_feature_norm/var": 400.77935791015625, "per_token_full_gradient_variance/max_squared_error": 0.4969846308231354, "per_token_full_gradient_variance/variance": 0.003013244131579995, "per_token_gradient_norm": 1.8497878313064575, "per_token_gradient_norm/max": 292.2734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 248.31126403808594, "per_token_policy_error_norm": 0.012289280071854591, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011839056387543678, "policy_entropy": 0.022617217153310776, "policy_entropy/max": 3.546875, "policy_entropy/median": 1.7811544239521027e-08, "policy_entropy/min": 3.469446951953614e-18, "policy_entropy/p25": 1.6916601452976465e-10, "policy_entropy/p75": 3.471970558166504e-06, "policy_entropy/var": 0.013956364244222641, "policy_error_vector_variance/max_squared_error": 1.9693306684494019, "policy_error_vector_variance/metric": 0.012279346585273743, "policy_loss": -0.7916666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.1666666567325592, "policy_sharpness": 9.410703659057617, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.6493799686431885, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1666666567325592, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1666666567325592, "sentence_full_gradient_variance/max_squared_error": 2740.2919921875, "sentence_full_gradient_variance/metric": 1038.420166015625, "sentence_full_gradient_variance/p75": 1408.8895263671875, "sentence_full_gradient_variance/p90": 2740.2919921875, "sentence_full_gradient_variance/p95": 2740.2919921875, "sentence_full_gradient_variance/p99": 2740.2919921875, "state_level_variance/metric": 2.764345407485962, "state_level_variance_full_gradient/metric": 1038.420166015625, "step": 16 }, { "accuracy_reward": 0.75, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.75, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1894736886024475, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 0.00010041147470474243, "adam_stats/lr_effective_mean": -9.877081375009311e-10, "adam_stats/lr_effective_min": -9.88922402029857e-05, "adam_stats/m_t_max": 0.014895280823111534, "adam_stats/m_t_mean": -2.288967765329275e-10, "adam_stats/m_t_min": -0.012380601838231087, "adam_stats/v_t_max": 3.1446692446479574e-05, "adam_stats/v_t_mean": 3.642466103043196e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.75, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.75, "advantages/p75": 1.0, "advantages/var": 0.1894736886024475, "all_logprobs": -0.027050374075770378, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.75, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.0003376007080078125, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.02099609375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.05001872032880783, "clip_ratio": 0.0, "completion_length": 518.03125, "completion_length/correct": 462.9583435058594, "completion_length/correct/max": 955.0, "completion_length/correct/median": 431.0, "completion_length/correct/min": 157.0, "completion_length/correct/p25": 319.5, "completion_length/correct/p75": 586.75, "completion_length/correct/var": 38246.4921875, "completion_length/incorrect": 683.25, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 637.0, "completion_length/incorrect/min": 294.0, "completion_length/incorrect/p25": 427.25, "completion_length/incorrect/p75": 931.75, "completion_length/incorrect/var": 71766.6328125, "completion_length/max": 1024.0, "completion_length/median": 451.0, "completion_length/min": 157.0, "completion_length/p25": 340.0, "completion_length/p75": 664.5, "completion_length/var": 55154.15625, "epoch": 0.21518987341772153, "feature_vector_variance/max_squared_error": 126422.9140625, "feature_vector_variance/metric": 28554.560546875, "generated_tokens/total": 959821.0, "grad_norm": 6.839051246643066, "learning_rate": 1.4777217947069972e-05, "loss": -0.75, "mean_logprobs": -0.026611328125, "mean_logprobs/var": 0.0004100799560546875, "num_completions/total": 1632, "per_sentence_gradient_norm": 2.4755759239196777, "per_sentence_gradient_norm/max": 11.271888732910156, "per_sentence_gradient_norm/median": 2.483537197113037, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.5200055241584778, "per_sentence_gradient_norm/p75": 3.5507025718688965, "per_sentence_gradient_norm/p85": 4.273622035980225, "per_sentence_gradient_norm/p90": 4.646618843078613, "per_sentence_gradient_norm/p95": 5.987268924713135, "per_sentence_gradient_norm/p99": 8.560635566711426, "per_sentence_gradient_norm/var": 4.616905212402344, "per_token_feature_norm": 187.7405242919922, "per_token_feature_norm/max": 290.0, "per_token_feature_norm/median": 189.0, "per_token_feature_norm/min": 77.5, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 201.0, "per_token_feature_norm/var": 575.7652587890625, "per_token_full_gradient_variance/max_squared_error": 1.6858662366867065, "per_token_full_gradient_variance/variance": 0.003198383143171668, "per_token_gradient_norm": 2.242567539215088, "per_token_gradient_norm/max": 330.890625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 316.33184814453125, "per_token_policy_error_norm": 0.014845965430140495, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0140212532132864, "policy_entropy": 0.028096482157707214, "policy_entropy/max": 3.3125, "policy_entropy/median": 3.818422555923462e-08, "policy_entropy/min": 5.5294310796760726e-18, "policy_entropy/p25": 3.183231456205249e-10, "policy_entropy/p75": 8.106231689453125e-06, "policy_entropy/var": 0.022720692679286003, "policy_error_vector_variance/max_squared_error": 2.0003151893615723, "policy_error_vector_variance/metric": 0.014769269153475761, "policy_loss": -0.75, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -0.75, "policy_loss/var": 0.1894736886024475, "policy_sharpness": 9.345069885253906, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.135436058044434, "reward": 0.75, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.75, "reward/p75": 1.0, "reward/var": 0.1894736886024475, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.75, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1894736886024475, "sentence_full_gradient_variance/max_squared_error": 2810.41259765625, "sentence_full_gradient_variance/metric": 1099.00634765625, "sentence_full_gradient_variance/p75": 2061.18115234375, "sentence_full_gradient_variance/p90": 2061.18115234375, "sentence_full_gradient_variance/p95": 2116.30810546875, "sentence_full_gradient_variance/p99": 2686.0849609375, "state_level_variance/metric": 4.616905212402344, "state_level_variance_full_gradient/metric": 1099.00634765625, "step": 17 }, { "accuracy_reward": 0.7708333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17850877344608307, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.931051317835227e-05, "adam_stats/lr_effective_mean": -1.1631122731614596e-09, "adam_stats/lr_effective_min": -9.850463538896292e-05, "adam_stats/m_t_max": 0.017436498776078224, "adam_stats/m_t_mean": -1.706026431236296e-10, "adam_stats/m_t_min": -0.01339802984148264, "adam_stats/v_t_max": 3.331494008307345e-05, "adam_stats/v_t_mean": 4.07984973929687e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7708333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.17850877344608307, "all_logprobs": -0.023931214585900307, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -9.6875, "all_logprobs/p1": -0.69140625, "all_logprobs/p10": -0.000263214111328125, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.0157470703125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.042326994240283966, "clip_ratio": 0.0, "completion_length": 511.96875, "completion_length/correct": 432.7432556152344, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 418.0, "completion_length/correct/min": 78.0, "completion_length/correct/p25": 310.25, "completion_length/correct/p75": 509.25, "completion_length/correct/var": 29700.548828125, "completion_length/incorrect": 778.45458984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 861.0, "completion_length/incorrect/min": 180.0, "completion_length/incorrect/p25": 585.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 73334.1640625, "completion_length/max": 1024.0, "completion_length/median": 435.0, "completion_length/min": 78.0, "completion_length/p25": 336.75, "completion_length/p75": 654.25, "completion_length/var": 60367.94140625, "epoch": 0.22784810126582278, "feature_vector_variance/max_squared_error": 115231.984375, "feature_vector_variance/metric": 28231.4609375, "generated_tokens/total": 1008970.0, "grad_norm": 1.6019948720932007, "learning_rate": 1.4709462719537392e-05, "loss": -0.7708, "mean_logprobs": -0.0281982421875, "mean_logprobs/var": 0.001373291015625, "num_completions/total": 1728, "per_sentence_gradient_norm": 2.663158893585205, "per_sentence_gradient_norm/max": 29.229291915893555, "per_sentence_gradient_norm/median": 1.9425523281097412, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.9455133080482483, "per_sentence_gradient_norm/p75": 3.506056547164917, "per_sentence_gradient_norm/p85": 4.1427412033081055, "per_sentence_gradient_norm/p90": 4.8424458503723145, "per_sentence_gradient_norm/p95": 7.714643478393555, "per_sentence_gradient_norm/p99": 13.70495319366455, "per_sentence_gradient_norm/var": 13.009987831115723, "per_token_feature_norm": 185.21604919433594, "per_token_feature_norm/max": 284.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 80.0, "per_token_feature_norm/p25": 176.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 635.6983032226562, "per_token_full_gradient_variance/max_squared_error": 1.6971814632415771, "per_token_full_gradient_variance/variance": 0.00304713798686862, "per_token_gradient_norm": 1.9939017295837402, "per_token_gradient_norm/max": 348.5625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 267.61322021484375, "per_token_policy_error_norm": 0.012979543767869473, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011874645948410034, "policy_entropy": 0.02624647691845894, "policy_entropy/max": 3.390625, "policy_entropy/median": 4.959292709827423e-08, "policy_entropy/min": 4.716279450311944e-18, "policy_entropy/p25": 3.2741809263825417e-10, "policy_entropy/p75": 9.5367431640625e-06, "policy_entropy/var": 0.02194817177951336, "policy_error_vector_variance/max_squared_error": 1.9838727712631226, "policy_error_vector_variance/metric": 0.01286572590470314, "policy_loss": -0.7708333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.17850877344608307, "policy_sharpness": 9.377187728881836, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.9725379943847656, "reward": 0.7708333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17850877344608307, "rewards/accuracy_reward": 0.7708333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17850877344608307, "sentence_full_gradient_variance/max_squared_error": 3131.138916015625, "sentence_full_gradient_variance/metric": 1098.7679443359375, "sentence_full_gradient_variance/p75": 2476.814697265625, "sentence_full_gradient_variance/p90": 2706.139892578125, "sentence_full_gradient_variance/p95": 2706.139892578125, "sentence_full_gradient_variance/p99": 2727.39111328125, "state_level_variance/metric": 13.009987831115723, "state_level_variance_full_gradient/metric": 1098.7679443359375, "step": 18 }, { "accuracy_reward": 0.6979166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21304824948310852, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.723872790345922e-05, "adam_stats/lr_effective_mean": -7.139626112540043e-10, "adam_stats/lr_effective_min": -9.854811651166528e-05, "adam_stats/m_t_max": 0.02222430519759655, "adam_stats/m_t_mean": -2.0758589291958174e-10, "adam_stats/m_t_min": -0.015871595591306686, "adam_stats/v_t_max": 3.796090095420368e-05, "adam_stats/v_t_mean": 4.4902280678382755e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6979166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21304824948310852, "all_logprobs": -0.02030022442340851, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -13.0, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00013065338134765625, "all_logprobs/p25": -2.384185791015625e-07, "all_logprobs/p5": -0.0101318359375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03686029836535454, "clip_ratio": 0.0, "completion_length": 537.9583740234375, "completion_length/correct": 442.92535400390625, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 400.0, "completion_length/correct/min": 120.0, "completion_length/correct/p25": 265.5, "completion_length/correct/p75": 543.5, "completion_length/correct/var": 49043.16015625, "completion_length/incorrect": 757.5172119140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 856.0, "completion_length/incorrect/min": 304.0, "completion_length/incorrect/p25": 504.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 72681.3359375, "completion_length/max": 1024.0, "completion_length/median": 466.0, "completion_length/min": 120.0, "completion_length/p25": 313.0, "completion_length/p75": 778.0, "completion_length/var": 76578.9296875, "epoch": 0.24050632911392406, "feature_vector_variance/max_squared_error": 126290.6328125, "feature_vector_variance/metric": 28170.595703125, "generated_tokens/total": 1060614.0, "grad_norm": 1.3847757577896118, "learning_rate": 1.4632923872213653e-05, "loss": -0.6979, "mean_logprobs": -0.0203857421875, "mean_logprobs/var": 0.000232696533203125, "num_completions/total": 1824, "per_sentence_gradient_norm": 1.834631323814392, "per_sentence_gradient_norm/max": 11.472039222717285, "per_sentence_gradient_norm/median": 1.5108933448791504, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.517479419708252, "per_sentence_gradient_norm/p85": 3.1933841705322266, "per_sentence_gradient_norm/p90": 3.915536642074585, "per_sentence_gradient_norm/p95": 5.530068874359131, "per_sentence_gradient_norm/p99": 8.611030578613281, "per_sentence_gradient_norm/var": 3.8736748695373535, "per_token_feature_norm": 186.59718322753906, "per_token_feature_norm/max": 294.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 82.0, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 402.840576171875, "per_token_full_gradient_variance/max_squared_error": 0.39593228697776794, "per_token_full_gradient_variance/variance": 0.002362743951380253, "per_token_gradient_norm": 1.4744906425476074, "per_token_gradient_norm/max": 319.4296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 193.22401428222656, "per_token_policy_error_norm": 0.011360111646354198, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010824581608176231, "policy_entropy": 0.02155405655503273, "policy_entropy/max": 3.21875, "policy_entropy/median": 2.1187588572502136e-08, "policy_entropy/min": 7.101524229780054e-18, "policy_entropy/p25": 1.7007550923153758e-10, "policy_entropy/p75": 3.993511199951172e-06, "policy_entropy/var": 0.014358719810843468, "policy_error_vector_variance/max_squared_error": 2.002788782119751, "policy_error_vector_variance/metric": 0.011307678185403347, "policy_loss": -0.6979166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21304824948310852, "policy_sharpness": 9.437773704528809, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.5296900272369385, "reward": 0.6979166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21304824948310852, "rewards/accuracy_reward": 0.6979166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21304824948310852, "sentence_full_gradient_variance/max_squared_error": 3112.0986328125, "sentence_full_gradient_variance/metric": 1313.96044921875, "sentence_full_gradient_variance/p75": 2059.978759765625, "sentence_full_gradient_variance/p90": 2059.978759765625, "sentence_full_gradient_variance/p95": 2214.34814453125, "sentence_full_gradient_variance/p99": 2527.4384765625, "state_level_variance/metric": 3.8736748695373535, "state_level_variance_full_gradient/metric": 1313.96044921875, "step": 19 }, { "accuracy_reward": 0.7291666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19956141710281372, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.770819451659918e-05, "adam_stats/lr_effective_mean": -8.944089358031704e-10, "adam_stats/lr_effective_min": -9.68088279478252e-05, "adam_stats/m_t_max": 0.014416511170566082, "adam_stats/m_t_mean": -1.5365407557421662e-10, "adam_stats/m_t_min": -0.01043626293540001, "adam_stats/v_t_max": 4.243443618179299e-05, "adam_stats/v_t_mean": 4.8151361370374346e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7291666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.19956141710281372, "all_logprobs": -0.01892138458788395, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.75, "all_logprobs/p1": -0.5625, "all_logprobs/p10": -0.00015354156494140625, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.0108642578125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.027432553470134735, "clip_ratio": 0.0, "completion_length": 501.57293701171875, "completion_length/correct": 460.5428466796875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 397.0, "completion_length/correct/min": 173.0, "completion_length/correct/p25": 300.0, "completion_length/correct/p75": 571.0, "completion_length/correct/var": 51108.42578125, "completion_length/incorrect": 612.0385131835938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 515.0, "completion_length/incorrect/min": 195.0, "completion_length/incorrect/p25": 378.25, "completion_length/incorrect/p75": 956.75, "completion_length/incorrect/var": 87068.6796875, "completion_length/max": 1024.0, "completion_length/median": 423.0, "completion_length/min": 173.0, "completion_length/p25": 319.75, "completion_length/p75": 594.5, "completion_length/var": 64613.78515625, "epoch": 0.25316455696202533, "feature_vector_variance/max_squared_error": 99778.0546875, "feature_vector_variance/metric": 28159.14453125, "generated_tokens/total": 1108765.0, "grad_norm": 1.0468131303787231, "learning_rate": 1.4547694655894313e-05, "loss": -0.7292, "mean_logprobs": -0.0189208984375, "mean_logprobs/var": 9.632110595703125e-05, "num_completions/total": 1920, "per_sentence_gradient_norm": 1.9287904500961304, "per_sentence_gradient_norm/max": 5.5484185218811035, "per_sentence_gradient_norm/median": 1.9387609958648682, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.121969699859619, "per_sentence_gradient_norm/p85": 3.5652899742126465, "per_sentence_gradient_norm/p90": 4.01164436340332, "per_sentence_gradient_norm/p95": 4.21264123916626, "per_sentence_gradient_norm/p99": 4.546103477478027, "per_sentence_gradient_norm/var": 2.20957088470459, "per_token_feature_norm": 185.492919921875, "per_token_feature_norm/max": 264.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 83.0, "per_token_feature_norm/p25": 176.0, "per_token_feature_norm/p75": 196.0, "per_token_feature_norm/var": 358.58819580078125, "per_token_full_gradient_variance/max_squared_error": 0.4936833679676056, "per_token_full_gradient_variance/variance": 0.0028010327368974686, "per_token_gradient_norm": 1.7480597496032715, "per_token_gradient_norm/max": 300.9375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 229.00442504882812, "per_token_policy_error_norm": 0.011030120775103569, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01061251014471054, "policy_entropy": 0.0205779317766428, "policy_entropy/max": 2.71875, "policy_entropy/median": 3.282912075519562e-08, "policy_entropy/min": 6.884683795282953e-18, "policy_entropy/p25": 2.510205376893282e-10, "policy_entropy/p75": 5.334615707397461e-06, "policy_entropy/var": 0.011948022991418839, "policy_error_vector_variance/max_squared_error": 1.9878031015396118, "policy_error_vector_variance/metric": 0.010999280028045177, "policy_loss": -0.7291666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.19956141710281372, "policy_sharpness": 9.435338020324707, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.4989304542541504, "reward": 0.7291666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19956141710281372, "rewards/accuracy_reward": 0.7291666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19956141710281372, "sentence_full_gradient_variance/max_squared_error": 2422.809326171875, "sentence_full_gradient_variance/metric": 1243.5032958984375, "sentence_full_gradient_variance/p75": 2111.05126953125, "sentence_full_gradient_variance/p90": 2111.05126953125, "sentence_full_gradient_variance/p95": 2111.05126953125, "sentence_full_gradient_variance/p99": 2319.41259765625, "state_level_variance/metric": 2.20957088470459, "state_level_variance_full_gradient/metric": 1243.5032958984375, "step": 20 }, { "accuracy_reward": 0.78125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17269736528396606, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.513712575426325e-05, "adam_stats/lr_effective_mean": -4.358259031445044e-10, "adam_stats/lr_effective_min": -9.588409739080817e-05, "adam_stats/m_t_max": 0.01662178710103035, "adam_stats/m_t_mean": -1.5905181338649044e-10, "adam_stats/m_t_min": -0.013543028384447098, "adam_stats/v_t_max": 4.411455665831454e-05, "adam_stats/v_t_mean": 5.0111880790792984e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.78125, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.17269736528396606, "all_logprobs": -0.01863103173673153, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -4.78125, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00020313262939453125, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.01416015625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.024486206471920013, "clip_ratio": 0.0, "completion_length": 490.9895935058594, "completion_length/correct": 447.6266784667969, "completion_length/correct/max": 946.0, "completion_length/correct/median": 443.0, "completion_length/correct/min": 128.0, "completion_length/correct/p25": 279.5, "completion_length/correct/p75": 559.0, "completion_length/correct/var": 35507.64453125, "completion_length/incorrect": 645.857177734375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 593.0, "completion_length/incorrect/min": 200.0, "completion_length/incorrect/p25": 396.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 85729.734375, "completion_length/max": 1024.0, "completion_length/median": 452.0, "completion_length/min": 128.0, "completion_length/p25": 304.0, "completion_length/p75": 630.5, "completion_length/var": 52493.140625, "epoch": 0.26582278481012656, "feature_vector_variance/max_squared_error": 78950.7421875, "feature_vector_variance/metric": 28003.0078125, "generated_tokens/total": 1155900.0, "grad_norm": 0.8841150999069214, "learning_rate": 1.4453878909250906e-05, "loss": -0.7812, "mean_logprobs": -0.018310546875, "mean_logprobs/var": 8.916854858398438e-05, "num_completions/total": 2016, "per_sentence_gradient_norm": 2.0428309440612793, "per_sentence_gradient_norm/max": 7.433304786682129, "per_sentence_gradient_norm/median": 1.768969178199768, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.6950828433036804, "per_sentence_gradient_norm/p75": 2.96852970123291, "per_sentence_gradient_norm/p85": 3.6794049739837646, "per_sentence_gradient_norm/p90": 4.179260730743408, "per_sentence_gradient_norm/p95": 4.907917022705078, "per_sentence_gradient_norm/p99": 6.89977502822876, "per_sentence_gradient_norm/var": 2.7471601963043213, "per_token_feature_norm": 186.0790557861328, "per_token_feature_norm/max": 264.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 87.5, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 283.5805358886719, "per_token_full_gradient_variance/max_squared_error": 0.5689811706542969, "per_token_full_gradient_variance/variance": 0.0033159698359668255, "per_token_gradient_norm": 1.8884490728378296, "per_token_gradient_norm/max": 298.7734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 240.59864807128906, "per_token_policy_error_norm": 0.010863535106182098, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009954212233424187, "policy_entropy": 0.021448925137519836, "policy_entropy/max": 3.265625, "policy_entropy/median": 4.6333298087120056e-08, "policy_entropy/min": 6.207057437479513e-18, "policy_entropy/p25": 3.5288394428789616e-10, "policy_entropy/p75": 7.12275505065918e-06, "policy_entropy/var": 0.011879613623023033, "policy_error_vector_variance/max_squared_error": 1.9714280366897583, "policy_error_vector_variance/metric": 0.010853092186152935, "policy_loss": -0.78125, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.17269736528396606, "policy_sharpness": 9.407623291015625, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.657789468765259, "reward": 0.78125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17269736528396606, "rewards/accuracy_reward": 0.78125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17269736528396606, "sentence_full_gradient_variance/max_squared_error": 2583.185791015625, "sentence_full_gradient_variance/metric": 1093.4990234375, "sentence_full_gradient_variance/p75": 1959.0989990234375, "sentence_full_gradient_variance/p90": 2583.179931640625, "sentence_full_gradient_variance/p95": 2583.179931640625, "sentence_full_gradient_variance/p99": 2583.18017578125, "state_level_variance/metric": 2.7471601963043213, "state_level_variance_full_gradient/metric": 1093.4990234375, "step": 21 }, { "accuracy_reward": 0.6979166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21304824948310852, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.480871813138947e-05, "adam_stats/lr_effective_mean": -7.715346139747226e-10, "adam_stats/lr_effective_min": -9.426493488717824e-05, "adam_stats/m_t_max": 0.012262818403542042, "adam_stats/m_t_mean": -6.367947835705934e-11, "adam_stats/m_t_min": -0.008110047318041325, "adam_stats/v_t_max": 5.345499448594637e-05, "adam_stats/v_t_mean": 5.373799929347944e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6979166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21304824948310852, "all_logprobs": -0.020284781232476234, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.4375, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.000171661376953125, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.01165771484375, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03413049131631851, "clip_ratio": 0.0, "completion_length": 504.65625, "completion_length/correct": 421.80596923828125, "completion_length/correct/max": 933.0, "completion_length/correct/median": 383.0, "completion_length/correct/min": 179.0, "completion_length/correct/p25": 311.5, "completion_length/correct/p75": 469.5, "completion_length/correct/var": 26611.37109375, "completion_length/incorrect": 696.0689697265625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 642.0, "completion_length/incorrect/min": 265.0, "completion_length/incorrect/p25": 424.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 84442.4296875, "completion_length/max": 1024.0, "completion_length/median": 418.0, "completion_length/min": 179.0, "completion_length/p25": 322.5, "completion_length/p75": 639.75, "completion_length/var": 59401.72265625, "epoch": 0.27848101265822783, "feature_vector_variance/max_squared_error": 80687.84375, "feature_vector_variance/metric": 28707.556640625, "generated_tokens/total": 1204347.0, "grad_norm": 1.4616997241973877, "learning_rate": 1.4351590932319506e-05, "loss": -0.6979, "mean_logprobs": -0.0203857421875, "mean_logprobs/var": 0.000225067138671875, "num_completions/total": 2112, "per_sentence_gradient_norm": 1.8981447219848633, "per_sentence_gradient_norm/max": 7.764307498931885, "per_sentence_gradient_norm/median": 1.765334129333496, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.0728683471679688, "per_sentence_gradient_norm/p85": 3.600848913192749, "per_sentence_gradient_norm/p90": 4.152196884155273, "per_sentence_gradient_norm/p95": 4.985745429992676, "per_sentence_gradient_norm/p99": 6.549784183502197, "per_sentence_gradient_norm/var": 3.1321115493774414, "per_token_feature_norm": 185.4528045654297, "per_token_feature_norm/max": 268.0, "per_token_feature_norm/median": 185.0, "per_token_feature_norm/min": 84.0, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 278.39373779296875, "per_token_full_gradient_variance/max_squared_error": 0.5406863689422607, "per_token_full_gradient_variance/variance": 0.00268158339895308, "per_token_gradient_norm": 1.5708822011947632, "per_token_gradient_norm/max": 286.2421875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 208.33023071289062, "per_token_policy_error_norm": 0.011321892030537128, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010836781933903694, "policy_entropy": 0.021756192669272423, "policy_entropy/max": 3.515625, "policy_entropy/median": 4.0512531995773315e-08, "policy_entropy/min": 1.1587410718438829e-18, "policy_entropy/p25": 2.8830982046201825e-10, "policy_entropy/p75": 6.22868537902832e-06, "policy_entropy/var": 0.013378177769482136, "policy_error_vector_variance/max_squared_error": 2.0007548332214355, "policy_error_vector_variance/metric": 0.011191715486347675, "policy_loss": -0.6979166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21304824948310852, "policy_sharpness": 9.41402816772461, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.65566086769104, "reward": 0.6979166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21304824948310852, "rewards/accuracy_reward": 0.6979166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21304824948310852, "sentence_full_gradient_variance/max_squared_error": 2321.098388671875, "sentence_full_gradient_variance/metric": 1219.181396484375, "sentence_full_gradient_variance/p75": 2272.84375, "sentence_full_gradient_variance/p90": 2272.84375, "sentence_full_gradient_variance/p95": 2272.84375, "sentence_full_gradient_variance/p99": 2275.256591796875, "state_level_variance/metric": 3.1321115493774414, "state_level_variance_full_gradient/metric": 1219.181396484375, "step": 22 }, { "accuracy_reward": 0.6979166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21304823458194733, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.453034726902843e-05, "adam_stats/lr_effective_mean": -9.481837537350657e-10, "adam_stats/lr_effective_min": -9.404521551914513e-05, "adam_stats/m_t_max": 0.00815291702747345, "adam_stats/m_t_mean": -3.7018903567553885e-12, "adam_stats/m_t_min": -0.008707334287464619, "adam_stats/v_t_max": 6.0662267060251907e-05, "adam_stats/v_t_mean": 5.6357297301135034e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6979166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21304823458194733, "all_logprobs": -0.019814174622297287, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.375, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.0001455307938158512, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.009765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.030260784551501274, "clip_ratio": 0.0, "completion_length": 477.5520935058594, "completion_length/correct": 431.6865539550781, "completion_length/correct/max": 941.0, "completion_length/correct/median": 390.0, "completion_length/correct/min": 174.0, "completion_length/correct/p25": 297.0, "completion_length/correct/p75": 539.0, "completion_length/correct/var": 33427.8203125, "completion_length/incorrect": 583.5172119140625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 479.0, "completion_length/incorrect/min": 191.0, "completion_length/incorrect/p25": 370.0, "completion_length/incorrect/p75": 859.0, "completion_length/incorrect/var": 80369.1171875, "completion_length/max": 1024.0, "completion_length/median": 412.0, "completion_length/min": 174.0, "completion_length/p25": 322.5, "completion_length/p75": 595.5, "completion_length/var": 51822.5859375, "epoch": 0.2911392405063291, "feature_vector_variance/max_squared_error": 78207.0546875, "feature_vector_variance/metric": 29191.080078125, "generated_tokens/total": 1250192.0, "grad_norm": 1.1231374740600586, "learning_rate": 1.4240955347243754e-05, "loss": -0.6979, "mean_logprobs": -0.02001953125, "mean_logprobs/var": 0.00011491775512695312, "num_completions/total": 2208, "per_sentence_gradient_norm": 1.9237302541732788, "per_sentence_gradient_norm/max": 8.656959533691406, "per_sentence_gradient_norm/median": 1.7684801816940308, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.162503480911255, "per_sentence_gradient_norm/p85": 3.7261741161346436, "per_sentence_gradient_norm/p90": 3.8972997665405273, "per_sentence_gradient_norm/p95": 4.4652276039123535, "per_sentence_gradient_norm/p99": 5.236514568328857, "per_sentence_gradient_norm/var": 2.8594703674316406, "per_token_feature_norm": 185.96583557128906, "per_token_feature_norm/max": 264.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 86.5, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 278.45001220703125, "per_token_full_gradient_variance/max_squared_error": 0.560485303401947, "per_token_full_gradient_variance/variance": 0.002974853152409196, "per_token_gradient_norm": 1.7363725900650024, "per_token_gradient_norm/max": 294.8125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 237.5956573486328, "per_token_policy_error_norm": 0.011450565420091152, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010985122993588448, "policy_entropy": 0.020972633734345436, "policy_entropy/max": 2.9375, "policy_entropy/median": 3.67872416973114e-08, "policy_entropy/min": 3.9302328752599536e-18, "policy_entropy/p25": 2.382876118645072e-10, "policy_entropy/p75": 5.8710575103759766e-06, "policy_entropy/var": 0.01249085832387209, "policy_error_vector_variance/max_squared_error": 1.96356201171875, "policy_error_vector_variance/metric": 0.011436644941568375, "policy_loss": -0.6979166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21304823458194733, "policy_sharpness": 9.429099082946777, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.552183151245117, "reward": 0.6979166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21304823458194733, "rewards/accuracy_reward": 0.6979166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21304823458194733, "sentence_full_gradient_variance/max_squared_error": 2483.3388671875, "sentence_full_gradient_variance/metric": 1154.9134521484375, "sentence_full_gradient_variance/p75": 1916.1759033203125, "sentence_full_gradient_variance/p90": 1916.1759033203125, "sentence_full_gradient_variance/p95": 2141.16943359375, "sentence_full_gradient_variance/p99": 2464.93603515625, "state_level_variance/metric": 2.8594703674316406, "state_level_variance_full_gradient/metric": 1154.9134521484375, "step": 23 }, { "accuracy_reward": 0.78125, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.17269736528396606, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.383865835843608e-05, "adam_stats/lr_effective_mean": -7.248863176378961e-10, "adam_stats/lr_effective_min": -9.321069228462875e-05, "adam_stats/m_t_max": 0.007174780126661062, "adam_stats/m_t_mean": -7.000713203586528e-11, "adam_stats/m_t_min": -0.006585892755538225, "adam_stats/v_t_max": 6.243168900255114e-05, "adam_stats/v_t_mean": 6.001411607253759e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.78125, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.17269736528396606, "all_logprobs": -0.019546540454030037, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.375, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.0001239776611328125, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -0.0093994140625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03034260682761669, "clip_ratio": 0.0, "completion_length": 468.5833435058594, "completion_length/correct": 424.0933532714844, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 396.0, "completion_length/correct/min": 99.0, "completion_length/correct/p25": 256.0, "completion_length/correct/p75": 549.0, "completion_length/correct/var": 37276.19140625, "completion_length/incorrect": 627.4761962890625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 533.0, "completion_length/incorrect/min": 248.0, "completion_length/incorrect/p25": 425.0, "completion_length/incorrect/p75": 865.0, "completion_length/incorrect/var": 73310.765625, "completion_length/max": 1024.0, "completion_length/median": 425.0, "completion_length/min": 99.0, "completion_length/p25": 297.25, "completion_length/p75": 584.0, "completion_length/var": 51613.59375, "epoch": 0.3037974683544304, "feature_vector_variance/max_squared_error": 117469.8828125, "feature_vector_variance/metric": 29312.796875, "generated_tokens/total": 1295176.0, "grad_norm": 1.3867801427841187, "learning_rate": 1.4122106946441953e-05, "loss": -0.7812, "mean_logprobs": -0.018798828125, "mean_logprobs/var": 0.0001392364501953125, "num_completions/total": 2304, "per_sentence_gradient_norm": 1.9093658924102783, "per_sentence_gradient_norm/max": 6.912288188934326, "per_sentence_gradient_norm/median": 1.7032678127288818, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.43697261810302734, "per_sentence_gradient_norm/p75": 3.0837302207946777, "per_sentence_gradient_norm/p85": 3.3960235118865967, "per_sentence_gradient_norm/p90": 3.6178574562072754, "per_sentence_gradient_norm/p95": 4.207862854003906, "per_sentence_gradient_norm/p99": 6.756346702575684, "per_sentence_gradient_norm/var": 2.573411226272583, "per_token_feature_norm": 186.02195739746094, "per_token_feature_norm/max": 296.0, "per_token_feature_norm/median": 185.0, "per_token_feature_norm/min": 89.0, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 258.4765625, "per_token_full_gradient_variance/max_squared_error": 0.8827510476112366, "per_token_full_gradient_variance/variance": 0.0028698286041617393, "per_token_gradient_norm": 1.7298226356506348, "per_token_gradient_norm/max": 267.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 225.6817626953125, "per_token_policy_error_norm": 0.01127445138990879, "per_token_policy_error_norm/max": 1.9921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0109739163890481, "policy_entropy": 0.02064623311161995, "policy_entropy/max": 2.71875, "policy_entropy/median": 3.282912075519562e-08, "policy_entropy/min": 5.21772295508649e-19, "policy_entropy/p25": 2.4147084332071245e-10, "policy_entropy/p75": 4.976987838745117e-06, "policy_entropy/var": 0.012147974222898483, "policy_error_vector_variance/max_squared_error": 1.9913665056228638, "policy_error_vector_variance/metric": 0.011244230903685093, "policy_loss": -0.78125, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.17269736528396606, "policy_sharpness": 9.444087028503418, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.462646961212158, "reward": 0.78125, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.17269736528396606, "rewards/accuracy_reward": 0.78125, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.17269736528396606, "sentence_full_gradient_variance/max_squared_error": 2837.787353515625, "sentence_full_gradient_variance/metric": 1165.7890625, "sentence_full_gradient_variance/p75": 2221.437255859375, "sentence_full_gradient_variance/p90": 2837.7783203125, "sentence_full_gradient_variance/p95": 2837.7783203125, "sentence_full_gradient_variance/p99": 2837.77880859375, "state_level_variance/metric": 2.573411226272583, "state_level_variance_full_gradient/metric": 1165.7890625, "step": 24 }, { "accuracy_reward": 0.7916666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 1.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.1666666865348816, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.286202111979946e-05, "adam_stats/lr_effective_mean": -7.123108769491182e-10, "adam_stats/lr_effective_min": -9.26140564843081e-05, "adam_stats/m_t_max": 0.006686425302177668, "adam_stats/m_t_mean": -7.329074847017836e-11, "adam_stats/m_t_min": -0.008108693175017834, "adam_stats/v_t_max": 6.241368828341365e-05, "adam_stats/v_t_mean": 6.1156917204041594e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.7916666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 1.0, "advantages/p75": 1.0, "advantages/var": 0.1666666865348816, "all_logprobs": -0.019550243392586708, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.5, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.0001583099365234375, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.01104736328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.028620921075344086, "clip_ratio": 0.0, "completion_length": 435.625, "completion_length/correct": 393.4868469238281, "completion_length/correct/max": 1022.0, "completion_length/correct/median": 343.0, "completion_length/correct/min": 162.0, "completion_length/correct/p25": 273.0, "completion_length/correct/p75": 436.0, "completion_length/correct/var": 33168.78515625, "completion_length/incorrect": 595.75, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 515.0, "completion_length/incorrect/min": 219.0, "completion_length/incorrect/p25": 431.5, "completion_length/incorrect/p75": 827.75, "completion_length/incorrect/var": 65733.8828125, "completion_length/max": 1024.0, "completion_length/median": 371.0, "completion_length/min": 162.0, "completion_length/p25": 283.0, "completion_length/p75": 513.5, "completion_length/var": 46151.05859375, "epoch": 0.31645569620253167, "feature_vector_variance/max_squared_error": 98137.5703125, "feature_vector_variance/metric": 29577.20703125, "generated_tokens/total": 1336996.0, "grad_norm": 1.0255836248397827, "learning_rate": 1.3995190528383292e-05, "loss": -0.7917, "mean_logprobs": -0.01953125, "mean_logprobs/var": 0.00015354156494140625, "num_completions/total": 2400, "per_sentence_gradient_norm": 2.1228318214416504, "per_sentence_gradient_norm/max": 6.57059907913208, "per_sentence_gradient_norm/median": 1.9052172899246216, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.8168656229972839, "per_sentence_gradient_norm/p75": 3.027306079864502, "per_sentence_gradient_norm/p85": 4.01100492477417, "per_sentence_gradient_norm/p90": 4.481265068054199, "per_sentence_gradient_norm/p95": 5.0972514152526855, "per_sentence_gradient_norm/p99": 5.728845119476318, "per_sentence_gradient_norm/var": 2.6974129676818848, "per_token_feature_norm": 185.1632080078125, "per_token_feature_norm/max": 272.0, "per_token_feature_norm/median": 185.0, "per_token_feature_norm/min": 90.0, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 238.36349487304688, "per_token_full_gradient_variance/max_squared_error": 0.5666000247001648, "per_token_full_gradient_variance/variance": 0.0031381247099488974, "per_token_gradient_norm": 1.9162403345108032, "per_token_gradient_norm/max": 280.5078125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 257.2720031738281, "per_token_policy_error_norm": 0.011365721002221107, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010885871946811676, "policy_entropy": 0.02135431207716465, "policy_entropy/max": 2.796875, "policy_entropy/median": 4.936009645462036e-08, "policy_entropy/min": 2.998496633280223e-19, "policy_entropy/p25": 3.1104718800634146e-10, "policy_entropy/p75": 7.808208465576172e-06, "policy_entropy/var": 0.012256575748324394, "policy_error_vector_variance/max_squared_error": 1.9859527349472046, "policy_error_vector_variance/metric": 0.011352448724210262, "policy_loss": -0.7916666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": -1.0, "policy_loss/var": 0.1666666865348816, "policy_sharpness": 9.428431510925293, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.5583651065826416, "reward": 0.7916666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 1.0, "reward/p75": 1.0, "reward/var": 0.1666666865348816, "rewards/accuracy_reward": 0.7916666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 1.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.1666666865348816, "sentence_full_gradient_variance/max_squared_error": 2904.842041015625, "sentence_full_gradient_variance/metric": 1156.583740234375, "sentence_full_gradient_variance/p75": 1891.39404296875, "sentence_full_gradient_variance/p90": 2904.83642578125, "sentence_full_gradient_variance/p95": 2904.83642578125, "sentence_full_gradient_variance/p99": 2904.836669921875, "state_level_variance/metric": 2.6974129676818848, "state_level_variance_full_gradient/metric": 1156.583740234375, "step": 25 }, { "accuracy_reward": 0.6666666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.224561408162117, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.198190673487261e-05, "adam_stats/lr_effective_mean": -9.474124817998586e-10, "adam_stats/lr_effective_min": -9.309301822213456e-05, "adam_stats/m_t_max": 0.006170173175632954, "adam_stats/m_t_mean": -7.985515720898562e-11, "adam_stats/m_t_min": -0.006571505218744278, "adam_stats/v_t_max": 6.24201784376055e-05, "adam_stats/v_t_mean": 6.198134887280826e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6666666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.224561408162117, "all_logprobs": -0.019043484702706337, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.875, "all_logprobs/p1": -0.546875, "all_logprobs/p10": -0.00014019012451171875, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.007611081004142761, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03133941441774368, "clip_ratio": 0.0, "completion_length": 498.26043701171875, "completion_length/correct": 420.515625, "completion_length/correct/max": 824.0, "completion_length/correct/median": 379.0, "completion_length/correct/min": 107.0, "completion_length/correct/p25": 290.5, "completion_length/correct/p75": 528.5, "completion_length/correct/var": 31418.634765625, "completion_length/incorrect": 653.75, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 494.0, "completion_length/incorrect/min": 180.0, "completion_length/incorrect/p25": 405.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 98827.546875, "completion_length/max": 1024.0, "completion_length/median": 421.0, "completion_length/min": 107.0, "completion_length/p25": 305.5, "completion_length/p75": 649.5, "completion_length/var": 65300.2578125, "epoch": 0.3291139240506329, "feature_vector_variance/max_squared_error": 77858.5234375, "feature_vector_variance/metric": 28693.0234375, "generated_tokens/total": 1384829.0, "grad_norm": 0.8378214836120605, "learning_rate": 1.3860360721173195e-05, "loss": -0.6667, "mean_logprobs": -0.018798828125, "mean_logprobs/var": 0.00011539459228515625, "num_completions/total": 2496, "per_sentence_gradient_norm": 1.6889846324920654, "per_sentence_gradient_norm/max": 5.345212936401367, "per_sentence_gradient_norm/median": 1.7096984386444092, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.7074167728424072, "per_sentence_gradient_norm/p85": 3.3487119674682617, "per_sentence_gradient_norm/p90": 3.6735877990722656, "per_sentence_gradient_norm/p95": 4.128076076507568, "per_sentence_gradient_norm/p99": 5.236213207244873, "per_sentence_gradient_norm/var": 2.234959840774536, "per_token_feature_norm": 184.57594299316406, "per_token_feature_norm/max": 268.0, "per_token_feature_norm/median": 185.0, "per_token_feature_norm/min": 89.0, "per_token_feature_norm/p25": 176.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 251.89134216308594, "per_token_full_gradient_variance/max_squared_error": 0.5092841982841492, "per_token_full_gradient_variance/variance": 0.0023186122998595238, "per_token_gradient_norm": 1.408516526222229, "per_token_gradient_norm/max": 262.203125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 182.07606506347656, "per_token_policy_error_norm": 0.010836162604391575, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010643490590155125, "policy_entropy": 0.019972555339336395, "policy_entropy/max": 2.328125, "policy_entropy/median": 5.2619725465774536e-08, "policy_entropy/min": 1.3349239248727773e-18, "policy_entropy/p25": 3.292370820418e-10, "policy_entropy/p75": 7.092952728271484e-06, "policy_entropy/var": 0.011581692844629288, "policy_error_vector_variance/max_squared_error": 1.9995821714401245, "policy_error_vector_variance/metric": 0.010786324739456177, "policy_loss": -0.6666666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.224561408162117, "policy_sharpness": 9.442598342895508, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.4617955684661865, "reward": 0.6666666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.224561408162117, "rewards/accuracy_reward": 0.6666666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.224561408162117, "sentence_full_gradient_variance/max_squared_error": 3449.43505859375, "sentence_full_gradient_variance/metric": 1322.381591796875, "sentence_full_gradient_variance/p75": 2107.773193359375, "sentence_full_gradient_variance/p90": 2107.773193359375, "sentence_full_gradient_variance/p95": 2161.725341796875, "sentence_full_gradient_variance/p99": 2509.899169921875, "state_level_variance/metric": 2.234959840774536, "state_level_variance_full_gradient/metric": 1322.381591796875, "step": 26 }, { "accuracy_reward": 0.6979166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.21304823458194733, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.101264731725678e-05, "adam_stats/lr_effective_mean": -1.0756091572972082e-09, "adam_stats/lr_effective_min": -9.060948650585487e-05, "adam_stats/m_t_max": 0.00833635963499546, "adam_stats/m_t_mean": -1.1694531454331525e-10, "adam_stats/m_t_min": -0.006052066572010517, "adam_stats/v_t_max": 6.347646558424458e-05, "adam_stats/v_t_mean": 6.402390336640584e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6979166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.21304823458194733, "all_logprobs": -0.019449817016720772, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.3125, "all_logprobs/p1": -0.57421875, "all_logprobs/p10": -0.00020313262939453125, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.01104736328125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03009922429919243, "clip_ratio": 0.0, "completion_length": 448.9375, "completion_length/correct": 420.34326171875, "completion_length/correct/max": 980.0, "completion_length/correct/median": 386.0, "completion_length/correct/min": 143.0, "completion_length/correct/p25": 296.0, "completion_length/correct/p75": 554.0, "completion_length/correct/var": 30231.201171875, "completion_length/incorrect": 515.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 434.0, "completion_length/incorrect/min": 205.0, "completion_length/incorrect/p25": 343.0, "completion_length/incorrect/p75": 585.0, "completion_length/incorrect/var": 64890.0, "completion_length/max": 1024.0, "completion_length/median": 386.0, "completion_length/min": 143.0, "completion_length/p25": 308.0, "completion_length/p75": 561.75, "completion_length/var": 42037.08984375, "epoch": 0.34177215189873417, "feature_vector_variance/max_squared_error": 75453.375, "feature_vector_variance/metric": 28974.751953125, "generated_tokens/total": 1427927.0, "grad_norm": 0.9425917863845825, "learning_rate": 1.3717781794162813e-05, "loss": -0.6979, "mean_logprobs": -0.0189208984375, "mean_logprobs/var": 0.0001373291015625, "num_completions/total": 2592, "per_sentence_gradient_norm": 1.8007973432540894, "per_sentence_gradient_norm/max": 6.6922736167907715, "per_sentence_gradient_norm/median": 1.7521156072616577, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.7373790740966797, "per_sentence_gradient_norm/p85": 3.327936887741089, "per_sentence_gradient_norm/p90": 4.446925163269043, "per_sentence_gradient_norm/p95": 4.98205041885376, "per_sentence_gradient_norm/p99": 5.97923469543457, "per_sentence_gradient_norm/var": 2.777311086654663, "per_token_feature_norm": 184.6727294921875, "per_token_feature_norm/max": 264.0, "per_token_feature_norm/median": 185.0, "per_token_feature_norm/min": 91.5, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 244.46026611328125, "per_token_full_gradient_variance/max_squared_error": 0.5041218996047974, "per_token_full_gradient_variance/variance": 0.003007266204804182, "per_token_gradient_norm": 1.680618166923523, "per_token_gradient_norm/max": 288.28125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 220.3388671875, "per_token_policy_error_norm": 0.011026151478290558, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010565916076302528, "policy_entropy": 0.02139371633529663, "policy_entropy/max": 2.796875, "policy_entropy/median": 6.239861249923706e-08, "policy_entropy/min": 6.532318089225164e-18, "policy_entropy/p25": 3.2014213502407074e-10, "policy_entropy/p75": 9.953975677490234e-06, "policy_entropy/var": 0.012886966578662395, "policy_error_vector_variance/max_squared_error": 1.9995838403701782, "policy_error_vector_variance/metric": 0.01095129456371069, "policy_loss": -0.6979166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.21304823458194733, "policy_sharpness": 9.407807350158691, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.712496280670166, "reward": 0.6979166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.21304823458194733, "rewards/accuracy_reward": 0.6979166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.21304823458194733, "sentence_full_gradient_variance/max_squared_error": 2518.8505859375, "sentence_full_gradient_variance/metric": 1184.245849609375, "sentence_full_gradient_variance/p75": 2017.424072265625, "sentence_full_gradient_variance/p90": 2017.424072265625, "sentence_full_gradient_variance/p95": 2017.424072265625, "sentence_full_gradient_variance/p99": 2236.570556640625, "state_level_variance/metric": 2.777311086654663, "state_level_variance_full_gradient/metric": 1184.245849609375, "step": 27 }, { "accuracy_reward": 0.6666666865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.224561408162117, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.115119610214606e-05, "adam_stats/lr_effective_mean": -7.016747738397555e-10, "adam_stats/lr_effective_min": -9.029074863065034e-05, "adam_stats/m_t_max": 0.0076354751363396645, "adam_stats/m_t_mean": -9.479916157628665e-11, "adam_stats/m_t_min": -0.007729575037956238, "adam_stats/v_t_max": 6.372628558892757e-05, "adam_stats/v_t_mean": 6.579703362014078e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6666666865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.224561408162117, "all_logprobs": -0.02025773748755455, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.875, "all_logprobs/p1": -0.578125, "all_logprobs/p10": -0.00018215179443359375, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -0.010034173727035522, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.032272063195705414, "clip_ratio": 0.0, "completion_length": 478.0520935058594, "completion_length/correct": 430.515625, "completion_length/correct/max": 912.0, "completion_length/correct/median": 364.0, "completion_length/correct/min": 164.0, "completion_length/correct/p25": 292.5, "completion_length/correct/p75": 563.75, "completion_length/correct/var": 37463.01953125, "completion_length/incorrect": 573.125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 487.0, "completion_length/incorrect/min": 220.0, "completion_length/incorrect/p25": 351.75, "completion_length/incorrect/p75": 711.0, "completion_length/incorrect/var": 71013.46875, "completion_length/max": 1024.0, "completion_length/median": 407.0, "completion_length/min": 164.0, "completion_length/p25": 306.25, "completion_length/p75": 625.5, "completion_length/var": 52583.71484375, "epoch": 0.35443037974683544, "feature_vector_variance/max_squared_error": 74961.921875, "feature_vector_variance/metric": 28597.5546875, "generated_tokens/total": 1473820.0, "grad_norm": 0.9804356098175049, "learning_rate": 1.3567627457812107e-05, "loss": -0.6667, "mean_logprobs": -0.0206298828125, "mean_logprobs/var": 0.00013637542724609375, "num_completions/total": 2688, "per_sentence_gradient_norm": 1.867476224899292, "per_sentence_gradient_norm/max": 10.1003999710083, "per_sentence_gradient_norm/median": 2.0220673084259033, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.9249942302703857, "per_sentence_gradient_norm/p85": 3.5329794883728027, "per_sentence_gradient_norm/p90": 4.12325382232666, "per_sentence_gradient_norm/p95": 4.644386291503906, "per_sentence_gradient_norm/p99": 6.396029949188232, "per_sentence_gradient_norm/var": 3.308152437210083, "per_token_feature_norm": 184.2350311279297, "per_token_feature_norm/max": 264.0, "per_token_feature_norm/median": 185.0, "per_token_feature_norm/min": 100.0, "per_token_feature_norm/p25": 176.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 238.9141845703125, "per_token_full_gradient_variance/max_squared_error": 0.6121354103088379, "per_token_full_gradient_variance/variance": 0.002648990135639906, "per_token_gradient_norm": 1.6496238708496094, "per_token_gradient_norm/max": 282.296875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 225.50494384765625, "per_token_policy_error_norm": 0.011731007136404514, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011419542133808136, "policy_entropy": 0.021074457094073296, "policy_entropy/max": 2.15625, "policy_entropy/median": 8.288770914077759e-08, "policy_entropy/min": 7.806255641895632e-18, "policy_entropy/p25": 5.493347998708487e-10, "policy_entropy/p75": 9.894371032714844e-06, "policy_entropy/var": 0.012238645926117897, "policy_error_vector_variance/max_squared_error": 1.9862298965454102, "policy_error_vector_variance/metric": 0.011699887923896313, "policy_loss": -0.6666666865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.224561408162117, "policy_sharpness": 9.414947509765625, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.6639811992645264, "reward": 0.6666666865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.224561408162117, "rewards/accuracy_reward": 0.6666666865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.224561408162117, "sentence_full_gradient_variance/max_squared_error": 2822.42578125, "sentence_full_gradient_variance/metric": 1263.3682861328125, "sentence_full_gradient_variance/p75": 1857.51513671875, "sentence_full_gradient_variance/p90": 1915.578125, "sentence_full_gradient_variance/p95": 2189.97509765625, "sentence_full_gradient_variance/p99": 2739.286865234375, "state_level_variance/metric": 3.308152437210083, "state_level_variance_full_gradient/metric": 1263.3682861328125, "step": 28 }, { "accuracy_reward": 0.625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.236842080950737, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 8.973020158009604e-05, "adam_stats/lr_effective_mean": -8.155947028853916e-10, "adam_stats/lr_effective_min": -9.069943189388141e-05, "adam_stats/m_t_max": 0.006926859263330698, "adam_stats/m_t_mean": -7.043575445120354e-11, "adam_stats/m_t_min": -0.005708141718059778, "adam_stats/v_t_max": 6.367878813762218e-05, "adam_stats/v_t_mean": 6.704211838459706e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.625, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.236842080950737, "all_logprobs": -0.02121609076857567, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.625, "all_logprobs/p1": -0.6555078029632568, "all_logprobs/p10": -0.000308990478515625, "all_logprobs/p25": -8.344650268554688e-07, "all_logprobs/p5": -0.014163210988044739, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03160945326089859, "clip_ratio": 0.0, "completion_length": 475.2083435058594, "completion_length/correct": 379.86669921875, "completion_length/correct/max": 928.0, "completion_length/correct/median": 313.0, "completion_length/correct/min": 120.0, "completion_length/correct/p25": 272.0, "completion_length/correct/p75": 472.5, "completion_length/correct/var": 29354.388671875, "completion_length/incorrect": 634.1111450195312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 547.0, "completion_length/incorrect/min": 183.0, "completion_length/incorrect/p25": 394.5, "completion_length/incorrect/p75": 981.25, "completion_length/incorrect/var": 84993.9375, "completion_length/max": 1024.0, "completion_length/median": 407.0, "completion_length/min": 120.0, "completion_length/p25": 280.75, "completion_length/p75": 610.25, "completion_length/var": 64853.70703125, "epoch": 0.3670886075949367, "feature_vector_variance/max_squared_error": 73338.546875, "feature_vector_variance/metric": 28674.171875, "generated_tokens/total": 1519440.0, "grad_norm": 0.9237640500068665, "learning_rate": 1.3410080652050414e-05, "loss": -0.625, "mean_logprobs": -0.0206298828125, "mean_logprobs/var": 0.00012874603271484375, "num_completions/total": 2784, "per_sentence_gradient_norm": 1.6669467687606812, "per_sentence_gradient_norm/max": 5.708336353302002, "per_sentence_gradient_norm/median": 1.519641637802124, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.8209474086761475, "per_sentence_gradient_norm/p85": 3.392465591430664, "per_sentence_gradient_norm/p90": 3.807621955871582, "per_sentence_gradient_norm/p95": 4.615067481994629, "per_sentence_gradient_norm/p99": 5.458099365234375, "per_sentence_gradient_norm/var": 2.549623489379883, "per_token_feature_norm": 183.83419799804688, "per_token_feature_norm/max": 258.0, "per_token_feature_norm/median": 185.0, "per_token_feature_norm/min": 92.0, "per_token_feature_norm/p25": 176.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 227.20872497558594, "per_token_full_gradient_variance/max_squared_error": 0.5810432434082031, "per_token_full_gradient_variance/variance": 0.0022599941585212946, "per_token_gradient_norm": 1.3462579250335693, "per_token_gradient_norm/max": 272.5625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 171.91346740722656, "per_token_policy_error_norm": 0.012065106071531773, "per_token_policy_error_norm/max": 1.9375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01141396351158619, "policy_entropy": 0.023649025708436966, "policy_entropy/max": 2.9375, "policy_entropy/median": 1.1641532182693481e-07, "policy_entropy/min": 6.505213034913027e-19, "policy_entropy/p25": 7.34871719032526e-10, "policy_entropy/p75": 1.3530254364013672e-05, "policy_entropy/var": 0.014751529321074486, "policy_error_vector_variance/max_squared_error": 1.9428107738494873, "policy_error_vector_variance/metric": 0.011978130787611008, "policy_loss": -0.625, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.236842080950737, "policy_sharpness": 9.365095138549805, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.948235034942627, "reward": 0.625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.236842080950737, "rewards/accuracy_reward": 0.625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.236842080950737, "sentence_full_gradient_variance/max_squared_error": 3120.7705078125, "sentence_full_gradient_variance/metric": 1430.982177734375, "sentence_full_gradient_variance/p75": 2011.752685546875, "sentence_full_gradient_variance/p90": 2011.752685546875, "sentence_full_gradient_variance/p95": 2090.92578125, "sentence_full_gradient_variance/p99": 2677.463623046875, "state_level_variance/metric": 2.549623489379883, "state_level_variance_full_gradient/metric": 1430.982177734375, "step": 29 }, { "accuracy_reward": 0.6458333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2311403602361679, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.003189916256815e-05, "adam_stats/lr_effective_mean": -7.102565202643518e-10, "adam_stats/lr_effective_min": -9.07941212062724e-05, "adam_stats/m_t_max": 0.008748821914196014, "adam_stats/m_t_mean": -6.65711374914224e-11, "adam_stats/m_t_min": -0.0056885057128965855, "adam_stats/v_t_max": 6.383689469657838e-05, "adam_stats/v_t_mean": 6.835557727485897e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6458333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2311403602361679, "all_logprobs": -0.02288755401968956, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.65625, "all_logprobs/p1": -0.6675000190734863, "all_logprobs/p10": -0.00040206871926784515, "all_logprobs/p25": -1.0728836059570312e-06, "all_logprobs/p5": -0.0181884765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.038920287042856216, "clip_ratio": 0.0, "completion_length": 452.21875, "completion_length/correct": 378.3870849609375, "completion_length/correct/max": 956.0, "completion_length/correct/median": 315.0, "completion_length/correct/min": 125.0, "completion_length/correct/p25": 245.5, "completion_length/correct/p75": 491.0, "completion_length/correct/var": 34977.98046875, "completion_length/incorrect": 586.8529663085938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 533.0, "completion_length/incorrect/min": 200.0, "completion_length/incorrect/p25": 371.5, "completion_length/incorrect/p75": 808.5, "completion_length/incorrect/var": 75635.9453125, "completion_length/max": 1024.0, "completion_length/median": 402.0, "completion_length/min": 125.0, "completion_length/p25": 260.5, "completion_length/p75": 583.0, "completion_length/var": 58777.984375, "epoch": 0.379746835443038, "feature_vector_variance/max_squared_error": 74344.7734375, "feature_vector_variance/metric": 28439.9453125, "generated_tokens/total": 1562853.0, "grad_norm": 0.928460419178009, "learning_rate": 1.3245333323392335e-05, "loss": -0.6458, "mean_logprobs": -0.022705078125, "mean_logprobs/var": 0.00021076202392578125, "num_completions/total": 2880, "per_sentence_gradient_norm": 1.8092663288116455, "per_sentence_gradient_norm/max": 7.546934604644775, "per_sentence_gradient_norm/median": 1.7125593423843384, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.908487319946289, "per_sentence_gradient_norm/p85": 3.659186601638794, "per_sentence_gradient_norm/p90": 4.230315208435059, "per_sentence_gradient_norm/p95": 4.888103485107422, "per_sentence_gradient_norm/p99": 6.928684234619141, "per_sentence_gradient_norm/var": 3.3455610275268555, "per_token_feature_norm": 184.10276794433594, "per_token_feature_norm/max": 274.0, "per_token_feature_norm/median": 185.0, "per_token_feature_norm/min": 89.5, "per_token_feature_norm/p25": 176.0, "per_token_feature_norm/p75": 192.0, "per_token_feature_norm/var": 248.00497436523438, "per_token_full_gradient_variance/max_squared_error": 0.7861523032188416, "per_token_full_gradient_variance/variance": 0.0026482499670237303, "per_token_gradient_norm": 1.5028637647628784, "per_token_gradient_norm/max": 292.2734375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 191.92941284179688, "per_token_policy_error_norm": 0.01277833990752697, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.012335725128650665, "policy_entropy": 0.02444789931178093, "policy_entropy/max": 2.734375, "policy_entropy/median": 1.3783574104309082e-07, "policy_entropy/min": 1.212951180468158e-18, "policy_entropy/p25": 8.330971468240023e-10, "policy_entropy/p75": 1.6570091247558594e-05, "policy_entropy/var": 0.015235936269164085, "policy_error_vector_variance/max_squared_error": 1.9894169569015503, "policy_error_vector_variance/metric": 0.012685072608292103, "policy_loss": -0.6458333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2311403602361679, "policy_sharpness": 9.340760231018066, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.100860595703125, "reward": 0.6458333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2311403602361679, "rewards/accuracy_reward": 0.6458333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2311403602361679, "sentence_full_gradient_variance/max_squared_error": 3349.91357421875, "sentence_full_gradient_variance/metric": 1505.743896484375, "sentence_full_gradient_variance/p75": 1931.394287109375, "sentence_full_gradient_variance/p90": 2279.2998046875, "sentence_full_gradient_variance/p95": 2735.61962890625, "sentence_full_gradient_variance/p99": 3202.2294921875, "state_level_variance/metric": 3.3455610275268555, "state_level_variance_full_gradient/metric": 1505.743896484375, "step": 30 }, { "accuracy_reward": 0.65625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2279605120420456, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 8.79335857462138e-05, "adam_stats/lr_effective_mean": -7.287939141065181e-10, "adam_stats/lr_effective_min": -8.827856800053269e-05, "adam_stats/m_t_max": 0.011188051663339138, "adam_stats/m_t_mean": -7.507982430210447e-11, "adam_stats/m_t_min": -0.010036309249699116, "adam_stats/v_t_max": 6.691313319606707e-05, "adam_stats/v_t_mean": 7.092568286154455e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.65625, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2279605120420456, "all_logprobs": -0.021930908784270287, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -7.0625, "all_logprobs/p1": -0.626953125, "all_logprobs/p10": -0.00046539306640625, "all_logprobs/p25": -1.0728836059570312e-06, "all_logprobs/p5": -0.018798828125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.034471914172172546, "clip_ratio": 0.0, "completion_length": 415.1145935058594, "completion_length/correct": 358.5238342285156, "completion_length/correct/max": 798.0, "completion_length/correct/median": 333.0, "completion_length/correct/min": 128.0, "completion_length/correct/p25": 232.5, "completion_length/correct/p75": 446.5, "completion_length/correct/var": 27933.802734375, "completion_length/incorrect": 523.1515502929688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 467.0, "completion_length/incorrect/min": 274.0, "completion_length/incorrect/p25": 354.0, "completion_length/incorrect/p75": 651.0, "completion_length/incorrect/var": 39332.0703125, "completion_length/max": 1024.0, "completion_length/median": 367.0, "completion_length/min": 128.0, "completion_length/p25": 282.25, "completion_length/p75": 524.25, "completion_length/var": 37657.4296875, "epoch": 0.3924050632911392, "feature_vector_variance/max_squared_error": 74107.4375, "feature_vector_variance/metric": 28456.669921875, "generated_tokens/total": 1602704.0, "grad_norm": 1.028203010559082, "learning_rate": 1.3073586191080456e-05, "loss": -0.6562, "mean_logprobs": -0.0203857421875, "mean_logprobs/var": 0.0001506805419921875, "num_completions/total": 2976, "per_sentence_gradient_norm": 1.7130522727966309, "per_sentence_gradient_norm/max": 7.3722639083862305, "per_sentence_gradient_norm/median": 1.6247198581695557, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.8049135208129883, "per_sentence_gradient_norm/p85": 3.364967107772827, "per_sentence_gradient_norm/p90": 3.8638572692871094, "per_sentence_gradient_norm/p95": 4.350708484649658, "per_sentence_gradient_norm/p99": 7.308729648590088, "per_sentence_gradient_norm/var": 2.737516164779663, "per_token_feature_norm": 184.84080505371094, "per_token_feature_norm/max": 266.0, "per_token_feature_norm/median": 185.0, "per_token_feature_norm/min": 86.5, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 193.0, "per_token_feature_norm/var": 269.3193054199219, "per_token_full_gradient_variance/max_squared_error": 0.5448144674301147, "per_token_full_gradient_variance/variance": 0.0026355069130659103, "per_token_gradient_norm": 1.569840908050537, "per_token_gradient_norm/max": 287.109375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 194.0325927734375, "per_token_policy_error_norm": 0.012248898856341839, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011350652202963829, "policy_entropy": 0.024818550795316696, "policy_entropy/max": 2.125, "policy_entropy/median": 1.1920928955078125e-07, "policy_entropy/min": 6.8575787409708155e-18, "policy_entropy/p25": 7.275957614183426e-10, "policy_entropy/p75": 1.609325408935547e-05, "policy_entropy/var": 0.014983142726123333, "policy_error_vector_variance/max_squared_error": 2.0042805671691895, "policy_error_vector_variance/metric": 0.01208465825766325, "policy_loss": -0.65625, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2279605120420456, "policy_sharpness": 9.32748031616211, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.157986164093018, "reward": 0.65625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2279605120420456, "rewards/accuracy_reward": 0.65625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2279605120420456, "sentence_full_gradient_variance/max_squared_error": 3168.24169921875, "sentence_full_gradient_variance/metric": 1341.056396484375, "sentence_full_gradient_variance/p75": 1832.27294921875, "sentence_full_gradient_variance/p90": 2066.24609375, "sentence_full_gradient_variance/p95": 2529.8798828125, "sentence_full_gradient_variance/p99": 2990.298828125, "state_level_variance/metric": 2.737516164779663, "state_level_variance_full_gradient/metric": 1341.056396484375, "step": 31 }, { "accuracy_reward": 0.59375, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24375000596046448, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 8.494139183312654e-05, "adam_stats/lr_effective_mean": -6.249199491215052e-10, "adam_stats/lr_effective_min": -8.525248995283619e-05, "adam_stats/m_t_max": 0.005104937124997377, "adam_stats/m_t_mean": -4.1312321619191295e-11, "adam_stats/m_t_min": -0.008543865755200386, "adam_stats/v_t_max": 7.358934090007097e-05, "adam_stats/v_t_mean": 7.519514524934046e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.59375, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.24375000596046448, "all_logprobs": -0.023759081959724426, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.25, "all_logprobs/p1": -0.69140625, "all_logprobs/p10": -0.000431060791015625, "all_logprobs/p25": -7.152557373046875e-07, "all_logprobs/p5": -0.0181884765625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03770098090171814, "clip_ratio": 0.0, "completion_length": 442.8125, "completion_length/correct": 418.5789489746094, "completion_length/correct/max": 956.0, "completion_length/correct/median": 364.0, "completion_length/correct/min": 114.0, "completion_length/correct/p25": 281.0, "completion_length/correct/p75": 526.0, "completion_length/correct/var": 44895.9296875, "completion_length/incorrect": 478.23077392578125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 428.0, "completion_length/incorrect/min": 198.0, "completion_length/incorrect/p25": 334.5, "completion_length/incorrect/p75": 547.5, "completion_length/incorrect/var": 51913.4453125, "completion_length/max": 1024.0, "completion_length/median": 367.0, "completion_length/min": 114.0, "completion_length/p25": 297.75, "completion_length/p75": 539.25, "completion_length/var": 48097.6953125, "epoch": 0.4050632911392405, "feature_vector_variance/max_squared_error": 104129.703125, "feature_vector_variance/metric": 28501.884765625, "generated_tokens/total": 1645214.0, "grad_norm": 1.6887093782424927, "learning_rate": 1.2895048502539883e-05, "loss": -0.5938, "mean_logprobs": -0.0235595703125, "mean_logprobs/var": 0.00017833709716796875, "num_completions/total": 3072, "per_sentence_gradient_norm": 1.902230978012085, "per_sentence_gradient_norm/max": 6.75360631942749, "per_sentence_gradient_norm/median": 1.8294830322265625, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.123035192489624, "per_sentence_gradient_norm/p85": 4.369470119476318, "per_sentence_gradient_norm/p90": 4.9472198486328125, "per_sentence_gradient_norm/p95": 5.93612813949585, "per_sentence_gradient_norm/p99": 6.545368671417236, "per_sentence_gradient_norm/var": 4.0881500244140625, "per_token_feature_norm": 185.4264373779297, "per_token_feature_norm/max": 276.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 89.0, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 253.68475341796875, "per_token_full_gradient_variance/max_squared_error": 0.5393266677856445, "per_token_full_gradient_variance/variance": 0.0027347509749233723, "per_token_gradient_norm": 1.7631492614746094, "per_token_gradient_norm/max": 273.796875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 234.08132934570312, "per_token_policy_error_norm": 0.013314560055732727, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01257573626935482, "policy_entropy": 0.02581821009516716, "policy_entropy/max": 2.921875, "policy_entropy/median": 8.754432201385498e-08, "policy_entropy/min": 2.100641709190665e-18, "policy_entropy/p25": 5.711626727133989e-10, "policy_entropy/p75": 1.245737075805664e-05, "policy_entropy/var": 0.01659955084323883, "policy_error_vector_variance/max_squared_error": 1.999435544013977, "policy_error_vector_variance/metric": 0.013248348608613014, "policy_loss": -0.59375, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.24375000596046448, "policy_sharpness": 9.331642150878906, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.137232303619385, "reward": 0.59375, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24375000596046448, "rewards/accuracy_reward": 0.59375, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24375000596046448, "sentence_full_gradient_variance/max_squared_error": 4257.056640625, "sentence_full_gradient_variance/metric": 1344.1715087890625, "sentence_full_gradient_variance/p75": 1410.185546875, "sentence_full_gradient_variance/p90": 2161.68212890625, "sentence_full_gradient_variance/p95": 2971.548095703125, "sentence_full_gradient_variance/p99": 3385.615478515625, "state_level_variance/metric": 4.0881500244140625, "state_level_variance_full_gradient/metric": 1344.1715087890625, "step": 32 }, { "accuracy_reward": 0.6145833730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23936405777931213, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 8.159687422448769e-05, "adam_stats/lr_effective_mean": -6.599265023332634e-10, "adam_stats/lr_effective_min": -8.130280184559524e-05, "adam_stats/m_t_max": 0.006358814891427755, "adam_stats/m_t_mean": 7.334534455477604e-13, "adam_stats/m_t_min": -0.006653168238699436, "adam_stats/v_t_max": 7.564546831417829e-05, "adam_stats/v_t_mean": 7.828366427320432e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6145833730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.23936405777931213, "all_logprobs": -0.026181520894169807, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.84375, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.000553131103515625, "all_logprobs/p25": -8.344650268554688e-07, "all_logprobs/p5": -0.02978515625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.040660008788108826, "clip_ratio": 0.0, "completion_length": 402.0833435058594, "completion_length/correct": 360.1355895996094, "completion_length/correct/max": 999.0, "completion_length/correct/median": 331.0, "completion_length/correct/min": 130.0, "completion_length/correct/p25": 252.0, "completion_length/correct/p75": 415.0, "completion_length/correct/var": 27006.669921875, "completion_length/incorrect": 468.9729919433594, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 359.0, "completion_length/incorrect/min": 179.0, "completion_length/incorrect/p25": 291.0, "completion_length/incorrect/p75": 546.0, "completion_length/incorrect/var": 66985.8046875, "completion_length/max": 1024.0, "completion_length/median": 347.0, "completion_length/min": 130.0, "completion_length/p25": 276.75, "completion_length/p75": 457.25, "completion_length/var": 44707.78515625, "epoch": 0.4177215189873418, "feature_vector_variance/max_squared_error": 119799.1796875, "feature_vector_variance/metric": 28405.015625, "generated_tokens/total": 1683814.0, "grad_norm": 1.1532737016677856, "learning_rate": 1.270993777844248e-05, "loss": -0.6146, "mean_logprobs": -0.024658203125, "mean_logprobs/var": 0.00035858154296875, "num_completions/total": 3168, "per_sentence_gradient_norm": 1.9781935214996338, "per_sentence_gradient_norm/max": 10.446500778198242, "per_sentence_gradient_norm/median": 1.7446306943893433, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.378953218460083, "per_sentence_gradient_norm/p85": 4.153059959411621, "per_sentence_gradient_norm/p90": 4.585960388183594, "per_sentence_gradient_norm/p95": 5.852593898773193, "per_sentence_gradient_norm/p99": 8.148736000061035, "per_sentence_gradient_norm/var": 4.764223575592041, "per_token_feature_norm": 185.81629943847656, "per_token_feature_norm/max": 292.0, "per_token_feature_norm/median": 186.0, "per_token_feature_norm/min": 91.0, "per_token_feature_norm/p25": 177.0, "per_token_feature_norm/p75": 195.0, "per_token_feature_norm/var": 268.8024597167969, "per_token_full_gradient_variance/max_squared_error": 0.453858882188797, "per_token_full_gradient_variance/variance": 0.0027130895759910345, "per_token_gradient_norm": 1.7227967977523804, "per_token_gradient_norm/max": 270.6640625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 232.1024627685547, "per_token_policy_error_norm": 0.014807521365582943, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014042024500668049, "policy_entropy": 0.02838842198252678, "policy_entropy/max": 2.390625, "policy_entropy/median": 8.475035429000854e-08, "policy_entropy/min": 2.358139725155972e-18, "policy_entropy/p25": 6.148184183984995e-10, "policy_entropy/p75": 1.2636184692382812e-05, "policy_entropy/var": 0.01852237619459629, "policy_error_vector_variance/max_squared_error": 1.9882205724716187, "policy_error_vector_variance/metric": 0.014780675061047077, "policy_loss": -0.6145833730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.23936405777931213, "policy_sharpness": 9.3014497756958, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.3597731590271, "reward": 0.6145833730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23936405777931213, "rewards/accuracy_reward": 0.6145833730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23936405777931213, "sentence_full_gradient_variance/max_squared_error": 3254.85205078125, "sentence_full_gradient_variance/metric": 1443.76171875, "sentence_full_gradient_variance/p75": 1927.1883544921875, "sentence_full_gradient_variance/p90": 1954.494140625, "sentence_full_gradient_variance/p95": 2349.352783203125, "sentence_full_gradient_variance/p99": 2991.469482421875, "state_level_variance/metric": 4.764223575592041, "state_level_variance_full_gradient/metric": 1443.76171875, "step": 33 }, { "accuracy_reward": 0.6354166865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23410090804100037, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 8.225224155467004e-05, "adam_stats/lr_effective_mean": -3.3814723243708045e-10, "adam_stats/lr_effective_min": -7.970562728587538e-05, "adam_stats/m_t_max": 0.009018832817673683, "adam_stats/m_t_mean": 8.253135154456803e-12, "adam_stats/m_t_min": -0.00869470089673996, "adam_stats/v_t_max": 7.639272371307015e-05, "adam_stats/v_t_mean": 8.009861870994506e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6354166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.23410090804100037, "all_logprobs": -0.021964287385344505, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -5.1875, "all_logprobs/p1": -0.69140625, "all_logprobs/p10": -0.00041618337854743004, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -0.018310546875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.029646949842572212, "clip_ratio": 0.0, "completion_length": 398.1145935058594, "completion_length/correct": 348.9671936035156, "completion_length/correct/max": 983.0, "completion_length/correct/median": 296.0, "completion_length/correct/min": 122.0, "completion_length/correct/p25": 217.0, "completion_length/correct/p75": 369.0, "completion_length/correct/var": 42240.23046875, "completion_length/incorrect": 483.77142333984375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 391.0, "completion_length/incorrect/min": 223.0, "completion_length/incorrect/p25": 300.0, "completion_length/incorrect/p75": 572.0, "completion_length/incorrect/var": 62154.0, "completion_length/max": 1024.0, "completion_length/median": 322.0, "completion_length/min": 122.0, "completion_length/p25": 249.0, "completion_length/p75": 426.5, "completion_length/var": 53176.75390625, "epoch": 0.43037974683544306, "feature_vector_variance/max_squared_error": 96109.953125, "feature_vector_variance/metric": 28462.298828125, "generated_tokens/total": 1722033.0, "grad_norm": 0.925184428691864, "learning_rate": 1.2518479547691437e-05, "loss": -0.6354, "mean_logprobs": -0.021484375, "mean_logprobs/var": 0.00016021728515625, "num_completions/total": 3264, "per_sentence_gradient_norm": 1.9704627990722656, "per_sentence_gradient_norm/max": 6.705019950866699, "per_sentence_gradient_norm/median": 1.9872440099716187, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.4253506660461426, "per_sentence_gradient_norm/p85": 3.9339282512664795, "per_sentence_gradient_norm/p90": 4.143547058105469, "per_sentence_gradient_norm/p95": 5.264770984649658, "per_sentence_gradient_norm/p99": 6.114856243133545, "per_sentence_gradient_norm/var": 3.4590139389038086, "per_token_feature_norm": 187.57284545898438, "per_token_feature_norm/max": 262.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 94.0, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 291.5021667480469, "per_token_full_gradient_variance/max_squared_error": 0.6340331435203552, "per_token_full_gradient_variance/variance": 0.0028158272616565228, "per_token_gradient_norm": 1.7675323486328125, "per_token_gradient_norm/max": 279.84375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 234.74562072753906, "per_token_policy_error_norm": 0.01301953848451376, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.012135505676269531, "policy_entropy": 0.024079767987132072, "policy_entropy/max": 2.03125, "policy_entropy/median": 6.05359673500061e-08, "policy_entropy/min": 6.938893903907228e-18, "policy_entropy/p25": 4.984030965715647e-10, "policy_entropy/p75": 8.821487426757812e-06, "policy_entropy/var": 0.01336106937378645, "policy_error_vector_variance/max_squared_error": 1.9734032154083252, "policy_error_vector_variance/metric": 0.013013085350394249, "policy_loss": -0.6354166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.23410090804100037, "policy_sharpness": 9.347577095031738, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.9836010932922363, "reward": 0.6354166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23410090804100037, "rewards/accuracy_reward": 0.6354166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23410090804100037, "sentence_full_gradient_variance/max_squared_error": 2859.080078125, "sentence_full_gradient_variance/metric": 1618.9403076171875, "sentence_full_gradient_variance/p75": 2199.302001953125, "sentence_full_gradient_variance/p90": 2199.302001953125, "sentence_full_gradient_variance/p95": 2378.73779296875, "sentence_full_gradient_variance/p99": 2551.76611328125, "state_level_variance/metric": 3.4590139389038086, "state_level_variance_full_gradient/metric": 1618.9403076171875, "step": 34 }, { "accuracy_reward": 0.6354166865348816, "accuracy_reward/correct": 0.9999999403953552, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.23410086333751678, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 8.177275594789535e-05, "adam_stats/lr_effective_mean": -1.6730312968338268e-10, "adam_stats/lr_effective_min": -7.821090548532084e-05, "adam_stats/m_t_max": 0.009283766150474548, "adam_stats/m_t_mean": 2.958577480227653e-11, "adam_stats/m_t_min": -0.011801661923527718, "adam_stats/v_t_max": 8.098928083200008e-05, "adam_stats/v_t_mean": 8.221839008226706e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6354166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.23410086333751678, "all_logprobs": -0.02516733482480049, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.625, "all_logprobs/p1": -0.7611328363418579, "all_logprobs/p10": -0.00092315673828125, "all_logprobs/p25": -1.1920928955078125e-06, "all_logprobs/p5": -0.03173828125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.036664288491010666, "clip_ratio": 0.0, "completion_length": 384.54168701171875, "completion_length/correct": 310.75408935546875, "completion_length/correct/max": 803.0, "completion_length/correct/median": 277.0, "completion_length/correct/min": 110.0, "completion_length/correct/p25": 212.0, "completion_length/correct/p75": 391.0, "completion_length/correct/var": 19456.955078125, "completion_length/incorrect": 513.1428833007812, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 402.0, "completion_length/incorrect/min": 175.0, "completion_length/incorrect/p25": 274.0, "completion_length/incorrect/p75": 692.5, "completion_length/incorrect/var": 75848.7109375, "completion_length/max": 1024.0, "completion_length/median": 307.0, "completion_length/min": 110.0, "completion_length/p25": 234.5, "completion_length/p75": 438.5, "completion_length/var": 49023.51171875, "epoch": 0.4430379746835443, "feature_vector_variance/max_squared_error": 126132.296875, "feature_vector_variance/metric": 28255.267578125, "generated_tokens/total": 1758949.0, "grad_norm": 0.9791134595870972, "learning_rate": 1.2320907072649045e-05, "loss": -0.6354, "mean_logprobs": -0.0245361328125, "mean_logprobs/var": 0.00023746490478515625, "num_completions/total": 3360, "per_sentence_gradient_norm": 2.2597503662109375, "per_sentence_gradient_norm/max": 10.228638648986816, "per_sentence_gradient_norm/median": 2.2535035610198975, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.695840835571289, "per_sentence_gradient_norm/p85": 4.491446495056152, "per_sentence_gradient_norm/p90": 4.873341083526611, "per_sentence_gradient_norm/p95": 6.179030895233154, "per_sentence_gradient_norm/p99": 8.96586799621582, "per_sentence_gradient_norm/var": 5.2054643630981445, "per_token_feature_norm": 187.69947814941406, "per_token_feature_norm/max": 294.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 93.0, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 297.3809814453125, "per_token_full_gradient_variance/max_squared_error": 0.505133867263794, "per_token_full_gradient_variance/variance": 0.0028535446617752314, "per_token_gradient_norm": 1.8090813159942627, "per_token_gradient_norm/max": 300.15625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 240.60459899902344, "per_token_policy_error_norm": 0.014345119707286358, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.013281390070915222, "policy_entropy": 0.028078177943825722, "policy_entropy/max": 2.65625, "policy_entropy/median": 1.1827796697616577e-07, "policy_entropy/min": 4.174178364069192e-18, "policy_entropy/p25": 1.0186340659856796e-09, "policy_entropy/p75": 1.8477439880371094e-05, "policy_entropy/var": 0.016485940665006638, "policy_error_vector_variance/max_squared_error": 1.970550775527954, "policy_error_vector_variance/metric": 0.014334635809063911, "policy_loss": -0.6354166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.23410086333751678, "policy_sharpness": 9.25426197052002, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.50960111618042, "reward": 0.6354166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.23410086333751678, "rewards/accuracy_reward": 0.6354166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.23410086333751678, "sentence_full_gradient_variance/max_squared_error": 2555.013671875, "sentence_full_gradient_variance/metric": 1626.63525390625, "sentence_full_gradient_variance/p75": 2442.71630859375, "sentence_full_gradient_variance/p90": 2442.71630859375, "sentence_full_gradient_variance/p95": 2442.727783203125, "sentence_full_gradient_variance/p99": 2541.58935546875, "state_level_variance/metric": 5.2054643630981445, "state_level_variance_full_gradient/metric": 1626.63525390625, "step": 35 }, { "accuracy_reward": 0.5625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.24868419766426086, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 7.9709330748301e-05, "adam_stats/lr_effective_mean": -1.4276418935921242e-10, "adam_stats/lr_effective_min": -7.761928281979635e-05, "adam_stats/m_t_max": 0.01106830034404993, "adam_stats/m_t_mean": 3.8608869573630855e-11, "adam_stats/m_t_min": -0.012578701600432396, "adam_stats/v_t_max": 8.102368883555755e-05, "adam_stats/v_t_mean": 8.351936330031062e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.5625, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.24868419766426086, "all_logprobs": -0.023905832320451736, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.125, "all_logprobs/p1": -0.734375, "all_logprobs/p10": -0.0006256103515625, "all_logprobs/p25": -9.5367431640625e-07, "all_logprobs/p5": -0.0263671875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.03269638121128082, "clip_ratio": 0.0, "completion_length": 387.34375, "completion_length/correct": 323.870361328125, "completion_length/correct/max": 845.0, "completion_length/correct/median": 290.0, "completion_length/correct/min": 104.0, "completion_length/correct/p25": 228.75, "completion_length/correct/p75": 383.5, "completion_length/correct/var": 22713.587890625, "completion_length/incorrect": 468.952392578125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 382.0, "completion_length/incorrect/min": 147.0, "completion_length/incorrect/p25": 331.0, "completion_length/incorrect/p75": 655.5, "completion_length/incorrect/var": 43156.78125, "completion_length/max": 1024.0, "completion_length/median": 342.0, "completion_length/min": 104.0, "completion_length/p25": 283.0, "completion_length/p75": 427.5, "completion_length/var": 36531.8515625, "epoch": 0.45569620253164556, "feature_vector_variance/max_squared_error": 121200.4296875, "feature_vector_variance/metric": 28181.927734375, "generated_tokens/total": 1796134.0, "grad_norm": 0.8100199103355408, "learning_rate": 1.2117461064942437e-05, "loss": -0.5625, "mean_logprobs": -0.02392578125, "mean_logprobs/var": 0.00019073486328125, "num_completions/total": 3456, "per_sentence_gradient_norm": 1.8822901248931885, "per_sentence_gradient_norm/max": 8.48954963684082, "per_sentence_gradient_norm/median": 1.1853400468826294, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.004997730255127, "per_sentence_gradient_norm/p85": 4.054784774780273, "per_sentence_gradient_norm/p90": 4.7890119552612305, "per_sentence_gradient_norm/p95": 6.136005401611328, "per_sentence_gradient_norm/p99": 8.259586334228516, "per_sentence_gradient_norm/var": 4.8127641677856445, "per_token_feature_norm": 187.94586181640625, "per_token_feature_norm/max": 290.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 90.0, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 198.0, "per_token_feature_norm/var": 316.7757568359375, "per_token_full_gradient_variance/max_squared_error": 0.5407852530479431, "per_token_full_gradient_variance/variance": 0.0024329267907887697, "per_token_gradient_norm": 1.5535942316055298, "per_token_gradient_norm/max": 300.375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 206.51669311523438, "per_token_policy_error_norm": 0.0140431709587574, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01303790882229805, "policy_entropy": 0.026488730683922768, "policy_entropy/max": 2.359375, "policy_entropy/median": 8.707866072654724e-08, "policy_entropy/min": 5.3939058081153846e-18, "policy_entropy/p25": 8.076312951743603e-10, "policy_entropy/p75": 1.4483928680419922e-05, "policy_entropy/var": 0.015590637922286987, "policy_error_vector_variance/max_squared_error": 1.9886025190353394, "policy_error_vector_variance/metric": 0.014043338596820831, "policy_loss": -0.5625, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.24868419766426086, "policy_sharpness": 9.297379493713379, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.2683844566345215, "reward": 0.5625, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.24868419766426086, "rewards/accuracy_reward": 0.5625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.24868419766426086, "sentence_full_gradient_variance/max_squared_error": 3688.1142578125, "sentence_full_gradient_variance/metric": 1495.8192138671875, "sentence_full_gradient_variance/p75": 1622.5450439453125, "sentence_full_gradient_variance/p90": 2309.1259765625, "sentence_full_gradient_variance/p95": 3044.814208984375, "sentence_full_gradient_variance/p99": 3404.5869140625, "state_level_variance/metric": 4.8127641677856445, "state_level_variance_full_gradient/metric": 1495.8192138671875, "step": 36 }, { "accuracy_reward": 0.6458333730697632, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.2311403602361679, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 7.78140893089585e-05, "adam_stats/lr_effective_mean": -3.5245720231280586e-10, "adam_stats/lr_effective_min": -7.56446024752222e-05, "adam_stats/m_t_max": 0.008573738858103752, "adam_stats/m_t_mean": 3.18082747641224e-11, "adam_stats/m_t_min": -0.010417510755360126, "adam_stats/v_t_max": 8.111970237223431e-05, "adam_stats/v_t_mean": 8.594163575237346e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.6458333730697632, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.2311403602361679, "all_logprobs": -0.027021512389183044, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.875, "all_logprobs/p1": -0.82421875, "all_logprobs/p10": -0.000911712646484375, "all_logprobs/p25": -9.5367431640625e-07, "all_logprobs/p5": -0.03269040584564209, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.04117234796285629, "clip_ratio": 0.0, "completion_length": 375.22918701171875, "completion_length/correct": 316.56451416015625, "completion_length/correct/max": 952.0, "completion_length/correct/median": 259.0, "completion_length/correct/min": 113.0, "completion_length/correct/p25": 209.25, "completion_length/correct/p75": 365.75, "completion_length/correct/var": 31103.23828125, "completion_length/incorrect": 482.20587158203125, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 379.0, "completion_length/incorrect/min": 189.0, "completion_length/incorrect/p25": 295.75, "completion_length/incorrect/p75": 705.0, "completion_length/incorrect/var": 67318.9609375, "completion_length/max": 1024.0, "completion_length/median": 307.0, "completion_length/min": 113.0, "completion_length/p25": 227.5, "completion_length/p75": 413.0, "completion_length/var": 49697.84375, "epoch": 0.46835443037974683, "feature_vector_variance/max_squared_error": 130388.5234375, "feature_vector_variance/metric": 27796.173828125, "generated_tokens/total": 1832156.0, "grad_norm": 0.9933852553367615, "learning_rate": 1.1908389392193549e-05, "loss": -0.6458, "mean_logprobs": -0.0267333984375, "mean_logprobs/var": 0.0002269744873046875, "num_completions/total": 3552, "per_sentence_gradient_norm": 2.340986967086792, "per_sentence_gradient_norm/max": 9.98052978515625, "per_sentence_gradient_norm/median": 2.0758304595947266, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.8786518573760986, "per_sentence_gradient_norm/p85": 4.873327732086182, "per_sentence_gradient_norm/p90": 5.345749855041504, "per_sentence_gradient_norm/p95": 5.810295581817627, "per_sentence_gradient_norm/p99": 7.171797752380371, "per_sentence_gradient_norm/var": 4.959161281585693, "per_token_feature_norm": 187.6890106201172, "per_token_feature_norm/max": 296.0, "per_token_feature_norm/median": 187.0, "per_token_feature_norm/min": 90.5, "per_token_feature_norm/p25": 178.0, "per_token_feature_norm/p75": 197.0, "per_token_feature_norm/var": 314.5227355957031, "per_token_full_gradient_variance/max_squared_error": 0.5628474950790405, "per_token_full_gradient_variance/variance": 0.0031162777449935675, "per_token_gradient_norm": 2.0200397968292236, "per_token_gradient_norm/max": 267.40625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 280.1816101074219, "per_token_policy_error_norm": 0.015574412420392036, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014867088757455349, "policy_entropy": 0.02860376611351967, "policy_entropy/max": 2.296875, "policy_entropy/median": 7.916241884231567e-08, "policy_entropy/min": 9.215718466126788e-18, "policy_entropy/p25": 6.693881005048752e-10, "policy_entropy/p75": 1.4066696166992188e-05, "policy_entropy/var": 0.017425380647182465, "policy_error_vector_variance/max_squared_error": 1.9880831241607666, "policy_error_vector_variance/metric": 0.01557009108364582, "policy_loss": -0.6458333730697632, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.2311403602361679, "policy_sharpness": 9.26259708404541, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.50959587097168, "reward": 0.6458333730697632, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.2311403602361679, "rewards/accuracy_reward": 0.6458333730697632, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.2311403602361679, "sentence_full_gradient_variance/max_squared_error": 2889.96484375, "sentence_full_gradient_variance/metric": 1680.3802490234375, "sentence_full_gradient_variance/p75": 2507.5419921875, "sentence_full_gradient_variance/p90": 2507.5419921875, "sentence_full_gradient_variance/p95": 2508.14697265625, "sentence_full_gradient_variance/p99": 2825.587890625, "state_level_variance/metric": 4.959161281585693, "state_level_variance_full_gradient/metric": 1680.3802490234375, "step": 37 }, { "accuracy_reward": 0.5104166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 1.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.25252193212509155, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 7.457016181433573e-05, "adam_stats/lr_effective_mean": -6.439670463542768e-10, "adam_stats/lr_effective_min": -7.590717723360285e-05, "adam_stats/m_t_max": 0.008431266993284225, "adam_stats/m_t_mean": 7.106511212828792e-11, "adam_stats/m_t_min": -0.012842557393014431, "adam_stats/v_t_max": 8.106363384285942e-05, "adam_stats/v_t_mean": 8.71931606777343e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.5104166865348816, "advantages/max": 1.0, "advantages/median": 1.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.25252193212509155, "all_logprobs": -0.030725453048944473, "all_logprobs/max": 0.0, "all_logprobs/median": 0.0, "all_logprobs/min": -6.375, "all_logprobs/p1": -0.97265625, "all_logprobs/p10": -0.00193023681640625, "all_logprobs/p25": -1.7881393432617188e-06, "all_logprobs/p5": -0.06201171875, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.043150290846824646, "clip_ratio": 0.0, "completion_length": 383.97918701171875, "completion_length/correct": 304.16326904296875, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 252.0, "completion_length/correct/min": 116.0, "completion_length/correct/p25": 190.0, "completion_length/correct/p75": 354.0, "completion_length/correct/var": 29402.015625, "completion_length/incorrect": 467.19146728515625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 407.0, "completion_length/incorrect/min": 120.0, "completion_length/incorrect/p25": 244.0, "completion_length/incorrect/p75": 617.5, "completion_length/incorrect/var": 79851.2421875, "completion_length/max": 1024.0, "completion_length/median": 289.0, "completion_length/min": 116.0, "completion_length/p25": 203.75, "completion_length/p75": 462.25, "completion_length/var": 60232.1484375, "epoch": 0.4810126582278481, "feature_vector_variance/max_squared_error": 126157.96875, "feature_vector_variance/metric": 28414.294921875, "generated_tokens/total": 1869018.0, "grad_norm": 0.7577505707740784, "learning_rate": 1.1693946776030601e-05, "loss": -0.5104, "mean_logprobs": -0.0299072265625, "mean_logprobs/var": 0.0002765655517578125, "num_completions/total": 3648, "per_sentence_gradient_norm": 1.8897145986557007, "per_sentence_gradient_norm/max": 8.358036994934082, "per_sentence_gradient_norm/median": 0.7277065515518188, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 3.439554214477539, "per_sentence_gradient_norm/p85": 4.555213451385498, "per_sentence_gradient_norm/p90": 5.077435493469238, "per_sentence_gradient_norm/p95": 5.437321662902832, "per_sentence_gradient_norm/p99": 7.324763774871826, "per_sentence_gradient_norm/var": 4.6839141845703125, "per_token_feature_norm": 190.46726989746094, "per_token_feature_norm/max": 290.0, "per_token_feature_norm/median": 189.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 201.0, "per_token_feature_norm/var": 336.78662109375, "per_token_full_gradient_variance/max_squared_error": 1.873395562171936, "per_token_full_gradient_variance/variance": 0.002655378542840481, "per_token_gradient_norm": 1.603522777557373, "per_token_gradient_norm/max": 320.625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 216.3585968017578, "per_token_policy_error_norm": 0.017791759222745895, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.016442693769931793, "policy_entropy": 0.03399217501282692, "policy_entropy/max": 2.8125, "policy_entropy/median": 1.043081283569336e-07, "policy_entropy/min": 2.4936649967166602e-17, "policy_entropy/p25": 8.958522812463343e-10, "policy_entropy/p75": 2.5391578674316406e-05, "policy_entropy/var": 0.021747173741459846, "policy_error_vector_variance/max_squared_error": 1.990100622177124, "policy_error_vector_variance/metric": 0.01778206042945385, "policy_loss": -0.5104166865348816, "policy_loss/max": 0.0, "policy_loss/median": -1.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.25252193212509155, "policy_sharpness": 9.163908004760742, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.117159843444824, "reward": 0.5104166865348816, "reward/max": 1.0, "reward/median": 1.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.25252193212509155, "rewards/accuracy_reward": 0.5104166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 1.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.25252193212509155, "sentence_full_gradient_variance/max_squared_error": 3657.96728515625, "sentence_full_gradient_variance/metric": 1804.8349609375, "sentence_full_gradient_variance/p75": 2061.67626953125, "sentence_full_gradient_variance/p90": 2754.333251953125, "sentence_full_gradient_variance/p95": 2903.119140625, "sentence_full_gradient_variance/p99": 3645.09814453125, "state_level_variance/metric": 4.6839141845703125, "state_level_variance_full_gradient/metric": 1804.8349609375, "step": 38 }, { "accuracy_reward": 0.3333333432674408, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.224561408162117, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 7.510496652685106e-05, "adam_stats/lr_effective_mean": -7.505654986417198e-10, "adam_stats/lr_effective_min": -7.5168652983848e-05, "adam_stats/m_t_max": 0.0077310046181082726, "adam_stats/m_t_mean": 4.8634633646738035e-12, "adam_stats/m_t_min": -0.015830764546990395, "adam_stats/v_t_max": 8.488877210766077e-05, "adam_stats/v_t_mean": 9.000362087563385e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.3333333432674408, "advantages/max": 1.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.224561408162117, "all_logprobs": -0.04088249430060387, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -8.5, "all_logprobs/p1": -1.2265625, "all_logprobs/p10": -0.0015579219907522202, "all_logprobs/p25": -6.9141387939453125e-06, "all_logprobs/p5": -0.064453125, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.080815888941288, "clip_ratio": 0.0, "completion_length": 406.03125, "completion_length/correct": 241.96875, "completion_length/correct/max": 534.0, "completion_length/correct/median": 222.0, "completion_length/correct/min": 74.0, "completion_length/correct/p25": 183.25, "completion_length/correct/p75": 261.25, "completion_length/correct/var": 11031.2568359375, "completion_length/incorrect": 488.0625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 293.0, "completion_length/incorrect/min": 38.0, "completion_length/incorrect/p25": 185.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 138125.90625, "completion_length/max": 1024.0, "completion_length/median": 252.0, "completion_length/min": 38.0, "completion_length/p25": 183.5, "completion_length/p75": 497.0, "completion_length/var": 108798.8828125, "epoch": 0.4936708860759494, "feature_vector_variance/max_squared_error": 101691.0546875, "feature_vector_variance/metric": 30672.9609375, "generated_tokens/total": 1907997.0, "grad_norm": 0.9893456101417542, "learning_rate": 1.1474394481749037e-05, "loss": -0.3333, "mean_logprobs": -0.06494140625, "mean_logprobs/var": 0.003204345703125, "num_completions/total": 3744, "per_sentence_gradient_norm": 2.979243278503418, "per_sentence_gradient_norm/max": 27.176097869873047, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 5.251232624053955, "per_sentence_gradient_norm/p85": 8.58282470703125, "per_sentence_gradient_norm/p90": 10.287256240844727, "per_sentence_gradient_norm/p95": 12.378087997436523, "per_sentence_gradient_norm/p99": 17.852985382080078, "per_sentence_gradient_norm/var": 26.445003509521484, "per_token_feature_norm": 184.0877685546875, "per_token_feature_norm/max": 296.0, "per_token_feature_norm/median": 188.0, "per_token_feature_norm/min": 91.0, "per_token_feature_norm/p25": 168.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 801.8426513671875, "per_token_full_gradient_variance/max_squared_error": 1.0500428676605225, "per_token_full_gradient_variance/variance": 0.0024233118165284395, "per_token_gradient_norm": 1.719861388206482, "per_token_gradient_norm/max": 352.1015625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 262.454833984375, "per_token_policy_error_norm": 0.02101988159120083, "per_token_policy_error_norm/max": 1.9921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.020031431689858437, "policy_entropy": 0.04375506564974785, "policy_entropy/max": 3.09375, "policy_entropy/median": 3.680586814880371e-06, "policy_entropy/min": 1.968911145233676e-16, "policy_entropy/p25": 2.3632310330867767e-08, "policy_entropy/p75": 9.274482727050781e-05, "policy_entropy/var": 0.04100248962640762, "policy_error_vector_variance/max_squared_error": 1.9932156801223755, "policy_error_vector_variance/metric": 0.020901428535580635, "policy_loss": -0.3333333432674408, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.224561408162117, "policy_sharpness": 9.138248443603516, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.725141525268555, "reward": 0.3333333432674408, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.224561408162117, "rewards/accuracy_reward": 0.3333333432674408, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.224561408162117, "sentence_full_gradient_variance/max_squared_error": 6236.90576171875, "sentence_full_gradient_variance/metric": 1830.4700927734375, "sentence_full_gradient_variance/p75": 3198.30810546875, "sentence_full_gradient_variance/p90": 4352.26953125, "sentence_full_gradient_variance/p95": 4742.6650390625, "sentence_full_gradient_variance/p99": 6032.32666015625, "state_level_variance/metric": 26.445003509521484, "state_level_variance_full_gradient/metric": 1830.4700927734375, "step": 39 }, { "accuracy_reward": 0.2604166865348816, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 1.0, "accuracy_reward/var": 0.19462718069553375, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 7.323922909563407e-05, "adam_stats/lr_effective_mean": -6.523052653584216e-10, "adam_stats/lr_effective_min": -7.363865006482229e-05, "adam_stats/m_t_max": 0.00912198331207037, "adam_stats/m_t_mean": 1.3396116160802052e-10, "adam_stats/m_t_min": -0.016521867364645004, "adam_stats/v_t_max": 8.994056406663731e-05, "adam_stats/v_t_mean": 9.374506379500325e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.2604166865348816, "advantages/max": 1.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 1.0, "advantages/var": 0.19462718069553375, "all_logprobs": -0.09335028380155563, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -9.75, "all_logprobs/p1": -2.453125, "all_logprobs/p10": -0.06201171875, "all_logprobs/p25": -6.532669067382812e-05, "all_logprobs/p5": -0.462890625, "all_logprobs/p75": 0.0, "all_logprobs/var": 0.20938526093959808, "clip_ratio": 0.0, "completion_length": 310.03125, "completion_length/correct": 229.3199920654297, "completion_length/correct/max": 717.0, "completion_length/correct/median": 161.0, "completion_length/correct/min": 81.0, "completion_length/correct/p25": 121.0, "completion_length/correct/p75": 273.0, "completion_length/correct/var": 25958.142578125, "completion_length/incorrect": 338.45068359375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 228.0, "completion_length/incorrect/min": 4.0, "completion_length/incorrect/p25": 152.5, "completion_length/incorrect/p75": 372.5, "completion_length/incorrect/var": 94038.6015625, "completion_length/max": 1024.0, "completion_length/median": 205.0, "completion_length/min": 4.0, "completion_length/p25": 134.0, "completion_length/p75": 353.5, "completion_length/var": 78167.359375, "epoch": 0.5063291139240507, "feature_vector_variance/max_squared_error": 100654.5703125, "feature_vector_variance/metric": 32547.650390625, "generated_tokens/total": 1937760.0, "grad_norm": 1.4170624017715454, "learning_rate": 1.125e-05, "loss": -0.2604, "mean_logprobs": -0.146484375, "mean_logprobs/var": 0.01953125, "num_completions/total": 3840, "per_sentence_gradient_norm": 3.561354398727417, "per_sentence_gradient_norm/max": 36.535179138183594, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 2.656987190246582, "per_sentence_gradient_norm/p85": 8.239227294921875, "per_sentence_gradient_norm/p90": 13.220216751098633, "per_sentence_gradient_norm/p95": 25.56181526184082, "per_sentence_gradient_norm/p99": 29.33481216430664, "per_sentence_gradient_norm/var": 61.81982421875, "per_token_feature_norm": 187.55197143554688, "per_token_feature_norm/max": 294.0, "per_token_feature_norm/median": 190.0, "per_token_feature_norm/min": 67.0, "per_token_feature_norm/p25": 173.0, "per_token_feature_norm/p75": 205.0, "per_token_feature_norm/var": 739.6951293945312, "per_token_full_gradient_variance/max_squared_error": 0.8607183694839478, "per_token_full_gradient_variance/variance": 0.0027449338231235743, "per_token_gradient_norm": 2.2030413150787354, "per_token_gradient_norm/max": 339.84375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 350.907958984375, "per_token_policy_error_norm": 0.04351407289505005, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.04002102091908455, "policy_entropy": 0.10038894414901733, "policy_entropy/max": 3.671875, "policy_entropy/median": 5.602836608886719e-06, "policy_entropy/min": 1.448494102440634e-16, "policy_entropy/p25": 5.390029400587082e-08, "policy_entropy/p75": 0.00074005126953125, "policy_entropy/var": 0.1173129603266716, "policy_error_vector_variance/max_squared_error": 1.9933149814605713, "policy_error_vector_variance/metric": 0.0430731400847435, "policy_loss": -0.2604166865348816, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": -1.0, "policy_loss/p25": -1.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.19462718069553375, "policy_sharpness": 8.458539962768555, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.898604393005371, "reward": 0.2604166865348816, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 1.0, "reward/var": 0.19462718069553375, "rewards/accuracy_reward": 0.2604166865348816, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 1.0, "rewards/accuracy_reward/var": 0.19462718069553375, "sentence_full_gradient_variance/max_squared_error": 7169.11572265625, "sentence_full_gradient_variance/metric": 1694.8675537109375, "sentence_full_gradient_variance/p75": 566.5732421875, "sentence_full_gradient_variance/p90": 5835.837890625, "sentence_full_gradient_variance/p95": 6554.759765625, "sentence_full_gradient_variance/p99": 7169.037109375, "state_level_variance/metric": 61.81982421875, "state_level_variance_full_gradient/metric": 1694.8675537109375, "step": 40 }, { "accuracy_reward": 0.0625, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.05921052768826485, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 6.99689335306175e-05, "adam_stats/lr_effective_mean": -5.461016083785353e-10, "adam_stats/lr_effective_min": -7.048896077321842e-05, "adam_stats/m_t_max": 0.008141318336129189, "adam_stats/m_t_mean": 1.2836641472002697e-10, "adam_stats/m_t_min": -0.015306082554161549, "adam_stats/v_t_max": 8.991261711344123e-05, "adam_stats/v_t_mean": 9.402247209966408e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0625, "advantages/max": 1.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.05921052768826485, "all_logprobs": -0.05452713370323181, "all_logprobs/max": 0.0, "all_logprobs/median": -5.245208740234375e-06, "all_logprobs/min": -10.375, "all_logprobs/p1": -1.6751561164855957, "all_logprobs/p10": -0.0036773681640625, "all_logprobs/p25": -7.963180541992188e-05, "all_logprobs/p5": -0.10302734375, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.1261974275112152, "clip_ratio": 0.0, "completion_length": 571.4479370117188, "completion_length/correct": 453.3333435058594, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 267.0, "completion_length/correct/min": 252.0, "completion_length/correct/p25": 259.5, "completion_length/correct/p75": 520.0, "completion_length/correct/var": 93670.265625, "completion_length/incorrect": 579.322265625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 473.0, "completion_length/incorrect/min": 10.0, "completion_length/incorrect/p25": 165.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 180599.34375, "completion_length/max": 1024.0, "completion_length/median": 460.0, "completion_length/min": 10.0, "completion_length/p25": 170.75, "completion_length/p75": 1024.0, "completion_length/var": 175062.953125, "epoch": 0.5189873417721519, "feature_vector_variance/max_squared_error": 93976.4140625, "feature_vector_variance/metric": 23790.76171875, "generated_tokens/total": 1992619.0, "grad_norm": 0.3555797040462494, "learning_rate": 1.1021036720894182e-05, "loss": -0.0625, "mean_logprobs": -0.173828125, "mean_logprobs/var": 0.07080078125, "num_completions/total": 3936, "per_sentence_gradient_norm": 0.8421767950057983, "per_sentence_gradient_norm/max": 17.321975708007812, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 6.967432022094727, "per_sentence_gradient_norm/p99": 16.361900329589844, "per_sentence_gradient_norm/var": 11.767982482910156, "per_token_feature_norm": 165.28688049316406, "per_token_feature_norm/max": 294.0, "per_token_feature_norm/median": 163.0, "per_token_feature_norm/min": 65.0, "per_token_feature_norm/p25": 137.0, "per_token_feature_norm/p75": 194.0, "per_token_feature_norm/var": 1246.88134765625, "per_token_full_gradient_variance/max_squared_error": 1.3180063962936401, "per_token_full_gradient_variance/variance": 0.0008957079262472689, "per_token_gradient_norm": 0.6376948952674866, "per_token_gradient_norm/max": 354.609375, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 97.73904418945312, "per_token_policy_error_norm": 0.025080738589167595, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.023376593366265297, "policy_entropy": 0.06093117967247963, "policy_entropy/max": 3.640625, "policy_entropy/median": 7.295608520507812e-05, "policy_entropy/min": 2.3175905639050143e-15, "policy_entropy/p25": 4.76837158203125e-06, "policy_entropy/p75": 0.0008945465087890625, "policy_entropy/var": 0.07407860457897186, "policy_error_vector_variance/max_squared_error": 2.0049850940704346, "policy_error_vector_variance/metric": 0.024690967053174973, "policy_loss": -0.0625, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": -1.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.05921052768826485, "policy_sharpness": 8.825769424438477, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.51341438293457, "reward": 0.0625, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.05921052768826485, "rewards/accuracy_reward": 0.0625, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.05921052768826485, "sentence_full_gradient_variance/max_squared_error": 6953.05615234375, "sentence_full_gradient_variance/metric": 307.2703552246094, "sentence_full_gradient_variance/p75": 16.3138370513916, "sentence_full_gradient_variance/p90": 16.3138370513916, "sentence_full_gradient_variance/p95": 546.7892456054688, "sentence_full_gradient_variance/p99": 6863.6953125, "state_level_variance/metric": 11.767982482910156, "state_level_variance_full_gradient/metric": 307.2703552246094, "step": 41 }, { "accuracy_reward": 0.02083333395421505, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.020614037290215492, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 6.93311303621158e-05, "adam_stats/lr_effective_mean": -6.360824644779939e-10, "adam_stats/lr_effective_min": -6.872215453768149e-05, "adam_stats/m_t_max": 0.007491331547498703, "adam_stats/m_t_mean": 1.1857537174364552e-10, "adam_stats/m_t_min": -0.014336997643113136, "adam_stats/v_t_max": 8.982300641946495e-05, "adam_stats/v_t_mean": 9.440801439219992e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.02083333395421505, "advantages/max": 1.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.020614037290215492, "all_logprobs": -0.031355585902929306, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-06, "all_logprobs/min": -7.125, "all_logprobs/p1": -0.9765625, "all_logprobs/p10": -0.0008544921875, "all_logprobs/p25": -4.291534423828125e-05, "all_logprobs/p5": -0.0162353515625, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.06490170955657959, "clip_ratio": 0.0, "completion_length": 711.1041870117188, "completion_length/correct": 129.5, "completion_length/correct/max": 190.0, "completion_length/correct/median": 69.0, "completion_length/correct/min": 69.0, "completion_length/correct/p25": 99.25, "completion_length/correct/p75": 159.75, "completion_length/correct/var": 7320.5, "completion_length/incorrect": 723.4786987304688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 4.0, "completion_length/incorrect/p25": 233.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 183370.359375, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 4.0, "completion_length/p25": 186.25, "completion_length/p75": 1024.0, "completion_length/var": 186859.828125, "epoch": 0.5316455696202531, "feature_vector_variance/max_squared_error": 95766.734375, "feature_vector_variance/metric": 23875.91796875, "generated_tokens/total": 2060885.0, "grad_norm": 0.42493686079978943, "learning_rate": 1.078778360091808e-05, "loss": -0.0208, "mean_logprobs": -0.10546875, "mean_logprobs/var": 0.0242919921875, "num_completions/total": 4032, "per_sentence_gradient_norm": 0.24888932704925537, "per_sentence_gradient_norm/max": 12.077791213989258, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 11.828696250915527, "per_sentence_gradient_norm/var": 2.9424660205841064, "per_token_feature_norm": 169.32696533203125, "per_token_feature_norm/max": 296.0, "per_token_feature_norm/median": 168.0, "per_token_feature_norm/min": 64.5, "per_token_feature_norm/p25": 139.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 1194.1590576171875, "per_token_full_gradient_variance/max_squared_error": 0.5501571893692017, "per_token_full_gradient_variance/variance": 8.554195665055886e-05, "per_token_gradient_norm": 0.045557901263237, "per_token_gradient_norm/max": 262.28125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 6.6277031898498535, "per_token_policy_error_norm": 0.015193182043731213, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.014619700610637665, "policy_entropy": 0.03578752651810646, "policy_entropy/max": 3.4375, "policy_entropy/median": 5.125999450683594e-05, "policy_entropy/min": 4.6629367034256575e-15, "policy_entropy/p25": 4.470348358154297e-06, "policy_entropy/p75": 0.000507354736328125, "policy_entropy/var": 0.037909068167209625, "policy_error_vector_variance/max_squared_error": 2.0021884441375732, "policy_error_vector_variance/metric": 0.014993365854024887, "policy_loss": -0.02083333395421505, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": -1.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.020614037290215492, "policy_sharpness": 9.115581512451172, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.459405899047852, "reward": 0.02083333395421505, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.020614037290215492, "rewards/accuracy_reward": 0.02083333395421505, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.020614037290215492, "sentence_full_gradient_variance/max_squared_error": 11431.15234375, "sentence_full_gradient_variance/metric": 213.99461364746094, "sentence_full_gradient_variance/p75": 4.532228469848633, "sentence_full_gradient_variance/p90": 4.532228469848633, "sentence_full_gradient_variance/p95": 4.532228469848633, "sentence_full_gradient_variance/p99": 8823.5517578125, "state_level_variance/metric": 2.9424660205841064, "state_level_variance_full_gradient/metric": 213.99461364746094, "step": 42 }, { "accuracy_reward": 0.010416666977107525, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": NaN, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0104166679084301, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 6.136806041467935e-05, "adam_stats/lr_effective_mean": -5.890504195527058e-10, "adam_stats/lr_effective_min": -6.0231923271203414e-05, "adam_stats/m_t_max": 0.00667505944147706, "adam_stats/m_t_mean": 1.0259470356599465e-10, "adam_stats/m_t_min": -0.012790381908416748, "adam_stats/v_t_max": 8.975672972155735e-05, "adam_stats/v_t_mean": 9.433303096995083e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.010416666977107525, "advantages/max": 1.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0104166679084301, "all_logprobs": -0.02403979003429413, "all_logprobs/max": 0.0, "all_logprobs/median": -5.364418029785156e-06, "all_logprobs/min": -47.0, "all_logprobs/p1": -0.63671875, "all_logprobs/p10": -0.000492095947265625, "all_logprobs/p25": -3.8623809814453125e-05, "all_logprobs/p5": -0.012457266449928284, "all_logprobs/p75": -9.5367431640625e-07, "all_logprobs/var": 0.07488854229450226, "clip_ratio": 0.0, "completion_length": 775.8229370117188, "completion_length/correct": 428.0, "completion_length/correct/max": 428.0, "completion_length/correct/median": 428.0, "completion_length/correct/min": 428.0, "completion_length/correct/p25": 428.0, "completion_length/correct/p75": 428.0, "completion_length/correct/var": NaN, "completion_length/incorrect": 779.4842529296875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 6.0, "completion_length/incorrect/p25": 350.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 168726.359375, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 6.0, "completion_length/p25": 385.75, "completion_length/p75": 1024.0, "completion_length/var": 168237.1875, "epoch": 0.5443037974683544, "feature_vector_variance/max_squared_error": 77152.3046875, "feature_vector_variance/metric": 27778.845703125, "generated_tokens/total": 2135364.0, "grad_norm": 0.0849023386836052, "learning_rate": 1.0550524823068504e-05, "loss": -0.0104, "mean_logprobs": -0.08642578125, "mean_logprobs/var": 0.025146484375, "num_completions/total": 4128, "per_sentence_gradient_norm": 0.10819250345230103, "per_sentence_gradient_norm/max": 10.386480331420898, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.5193557143211365, "per_sentence_gradient_norm/var": 1.1237393617630005, "per_token_feature_norm": 182.80160522460938, "per_token_feature_norm/max": 280.0, "per_token_feature_norm/median": 190.0, "per_token_feature_norm/min": 72.5, "per_token_feature_norm/p25": 155.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 1070.0531005859375, "per_token_full_gradient_variance/max_squared_error": 0.46738240122795105, "per_token_full_gradient_variance/variance": 9.505638445261866e-05, "per_token_gradient_norm": 0.0596868060529232, "per_token_gradient_norm/max": 283.15625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 8.484573364257812, "per_token_policy_error_norm": 0.011721320450305939, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011396907269954681, "policy_entropy": 0.027826249599456787, "policy_entropy/max": 3.625, "policy_entropy/median": 7.677078247070312e-05, "policy_entropy/min": 2.0261570199409107e-15, "policy_entropy/p25": 1.4662742614746094e-05, "policy_entropy/p75": 0.0004634857177734375, "policy_entropy/var": 0.025287291035056114, "policy_error_vector_variance/max_squared_error": 2.003040075302124, "policy_error_vector_variance/metric": 0.01156323030591011, "policy_loss": -0.010416666977107525, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": -1.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0104166679084301, "policy_sharpness": 9.259340286254883, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.569526672363281, "reward": 0.010416666977107525, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0104166679084301, "rewards/accuracy_reward": 0.010416666977107525, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0104166679084301, "sentence_full_gradient_variance/max_squared_error": 4429.38525390625, "sentence_full_gradient_variance/metric": 46.62510681152344, "sentence_full_gradient_variance/p75": 0.49079054594039917, "sentence_full_gradient_variance/p90": 0.49079054594039917, "sentence_full_gradient_variance/p95": 0.49079054594039917, "sentence_full_gradient_variance/p99": 221.94903564453125, "state_level_variance/metric": 1.1237393617630005, "state_level_variance_full_gradient/metric": 46.62510681152344, "step": 43 }, { "accuracy_reward": 0.010416666977107525, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": NaN, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0104166679084301, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 6.110392132541165e-05, "adam_stats/lr_effective_mean": -4.345036830333271e-10, "adam_stats/lr_effective_min": -6.182179640745744e-05, "adam_stats/m_t_max": 0.005697776563465595, "adam_stats/m_t_mean": 1.0531697736126944e-10, "adam_stats/m_t_min": -0.011451833881437778, "adam_stats/v_t_max": 8.97358768270351e-05, "adam_stats/v_t_mean": 9.43863823904545e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.010416666977107525, "advantages/max": 1.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0104166679084301, "all_logprobs": -0.016958996653556824, "all_logprobs/max": 0.0, "all_logprobs/median": -3.4570693969726562e-06, "all_logprobs/min": -9.5, "all_logprobs/p1": -0.38671875, "all_logprobs/p10": -0.00025253184139728546, "all_logprobs/p25": -2.3603439331054688e-05, "all_logprobs/p5": -0.0023651123046875, "all_logprobs/p75": -5.960464477539062e-07, "all_logprobs/var": 0.03812645003199577, "clip_ratio": 0.0, "completion_length": 958.09375, "completion_length/correct": 296.0, "completion_length/correct/max": 296.0, "completion_length/correct/median": 296.0, "completion_length/correct/min": 296.0, "completion_length/correct/p25": 296.0, "completion_length/correct/p75": 296.0, "completion_length/correct/var": NaN, "completion_length/incorrect": 965.0631713867188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 34.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 51296.33984375, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 34.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 55419.3515625, "epoch": 0.5569620253164557, "feature_vector_variance/max_squared_error": 97932.1484375, "feature_vector_variance/metric": 25331.828125, "generated_tokens/total": 2227341.0, "grad_norm": 0.2220391482114792, "learning_rate": 1.0309549450619342e-05, "loss": -0.0104, "mean_logprobs": -0.03564453125, "mean_logprobs/var": 0.00823974609375, "num_completions/total": 4224, "per_sentence_gradient_norm": 0.20616255700588226, "per_sentence_gradient_norm/max": 19.79160499572754, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.9896406531333923, "per_sentence_gradient_norm/var": 4.080287933349609, "per_token_feature_norm": 184.3118133544922, "per_token_feature_norm/max": 294.0, "per_token_feature_norm/median": 195.0, "per_token_feature_norm/min": 85.0, "per_token_feature_norm/p25": 155.0, "per_token_feature_norm/p75": 209.0, "per_token_feature_norm/var": 1056.150634765625, "per_token_full_gradient_variance/max_squared_error": 0.5899356007575989, "per_token_full_gradient_variance/variance": 9.804289584280923e-05, "per_token_gradient_norm": 0.06369326263666153, "per_token_gradient_norm/max": 275.7421875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 9.346524238586426, "per_token_policy_error_norm": 0.008256309665739536, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.008179628290235996, "policy_entropy": 0.018908899277448654, "policy_entropy/max": 3.59375, "policy_entropy/median": 5.030632019042969e-05, "policy_entropy/min": 5.1958437552457326e-14, "policy_entropy/p25": 1.0788440704345703e-05, "policy_entropy/p75": 0.0002956390380859375, "policy_entropy/var": 0.016946908086538315, "policy_error_vector_variance/max_squared_error": 2.0001347064971924, "policy_error_vector_variance/metric": 0.008161040022969246, "policy_loss": -0.010416666977107525, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": -1.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0104166679084301, "policy_sharpness": 9.412169456481934, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.593924045562744, "reward": 0.010416666977107525, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0104166679084301, "rewards/accuracy_reward": 0.010416666977107525, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0104166679084301, "sentence_full_gradient_variance/max_squared_error": 7091.84423828125, "sentence_full_gradient_variance/metric": 74.6510009765625, "sentence_full_gradient_variance/p75": 0.7857999205589294, "sentence_full_gradient_variance/p90": 0.7857999205589294, "sentence_full_gradient_variance/p95": 0.7857999205589294, "sentence_full_gradient_variance/p99": 355.3603820800781, "state_level_variance/metric": 4.080287933349609, "state_level_variance_full_gradient/metric": 74.6510009765625, "step": 44 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 5.3716317779617384e-05, "adam_stats/lr_effective_mean": -3.819426996898301e-10, "adam_stats/lr_effective_min": -5.43476635357365e-05, "adam_stats/m_t_max": 0.005127999000251293, "adam_stats/m_t_mean": 9.478524215511541e-11, "adam_stats/m_t_min": -0.010306649841368198, "adam_stats/v_t_max": 8.964614244177938e-05, "adam_stats/v_t_mean": 9.42919960861266e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.012348691001534462, "all_logprobs/max": 0.0, "all_logprobs/median": -2.6226043701171875e-06, "all_logprobs/min": -10.0, "all_logprobs/p1": -0.22607421875, "all_logprobs/p10": -0.00014972686767578125, "all_logprobs/p25": -1.5497207641601562e-05, "all_logprobs/p5": -0.00124359130859375, "all_logprobs/p75": -5.960464477539062e-07, "all_logprobs/var": 0.027401775121688843, "clip_ratio": 0.0, "completion_length": 982.8229370117188, "completion_length/incorrect": 982.8229370117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 10.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 36039.75, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 10.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 36039.75, "epoch": 0.569620253164557, "feature_vector_variance/max_squared_error": 91947.203125, "feature_vector_variance/metric": 21373.173828125, "generated_tokens/total": 2321692.0, "grad_norm": 0.0, "learning_rate": 1.0065151074942516e-05, "loss": 0.0, "mean_logprobs": -0.02197265625, "mean_logprobs/var": 0.0037689208984375, "num_completions/total": 4320, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 187.68887329101562, "per_token_feature_norm/max": 282.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 76.5, "per_token_feature_norm/p25": 162.0, "per_token_feature_norm/p75": 210.0, "per_token_feature_norm/var": 908.1309814453125, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.006062696687877178, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.006010676268488169, "policy_entropy": 0.014288565143942833, "policy_entropy/max": 3.53125, "policy_entropy/median": 3.933906555175781e-05, "policy_entropy/min": 1.865174681370263e-14, "policy_entropy/p25": 1.0728836059570312e-05, "policy_entropy/p75": 0.0002002716064453125, "policy_entropy/var": 0.010936564765870571, "policy_error_vector_variance/max_squared_error": 2.0023324489593506, "policy_error_vector_variance/metric": 0.0060136085376143456, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.523794174194336, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 2.8865699768066406, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 45 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 4.717902629636228e-05, "adam_stats/lr_effective_mean": -3.3543542943270666e-10, "adam_stats/lr_effective_min": -4.7733781684655696e-05, "adam_stats/m_t_max": 0.0046151988208293915, "adam_stats/m_t_mean": 8.530677553242327e-11, "adam_stats/m_t_min": -0.009275984950363636, "adam_stats/v_t_max": 8.955649536801502e-05, "adam_stats/v_t_mean": 9.419772253882464e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.008922823704779148, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-06, "all_logprobs/min": -10.6875, "all_logprobs/p1": -0.11767578125, "all_logprobs/p10": -5.984306335449219e-05, "all_logprobs/p25": -1.1086463928222656e-05, "all_logprobs/p5": -0.000522613525390625, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.016913753002882004, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.5822784810126582, "feature_vector_variance/max_squared_error": 90445.2734375, "feature_vector_variance/metric": 17940.4375, "generated_tokens/total": 2419996.0, "grad_norm": 0.0, "learning_rate": 9.817627457812105e-06, "loss": 0.0, "mean_logprobs": -0.0089111328125, "mean_logprobs/var": 4.553794860839844e-05, "num_completions/total": 4416, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 190.3798065185547, "per_token_feature_norm/max": 272.0, "per_token_feature_norm/median": 200.0, "per_token_feature_norm/min": 111.5, "per_token_feature_norm/p25": 180.0, "per_token_feature_norm/p75": 208.0, "per_token_feature_norm/var": 777.6032104492188, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0046983761712908745, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.004650456365197897, "policy_entropy": 0.01017804816365242, "policy_entropy/max": 3.65625, "policy_entropy/median": 3.4809112548828125e-05, "policy_entropy/min": 5.420588422566652e-10, "policy_entropy/p25": 6.794929504394531e-06, "policy_entropy/p75": 0.00014495849609375, "policy_entropy/var": 0.006918000057339668, "policy_error_vector_variance/max_squared_error": 1.9988243579864502, "policy_error_vector_variance/metric": 0.004691194277256727, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.650833129882812, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.040770553052425385, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 2.118744373321533, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 46 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 4.13987654610537e-05, "adam_stats/lr_effective_mean": -2.9431682091463074e-10, "adam_stats/lr_effective_min": -4.188576349406503e-05, "adam_stats/m_t_max": 0.00415367865934968, "adam_stats/m_t_mean": 7.677608548917192e-11, "adam_stats/m_t_min": -0.00834838580340147, "adam_stats/v_t_max": 8.946694288169965e-05, "adam_stats/v_t_mean": 9.410351838046171e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.010479215532541275, "all_logprobs/max": 0.0, "all_logprobs/median": -3.2186508178710938e-06, "all_logprobs/min": -8.3125, "all_logprobs/p1": -0.1318359375, "all_logprobs/p10": -0.00011110305786132812, "all_logprobs/p25": -9.775161743164062e-06, "all_logprobs/p5": -0.002105712890625, "all_logprobs/p75": -4.76837158203125e-07, "all_logprobs/var": 0.022615985944867134, "clip_ratio": 0.0, "completion_length": 997.34375, "completion_length/incorrect": 997.34375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 715.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 6532.4814453125, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 715.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 6532.4814453125, "epoch": 0.5949367088607594, "feature_vector_variance/max_squared_error": 104223.890625, "feature_vector_variance/metric": 16092.8349609375, "generated_tokens/total": 2515741.0, "grad_norm": 0.0, "learning_rate": 9.567280168627493e-06, "loss": 0.0, "mean_logprobs": -0.01080322265625, "mean_logprobs/var": 8.96453857421875e-05, "num_completions/total": 4512, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 192.92245483398438, "per_token_feature_norm/max": 272.0, "per_token_feature_norm/median": 200.0, "per_token_feature_norm/min": 91.5, "per_token_feature_norm/p25": 191.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 567.2166748046875, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.005296458024531603, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.005621721502393484, "policy_entropy": 0.01185651682317257, "policy_entropy/max": 3.59375, "policy_entropy/median": 4.5299530029296875e-05, "policy_entropy/min": 4.912736883966318e-15, "policy_entropy/p25": 7.241964340209961e-06, "policy_entropy/p75": 0.00012874603271484375, "policy_entropy/var": 0.006944267079234123, "policy_error_vector_variance/max_squared_error": 1.9978327751159668, "policy_error_vector_variance/metric": 0.0052240570075809956, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.480667114257812, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 3.4109513759613037, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 47 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 3.6292003642302006e-05, "adam_stats/lr_effective_mean": -2.5799212766131063e-10, "adam_stats/lr_effective_min": -3.6719102354254574e-05, "adam_stats/m_t_max": 0.0037383106537163258, "adam_stats/m_t_mean": 6.909844918467911e-11, "adam_stats/m_t_min": -0.007513546850532293, "adam_stats/v_t_max": 8.937747770687565e-05, "adam_stats/v_t_mean": 9.40094009582726e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.012625663541257381, "all_logprobs/max": 0.0, "all_logprobs/median": -2.86102294921875e-06, "all_logprobs/min": -9.75, "all_logprobs/p1": -0.251953125, "all_logprobs/p10": -0.00034599192440509796, "all_logprobs/p25": -1.1563301086425781e-05, "all_logprobs/p5": -0.00640869140625, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.025985701009631157, "clip_ratio": 0.0, "completion_length": 974.65625, "completion_length/incorrect": 974.65625, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 566.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 12775.9541015625, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 566.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 12775.9541015625, "epoch": 0.6075949367088608, "feature_vector_variance/max_squared_error": 116958.15625, "feature_vector_variance/metric": 15242.013671875, "generated_tokens/total": 2609308.0, "grad_norm": 0.0, "learning_rate": 9.314414216997507e-06, "loss": 0.0, "mean_logprobs": -0.01324462890625, "mean_logprobs/var": 0.0001392364501953125, "num_completions/total": 4608, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 194.76870727539062, "per_token_feature_norm/max": 294.0, "per_token_feature_norm/median": 200.0, "per_token_feature_norm/min": 93.5, "per_token_feature_norm/p25": 194.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 461.99896240234375, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0065216259099543095, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.006782847456634045, "policy_entropy": 0.01476955134421587, "policy_entropy/max": 2.890625, "policy_entropy/median": 4.00543212890625e-05, "policy_entropy/min": 6.800116025829084e-15, "policy_entropy/p25": 6.318092346191406e-06, "policy_entropy/p75": 0.0001506805419921875, "policy_entropy/var": 0.0076024229638278484, "policy_error_vector_variance/max_squared_error": 2.0000829696655273, "policy_error_vector_variance/metric": 0.006458761636167765, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.34635066986084, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 4.033677577972412, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 48 }, { "accuracy_reward": 0.010416666977107525, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": NaN, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.01041666604578495, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 3.94462222175207e-05, "adam_stats/lr_effective_mean": -2.1784112014255896e-10, "adam_stats/lr_effective_min": -3.89592933061067e-05, "adam_stats/m_t_max": 0.0033061145804822445, "adam_stats/m_t_mean": 6.210324615674168e-11, "adam_stats/m_t_min": -0.006787178572267294, "adam_stats/v_t_max": 8.928814349928871e-05, "adam_stats/v_t_mean": 9.391673203018591e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.010416666977107525, "advantages/max": 1.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.01041666604578495, "all_logprobs": -0.014969123527407646, "all_logprobs/max": 0.0, "all_logprobs/median": -2.5033950805664062e-06, "all_logprobs/min": -13.25, "all_logprobs/p1": -0.283203125, "all_logprobs/p10": -0.0015869140625, "all_logprobs/p25": -1.9669532775878906e-05, "all_logprobs/p5": -0.01251220703125, "all_logprobs/p75": -7.152557373046875e-07, "all_logprobs/var": 0.03371544927358627, "clip_ratio": 0.0, "completion_length": 958.1771240234375, "completion_length/correct": 698.0, "completion_length/correct/max": 698.0, "completion_length/correct/median": 698.0, "completion_length/correct/min": 698.0, "completion_length/correct/p25": 698.0, "completion_length/correct/p75": 698.0, "completion_length/correct/var": NaN, "completion_length/incorrect": 960.9158325195312, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 695.0, "completion_length/incorrect/p25": 948.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 12464.80078125, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 695.0, "completion_length/p25": 904.25, "completion_length/p75": 1024.0, "completion_length/var": 13053.6435546875, "epoch": 0.620253164556962, "feature_vector_variance/max_squared_error": 115034.640625, "feature_vector_variance/metric": 13900.2138671875, "generated_tokens/total": 2701293.0, "grad_norm": 0.020190216600894928, "learning_rate": 9.059337681133194e-06, "loss": -0.0104, "mean_logprobs": -0.01531982421875, "mean_logprobs/var": 0.000118255615234375, "num_completions/total": 4704, "per_sentence_gradient_norm": 0.013342383317649364, "per_sentence_gradient_norm/max": 1.2808687686920166, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.06404734402894974, "per_sentence_gradient_norm/var": 0.01708984188735485, "per_token_feature_norm": 195.3380889892578, "per_token_feature_norm/max": 276.0, "per_token_feature_norm/median": 201.0, "per_token_feature_norm/min": 96.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 402.3762512207031, "per_token_full_gradient_variance/max_squared_error": 0.25224000215530396, "per_token_full_gradient_variance/variance": 0.0001404577196808532, "per_token_gradient_norm": 0.009719479829072952, "per_token_gradient_norm/max": 215.15625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 1.1146116256713867, "per_token_policy_error_norm": 0.007636456284672022, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.007964490912854671, "policy_entropy": 0.017796648666262627, "policy_entropy/max": 2.59375, "policy_entropy/median": 3.647804260253906e-05, "policy_entropy/min": 1.2101430968414206e-14, "policy_entropy/p25": 1.1026859283447266e-05, "policy_entropy/p75": 0.0002498626708984375, "policy_entropy/var": 0.00830957479774952, "policy_error_vector_variance/max_squared_error": 2.0011558532714844, "policy_error_vector_variance/metric": 0.0075166248716413975, "policy_loss": -0.010416666977107525, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": -1.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.01041666604578495, "policy_sharpness": 9.04474925994873, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.869333744049072, "reward": 0.010416666977107525, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.01041666604578495, "rewards/accuracy_reward": 0.010416666977107525, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.01041666604578495, "sentence_full_gradient_variance/max_squared_error": 1486.6572265625, "sentence_full_gradient_variance/metric": 15.649023056030273, "sentence_full_gradient_variance/p75": 0.16472652554512024, "sentence_full_gradient_variance/p90": 0.16472652554512024, "sentence_full_gradient_variance/p95": 0.16472652554512024, "sentence_full_gradient_variance/p99": 74.49388885498047, "state_level_variance/metric": 0.01708984188735485, "state_level_variance_full_gradient/metric": 15.649023056030273, "step": 49 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 3.451169322943315e-05, "adam_stats/lr_effective_mean": -1.905747365471555e-10, "adam_stats/lr_effective_min": -3.408553311601281e-05, "adam_stats/m_t_max": 0.002975502982735634, "adam_stats/m_t_mean": 5.58929291738508e-11, "adam_stats/m_t_min": -0.006108460482209921, "adam_stats/v_t_max": 8.919885294744745e-05, "adam_stats/v_t_mean": 9.382279675396177e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.01905687153339386, "all_logprobs/max": 0.0, "all_logprobs/median": -2.5033950805664062e-06, "all_logprobs/min": -10.75, "all_logprobs/p1": -0.404296875, "all_logprobs/p10": -0.004180908203125, "all_logprobs/p25": -3.7670135498046875e-05, "all_logprobs/p5": -0.0233154296875, "all_logprobs/p75": -8.344650268554688e-07, "all_logprobs/var": 0.03982796519994736, "clip_ratio": 0.0, "completion_length": 916.7083740234375, "completion_length/incorrect": 916.7083740234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 699.0, "completion_length/incorrect/p25": 779.75, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 16285.0908203125, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 699.0, "completion_length/p25": 779.75, "completion_length/p75": 1024.0, "completion_length/var": 16285.0908203125, "epoch": 0.6329113924050633, "feature_vector_variance/max_squared_error": 112073.3984375, "feature_vector_variance/metric": 13019.9521484375, "generated_tokens/total": 2789297.0, "grad_norm": 0.0, "learning_rate": 8.80236133250198e-06, "loss": 0.0, "mean_logprobs": -0.019775390625, "mean_logprobs/var": 0.00015926361083984375, "num_completions/total": 4800, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 197.09912109375, "per_token_feature_norm/max": 276.0, "per_token_feature_norm/median": 201.0, "per_token_feature_norm/min": 101.0, "per_token_feature_norm/p25": 197.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 298.1082458496094, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.010067102499306202, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010969030670821667, "policy_entropy": 0.022243505343794823, "policy_entropy/max": 2.515625, "policy_entropy/median": 3.552436828613281e-05, "policy_entropy/min": 2.5646151868841116e-14, "policy_entropy/p25": 1.2695789337158203e-05, "policy_entropy/p75": 0.000446319580078125, "policy_entropy/var": 0.00901518389582634, "policy_error_vector_variance/max_squared_error": 2.0009922981262207, "policy_error_vector_variance/metric": 0.009841587394475937, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 8.862541198730469, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 6.84550666809082, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 50 }, { "accuracy_reward": 0.010416666977107525, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": NaN, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0104166679084301, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 3.7189365684753284e-05, "adam_stats/lr_effective_mean": -1.0793188565116907e-10, "adam_stats/lr_effective_min": -3.752171323867515e-05, "adam_stats/m_t_max": 0.002667984925210476, "adam_stats/m_t_mean": 5.030922206605837e-11, "adam_stats/m_t_min": -0.005506721790879965, "adam_stats/v_t_max": 8.911010081646964e-05, "adam_stats/v_t_mean": 9.376149162632075e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.010416666977107525, "advantages/max": 1.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0104166679084301, "all_logprobs": -0.021490326151251793, "all_logprobs/max": 0.0, "all_logprobs/median": -2.6226043701171875e-06, "all_logprobs/min": -10.1875, "all_logprobs/p1": -0.482421875, "all_logprobs/p10": -0.00689697265625, "all_logprobs/p25": -8.630752563476562e-05, "all_logprobs/p5": -0.031982421875, "all_logprobs/p75": -8.344650268554688e-07, "all_logprobs/var": 0.04635564982891083, "clip_ratio": 0.0, "completion_length": 913.96875, "completion_length/correct": 767.0, "completion_length/correct/max": 767.0, "completion_length/correct/median": 767.0, "completion_length/correct/min": 767.0, "completion_length/correct/p25": 767.0, "completion_length/correct/p75": 767.0, "completion_length/correct/var": NaN, "completion_length/incorrect": 915.5158081054688, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 597.0, "completion_length/incorrect/p25": 779.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 17496.826171875, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 597.0, "completion_length/p25": 778.0, "completion_length/p75": 1024.0, "completion_length/var": 17542.41015625, "epoch": 0.6455696202531646, "feature_vector_variance/max_squared_error": 122065.28125, "feature_vector_variance/metric": 12676.966796875, "generated_tokens/total": 2877038.0, "grad_norm": 0.08676548302173615, "learning_rate": 8.543798257200491e-06, "loss": -0.0104, "mean_logprobs": -0.0220947265625, "mean_logprobs/var": 0.0001392364501953125, "num_completions/total": 4896, "per_sentence_gradient_norm": 0.03983534872531891, "per_sentence_gradient_norm/max": 3.824193239212036, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.19122132658958435, "per_sentence_gradient_norm/var": 0.15233807265758514, "per_token_feature_norm": 196.85177612304688, "per_token_feature_norm/max": 284.0, "per_token_feature_norm/median": 201.0, "per_token_feature_norm/min": 86.0, "per_token_feature_norm/p25": 197.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 296.00213623046875, "per_token_full_gradient_variance/max_squared_error": 0.35273367166519165, "per_token_full_gradient_variance/variance": 0.00016314660024363548, "per_token_gradient_norm": 0.03342970833182335, "per_token_gradient_norm/max": 282.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 5.742141246795654, "per_token_policy_error_norm": 0.011087208054959774, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.011416896246373653, "policy_entropy": 0.026425110176205635, "policy_entropy/max": 2.5, "policy_entropy/median": 3.838539123535156e-05, "policy_entropy/min": 2.3869795029440866e-14, "policy_entropy/p25": 1.3768672943115234e-05, "policy_entropy/p75": 0.0009307861328125, "policy_entropy/var": 0.010501213371753693, "policy_error_vector_variance/max_squared_error": 2.003192663192749, "policy_error_vector_variance/metric": 0.010782566852867603, "policy_loss": -0.010416666977107525, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": -1.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0104166679084301, "policy_sharpness": 8.688817024230957, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.7606635093688965, "reward": 0.010416666977107525, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0104166679084301, "rewards/accuracy_reward": 0.010416666977107525, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0104166679084301, "sentence_full_gradient_variance/max_squared_error": 834.8873291015625, "sentence_full_gradient_variance/metric": 8.788288116455078, "sentence_full_gradient_variance/p75": 0.09250830113887787, "sentence_full_gradient_variance/p90": 0.09250830113887787, "sentence_full_gradient_variance/p95": 0.09250830113887787, "sentence_full_gradient_variance/p99": 41.83479690551758, "state_level_variance/metric": 0.15233807265758514, "state_level_variance_full_gradient/metric": 8.788288116455078, "step": 51 }, { "accuracy_reward": 0.010416666977107525, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": NaN, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0104166679084301, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 3.991244011558592e-05, "adam_stats/lr_effective_mean": -1.0957710433467938e-10, "adam_stats/lr_effective_min": -4.009692929685116e-05, "adam_stats/m_t_max": 0.0024826747830957174, "adam_stats/m_t_mean": 4.439178538651056e-11, "adam_stats/m_t_min": -0.00487517798319459, "adam_stats/v_t_max": 8.902630361262709e-05, "adam_stats/v_t_mean": 9.36818764923908e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.010416666977107525, "advantages/max": 1.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0104166679084301, "all_logprobs": -0.019590241834521294, "all_logprobs/max": 0.0, "all_logprobs/median": -2.5033950805664062e-06, "all_logprobs/min": -11.25, "all_logprobs/p1": -0.353515625, "all_logprobs/p10": -0.00927734375, "all_logprobs/p25": -0.00016021728515625, "all_logprobs/p5": -0.032470703125, "all_logprobs/p75": -7.152557373046875e-07, "all_logprobs/var": 0.04671847075223923, "clip_ratio": 0.0, "completion_length": 885.3333740234375, "completion_length/correct": 758.0, "completion_length/correct/max": 758.0, "completion_length/correct/median": 758.0, "completion_length/correct/min": 758.0, "completion_length/correct/p25": 758.0, "completion_length/correct/p75": 758.0, "completion_length/correct/var": NaN, "completion_length/incorrect": 886.6737060546875, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 875.0, "completion_length/incorrect/min": 609.0, "completion_length/incorrect/p25": 758.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 17552.4765625, "completion_length/max": 1024.0, "completion_length/median": 863.0, "completion_length/min": 609.0, "completion_length/p25": 757.75, "completion_length/p75": 1024.0, "completion_length/var": 17540.181640625, "epoch": 0.6582278481012658, "feature_vector_variance/max_squared_error": 116341.53125, "feature_vector_variance/metric": 12003.46875, "generated_tokens/total": 2962030.0, "grad_norm": 0.06010730192065239, "learning_rate": 8.283963474507402e-06, "loss": -0.0104, "mean_logprobs": -0.019775390625, "mean_logprobs/var": 0.00011587142944335938, "num_completions/total": 4992, "per_sentence_gradient_norm": 0.042603250592947006, "per_sentence_gradient_norm/max": 4.089911937713623, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.20450808107852936, "per_sentence_gradient_norm/var": 0.17424356937408447, "per_token_feature_norm": 197.0250701904297, "per_token_feature_norm/max": 286.0, "per_token_feature_norm/median": 201.0, "per_token_feature_norm/min": 97.0, "per_token_feature_norm/p25": 197.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 250.73135375976562, "per_token_full_gradient_variance/max_squared_error": 0.6547530293464661, "per_token_full_gradient_variance/variance": 0.00020422058878466487, "per_token_gradient_norm": 0.03647582232952118, "per_token_gradient_norm/max": 265.78125, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4.939617156982422, "per_token_policy_error_norm": 0.009380500763654709, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.010022565722465515, "policy_entropy": 0.02660648711025715, "policy_entropy/max": 2.9375, "policy_entropy/median": 3.695487976074219e-05, "policy_entropy/min": 2.3314683517128287e-14, "policy_entropy/p25": 1.1444091796875e-05, "policy_entropy/p75": 0.00160980224609375, "policy_entropy/var": 0.00998766254633665, "policy_error_vector_variance/max_squared_error": 2.0047993659973145, "policy_error_vector_variance/metric": 0.009038901887834072, "policy_loss": -0.010416666977107525, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": -1.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0104166679084301, "policy_sharpness": 8.451441764831543, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 9.125, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.085346221923828, "reward": 0.010416666977107525, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0104166679084301, "rewards/accuracy_reward": 0.010416666977107525, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0104166679084301, "sentence_full_gradient_variance/max_squared_error": 902.9407958984375, "sentence_full_gradient_variance/metric": 9.504640579223633, "sentence_full_gradient_variance/p75": 0.10004884004592896, "sentence_full_gradient_variance/p90": 0.10004884004592896, "sentence_full_gradient_variance/p95": 0.10004884004592896, "sentence_full_gradient_variance/p99": 45.244842529296875, "state_level_variance/metric": 0.17424356937408447, "state_level_variance_full_gradient/metric": 9.504640579223633, "step": 52 }, { "accuracy_reward": 0.010416666977107525, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": NaN, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.01041666604578495, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 4.1196250094799325e-05, "adam_stats/lr_effective_mean": -1.0252798610110858e-10, "adam_stats/lr_effective_min": -4.116824857192114e-05, "adam_stats/m_t_max": 0.0024233662988990545, "adam_stats/m_t_mean": 4.2528341959169325e-11, "adam_stats/m_t_min": -0.004307551309466362, "adam_stats/v_t_max": 8.894631173461676e-05, "adam_stats/v_t_mean": 9.359640666672941e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.010416666977107525, "advantages/max": 1.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.01041666604578495, "all_logprobs": -0.01831008307635784, "all_logprobs/max": 0.0, "all_logprobs/median": -1.9073486328125e-06, "all_logprobs/min": -13.0, "all_logprobs/p1": -0.314453125, "all_logprobs/p10": -0.007626324892044067, "all_logprobs/p25": -0.0001609325408935547, "all_logprobs/p5": -0.0242919921875, "all_logprobs/p75": -4.76837158203125e-07, "all_logprobs/var": 0.04686633497476578, "clip_ratio": 0.0, "completion_length": 897.1041870117188, "completion_length/correct": 608.0, "completion_length/correct/max": 608.0, "completion_length/correct/median": 608.0, "completion_length/correct/min": 608.0, "completion_length/correct/p25": 608.0, "completion_length/correct/p75": 608.0, "completion_length/correct/var": NaN, "completion_length/incorrect": 900.1473999023438, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1023.0, "completion_length/incorrect/min": 598.0, "completion_length/incorrect/p25": 766.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 18157.5546875, "completion_length/max": 1024.0, "completion_length/median": 994.0, "completion_length/min": 598.0, "completion_length/p25": 766.0, "completion_length/p75": 1024.0, "completion_length/var": 18855.482421875, "epoch": 0.6708860759493671, "feature_vector_variance/max_squared_error": 132463.03125, "feature_vector_variance/metric": 12444.5576171875, "generated_tokens/total": 3048152.0, "grad_norm": 0.0498359352350235, "learning_rate": 8.02317355308094e-06, "loss": -0.0104, "mean_logprobs": -0.0186767578125, "mean_logprobs/var": 9.775161743164062e-05, "num_completions/total": 5088, "per_sentence_gradient_norm": 0.0298846997320652, "per_sentence_gradient_norm/max": 2.8689310550689697, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.14345531165599823, "per_sentence_gradient_norm/var": 0.08573713898658752, "per_token_feature_norm": 196.82583618164062, "per_token_feature_norm/max": 298.0, "per_token_feature_norm/median": 202.0, "per_token_feature_norm/min": 106.5, "per_token_feature_norm/p25": 196.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 281.5151672363281, "per_token_full_gradient_variance/max_squared_error": 0.21556660532951355, "per_token_full_gradient_variance/variance": 9.178274922305718e-05, "per_token_gradient_norm": 0.020253943279385567, "per_token_gradient_norm/max": 292.546875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 3.0836446285247803, "per_token_policy_error_norm": 0.00867646373808384, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.009718007408082485, "policy_entropy": 0.023300563916563988, "policy_entropy/max": 2.609375, "policy_entropy/median": 2.765655517578125e-05, "policy_entropy/min": 2.731148640577885e-14, "policy_entropy/p25": 8.821487426757812e-06, "policy_entropy/p75": 0.00162506103515625, "policy_entropy/var": 0.00832089502364397, "policy_error_vector_variance/max_squared_error": 2.004382610321045, "policy_error_vector_variance/metric": 0.008437464945018291, "policy_loss": -0.010416666977107525, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": -1.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.01041666604578495, "policy_sharpness": 8.444849967956543, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.078402519226074, "reward": 0.010416666977107525, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.01041666604578495, "rewards/accuracy_reward": 0.010416666977107525, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.01041666604578495, "sentence_full_gradient_variance/max_squared_error": 2340.238525390625, "sentence_full_gradient_variance/metric": 24.634090423583984, "sentence_full_gradient_variance/p75": 0.2593061625957489, "sentence_full_gradient_variance/p90": 0.2593061625957489, "sentence_full_gradient_variance/p95": 0.2593061625957489, "sentence_full_gradient_variance/p99": 117.26541137695312, "state_level_variance/metric": 0.08573713898658752, "state_level_variance_full_gradient/metric": 24.634090423583984, "step": 53 }, { "accuracy_reward": 0.0416666679084301, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.04035087302327156, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 4.1380746552022174e-05, "adam_stats/lr_effective_mean": -1.701094543005155e-10, "adam_stats/lr_effective_min": -4.172290209680796e-05, "adam_stats/m_t_max": 0.0038139529060572386, "adam_stats/m_t_mean": 5.2516675036473615e-11, "adam_stats/m_t_min": -0.004837504122406244, "adam_stats/v_t_max": 8.938402606872842e-05, "adam_stats/v_t_mean": 9.379167581480274e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0416666679084301, "advantages/max": 1.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.04035087302327156, "all_logprobs": -0.018208084627985954, "all_logprobs/max": 0.0, "all_logprobs/median": -1.5497207641601562e-06, "all_logprobs/min": -10.4375, "all_logprobs/p1": -0.29398441314697266, "all_logprobs/p10": -0.0072692930698394775, "all_logprobs/p25": -0.00016498565673828125, "all_logprobs/p5": -0.0247802734375, "all_logprobs/p75": -4.76837158203125e-07, "all_logprobs/var": 0.045340172946453094, "clip_ratio": 0.0, "completion_length": 909.8854370117188, "completion_length/correct": 789.5, "completion_length/correct/max": 916.0, "completion_length/correct/median": 711.0, "completion_length/correct/min": 646.0, "completion_length/correct/p25": 694.75, "completion_length/correct/p75": 892.75, "completion_length/correct/var": 17292.333984375, "completion_length/incorrect": 915.1195678710938, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 583.0, "completion_length/incorrect/p25": 766.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 18456.43359375, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 583.0, "completion_length/p25": 765.75, "completion_length/p75": 1024.0, "completion_length/var": 18862.14453125, "epoch": 0.6835443037974683, "feature_vector_variance/max_squared_error": 128481.6640625, "feature_vector_variance/metric": 13756.8779296875, "generated_tokens/total": 3135501.0, "grad_norm": 0.24119922518730164, "learning_rate": 7.76174622526876e-06, "loss": -0.0417, "mean_logprobs": -0.0184326171875, "mean_logprobs/var": 9.822845458984375e-05, "num_completions/total": 5184, "per_sentence_gradient_norm": 0.08508698642253876, "per_sentence_gradient_norm/max": 2.2600011825561523, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 2.102017641067505, "per_sentence_gradient_norm/var": 0.16948433220386505, "per_token_feature_norm": 196.3035430908203, "per_token_feature_norm/max": 302.0, "per_token_feature_norm/median": 201.0, "per_token_feature_norm/min": 90.5, "per_token_feature_norm/p25": 194.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 292.2860412597656, "per_token_full_gradient_variance/max_squared_error": 0.521711528301239, "per_token_full_gradient_variance/variance": 0.0006355394143611193, "per_token_gradient_norm": 0.0733434185385704, "per_token_gradient_norm/max": 270.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 11.85666275024414, "per_token_policy_error_norm": 0.008644561283290386, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.01008686888962984, "policy_entropy": 0.023007260635495186, "policy_entropy/max": 2.875, "policy_entropy/median": 2.4318695068359375e-05, "policy_entropy/min": 1.2378986724570495e-14, "policy_entropy/p25": 7.62939453125e-06, "policy_entropy/p75": 0.001708984375, "policy_entropy/var": 0.008507993072271347, "policy_error_vector_variance/max_squared_error": 2.001669406890869, "policy_error_vector_variance/metric": 0.008430085144937038, "policy_loss": -0.0416666679084301, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": -1.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.04035087302327156, "policy_sharpness": 8.423958778381348, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 9.304251670837402, "reward": 0.0416666679084301, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.04035087302327156, "rewards/accuracy_reward": 0.0416666679084301, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.04035087302327156, "sentence_full_gradient_variance/max_squared_error": 1745.2164306640625, "sentence_full_gradient_variance/metric": 35.216552734375, "sentence_full_gradient_variance/p75": 1.2287405729293823, "sentence_full_gradient_variance/p90": 1.2287405729293823, "sentence_full_gradient_variance/p95": 1.2287405729293823, "sentence_full_gradient_variance/p99": 1211.5140380859375, "state_level_variance/metric": 0.16948433220386505, "state_level_variance_full_gradient/metric": 35.216552734375, "step": 54 }, { "accuracy_reward": 0.010416666977107525, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": NaN, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0104166679084301, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 4.3001011363230646e-05, "adam_stats/lr_effective_mean": -2.0805179801186569e-10, "adam_stats/lr_effective_min": -4.26955011789687e-05, "adam_stats/m_t_max": 0.005083539057523012, "adam_stats/m_t_mean": 6.056932733145004e-11, "adam_stats/m_t_min": -0.006514398846775293, "adam_stats/v_t_max": 8.97614736459218e-05, "adam_stats/v_t_mean": 9.390454559776718e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.010416666977107525, "advantages/max": 1.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0104166679084301, "all_logprobs": -0.014287373051047325, "all_logprobs/max": 0.0, "all_logprobs/median": -1.0728836059570312e-06, "all_logprobs/min": -9.5625, "all_logprobs/p1": -0.11016607284545898, "all_logprobs/p10": -0.006256103515625, "all_logprobs/p25": -5.7578086853027344e-05, "all_logprobs/p5": -0.0208740234375, "all_logprobs/p75": -4.76837158203125e-07, "all_logprobs/var": 0.03754856064915657, "clip_ratio": 0.0, "completion_length": 930.6979370117188, "completion_length/correct": 818.0, "completion_length/correct/max": 818.0, "completion_length/correct/median": 818.0, "completion_length/correct/min": 818.0, "completion_length/correct/p25": 818.0, "completion_length/correct/p75": 818.0, "completion_length/correct/var": NaN, "completion_length/incorrect": 931.88427734375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 717.0, "completion_length/incorrect/p25": 807.5, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 13698.3818359375, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 717.0, "completion_length/p25": 808.75, "completion_length/p75": 1024.0, "completion_length/var": 13689.28515625, "epoch": 0.6962025316455697, "feature_vector_variance/max_squared_error": 127096.96875, "feature_vector_variance/metric": 10858.01953125, "generated_tokens/total": 3224848.0, "grad_norm": 0.20061138272285461, "learning_rate": 7.5e-06, "loss": -0.0104, "mean_logprobs": -0.014404296875, "mean_logprobs/var": 5.412101745605469e-05, "num_completions/total": 5280, "per_sentence_gradient_norm": 0.02683371864259243, "per_sentence_gradient_norm/max": 2.5760369300842285, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.12880970537662506, "per_sentence_gradient_norm/var": 0.06912465393543243, "per_token_feature_norm": 197.2216339111328, "per_token_feature_norm/max": 284.0, "per_token_feature_norm/median": 201.0, "per_token_feature_norm/min": 88.5, "per_token_feature_norm/p25": 196.0, "per_token_feature_norm/p75": 204.0, "per_token_feature_norm/var": 173.5215606689453, "per_token_full_gradient_variance/max_squared_error": 0.46737638115882874, "per_token_full_gradient_variance/variance": 0.00014652679965365678, "per_token_gradient_norm": 0.023584431037306786, "per_token_gradient_norm/max": 281.875, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 4.087490081787109, "per_token_policy_error_norm": 0.006333955097943544, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.008202489465475082, "policy_entropy": 0.01886001043021679, "policy_entropy/max": 3.53125, "policy_entropy/median": 1.7523765563964844e-05, "policy_entropy/min": 3.552713678800501e-14, "policy_entropy/p25": 7.361173629760742e-06, "policy_entropy/p75": 0.00064849853515625, "policy_entropy/var": 0.005995327141135931, "policy_error_vector_variance/max_squared_error": 2.002094030380249, "policy_error_vector_variance/metric": 0.006227508187294006, "policy_loss": -0.010416666977107525, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": -1.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0104166679084301, "policy_sharpness": 8.618640899658203, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 8.536084175109863, "reward": 0.010416666977107525, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0104166679084301, "rewards/accuracy_reward": 0.010416666977107525, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0104166679084301, "sentence_full_gradient_variance/max_squared_error": 520.394775390625, "sentence_full_gradient_variance/metric": 5.477839469909668, "sentence_full_gradient_variance/p75": 0.057661477476358414, "sentence_full_gradient_variance/p90": 0.057661477476358414, "sentence_full_gradient_variance/p95": 0.057661477476358414, "sentence_full_gradient_variance/p99": 26.07610511779785, "state_level_variance/metric": 0.06912465393543243, "state_level_variance_full_gradient/metric": 5.477839469909668, "step": 55 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 3.736868529813364e-05, "adam_stats/lr_effective_mean": -1.8079490682332278e-10, "adam_stats/lr_effective_min": -3.710320743266493e-05, "adam_stats/m_t_max": 0.004575185012072325, "adam_stats/m_t_mean": 5.451239806775199e-11, "adam_stats/m_t_min": -0.00586295872926712, "adam_stats/v_t_max": 8.967171015683562e-05, "adam_stats/v_t_mean": 9.381063634239517e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.00949817057698965, "all_logprobs/max": 0.0, "all_logprobs/median": -9.5367431640625e-07, "all_logprobs/min": -9.5, "all_logprobs/p1": -0.044677734375, "all_logprobs/p10": -0.005615234375, "all_logprobs/p25": -4.673004150390625e-05, "all_logprobs/p5": -0.0140380859375, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.025007382035255432, "clip_ratio": 0.0, "completion_length": 996.3646240234375, "completion_length/incorrect": 996.3646240234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 766.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 3518.044677734375, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 766.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 3518.044677734375, "epoch": 0.7088607594936709, "feature_vector_variance/max_squared_error": 108177.4921875, "feature_vector_variance/metric": 9335.302734375, "generated_tokens/total": 3320499.0, "grad_norm": 0.0, "learning_rate": 7.238253774731245e-06, "loss": 0.0, "mean_logprobs": -0.00958251953125, "mean_logprobs/var": 2.5033950805664062e-05, "num_completions/total": 5376, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 197.364990234375, "per_token_feature_norm/max": 240.0, "per_token_feature_norm/median": 200.0, "per_token_feature_norm/min": 110.5, "per_token_feature_norm/p25": 197.0, "per_token_feature_norm/p75": 203.0, "per_token_feature_norm/var": 117.90509033203125, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.003946598153561354, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.005340578034520149, "policy_entropy": 0.014144821092486382, "policy_entropy/max": 2.28125, "policy_entropy/median": 1.4603137969970703e-05, "policy_entropy/min": 1.7462298274040222e-08, "policy_entropy/p25": 6.616115570068359e-06, "policy_entropy/p75": 0.00054168701171875, "policy_entropy/var": 0.0034712739288806915, "policy_error_vector_variance/max_squared_error": 2.0025320053100586, "policy_error_vector_variance/metric": 0.0039128512144088745, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 8.640771865844727, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 7.859025001525879, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 56 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 3.243310857214965e-05, "adam_stats/lr_effective_mean": -1.569107066501374e-10, "adam_stats/lr_effective_min": -3.2202711736317724e-05, "adam_stats/m_t_max": 0.004117666278034449, "adam_stats/m_t_mean": 4.906114750569124e-11, "adam_stats/m_t_min": -0.005276662763208151, "adam_stats/v_t_max": 8.958204125519842e-05, "adam_stats/v_t_mean": 9.371682249681434e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.005783304572105408, "all_logprobs/max": 0.0, "all_logprobs/median": -7.152557373046875e-07, "all_logprobs/min": -14.75, "all_logprobs/p1": -0.0280139222741127, "all_logprobs/p10": -0.001617431640625, "all_logprobs/p25": -1.5497207641601562e-05, "all_logprobs/p5": -0.00653076171875, "all_logprobs/p75": -3.5762786865234375e-07, "all_logprobs/var": 0.01599842868745327, "clip_ratio": 0.0, "completion_length": 1017.2083740234375, "completion_length/incorrect": 1017.2083740234375, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 869.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 729.9772338867188, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 869.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 729.9772338867188, "epoch": 0.7215189873417721, "feature_vector_variance/max_squared_error": 61362.91015625, "feature_vector_variance/metric": 7627.29541015625, "generated_tokens/total": 3418151.0, "grad_norm": 0.0, "learning_rate": 6.976826446919061e-06, "loss": 0.0, "mean_logprobs": -0.005828857421875, "mean_logprobs/var": 2.0623207092285156e-05, "num_completions/total": 5472, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 197.9491424560547, "per_token_feature_norm/max": 231.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 132.0, "per_token_feature_norm/p25": 197.0, "per_token_feature_norm/p75": 202.0, "per_token_feature_norm/var": 48.93632888793945, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0024544268380850554, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.003071665531024337, "policy_entropy": 0.008523098193109035, "policy_entropy/max": 1.578125, "policy_entropy/median": 1.1026859283447266e-05, "policy_entropy/min": 1.0384246706962585e-07, "policy_entropy/p25": 5.602836608886719e-06, "policy_entropy/p75": 0.00019359588623046875, "policy_entropy/var": 0.0023983391001820564, "policy_error_vector_variance/max_squared_error": 2.001377820968628, "policy_error_vector_variance/metric": 0.002448733663186431, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.021873474121094, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.08773108571767807, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 5.544895648956299, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 57 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 2.8112550353398547e-05, "adam_stats/lr_effective_mean": -1.3600340298403069e-10, "adam_stats/lr_effective_min": -2.7912863515666686e-05, "adam_stats/m_t_max": 0.003705899463966489, "adam_stats/m_t_mean": 4.41550337959562e-11, "adam_stats/m_t_min": -0.00474899634718895, "adam_stats/v_t_max": 8.949245966505259e-05, "adam_stats/v_t_mean": 9.36231040610247e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0034618002828210592, "all_logprobs/max": 0.0, "all_logprobs/median": -4.76837158203125e-07, "all_logprobs/min": -9.8125, "all_logprobs/p1": -0.0113525390625, "all_logprobs/p10": -0.00016603432595729828, "all_logprobs/p25": -2.9802322387695312e-06, "all_logprobs/p5": -0.0010986328125, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.009606746956706047, "clip_ratio": 0.0, "completion_length": 1022.9166870117188, "completion_length/incorrect": 1022.9166870117188, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 959.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 59.298336029052734, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 959.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 59.298336029052734, "epoch": 0.7341772151898734, "feature_vector_variance/max_squared_error": 58616.46875, "feature_vector_variance/metric": 6586.900390625, "generated_tokens/total": 3516351.0, "grad_norm": 0.0, "learning_rate": 6.7160365254926005e-06, "loss": 0.0, "mean_logprobs": -0.00347900390625, "mean_logprobs/var": 1.150369644165039e-05, "num_completions/total": 5568, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 198.85302734375, "per_token_feature_norm/max": 229.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 130.0, "per_token_feature_norm/p25": 197.0, "per_token_feature_norm/p75": 201.0, "per_token_feature_norm/var": 21.88013458251953, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0016125214751809835, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0019383702892810106, "policy_entropy": 0.004142402671277523, "policy_entropy/max": 1.7109375, "policy_entropy/median": 8.702278137207031e-06, "policy_entropy/min": 9.685754776000977e-08, "policy_entropy/p25": 4.6193599700927734e-06, "policy_entropy/p75": 4.172325134277344e-05, "policy_entropy/var": 0.0015733788022771478, "policy_error_vector_variance/max_squared_error": 2.0002224445343018, "policy_error_vector_variance/metric": 0.0016132512828335166, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.52045726776123, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 2.5996925830841064, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 58 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 2.4334412955795415e-05, "adam_stats/lr_effective_mean": -1.1772156860434535e-10, "adam_stats/lr_effective_min": -2.416157440165989e-05, "adam_stats/m_t_max": 0.003335309447720647, "adam_stats/m_t_mean": 3.973953111024997e-11, "adam_stats/m_t_min": -0.004274096805602312, "adam_stats/v_t_max": 8.940296538639814e-05, "adam_stats/v_t_mean": 9.352948970864361e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.002523738192394376, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -7.375, "all_logprobs/p1": -0.006438294425606728, "all_logprobs/p10": -1.5854835510253906e-05, "all_logprobs/p25": -1.0728836059570312e-06, "all_logprobs/p5": -0.00020503997802734375, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.005606651306152344, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.7468354430379747, "feature_vector_variance/max_squared_error": 58663.234375, "feature_vector_variance/metric": 5521.65771484375, "generated_tokens/total": 3614655.0, "grad_norm": 0.0, "learning_rate": 6.456201742799511e-06, "loss": 0.0, "mean_logprobs": -0.0025177001953125, "mean_logprobs/var": 6.854534149169922e-06, "num_completions/total": 5664, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 198.8364715576172, "per_token_feature_norm/max": 228.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 142.0, "per_token_feature_norm/p25": 198.0, "per_token_feature_norm/p75": 201.0, "per_token_feature_norm/var": 9.811599731445312, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.001247326610609889, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0014045453863218427, "policy_entropy": 0.002957599703222513, "policy_entropy/max": 1.3984375, "policy_entropy/median": 6.467103958129883e-06, "policy_entropy/min": 6.239861249923706e-08, "policy_entropy/p25": 3.6656856536865234e-06, "policy_entropy/p75": 1.6450881958007812e-05, "policy_entropy/var": 0.0014034333871677518, "policy_error_vector_variance/max_squared_error": 1.9990670680999756, "policy_error_vector_variance/metric": 0.0012504273327067494, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.765714645385742, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.125, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 1.1870394945144653, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 59 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 2.1034229575889185e-05, "adam_stats/lr_effective_mean": -1.0175302961323851e-10, "adam_stats/lr_effective_min": -2.0884843252133578e-05, "adam_stats/m_t_max": 0.0030017783865332603, "adam_stats/m_t_mean": 3.5765581468671925e-11, "adam_stats/m_t_min": -0.0038466870319098234, "adam_stats/v_t_max": 8.931356569519266e-05, "adam_stats/v_t_mean": 9.343596209243632e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.002360585378482938, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -9.8125, "all_logprobs/p1": -0.0036163330078125, "all_logprobs/p10": -7.3909759521484375e-06, "all_logprobs/p25": -9.5367431640625e-07, "all_logprobs/p5": -9.529595263302326e-05, "all_logprobs/p75": -2.384185791015625e-07, "all_logprobs/var": 0.006303890608251095, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.759493670886076, "feature_vector_variance/max_squared_error": 56998.05859375, "feature_vector_variance/metric": 5229.984375, "generated_tokens/total": 3712959.0, "grad_norm": 0.0, "learning_rate": 6.197638667498023e-06, "loss": 0.0, "mean_logprobs": -0.0023651123046875, "mean_logprobs/var": 7.450580596923828e-06, "num_completions/total": 5760, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 198.7112274169922, "per_token_feature_norm/max": 227.0, "per_token_feature_norm/median": 199.0, "per_token_feature_norm/min": 166.0, "per_token_feature_norm/p25": 197.0, "per_token_feature_norm/p75": 201.0, "per_token_feature_norm/var": 9.54031753540039, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0011222760658711195, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.001273747649975121, "policy_entropy": 0.002487937919795513, "policy_entropy/max": 1.6328125, "policy_entropy/median": 5.8710575103759766e-06, "policy_entropy/min": 6.612390279769897e-08, "policy_entropy/p25": 3.2782554626464844e-06, "policy_entropy/p75": 1.4841556549072266e-05, "policy_entropy/var": 0.0013171250466257334, "policy_error_vector_variance/max_squared_error": 1.9999969005584717, "policy_error_vector_variance/metric": 0.0011249319650232792, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.824203491210938, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.107269287109375, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.8717421293258667, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 60 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.8154814824811183e-05, "adam_stats/lr_effective_mean": -8.78209865939894e-11, "adam_stats/lr_effective_min": -1.8025886674877256e-05, "adam_stats/m_t_max": 0.0027016005478799343, "adam_stats/m_t_mean": 3.218900840318284e-11, "adam_stats/m_t_min": -0.0034620182123035192, "adam_stats/v_t_max": 8.922425331547856e-05, "adam_stats/v_t_mean": 9.334252988602021e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.001572518376633525, "all_logprobs/max": 0.0, "all_logprobs/median": -3.5762786865234375e-07, "all_logprobs/min": -5.125, "all_logprobs/p1": -0.0022125244140625, "all_logprobs/p10": -3.4570693969726562e-06, "all_logprobs/p25": -8.344650268554688e-07, "all_logprobs/p5": -3.600120544433594e-05, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002055008430033922, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.7721518987341772, "feature_vector_variance/max_squared_error": 47203.09375, "feature_vector_variance/metric": 4854.21435546875, "generated_tokens/total": 3811263.0, "grad_norm": 0.0, "learning_rate": 5.9406623188668065e-06, "loss": 0.0, "mean_logprobs": -0.0015716552734375, "mean_logprobs/var": 2.0116567611694336e-06, "num_completions/total": 5856, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 198.10018920898438, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 176.0, "per_token_feature_norm/p25": 196.0, "per_token_feature_norm/p75": 200.0, "per_token_feature_norm/var": 11.809164047241211, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0008707444067113101, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.000809347431641072, "policy_entropy": 0.00211352133192122, "policy_entropy/max": 1.515625, "policy_entropy/median": 5.155801773071289e-06, "policy_entropy/min": 3.888271749019623e-08, "policy_entropy/p25": 2.8461217880249023e-06, "policy_entropy/p75": 1.33514404296875e-05, "policy_entropy/var": 0.001144443522207439, "policy_error_vector_variance/max_squared_error": 1.9731088876724243, "policy_error_vector_variance/metric": 0.0008721583872102201, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.87490463256836, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.6208994388580322, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 61 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.5645473467884585e-05, "adam_stats/lr_effective_mean": -7.567993454138389e-11, "adam_stats/lr_effective_min": -1.5534375052084215e-05, "adam_stats/m_t_max": 0.0024314403999596834, "adam_stats/m_t_mean": 2.8970112767034983e-11, "adam_stats/m_t_min": -0.0031158162746578455, "adam_stats/v_t_max": 8.913502824725583e-05, "adam_stats/v_t_mean": 9.32491844157779e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.001605576602742076, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -5.125, "all_logprobs/p1": -0.001983642578125, "all_logprobs/p10": -2.384185791015625e-06, "all_logprobs/p25": -7.152557373046875e-07, "all_logprobs/p5": -1.7404556274414062e-05, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002287495182827115, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.7848101265822784, "feature_vector_variance/max_squared_error": 44576.55078125, "feature_vector_variance/metric": 4775.162109375, "generated_tokens/total": 3909567.0, "grad_norm": 0.0, "learning_rate": 5.685585783002493e-06, "loss": 0.0, "mean_logprobs": -0.0016021728515625, "mean_logprobs/var": 2.0712614059448242e-06, "num_completions/total": 5952, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 197.481689453125, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 177.0, "per_token_feature_norm/p25": 196.0, "per_token_feature_norm/p75": 200.0, "per_token_feature_norm/var": 10.533915519714355, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0008640686864964664, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007871701382100582, "policy_entropy": 0.0021298876963555813, "policy_entropy/max": 1.40625, "policy_entropy/median": 4.231929779052734e-06, "policy_entropy/min": 2.584420144557953e-08, "policy_entropy/p25": 2.3096799850463867e-06, "policy_entropy/p75": 1.1742115020751953e-05, "policy_entropy/var": 0.0012056384002789855, "policy_error_vector_variance/max_squared_error": 1.9732478857040405, "policy_error_vector_variance/metric": 0.0008648074581287801, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.89059829711914, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.125, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.5622826814651489, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 62 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.3461311027640477e-05, "adam_stats/lr_effective_mean": -6.511260280950282e-11, "adam_stats/lr_effective_min": -1.3365730410441756e-05, "adam_stats/m_t_max": 0.002188296290114522, "adam_stats/m_t_mean": 2.6073107214918956e-11, "adam_stats/m_t_min": -0.0028042346239089966, "adam_stats/v_t_max": 8.904589776648208e-05, "adam_stats/v_t_mean": 9.315594302894414e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.001563018187880516, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -6.5625, "all_logprobs/p1": -0.0014715585857629776, "all_logprobs/p10": -2.1457672119140625e-06, "all_logprobs/p25": -7.152557373046875e-07, "all_logprobs/p5": -1.633167266845703e-05, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002523513277992606, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.7974683544303798, "feature_vector_variance/max_squared_error": 44500.0703125, "feature_vector_variance/metric": 4848.9189453125, "generated_tokens/total": 4007871.0, "grad_norm": 0.0, "learning_rate": 5.432719831372507e-06, "loss": 0.0, "mean_logprobs": -0.00156402587890625, "mean_logprobs/var": 2.2202730178833008e-06, "num_completions/total": 6048, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 197.5902099609375, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 171.0, "per_token_feature_norm/p25": 196.0, "per_token_feature_norm/p75": 200.0, "per_token_feature_norm/var": 10.654480934143066, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0008625785703770816, "per_token_policy_error_norm/max": 1.953125, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007894759182818234, "policy_entropy": 0.0018478457350283861, "policy_entropy/max": 1.3671875, "policy_entropy/median": 4.0531158447265625e-06, "policy_entropy/min": 2.2817403078079224e-08, "policy_entropy/p25": 2.250075340270996e-06, "policy_entropy/p75": 1.0728836059570312e-05, "policy_entropy/var": 0.0010372453834861517, "policy_error_vector_variance/max_squared_error": 1.9547803401947021, "policy_error_vector_variance/metric": 0.0008635101257823408, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.903047561645508, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.4731726050376892, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 63 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.1562593499547802e-05, "adam_stats/lr_effective_mean": -5.5926607095413416e-11, "adam_stats/lr_effective_min": -1.1480501598271076e-05, "adam_stats/m_t_max": 0.001969466684386134, "adam_stats/m_t_mean": 2.3465795279120627e-11, "adam_stats/m_t_min": -0.0025238110683858395, "adam_stats/v_t_max": 8.895685459719971e-05, "adam_stats/v_t_mean": 9.306277970466681e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0017470361199229956, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -5.25, "all_logprobs/p1": -0.0013885498046875, "all_logprobs/p10": -2.0265579223632812e-06, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -1.1801719665527344e-05, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0026338498573750257, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.810126582278481, "feature_vector_variance/max_squared_error": 41833.28125, "feature_vector_variance/metric": 4415.64306640625, "generated_tokens/total": 4106175.0, "grad_norm": 0.0, "learning_rate": 5.182372542187895e-06, "loss": 0.0, "mean_logprobs": -0.00174713134765625, "mean_logprobs/var": 3.0994415283203125e-06, "num_completions/total": 6144, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 197.70053100585938, "per_token_feature_norm/max": 208.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 174.0, "per_token_feature_norm/p25": 196.0, "per_token_feature_norm/p75": 200.0, "per_token_feature_norm/var": 10.674193382263184, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0010056495666503906, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.000980474753305316, "policy_entropy": 0.0019427345832809806, "policy_entropy/max": 1.4140625, "policy_entropy/median": 3.606081008911133e-06, "policy_entropy/min": 1.7578713595867157e-08, "policy_entropy/p25": 2.0712614059448242e-06, "policy_entropy/p75": 9.47713851928711e-06, "policy_entropy/var": 0.00121292844414711, "policy_error_vector_variance/max_squared_error": 1.9820688962936401, "policy_error_vector_variance/metric": 0.00100611278321594, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.904273986816406, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.47609949111938477, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 64 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 9.914185284287669e-06, "adam_stats/lr_effective_mean": -4.795191674289434e-11, "adam_stats/lr_effective_min": -9.843803127296269e-06, "adam_stats/m_t_max": 0.0017725200159475207, "adam_stats/m_t_mean": 2.1119215751208564e-11, "adam_stats/m_t_min": -0.002271429868414998, "adam_stats/v_t_max": 8.88678987394087e-05, "adam_stats/v_t_mean": 9.296972913741541e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0013978949282318354, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -6.375, "all_logprobs/p1": -0.000957374693825841, "all_logprobs/p10": -2.0265579223632812e-06, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -1.1682510375976562e-05, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0020397210028022528, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.8227848101265823, "feature_vector_variance/max_squared_error": 42841.4375, "feature_vector_variance/metric": 4390.33203125, "generated_tokens/total": 4204479.0, "grad_norm": 0.0, "learning_rate": 4.934848925057485e-06, "loss": 0.0, "mean_logprobs": -0.00139617919921875, "mean_logprobs/var": 2.0712614059448242e-06, "num_completions/total": 6240, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 197.5994873046875, "per_token_feature_norm/max": 209.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 164.0, "per_token_feature_norm/p25": 196.0, "per_token_feature_norm/p75": 200.0, "per_token_feature_norm/var": 10.296249389648438, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0007661184063181281, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0006558594759553671, "policy_entropy": 0.0017869191942736506, "policy_entropy/max": 1.3125, "policy_entropy/median": 3.5315752029418945e-06, "policy_entropy/min": 1.8510036170482635e-08, "policy_entropy/p25": 2.041459083557129e-06, "policy_entropy/p75": 9.417533874511719e-06, "policy_entropy/var": 0.0010445547522976995, "policy_error_vector_variance/max_squared_error": 1.9971764087677002, "policy_error_vector_variance/metric": 0.0007667482714168727, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.911592483520508, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.125, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.42605331540107727, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 65 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 8.485048056172673e-06, "adam_stats/lr_effective_mean": -4.103825265722172e-11, "adam_stats/lr_effective_min": -8.42481676954776e-06, "adam_stats/m_t_max": 0.0015952680259943008, "adam_stats/m_t_mean": 1.900729400261536e-11, "adam_stats/m_t_min": -0.002044286811724305, "adam_stats/v_t_max": 8.877903019310907e-05, "adam_stats/v_t_mean": 9.287675663272044e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0014414421748369932, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -8.75, "all_logprobs/p1": -0.00104522705078125, "all_logprobs/p10": -1.7881393432617188e-06, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -9.757291991263628e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0026454490143805742, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.8354430379746836, "feature_vector_variance/max_squared_error": 50413.1328125, "feature_vector_variance/metric": 4487.64794921875, "generated_tokens/total": 4302783.0, "grad_norm": 0.0, "learning_rate": 4.6904505493806595e-06, "loss": 0.0, "mean_logprobs": -0.00144195556640625, "mean_logprobs/var": 2.518296241760254e-06, "num_completions/total": 6336, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 197.0341796875, "per_token_feature_norm/max": 208.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 174.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.949934005737305, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0007823705673217773, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007343230536207557, "policy_entropy": 0.0017072854097932577, "policy_entropy/max": 1.5, "policy_entropy/median": 3.3676624298095703e-06, "policy_entropy/min": 2.10711732506752e-08, "policy_entropy/p25": 1.952052116394043e-06, "policy_entropy/p75": 9.417533874511719e-06, "policy_entropy/var": 0.0009251759620383382, "policy_error_vector_variance/max_squared_error": 2.000004291534424, "policy_error_vector_variance/metric": 0.0007828306406736374, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.910285949707031, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.4634132385253906, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 66 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 7.24778101357515e-06, "adam_stats/lr_effective_mean": -3.5052998698104076e-11, "adam_stats/lr_effective_min": -7.196336810011417e-06, "adam_stats/m_t_max": 0.0014357412001118064, "adam_stats/m_t_mean": 1.710657136777538e-11, "adam_stats/m_t_min": -0.0018398581305518746, "adam_stats/v_t_max": 8.86902489583008e-05, "adam_stats/v_t_mean": 9.278387086419926e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0014255766291171312, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -8.75, "all_logprobs/p1": -0.000576019287109375, "all_logprobs/p10": -2.0265579223632812e-06, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -6.4373016357421875e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0029185242019593716, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.8481012658227848, "feature_vector_variance/max_squared_error": 44671.4375, "feature_vector_variance/metric": 4789.04248046875, "generated_tokens/total": 4401087.0, "grad_norm": 0.0, "learning_rate": 4.4494751769315e-06, "loss": 0.0, "mean_logprobs": -0.00142669677734375, "mean_logprobs/var": 2.950429916381836e-06, "num_completions/total": 6432, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 197.0017547607422, "per_token_feature_norm/max": 209.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 159.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 13.293253898620605, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0007471839780919254, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007360539166256785, "policy_entropy": 0.0015692220767959952, "policy_entropy/max": 1.328125, "policy_entropy/median": 3.3229589462280273e-06, "policy_entropy/min": 3.050081431865692e-08, "policy_entropy/p25": 1.9818544387817383e-06, "policy_entropy/p75": 9.119510650634766e-06, "policy_entropy/var": 0.0009009467903524637, "policy_error_vector_variance/max_squared_error": 2.0001425743103027, "policy_error_vector_variance/metric": 0.0007471549324691296, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.931360244750977, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.3572166860103607, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 67 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 6.178222065500449e-06, "adam_stats/lr_effective_mean": -2.987921715602582e-11, "adam_stats/lr_effective_min": -6.134373052191222e-06, "adam_stats/m_t_max": 0.0012921669986099005, "adam_stats/m_t_mean": 1.5395910241133848e-11, "adam_stats/m_t_min": -0.0016558723291382194, "adam_stats/v_t_max": 8.860156231094152e-05, "adam_stats/v_t_mean": 9.269108917908664e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.001489264308474958, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -4.125, "all_logprobs/p1": -0.000816230894997716, "all_logprobs/p10": -1.7881393432617188e-06, "all_logprobs/p25": -5.960464477539062e-07, "all_logprobs/p5": -6.4373016357421875e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002157289767637849, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.8607594936708861, "feature_vector_variance/max_squared_error": 42648.95703125, "feature_vector_variance/metric": 4611.5751953125, "generated_tokens/total": 4499391.0, "grad_norm": 0.0, "learning_rate": 4.212216399081919e-06, "loss": 0.0, "mean_logprobs": -0.00148773193359375, "mean_logprobs/var": 2.86102294921875e-06, "num_completions/total": 6528, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.80691528320312, "per_token_feature_norm/max": 209.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 176.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 12.371990203857422, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0008356174221262336, "per_token_policy_error_norm/max": 1.90625, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007900989730842412, "policy_entropy": 0.0017668355721980333, "policy_entropy/max": 1.234375, "policy_entropy/median": 3.471970558166504e-06, "policy_entropy/min": 2.0838342607021332e-08, "policy_entropy/p25": 2.0116567611694336e-06, "policy_entropy/p75": 9.119510650634766e-06, "policy_entropy/var": 0.000989041873253882, "policy_error_vector_variance/max_squared_error": 1.9087636470794678, "policy_error_vector_variance/metric": 0.000836055027320981, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.919455528259277, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.42503097653388977, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 68 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 5.2550799409800675e-06, "adam_stats/lr_effective_mean": -2.5413848109279158e-11, "adam_stats/lr_effective_min": -5.217785655986518e-06, "adam_stats/m_t_max": 0.0011629502987489104, "adam_stats/m_t_mean": 1.385631800271403e-11, "adam_stats/m_t_min": -0.0014902850380167365, "adam_stats/v_t_max": 8.85129629750736e-05, "adam_stats/v_t_mean": 9.25984029037652e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.001511119189672172, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -4.25, "all_logprobs/p1": -0.0008580782450735569, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -5.841255187988281e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0022047755774110556, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.8734177215189873, "feature_vector_variance/max_squared_error": 42478.1796875, "feature_vector_variance/metric": 4092.85888671875, "generated_tokens/total": 4597695.0, "grad_norm": 0.0, "learning_rate": 3.978963279105821e-06, "loss": 0.0, "mean_logprobs": -0.0015106201171875, "mean_logprobs/var": 1.8775463104248047e-06, "num_completions/total": 6624, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.80908203125, "per_token_feature_norm/max": 208.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 173.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.220083236694336, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0008698105812072754, "per_token_policy_error_norm/max": 1.9375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0008252986008301377, "policy_entropy": 0.0016751002985984087, "policy_entropy/max": 1.4296875, "policy_entropy/median": 2.816319465637207e-06, "policy_entropy/min": 2.3399479687213898e-08, "policy_entropy/p25": 1.780688762664795e-06, "policy_entropy/p75": 7.271766662597656e-06, "policy_entropy/var": 0.0009859484853222966, "policy_error_vector_variance/max_squared_error": 1.9408100843429565, "policy_error_vector_variance/metric": 0.0008702994673512876, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.918684005737305, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.40807849168777466, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 69 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 4.459613592189271e-06, "adam_stats/lr_effective_mean": -2.1566219296498268e-11, "adam_stats/lr_effective_min": -4.42796681454638e-06, "adam_stats/m_t_max": 0.0010466552339494228, "adam_stats/m_t_mean": 1.2470683340148891e-11, "adam_stats/m_t_min": -0.001341256545856595, "adam_stats/v_t_max": 8.842445095069706e-05, "adam_stats/v_t_mean": 9.250580336461756e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0013452497078105807, "all_logprobs/max": 0.0, "all_logprobs/median": -2.384185791015625e-07, "all_logprobs/min": -5.375, "all_logprobs/p1": -0.000812530517578125, "all_logprobs/p10": -1.6689300537109375e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -7.510185241699219e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002053983509540558, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.8860759493670886, "feature_vector_variance/max_squared_error": 45464.9765625, "feature_vector_variance/metric": 4083.99462890625, "generated_tokens/total": 4695999.0, "grad_norm": 0.0, "learning_rate": 3.750000000000002e-06, "loss": 0.0, "mean_logprobs": -0.0013427734375, "mean_logprobs/var": 1.996755599975586e-06, "num_completions/total": 6720, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.7776336669922, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 178.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.138465881347656, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0007440447807312012, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0006774624926038086, "policy_entropy": 0.0016237794188782573, "policy_entropy/max": 1.421875, "policy_entropy/median": 3.203749656677246e-06, "policy_entropy/min": 7.62520357966423e-09, "policy_entropy/p25": 1.7955899238586426e-06, "policy_entropy/p75": 8.165836334228516e-06, "policy_entropy/var": 0.000954570947214961, "policy_error_vector_variance/max_squared_error": 1.983492374420166, "policy_error_vector_variance/metric": 0.0007436721934936941, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.91787338256836, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.4088665246963501, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 70 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 3.7753413835162064e-06, "adam_stats/lr_effective_mean": -1.825654384612907e-11, "adam_stats/lr_effective_min": -3.7485526718228357e-06, "adam_stats/m_t_max": 0.0009419897105544806, "adam_stats/m_t_mean": 1.1223610582589139e-11, "adam_stats/m_t_min": -0.0012071308447048068, "adam_stats/v_t_max": 8.83360262378119e-05, "adam_stats/v_t_mean": 9.24132992352611e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.001494609983637929, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -7.625, "all_logprobs/p1": -0.000728607177734375, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -5.602836608886719e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002627034904435277, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.8987341772151899, "feature_vector_variance/max_squared_error": 50756.53125, "feature_vector_variance/metric": 4078.94873046875, "generated_tokens/total": 4794303.0, "grad_norm": 0.0, "learning_rate": 3.525605518250964e-06, "loss": 0.0, "mean_logprobs": -0.001495361328125, "mean_logprobs/var": 1.9818544387817383e-06, "num_completions/total": 6816, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 197.1837158203125, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 177.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 9.839889526367188, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0008483926649205387, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0008253321284428239, "policy_entropy": 0.0015921082813292742, "policy_entropy/max": 1.2109375, "policy_entropy/median": 2.7865171432495117e-06, "policy_entropy/min": 1.885928213596344e-08, "policy_entropy/p25": 1.6987323760986328e-06, "policy_entropy/p75": 7.271766662597656e-06, "policy_entropy/var": 0.0009185682865791023, "policy_error_vector_variance/max_squared_error": 1.9989562034606934, "policy_error_vector_variance/metric": 0.0008482564589940012, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.924370765686035, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.3716598153114319, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 71 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 3.1877841593086487e-06, "adam_stats/lr_effective_mean": -1.541475627697686e-11, "adam_stats/lr_effective_min": -3.165166617691284e-06, "adam_stats/m_t_max": 0.0008477907394990325, "adam_stats/m_t_mean": 1.0101251432526048e-11, "adam_stats/m_t_min": -0.0010864177020266652, "adam_stats/v_t_max": 8.824768883641809e-05, "adam_stats/v_t_mean": 9.232089051569581e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.001167856389656663, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -4.03125, "all_logprobs/p1": -0.000507354736328125, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -4.5299530029296875e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0013749179197475314, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.9113924050632911, "feature_vector_variance/max_squared_error": 46507.55078125, "feature_vector_variance/metric": 4246.5166015625, "generated_tokens/total": 4892607.0, "grad_norm": 0.0, "learning_rate": 3.3060532239694e-06, "loss": 0.0, "mean_logprobs": -0.00116729736328125, "mean_logprobs/var": 1.1846423149108887e-06, "num_completions/total": 6912, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.50865173339844, "per_token_feature_norm/max": 208.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 174.0, "per_token_feature_norm/p25": 194.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.565397262573242, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0006742874975316226, "per_token_policy_error_norm/max": 1.921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0005882452242076397, "policy_entropy": 0.0015175164444372058, "policy_entropy/max": 1.296875, "policy_entropy/median": 2.9802322387695312e-06, "policy_entropy/min": 2.4097971618175507e-08, "policy_entropy/p25": 1.8253922462463379e-06, "policy_entropy/p75": 7.987022399902344e-06, "policy_entropy/var": 0.0008897254010662436, "policy_error_vector_variance/max_squared_error": 1.9189660549163818, "policy_error_vector_variance/metric": 0.0006744303973391652, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.931163787841797, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.3542115092277527, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 72 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 2.6842340048460755e-06, "adam_stats/lr_effective_mean": -1.2979377121691282e-11, "adam_stats/lr_effective_min": -2.665191004780354e-06, "adam_stats/m_t_max": 0.0007630116306245327, "adam_stats/m_t_mean": 9.091125421911705e-12, "adam_stats/m_t_min": -0.0009777758968994021, "adam_stats/v_t_max": 8.815943874651566e-05, "adam_stats/v_t_mean": 9.222856853230432e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0014084557769820094, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -3.765625, "all_logprobs/p1": -0.000659828307107091, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -4.410743713378906e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0020889744628220797, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.9240506329113924, "feature_vector_variance/max_squared_error": 43748.53125, "feature_vector_variance/metric": 4133.8359375, "generated_tokens/total": 4990911.0, "grad_norm": 0.0, "learning_rate": 3.0916106078064522e-06, "loss": 0.0, "mean_logprobs": -0.00141143798828125, "mean_logprobs/var": 2.5331974029541016e-06, "num_completions/total": 7008, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.91232299804688, "per_token_feature_norm/max": 207.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 176.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.203548431396484, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.000775714754126966, "per_token_policy_error_norm/max": 1.90625, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007580532692372799, "policy_entropy": 0.0016692759236320853, "policy_entropy/max": 1.3515625, "policy_entropy/median": 2.6673078536987305e-06, "policy_entropy/min": 2.468004822731018e-08, "policy_entropy/p25": 1.5944242477416992e-06, "policy_entropy/p75": 7.212162017822266e-06, "policy_entropy/var": 0.0009776088409125805, "policy_error_vector_variance/max_squared_error": 1.9081693887710571, "policy_error_vector_variance/metric": 0.000775794149376452, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.925553321838379, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.3805721402168274, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 73 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 2.2535505195264705e-06, "adam_stats/lr_effective_mean": -1.0896475562127694e-11, "adam_stats/lr_effective_min": -2.2375643311534077e-06, "adam_stats/m_t_max": 0.0006867104675620794, "adam_stats/m_t_mean": 8.182014094026968e-12, "adam_stats/m_t_min": -0.0008799982606433332, "adam_stats/v_t_max": 8.807128324406222e-05, "adam_stats/v_t_mean": 9.213633328508664e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0011722188210114837, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -9.0, "all_logprobs/p1": -0.00081634521484375, "all_logprobs/p10": -1.7881393432617188e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -5.7220458984375e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.001931366859935224, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.9367088607594937, "feature_vector_variance/max_squared_error": 46449.15625, "feature_vector_variance/metric": 4452.92041015625, "generated_tokens/total": 5089215.0, "grad_norm": 0.0, "learning_rate": 2.882538935057563e-06, "loss": 0.0, "mean_logprobs": -0.0011749267578125, "mean_logprobs/var": 1.8700957298278809e-06, "num_completions/total": 7104, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.84637451171875, "per_token_feature_norm/max": 209.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 167.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.331986427307129, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0005932251806370914, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.00044145077117718756, "policy_entropy": 0.0016788564389571548, "policy_entropy/max": 1.609375, "policy_entropy/median": 2.86102294921875e-06, "policy_entropy/min": 1.3387762010097504e-08, "policy_entropy/p25": 1.7136335372924805e-06, "policy_entropy/p75": 8.046627044677734e-06, "policy_entropy/var": 0.0009794884826987982, "policy_error_vector_variance/max_squared_error": 2.0006160736083984, "policy_error_vector_variance/metric": 0.0005934155196882784, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.921154022216797, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.40236523747444153, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 74 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.885977212623402e-06, "adam_stats/lr_effective_mean": -9.118867987101265e-12, "adam_stats/lr_effective_min": -1.8725995687418617e-06, "adam_stats/m_t_max": 0.0006180393975228071, "adam_stats/m_t_mean": 7.363811817262533e-12, "adam_stats/m_t_min": -0.0007919984054751694, "adam_stats/v_t_max": 8.798321505310014e-05, "adam_stats/v_t_mean": 9.204421079489489e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0015771170146763325, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -9.125, "all_logprobs/p1": -0.00067901611328125, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -5.602836608886719e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0031803282909095287, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.9493670886075949, "feature_vector_variance/max_squared_error": 46300.02734375, "feature_vector_variance/metric": 4219.05908203125, "generated_tokens/total": 5187519.0, "grad_norm": 0.0, "learning_rate": 2.6790929273509547e-06, "loss": 0.0, "mean_logprobs": -0.00157928466796875, "mean_logprobs/var": 3.725290298461914e-06, "num_completions/total": 7200, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.95071411132812, "per_token_feature_norm/max": 209.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 176.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.470376968383789, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0008603930473327637, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0008259403402917087, "policy_entropy": 0.001658179797232151, "policy_entropy/max": 1.515625, "policy_entropy/median": 2.518296241760254e-06, "policy_entropy/min": 1.4784745872020721e-08, "policy_entropy/p25": 1.5422701835632324e-06, "policy_entropy/p75": 7.152557373046875e-06, "policy_entropy/var": 0.0009874920360744, "policy_error_vector_variance/max_squared_error": 1.9966578483581543, "policy_error_vector_variance/metric": 0.0008610013755969703, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.925493240356445, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.38981157541275024, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 75 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.572979385855433e-06, "adam_stats/lr_effective_mean": -7.605239354946391e-12, "adam_stats/lr_effective_min": -1.5618230690961354e-06, "adam_stats/m_t_max": 0.000556235434487462, "adam_stats/m_t_mean": 6.6274299416468896e-12, "adam_stats/m_t_min": -0.0007127985591068864, "adam_stats/v_t_max": 8.789523417362943e-05, "adam_stats/v_t_mean": 9.195217504087694e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.001347726909443736, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -3.53125, "all_logprobs/p1": -0.0006561279296875, "all_logprobs/p10": -1.6689300537109375e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -6.318092346191406e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0018941767048090696, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.9620253164556962, "feature_vector_variance/max_squared_error": 41078.46875, "feature_vector_variance/metric": 4246.94091796875, "generated_tokens/total": 5285823.0, "grad_norm": 0.0, "learning_rate": 2.4815204523085656e-06, "loss": 0.0, "mean_logprobs": -0.00135040283203125, "mean_logprobs/var": 1.8551945686340332e-06, "num_completions/total": 7296, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.79742431640625, "per_token_feature_norm/max": 208.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 178.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 12.164896965026855, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0007886489620432258, "per_token_policy_error_norm/max": 1.875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007877589669078588, "policy_entropy": 0.0015358495293185115, "policy_entropy/max": 1.265625, "policy_entropy/median": 2.816319465637207e-06, "policy_entropy/min": 2.3515895009040833e-08, "policy_entropy/p25": 1.6316771507263184e-06, "policy_entropy/p75": 7.68899917602539e-06, "policy_entropy/var": 0.0008439738885499537, "policy_error_vector_variance/max_squared_error": 1.8806627988815308, "policy_error_vector_variance/metric": 0.0007891890127211809, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.9247407913208, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.125, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.381611168384552, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 76 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.3071005469100783e-06, "adam_stats/lr_effective_mean": -6.319524330916648e-12, "adam_stats/lr_effective_min": -1.2978306358490954e-06, "adam_stats/m_t_max": 0.000500611902680248, "adam_stats/m_t_mean": 5.96468751126733e-12, "adam_stats/m_t_min": -0.0006415186799131334, "adam_stats/v_t_max": 8.78073406056501e-05, "adam_stats/v_t_mean": 9.186020867579803e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0012923224130645394, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -4.5, "all_logprobs/p1": -0.000617866637185216, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -5.4836273193359375e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0020511564798653126, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.9746835443037974, "feature_vector_variance/max_squared_error": 40709.1640625, "feature_vector_variance/metric": 4169.1240234375, "generated_tokens/total": 5384127.0, "grad_norm": 0.0, "learning_rate": 2.29006222155752e-06, "loss": 0.0, "mean_logprobs": -0.00128936767578125, "mean_logprobs/var": 1.55717134475708e-06, "num_completions/total": 7392, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.4083709716797, "per_token_feature_norm/max": 208.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 173.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 10.842228889465332, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0007292429800145328, "per_token_policy_error_norm/max": 1.953125, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007834411226212978, "policy_entropy": 0.001445906120352447, "policy_entropy/max": 1.140625, "policy_entropy/median": 2.8014183044433594e-06, "policy_entropy/min": 2.3748725652694702e-08, "policy_entropy/p25": 1.646578311920166e-06, "policy_entropy/p75": 7.331371307373047e-06, "policy_entropy/var": 0.0007417658343911171, "policy_error_vector_variance/max_squared_error": 1.9545326232910156, "policy_error_vector_variance/metric": 0.0007297663833014667, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.924959182739258, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.0969339907169342, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.392056941986084, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 77 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.08183337488299e-06, "adam_stats/lr_effective_mean": -5.230239852327401e-12, "adam_stats/lr_effective_min": -1.0741617870735354e-06, "adam_stats/m_t_max": 0.00045055069494992495, "adam_stats/m_t_mean": 5.368219323925727e-12, "adam_stats/m_t_min": -0.0005773667944595218, "adam_stats/v_t_max": 8.771953434916213e-05, "adam_stats/v_t_mean": 9.176834639412768e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.001128701725974679, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -4.625, "all_logprobs/p1": -0.0007476806640625, "all_logprobs/p10": -1.7881393432617188e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -7.62939453125e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0013999217189848423, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 0.9873417721518988, "feature_vector_variance/max_squared_error": 45146.73828125, "feature_vector_variance/metric": 4345.7734375, "generated_tokens/total": 5482431.0, "grad_norm": 0.0, "learning_rate": 2.104951497460118e-06, "loss": 0.0, "mean_logprobs": -0.001129150390625, "mean_logprobs/var": 1.4454126358032227e-06, "num_completions/total": 7488, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.59774780273438, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 168.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 12.937766075134277, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0006206035614013672, "per_token_policy_error_norm/max": 1.953125, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0005184065666981041, "policy_entropy": 0.0015532083343714476, "policy_entropy/max": 1.234375, "policy_entropy/median": 2.905726432800293e-06, "policy_entropy/min": 1.5832483768463135e-08, "policy_entropy/p25": 1.773238182067871e-06, "policy_entropy/p75": 7.987022399902344e-06, "policy_entropy/var": 0.0008568221237510443, "policy_error_vector_variance/max_squared_error": 1.9577735662460327, "policy_error_vector_variance/metric": 0.000620961538515985, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.919830322265625, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.40978729724884033, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 78 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 8.915062608139124e-07, "adam_stats/lr_effective_mean": -4.309938256980006e-12, "adam_stats/lr_effective_min": -8.851848178892396e-07, "adam_stats/m_t_max": 0.0004054956079926342, "adam_stats/m_t_mean": 4.831396567539503e-12, "adam_stats/m_t_min": -0.0005196300917305052, "adam_stats/v_t_max": 8.763181540416554e-05, "adam_stats/v_t_mean": 9.167658819586588e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.001370035344734788, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -5.09375, "all_logprobs/p1": -0.00063323974609375, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -5.4836273193359375e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.00212600314989686, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.0126582278481013, "feature_vector_variance/max_squared_error": 42714.6875, "feature_vector_variance/metric": 4379.169921875, "generated_tokens/total": 5580735.0, "grad_norm": 0.0, "learning_rate": 1.9264138089195424e-06, "loss": 0.0, "mean_logprobs": -0.001373291015625, "mean_logprobs/var": 1.5795230865478516e-06, "num_completions/total": 7584, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.85458374023438, "per_token_feature_norm/max": 208.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 174.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 12.680943489074707, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0007822513580322266, "per_token_policy_error_norm/max": 1.96875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007582782418467104, "policy_entropy": 0.0015415309462696314, "policy_entropy/max": 1.4296875, "policy_entropy/median": 2.7865171432495117e-06, "policy_entropy/min": 2.6775524020195007e-08, "policy_entropy/p25": 1.6540288925170898e-06, "policy_entropy/p75": 7.331371307373047e-06, "policy_entropy/var": 0.0008780162315815687, "policy_error_vector_variance/max_squared_error": 1.969393253326416, "policy_error_vector_variance/metric": 0.0007824554922990501, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.926562309265137, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.37187522649765015, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 79 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 7.31182808522135e-07, "adam_stats/lr_effective_mean": -3.5347438951532917e-12, "adam_stats/lr_effective_min": -7.259985750351916e-07, "adam_stats/m_t_max": 0.00036494602682068944, "adam_stats/m_t_mean": 4.348256563840858e-12, "adam_stats/m_t_min": -0.00046766706509515643, "adam_stats/v_t_max": 8.754418377066031e-05, "adam_stats/v_t_mean": 9.158491673377789e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0013371192617341876, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -6.125, "all_logprobs/p1": -0.000743865966796875, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -5.841255187988281e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002106194384396076, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.0253164556962024, "feature_vector_variance/max_squared_error": 41041.46875, "feature_vector_variance/metric": 3975.19482421875, "generated_tokens/total": 5679039.0, "grad_norm": 0.0, "learning_rate": 1.7546666766076658e-06, "loss": 0.0, "mean_logprobs": -0.00133514404296875, "mean_logprobs/var": 1.9222497940063477e-06, "num_completions/total": 7680, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.4464111328125, "per_token_feature_norm/max": 207.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 178.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 10.801092147827148, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0007420381298288703, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007244767039082944, "policy_entropy": 0.0015834863297641277, "policy_entropy/max": 1.265625, "policy_entropy/median": 2.8312206268310547e-06, "policy_entropy/min": 2.5262124836444855e-08, "policy_entropy/p25": 1.646578311920166e-06, "policy_entropy/p75": 7.241964340209961e-06, "policy_entropy/var": 0.0008823096286505461, "policy_error_vector_variance/max_squared_error": 1.9880857467651367, "policy_error_vector_variance/metric": 0.000743066833820194, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.919441223144531, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.22267548739910126, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.4131036400794983, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 80 }, { "accuracy_reward": 0.010416666977107525, "accuracy_reward/correct": 1.0, "accuracy_reward/correct/max": 1.0, "accuracy_reward/correct/median": 1.0, "accuracy_reward/correct/min": 1.0, "accuracy_reward/correct/p25": 1.0, "accuracy_reward/correct/p75": 1.0, "accuracy_reward/correct/var": NaN, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 1.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0104166679084301, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 6.797574769734638e-07, "adam_stats/lr_effective_mean": -3.017134068625249e-12, "adam_stats/lr_effective_min": -5.997587777528679e-07, "adam_stats/m_t_max": 0.00032984616700559855, "adam_stats/m_t_mean": 3.990508878198851e-12, "adam_stats/m_t_min": -0.00042073792428709567, "adam_stats/v_t_max": 8.745663944864646e-05, "adam_stats/v_t_mean": 9.149338404956797e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.010416666977107525, "advantages/max": 1.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0104166679084301, "all_logprobs": -0.001211169408634305, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -6.125, "all_logprobs/p1": -0.00049591064453125, "all_logprobs/p10": -1.430511474609375e-06, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -3.6954879760742188e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002036802005022764, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/correct": 1024.0, "completion_length/correct/max": 1024.0, "completion_length/correct/median": 1024.0, "completion_length/correct/min": 1024.0, "completion_length/correct/p25": 1024.0, "completion_length/correct/p75": 1024.0, "completion_length/correct/var": NaN, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.0379746835443038, "feature_vector_variance/max_squared_error": 42202.1640625, "feature_vector_variance/metric": 4310.4912109375, "generated_tokens/total": 5777343.0, "grad_norm": 0.0037976743187755346, "learning_rate": 1.5899193479495858e-06, "loss": -0.0104, "mean_logprobs": -0.00121307373046875, "mean_logprobs/var": 2.60770320892334e-06, "num_completions/total": 7776, "per_sentence_gradient_norm": 0.0007018744945526123, "per_sentence_gradient_norm/max": 0.06737995147705078, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0033692033030092716, "per_sentence_gradient_norm/var": 4.7292265662690625e-05, "per_token_feature_norm": 196.86839294433594, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 171.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.794970512390137, "per_token_full_gradient_variance/max_squared_error": 0.14941908419132233, "per_token_full_gradient_variance/variance": 3.086908463956206e-06, "per_token_gradient_norm": 0.0007018744945526123, "per_token_gradient_norm/max": 48.7265625, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.028332019224762917, "per_token_policy_error_norm": 0.0006388028850778937, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0005737829487770796, "policy_entropy": 0.001519693760201335, "policy_entropy/max": 1.109375, "policy_entropy/median": 2.60770320892334e-06, "policy_entropy/min": 1.4726538211107254e-08, "policy_entropy/p25": 1.5944242477416992e-06, "policy_entropy/p75": 6.496906280517578e-06, "policy_entropy/var": 0.0008317633182741702, "policy_error_vector_variance/max_squared_error": 1.988290548324585, "policy_error_vector_variance/metric": 0.00063881347887218, "policy_loss": -0.010416666977107525, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": -1.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0104166679084301, "policy_sharpness": 9.934637069702148, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.3411438465118408, "reward": 0.010416666977107525, "reward/max": 1.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0104166679084301, "rewards/accuracy_reward": 0.010416666977107525, "rewards/accuracy_reward/max": 1.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0104166679084301, "sentence_full_gradient_variance/max_squared_error": 3.079623311919022e-08, "sentence_full_gradient_variance/metric": 3.241708845802549e-10, "sentence_full_gradient_variance/p75": 3.412325543655781e-12, "sentence_full_gradient_variance/p90": 3.412325543655781e-12, "sentence_full_gradient_variance/p95": 3.412325543655781e-12, "sentence_full_gradient_variance/p99": 1.5431473876503787e-09, "state_level_variance/metric": 4.7292265662690625e-05, "state_level_variance_full_gradient/metric": 3.241708845802549e-10, "step": 81 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 5.512652023753617e-07, "adam_stats/lr_effective_mean": -2.4474732136792188e-12, "adam_stats/lr_effective_min": -4.865364076067635e-07, "adam_stats/m_t_max": 0.0002968615444842726, "adam_stats/m_t_mean": 3.5914585107960084e-12, "adam_stats/m_t_min": -0.00037866411730647087, "adam_stats/v_t_max": 8.736918243812397e-05, "adam_stats/v_t_mean": 9.140190340706233e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0012292175088077784, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -6.625, "all_logprobs/p1": -0.00038909912109375, "all_logprobs/p10": -1.430511474609375e-06, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -3.2186508178710938e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0020413941238075495, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.0506329113924051, "feature_vector_variance/max_squared_error": 40354.97265625, "feature_vector_variance/metric": 4416.17138671875, "generated_tokens/total": 5875647.0, "grad_norm": 0.0, "learning_rate": 1.432372542187895e-06, "loss": 0.0, "mean_logprobs": -0.00122833251953125, "mean_logprobs/var": 2.0563602447509766e-06, "num_completions/total": 7872, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.76048278808594, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 176.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 12.517765045166016, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0006769299507141113, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0006677560741081834, "policy_entropy": 0.001434129080735147, "policy_entropy/max": 1.1953125, "policy_entropy/median": 2.5480985641479492e-06, "policy_entropy/min": 1.0710209608078003e-08, "policy_entropy/p25": 1.5869736671447754e-06, "policy_entropy/p75": 6.377696990966797e-06, "policy_entropy/var": 0.0008034584461711347, "policy_error_vector_variance/max_squared_error": 1.9975757598876953, "policy_error_vector_variance/metric": 0.0006769231986254454, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.939227104187012, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.09470813721418381, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.3301800787448883, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 82 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 4.4421403799788095e-07, "adam_stats/lr_effective_mean": -1.9727247179551766e-12, "adam_stats/lr_effective_min": -3.9217431435645267e-07, "adam_stats/m_t_max": 0.0002671753754839301, "adam_stats/m_t_mean": 3.2323119875110606e-12, "adam_stats/m_t_min": -0.0003407977055758238, "adam_stats/v_t_max": 8.728181273909286e-05, "adam_stats/v_t_mean": 9.131048347987836e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.001378181274048984, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -5.875, "all_logprobs/p1": -0.0004710579523816705, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -3.814697265625e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0024080565199255943, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.0632911392405062, "feature_vector_variance/max_squared_error": 44331.58203125, "feature_vector_variance/metric": 4369.1220703125, "generated_tokens/total": 5973951.0, "grad_norm": 0.0, "learning_rate": 1.282218205837188e-06, "loss": 0.0, "mean_logprobs": -0.00138092041015625, "mean_logprobs/var": 2.0116567611694336e-06, "num_completions/total": 7968, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.8302001953125, "per_token_feature_norm/max": 212.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 175.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.270445823669434, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0007786552305333316, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007935278699733317, "policy_entropy": 0.0014651156961917877, "policy_entropy/max": 1.25, "policy_entropy/median": 2.562999725341797e-06, "policy_entropy/min": 5.966285243630409e-09, "policy_entropy/p25": 1.5124678611755371e-06, "policy_entropy/p75": 7.241964340209961e-06, "policy_entropy/var": 0.0008341191569343209, "policy_error_vector_variance/max_squared_error": 1.9864015579223633, "policy_error_vector_variance/metric": 0.0007782634347677231, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.936161994934082, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.32896649837493896, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 83 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 3.5540489307095413e-07, "adam_stats/lr_effective_mean": -1.5787543798315151e-12, "adam_stats/lr_effective_min": -3.1386466048388684e-07, "adam_stats/m_t_max": 0.00024045782629400492, "adam_stats/m_t_mean": 2.909079661189695e-12, "adam_stats/m_t_min": -0.0003067179350182414, "adam_stats/v_t_max": 8.719453035155311e-05, "adam_stats/v_t_mean": 9.12191849833377e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.001117423176765442, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -5.5, "all_logprobs/p1": -0.0003985787043347955, "all_logprobs/p10": -1.6689300537109375e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -4.5299530029296875e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0017363748047500849, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.0759493670886076, "feature_vector_variance/max_squared_error": 42504.35546875, "feature_vector_variance/metric": 4549.23486328125, "generated_tokens/total": 6072255.0, "grad_norm": 0.0, "learning_rate": 1.1396392788268054e-06, "loss": 0.0, "mean_logprobs": -0.0011138916015625, "mean_logprobs/var": 1.5124678611755371e-06, "num_completions/total": 8064, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.5986328125, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 178.0, "per_token_feature_norm/p25": 194.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 12.33397102355957, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0006183783407323062, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0005955758388154209, "policy_entropy": 0.001358278444968164, "policy_entropy/max": 1.3125, "policy_entropy/median": 2.9355287551879883e-06, "policy_entropy/min": 1.0360963642597198e-08, "policy_entropy/p25": 1.644715666770935e-06, "policy_entropy/p75": 7.987022399902344e-06, "policy_entropy/var": 0.000720761890988797, "policy_error_vector_variance/max_squared_error": 1.9843292236328125, "policy_error_vector_variance/metric": 0.0006183407385833561, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.938522338867188, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.32046809792518616, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 84 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 2.820754900767497e-07, "adam_stats/lr_effective_mean": -1.2533532154843097e-12, "adam_stats/lr_effective_min": -2.491819657279848e-07, "adam_stats/m_t_max": 0.00021641203784383833, "adam_stats/m_t_mean": 2.6181717384388126e-12, "adam_stats/m_t_min": -0.0002760461356956512, "adam_stats/v_t_max": 8.710733527550474e-05, "adam_stats/v_t_mean": 9.112796454935346e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0012267716228961945, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -5.625, "all_logprobs/p1": -0.000663643004372716, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -5.364418029785156e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0019336050609126687, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.0886075949367089, "feature_vector_variance/max_squared_error": 44764.08984375, "feature_vector_variance/metric": 4053.181640625, "generated_tokens/total": 6170559.0, "grad_norm": 0.0, "learning_rate": 1.0048094716167097e-06, "loss": 0.0, "mean_logprobs": -0.00122833251953125, "mean_logprobs/var": 1.8030405044555664e-06, "num_completions/total": 8160, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.3652801513672, "per_token_feature_norm/max": 209.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 177.0, "per_token_feature_norm/p25": 194.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 12.186312675476074, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.000670313835144043, "per_token_policy_error_norm/max": 1.9765625, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0006397555116564035, "policy_entropy": 0.0014425761764869094, "policy_entropy/max": 1.203125, "policy_entropy/median": 2.8014183044433594e-06, "policy_entropy/min": 3.4458935260772705e-08, "policy_entropy/p25": 1.6689300537109375e-06, "policy_entropy/p75": 7.241964340209961e-06, "policy_entropy/var": 0.0007785593043081462, "policy_error_vector_variance/max_squared_error": 1.9777321815490723, "policy_error_vector_variance/metric": 0.0006708243163302541, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.923876762390137, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.38849756121635437, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 85 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 2.2184458714491484e-07, "adam_stats/lr_effective_mean": -9.859937302389432e-13, "adam_stats/lr_effective_min": -1.9603442069637822e-07, "adam_stats/m_t_max": 0.00019477082241792232, "adam_stats/m_t_mean": 2.356355995741799e-12, "adam_stats/m_t_min": -0.00024844150175340474, "adam_stats/v_t_max": 8.702022751094773e-05, "adam_stats/v_t_mean": 9.103681350430826e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0013442023191601038, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -9.4375, "all_logprobs/p1": -0.000667572021484375, "all_logprobs/p10": -1.7881393432617188e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -7.152557373046875e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0027366280555725098, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.1012658227848102, "feature_vector_variance/max_squared_error": 43432.7578125, "feature_vector_variance/metric": 4196.435546875, "generated_tokens/total": 6268863.0, "grad_norm": 0.0, "learning_rate": 8.778930535580476e-07, "loss": 0.0, "mean_logprobs": -0.0013427734375, "mean_logprobs/var": 2.205371856689453e-06, "num_completions/total": 8256, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.65451049804688, "per_token_feature_norm/max": 209.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 166.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 10.574146270751953, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0007157127256505191, "per_token_policy_error_norm/max": 1.9140625, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007109412690624595, "policy_entropy": 0.0014920876128599048, "policy_entropy/max": 1.28125, "policy_entropy/median": 2.7567148208618164e-06, "policy_entropy/min": 1.885928213596344e-08, "policy_entropy/p25": 1.6093254089355469e-06, "policy_entropy/p75": 7.510185241699219e-06, "policy_entropy/var": 0.0008288748795166612, "policy_error_vector_variance/max_squared_error": 1.9106366634368896, "policy_error_vector_variance/metric": 0.0007160566747188568, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.925298690795898, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.378288209438324, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 86 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.7266334850774e-07, "adam_stats/lr_effective_mean": -7.676128070850663e-13, "adam_stats/lr_effective_min": -1.526215669400699e-07, "adam_stats/m_t_max": 0.00017529373872093856, "adam_stats/m_t_mean": 2.1207189650207514e-12, "adam_stats/m_t_min": -0.00022359735157806426, "adam_stats/v_t_max": 8.69332070578821e-05, "adam_stats/v_t_mean": 9.094579256352375e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0012057042913511395, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -6.25, "all_logprobs/p1": -0.0005645751953125, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -4.887580871582031e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002048750873655081, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.1139240506329113, "feature_vector_variance/max_squared_error": 45080.953125, "feature_vector_variance/metric": 4230.0478515625, "generated_tokens/total": 6367167.0, "grad_norm": 0.0, "learning_rate": 7.59044652756249e-07, "loss": 0.0, "mean_logprobs": -0.0012054443359375, "mean_logprobs/var": 1.7881393432617188e-06, "num_completions/total": 8352, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 197.20175170898438, "per_token_feature_norm/max": 209.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 174.0, "per_token_feature_norm/p25": 196.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 10.8583984375, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0006305178394541144, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0005408169818110764, "policy_entropy": 0.0015067053027451038, "policy_entropy/max": 1.6484375, "policy_entropy/median": 2.60770320892334e-06, "policy_entropy/min": 1.6880221664905548e-08, "policy_entropy/p25": 1.5273690223693848e-06, "policy_entropy/p75": 7.241964340209961e-06, "policy_entropy/var": 0.0008755249436944723, "policy_error_vector_variance/max_squared_error": 1.9889107942581177, "policy_error_vector_variance/metric": 0.0006309144664555788, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.932409286499023, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.3359198272228241, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 87 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.3277229982122662e-07, "adam_stats/lr_effective_mean": -5.904278801982521e-13, "adam_stats/lr_effective_min": -1.1739658134501951e-07, "adam_stats/m_t_max": 0.00015776435611769557, "adam_stats/m_t_mean": 1.9086479358804143e-12, "adam_stats/m_t_min": -0.00020123760623391718, "adam_stats/v_t_max": 8.684627391630784e-05, "adam_stats/v_t_mean": 9.085484968529567e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0013076174072921276, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -7.625, "all_logprobs/p1": -0.0005035400390625, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -4.410743713378906e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0022927664685994387, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.1265822784810127, "feature_vector_variance/max_squared_error": 45244.8515625, "feature_vector_variance/metric": 4206.07275390625, "generated_tokens/total": 6465471.0, "grad_norm": 0.0, "learning_rate": 6.484090676804927e-07, "loss": 0.0, "mean_logprobs": -0.00130462646484375, "mean_logprobs/var": 2.2202730178833008e-06, "num_completions/total": 8448, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 197.13108825683594, "per_token_feature_norm/max": 208.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 157.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 10.319514274597168, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0007213751669041812, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007021957426331937, "policy_entropy": 0.00146616087295115, "policy_entropy/max": 1.28125, "policy_entropy/median": 2.592802047729492e-06, "policy_entropy/min": 1.792795956134796e-08, "policy_entropy/p25": 1.4826655387878418e-06, "policy_entropy/p75": 7.3909759521484375e-06, "policy_entropy/var": 0.0008315572631545365, "policy_error_vector_variance/max_squared_error": 1.9992882013320923, "policy_error_vector_variance/metric": 0.0007217072416096926, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.93484878540039, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.125, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.3292525112628937, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 88 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.0066370492722854e-07, "adam_stats/lr_effective_mean": -4.477638420631591e-13, "adam_stats/lr_effective_min": -8.903344905775157e-08, "adam_stats/m_t_max": 0.00014198791177477688, "adam_stats/m_t_mean": 1.7177826001912866e-12, "adam_stats/m_t_min": -0.0001811138354241848, "adam_stats/v_t_max": 8.675942808622494e-05, "adam_stats/v_t_mean": 9.076400221685876e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0014334185980260372, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -6.375, "all_logprobs/p1": -0.0005300142802298069, "all_logprobs/p10": -1.6689300537109375e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -5.602836608886719e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002432498848065734, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.139240506329114, "feature_vector_variance/max_squared_error": 51273.75390625, "feature_vector_variance/metric": 4329.306640625, "generated_tokens/total": 6563775.0, "grad_norm": 0.0, "learning_rate": 5.461210907490952e-07, "loss": 0.0, "mean_logprobs": -0.001434326171875, "mean_logprobs/var": 2.0265579223632812e-06, "num_completions/total": 8544, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 197.00228881835938, "per_token_feature_norm/max": 208.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 147.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.022295951843262, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.000796417414676398, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007809430826455355, "policy_entropy": 0.001569574000313878, "policy_entropy/max": 1.46875, "policy_entropy/median": 2.6971101760864258e-06, "policy_entropy/min": 2.0023435354232788e-08, "policy_entropy/p25": 1.5869736671447754e-06, "policy_entropy/p75": 7.867813110351562e-06, "policy_entropy/var": 0.0009263121755793691, "policy_error_vector_variance/max_squared_error": 1.996675729751587, "policy_error_vector_variance/metric": 0.000796565436758101, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.933850288391113, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.35979098081588745, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.3404132127761841, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 89 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 7.504835508598262e-08, "adam_stats/lr_effective_mean": -3.3391385901963833e-13, "adam_stats/lr_effective_min": -6.639780991690714e-08, "adam_stats/m_t_max": 0.00012778911332134157, "adam_stats/m_t_mean": 1.5460051858498525e-12, "adam_stats/m_t_min": -0.00016300244897138327, "adam_stats/v_t_max": 8.667266956763342e-05, "adam_stats/v_t_mean": 9.067323281097828e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0012781916884705424, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -6.5, "all_logprobs/p1": -0.000472965301014483, "all_logprobs/p10": -1.430511474609375e-06, "all_logprobs/p25": -3.5762786865234375e-07, "all_logprobs/p5": -3.814697265625e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0018666473915800452, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.1518987341772151, "feature_vector_variance/max_squared_error": 40638.6171875, "feature_vector_variance/metric": 4430.76806640625, "generated_tokens/total": 6662079.0, "grad_norm": 0.0, "learning_rate": 4.5230534410568764e-07, "loss": 0.0, "mean_logprobs": -0.00128173828125, "mean_logprobs/var": 1.9073486328125e-06, "num_completions/total": 8640, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.83270263671875, "per_token_feature_norm/max": 209.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 176.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 10.615572929382324, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.00073186558438465, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0006744173006154597, "policy_entropy": 0.0015397905372083187, "policy_entropy/max": 1.2421875, "policy_entropy/median": 2.4586915969848633e-06, "policy_entropy/min": 1.862645149230957e-08, "policy_entropy/p25": 1.4826655387878418e-06, "policy_entropy/p75": 6.407499313354492e-06, "policy_entropy/var": 0.0008879601373337209, "policy_error_vector_variance/max_squared_error": 1.9973162412643433, "policy_error_vector_variance/metric": 0.0007316320552490652, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.93533992767334, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.3329399824142456, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 90 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 5.48265717270624e-08, "adam_stats/lr_effective_mean": -2.4400652939154943e-13, "adam_stats/lr_effective_min": -4.852170221170127e-08, "adam_stats/m_t_max": 0.00011501019616844133, "adam_stats/m_t_mean": 1.3914039191653682e-12, "adam_stats/m_t_min": -0.00014670219388790429, "adam_stats/v_t_max": 8.658599836053327e-05, "adam_stats/v_t_mean": 9.058258483574111e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0013616823125630617, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -8.0, "all_logprobs/p1": -0.000747566344216466, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -6.318092346191406e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002853318816050887, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.1645569620253164, "feature_vector_variance/max_squared_error": 42224.125, "feature_vector_variance/metric": 4261.97412109375, "generated_tokens/total": 6760383.0, "grad_norm": 0.0, "learning_rate": 3.6707612778634855e-07, "loss": 0.0, "mean_logprobs": -0.0013580322265625, "mean_logprobs/var": 2.205371856689453e-06, "num_completions/total": 8736, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.935546875, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 198.0, "per_token_feature_norm/min": 175.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.007400512695312, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0007161101093515754, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0006685727275907993, "policy_entropy": 0.0015482649905607104, "policy_entropy/max": 1.328125, "policy_entropy/median": 2.771615982055664e-06, "policy_entropy/min": 2.0605511963367462e-08, "policy_entropy/p25": 1.6093254089355469e-06, "policy_entropy/p75": 7.987022399902344e-06, "policy_entropy/var": 0.0008772742003202438, "policy_error_vector_variance/max_squared_error": 1.9995198249816895, "policy_error_vector_variance/metric": 0.0007169221062213182, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.921610832214355, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.38640257716178894, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 91 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 3.906270507059162e-08, "adam_stats/lr_effective_mean": -1.7389599959657942e-13, "adam_stats/lr_effective_min": -3.458116282217816e-08, "adam_stats/m_t_max": 0.00010350917727919295, "adam_stats/m_t_mean": 1.2522631586200927e-12, "adam_stats/m_t_min": -0.0001320319715887308, "adam_stats/v_t_max": 8.649941446492448e-05, "adam_stats/v_t_mean": 9.049197155497346e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0012539196759462357, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -6.625, "all_logprobs/p1": -0.0004405975341796875, "all_logprobs/p10": -1.430511474609375e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -3.4570693969726562e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002478215144947171, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.1772151898734178, "feature_vector_variance/max_squared_error": 44716.59765625, "feature_vector_variance/metric": 4327.79736328125, "generated_tokens/total": 6858687.0, "grad_norm": 0.0, "learning_rate": 2.905372804626083e-07, "loss": 0.0, "mean_logprobs": -0.001251220703125, "mean_logprobs/var": 2.205371856689453e-06, "num_completions/total": 8832, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.67709350585938, "per_token_feature_norm/max": 208.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 170.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 10.820406913757324, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0006587505340576172, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0006638310733251274, "policy_entropy": 0.0013947405386716127, "policy_entropy/max": 1.1171875, "policy_entropy/median": 2.4437904357910156e-06, "policy_entropy/min": 9.19681042432785e-09, "policy_entropy/p25": 1.475214958190918e-06, "policy_entropy/p75": 7.12275505065918e-06, "policy_entropy/var": 0.0007520728395320475, "policy_error_vector_variance/max_squared_error": 1.9887655973434448, "policy_error_vector_variance/metric": 0.0006586128729395568, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.936911582946777, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.3363208472728729, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 92 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 2.6962867494262355e-08, "adam_stats/lr_effective_mean": -1.2006341016876365e-13, "adam_stats/lr_effective_min": -2.3876777888176548e-08, "adam_stats/m_t_max": 9.315825445810333e-05, "adam_stats/m_t_mean": 1.1270370487564962e-12, "adam_stats/m_t_min": -0.00011882877151947469, "adam_stats/v_t_max": 8.641291788080707e-05, "adam_stats/v_t_mean": 9.040147970484913e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0013620754471048713, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -5.75, "all_logprobs/p1": -0.00045955670066177845, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -3.814697265625e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002312556141987443, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.189873417721519, "feature_vector_variance/max_squared_error": 43498.34375, "feature_vector_variance/metric": 4469.5126953125, "generated_tokens/total": 6956991.0, "grad_norm": 0.0, "learning_rate": 2.2278205293002645e-07, "loss": 0.0, "mean_logprobs": -0.0013580322265625, "mean_logprobs/var": 2.0116567611694336e-06, "num_completions/total": 8928, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.8154754638672, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 177.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.154048919677734, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0007555087795481086, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.000739626120775938, "policy_entropy": 0.001437291968613863, "policy_entropy/max": 1.359375, "policy_entropy/median": 2.4884939193725586e-06, "policy_entropy/min": 1.2514647096395493e-08, "policy_entropy/p25": 1.4528632164001465e-06, "policy_entropy/p75": 7.241964340209961e-06, "policy_entropy/var": 0.0008647143840789795, "policy_error_vector_variance/max_squared_error": 1.9859257936477661, "policy_error_vector_variance/metric": 0.0007559629157185555, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.935869216918945, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.32765793800354004, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 93 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.785548597865727e-08, "adam_stats/lr_effective_mean": -7.953045096402087e-14, "adam_stats/lr_effective_min": -1.581662090188729e-08, "adam_stats/m_t_max": 8.384242391912267e-05, "adam_stats/m_t_mean": 1.0143332571446728e-12, "adam_stats/m_t_min": -0.00010694588854676113, "adam_stats/v_t_max": 8.632650860818103e-05, "adam_stats/v_t_mean": 9.031109193813336e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0012818974209949374, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -5.75, "all_logprobs/p1": -0.000659942626953125, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -5.602836608886719e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002185943303629756, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.2025316455696202, "feature_vector_variance/max_squared_error": 46517.83984375, "feature_vector_variance/metric": 4338.119140625, "generated_tokens/total": 7055295.0, "grad_norm": 0.0, "learning_rate": 1.6389299449645734e-07, "loss": 0.0, "mean_logprobs": -0.00128173828125, "mean_logprobs/var": 1.8551945686340332e-06, "num_completions/total": 9024, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.80667114257812, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 178.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 12.604589462280273, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0006962220068089664, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0007030580891296268, "policy_entropy": 0.0014905984280630946, "policy_entropy/max": 1.140625, "policy_entropy/median": 2.6673078536987305e-06, "policy_entropy/min": 1.234002411365509e-08, "policy_entropy/p25": 1.5795230865478516e-06, "policy_entropy/p75": 7.420778274536133e-06, "policy_entropy/var": 0.0007944641401991248, "policy_error_vector_variance/max_squared_error": 1.9860706329345703, "policy_error_vector_variance/metric": 0.0006964779458940029, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.927627563476562, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.36981502175331116, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 94 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.1174290825977096e-08, "adam_stats/lr_effective_mean": -4.978502893683394e-14, "adam_stats/lr_effective_min": -9.901350850327617e-09, "adam_stats/m_t_max": 7.545817788923159e-05, "adam_stats/m_t_mean": 9.128998555360535e-13, "adam_stats/m_t_min": -9.625130041968077e-05, "adam_stats/v_t_max": 8.624018664704636e-05, "adam_stats/v_t_mean": 9.0220782233974e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0010819993913173676, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -6.0, "all_logprobs/p1": -0.000507240416482091, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -4.154455382376909e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0016765872715041041, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.2151898734177216, "feature_vector_variance/max_squared_error": 43154.7890625, "feature_vector_variance/metric": 4376.0830078125, "generated_tokens/total": 7153599.0, "grad_norm": 0.0, "learning_rate": 1.1394185240843985e-07, "loss": 0.0, "mean_logprobs": -0.0010833740234375, "mean_logprobs/var": 1.5944242477416992e-06, "num_completions/total": 9120, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.74142456054688, "per_token_feature_norm/max": 208.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 167.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.831058502197266, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0005882183904759586, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0005288991378620267, "policy_entropy": 0.001391516998410225, "policy_entropy/max": 1.0859375, "policy_entropy/median": 2.562999725341797e-06, "policy_entropy/min": 1.3504177331924438e-08, "policy_entropy/p25": 1.5422701835632324e-06, "policy_entropy/p75": 7.3015689849853516e-06, "policy_entropy/var": 0.0007528093410655856, "policy_error_vector_variance/max_squared_error": 1.9876108169555664, "policy_error_vector_variance/metric": 0.0005886361468583345, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.932273864746094, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.125, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.34232139587402344, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 95 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 6.443507505338175e-09, "adam_stats/lr_effective_mean": -2.871561969870588e-14, "adam_stats/lr_effective_min": -5.711225714577495e-09, "adam_stats/m_t_max": 6.79123550071381e-05, "adam_stats/m_t_mean": 8.216097073521222e-13, "adam_stats/m_t_min": -8.662616892252117e-05, "adam_stats/v_t_max": 8.615394472144544e-05, "adam_stats/v_t_mean": 9.013055926598845e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0011673311237245798, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -9.875, "all_logprobs/p1": -0.000472965301014483, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -3.5762786865234375e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0023727233055979013, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.2278481012658227, "feature_vector_variance/max_squared_error": 43922.72265625, "feature_vector_variance/metric": 4475.57763671875, "generated_tokens/total": 7251903.0, "grad_norm": 0.0, "learning_rate": 7.298948443822229e-08, "loss": 0.0, "mean_logprobs": -0.00116729736328125, "mean_logprobs/var": 2.041459083557129e-06, "num_completions/total": 9216, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.39170837402344, "per_token_feature_norm/max": 208.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 174.0, "per_token_feature_norm/p25": 194.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 13.309099197387695, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0006107091903686523, "per_token_policy_error_norm/max": 1.9921875, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0005579852731898427, "policy_entropy": 0.0014145191526040435, "policy_entropy/max": 1.1640625, "policy_entropy/median": 2.7567148208618164e-06, "policy_entropy/min": 1.5599653124809265e-08, "policy_entropy/p25": 1.6689300537109375e-06, "policy_entropy/p75": 7.599592208862305e-06, "policy_entropy/var": 0.0007848397362977266, "policy_error_vector_variance/max_squared_error": 1.992361068725586, "policy_error_vector_variance/metric": 0.0006105691427364945, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.935483932495117, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.24609375, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.3402467668056488, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 96 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 3.264968295724202e-09, "adam_stats/lr_effective_mean": -1.4554324881247728e-14, "adam_stats/lr_effective_min": -2.894799067121312e-09, "adam_stats/m_t_max": 6.112111441325396e-05, "adam_stats/m_t_mean": 7.394487583009535e-13, "adam_stats/m_t_min": -7.796355203026906e-05, "adam_stats/v_t_max": 8.60677901073359e-05, "adam_stats/v_t_mean": 9.004044038141146e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0015182655770331621, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -4.71875, "all_logprobs/p1": -0.0006866455078125, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -5.0067901611328125e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.002564897295087576, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.240506329113924, "feature_vector_variance/max_squared_error": 43090.453125, "feature_vector_variance/metric": 4094.163818359375, "generated_tokens/total": 7350207.0, "grad_norm": 0.0, "learning_rate": 4.108578473795033e-08, "loss": 0.0, "mean_logprobs": -0.00151824951171875, "mean_logprobs/var": 2.250075340270996e-06, "num_completions/total": 9312, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.39599609375, "per_token_feature_norm/max": 208.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 176.0, "per_token_feature_norm/p25": 194.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 10.883561134338379, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0008535782690159976, "per_token_policy_error_norm/max": 1.953125, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0008769070846028626, "policy_entropy": 0.0016066529788076878, "policy_entropy/max": 1.4140625, "policy_entropy/median": 2.60770320892334e-06, "policy_entropy/min": 3.3527612686157227e-08, "policy_entropy/p25": 1.5869736671447754e-06, "policy_entropy/p75": 7.12275505065918e-06, "policy_entropy/var": 0.0009220250649377704, "policy_error_vector_variance/max_squared_error": 1.9516218900680542, "policy_error_vector_variance/metric": 0.0008545710588805377, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.92408561706543, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.38726943731307983, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 97 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 1.3068998105936203e-09, "adam_stats/lr_effective_mean": -5.8273694131586156e-15, "adam_stats/lr_effective_min": -1.1590821635820703e-09, "adam_stats/m_t_max": 5.500900078914128e-05, "adam_stats/m_t_mean": 6.655040017330971e-13, "adam_stats/m_t_min": -7.016719609964639e-05, "adam_stats/v_t_max": 8.598172280471772e-05, "adam_stats/v_t_mean": 8.99503908857735e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0015303425025194883, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -7.28125, "all_logprobs/p1": -0.0006103515625, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -6.556510925292969e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0032147751189768314, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.2531645569620253, "feature_vector_variance/max_squared_error": 41480.92578125, "feature_vector_variance/metric": 4338.0888671875, "generated_tokens/total": 7448511.0, "grad_norm": 0.0, "learning_rate": 1.8269623051318517e-08, "loss": 0.0, "mean_logprobs": -0.00153350830078125, "mean_logprobs/var": 3.471970558166504e-06, "num_completions/total": 9408, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.6282196044922, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 174.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 12.371005058288574, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0008120139827951789, "per_token_policy_error_norm/max": 1.984375, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0008223685435950756, "policy_entropy": 0.001559280208311975, "policy_entropy/max": 1.203125, "policy_entropy/median": 2.637505531311035e-06, "policy_entropy/min": 2.2118911147117615e-08, "policy_entropy/p25": 1.6093254089355469e-06, "policy_entropy/p75": 7.18235969543457e-06, "policy_entropy/var": 0.0009060195297934115, "policy_error_vector_variance/max_squared_error": 1.9889070987701416, "policy_error_vector_variance/metric": 0.0008123097941279411, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.929672241210938, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.15043869614601135, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.3528544306755066, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 98 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": 2.941981380732983e-10, "adam_stats/lr_effective_mean": -1.3121620319976861e-15, "adam_stats/lr_effective_min": -2.6100235861470367e-10, "adam_stats/m_t_max": 4.9508100346429273e-05, "adam_stats/m_t_mean": 5.989537208220264e-13, "adam_stats/m_t_min": -6.31504735792987e-05, "adam_stats/v_t_max": 8.589574281359091e-05, "adam_stats/v_t_mean": 8.98604454735441e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0011719821486622095, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -9.4375, "all_logprobs/p1": -0.000492095947265625, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -5.7220458984375e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.0023700501769781113, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.2658227848101267, "feature_vector_variance/max_squared_error": 40846.73828125, "feature_vector_variance/metric": 4530.20947265625, "generated_tokens/total": 7546815.0, "grad_norm": 0.0, "learning_rate": 4.568797356781784e-09, "loss": 0.0, "mean_logprobs": -0.0011749267578125, "mean_logprobs/var": 1.8998980522155762e-06, "num_completions/total": 9504, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.4979705810547, "per_token_feature_norm/max": 210.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 171.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.784880638122559, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0006043116445653141, "per_token_policy_error_norm/max": 1.90625, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0005469759926199913, "policy_entropy": 0.0013955303002148867, "policy_entropy/max": 1.296875, "policy_entropy/median": 2.726912498474121e-06, "policy_entropy/min": 1.525040715932846e-08, "policy_entropy/p25": 1.5720725059509277e-06, "policy_entropy/p75": 7.927417755126953e-06, "policy_entropy/var": 0.0007612555054947734, "policy_error_vector_variance/max_squared_error": 1.9082715511322021, "policy_error_vector_variance/metric": 0.0006050632800906897, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.93250560760498, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": -0.0, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.35288166999816895, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 99 }, { "accuracy_reward": 0.0, "accuracy_reward/incorrect": 0.0, "accuracy_reward/incorrect/max": 0.0, "accuracy_reward/incorrect/median": 0.0, "accuracy_reward/incorrect/min": 0.0, "accuracy_reward/incorrect/p25": 0.0, "accuracy_reward/incorrect/p75": 0.0, "accuracy_reward/incorrect/var": 0.0, "accuracy_reward/max": 0.0, "accuracy_reward/median": 0.0, "accuracy_reward/min": 0.0, "accuracy_reward/p25": 0.0, "accuracy_reward/p75": 0.0, "accuracy_reward/var": 0.0, "action_level_variance/metric": NaN, "action_level_variance_full_gradient/metric": 0.0, "adam_stats/lr_effective_max": -0.0, "adam_stats/lr_effective_mean": 0.0, "adam_stats/lr_effective_min": -0.0, "adam_stats/m_t_max": 4.455728776520118e-05, "adam_stats/m_t_mean": 5.390583270557803e-13, "adam_stats/m_t_min": -5.6835426221368834e-05, "adam_stats/v_t_max": 8.580985013395548e-05, "adam_stats/v_t_mean": 8.97705867974885e-12, "adam_stats/v_t_min": 0.0, "advantages": 0.0, "advantages/max": 0.0, "advantages/median": 0.0, "advantages/min": 0.0, "advantages/p25": 0.0, "advantages/p75": 0.0, "advantages/var": 0.0, "all_logprobs": -0.0016299986746162176, "all_logprobs/max": 0.0, "all_logprobs/median": -1.1920928955078125e-07, "all_logprobs/min": -8.5, "all_logprobs/p1": -0.000682830810546875, "all_logprobs/p10": -1.5497207641601562e-06, "all_logprobs/p25": -4.76837158203125e-07, "all_logprobs/p5": -6.4373016357421875e-06, "all_logprobs/p75": -1.1920928955078125e-07, "all_logprobs/var": 0.003999890293926001, "clip_ratio": 0.0, "completion_length": 1024.0, "completion_length/incorrect": 1024.0, "completion_length/incorrect/max": 1024.0, "completion_length/incorrect/median": 1024.0, "completion_length/incorrect/min": 1024.0, "completion_length/incorrect/p25": 1024.0, "completion_length/incorrect/p75": 1024.0, "completion_length/incorrect/var": 0.0, "completion_length/max": 1024.0, "completion_length/median": 1024.0, "completion_length/min": 1024.0, "completion_length/p25": 1024.0, "completion_length/p75": 1024.0, "completion_length/var": 0.0, "epoch": 1.2784810126582278, "feature_vector_variance/max_squared_error": 41136.21484375, "feature_vector_variance/metric": 4148.2470703125, "generated_tokens/total": 7645119.0, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.0, "mean_logprobs": -0.0016326904296875, "mean_logprobs/var": 3.874301910400391e-06, "num_completions/total": 9600, "per_sentence_gradient_norm": 0.0, "per_sentence_gradient_norm/max": 0.0, "per_sentence_gradient_norm/median": 0.0, "per_sentence_gradient_norm/min": 0.0, "per_sentence_gradient_norm/p25": 0.0, "per_sentence_gradient_norm/p75": 0.0, "per_sentence_gradient_norm/p85": 0.0, "per_sentence_gradient_norm/p90": 0.0, "per_sentence_gradient_norm/p95": 0.0, "per_sentence_gradient_norm/p99": 0.0, "per_sentence_gradient_norm/var": 0.0, "per_token_feature_norm": 196.48910522460938, "per_token_feature_norm/max": 209.0, "per_token_feature_norm/median": 197.0, "per_token_feature_norm/min": 163.0, "per_token_feature_norm/p25": 195.0, "per_token_feature_norm/p75": 199.0, "per_token_feature_norm/var": 11.054245948791504, "per_token_full_gradient_variance/max_squared_error": 0.0, "per_token_full_gradient_variance/variance": 0.0, "per_token_gradient_norm": 0.0, "per_token_gradient_norm/max": 0.0, "per_token_gradient_norm/median": 0.0, "per_token_gradient_norm/min": 0.0, "per_token_gradient_norm/p1": 0.0, "per_token_gradient_norm/p10": 0.0, "per_token_gradient_norm/p25": 0.0, "per_token_gradient_norm/p5": 0.0, "per_token_gradient_norm/p75": 0.0, "per_token_gradient_norm/var": 0.0, "per_token_policy_error_norm": 0.0008255640859715641, "per_token_policy_error_norm/max": 2.0, "per_token_policy_error_norm/median": 0.0, "per_token_policy_error_norm/min": 0.0, "per_token_policy_error_norm/p25": 0.0, "per_token_policy_error_norm/p75": 0.0, "per_token_policy_error_norm/var": 0.0008417711360380054, "policy_entropy": 0.0016228508902713656, "policy_entropy/max": 1.3203125, "policy_entropy/median": 2.5779008865356445e-06, "policy_entropy/min": 1.641456037759781e-08, "policy_entropy/p25": 1.5497207641601562e-06, "policy_entropy/p75": 7.271766662597656e-06, "policy_entropy/var": 0.0009355874499306083, "policy_error_vector_variance/max_squared_error": 1.9996933937072754, "policy_error_vector_variance/metric": 0.0008259442402049899, "policy_loss": 0.0, "policy_loss/max": 0.0, "policy_loss/median": 0.0, "policy_loss/min": 0.0, "policy_loss/p25": 0.0, "policy_loss/p75": 0.0, "policy_loss/var": 0.0, "policy_sharpness": 9.922906875610352, "policy_sharpness/max": 10.0, "policy_sharpness/median": 10.0, "policy_sharpness/min": 0.0841960534453392, "policy_sharpness/p25": 10.0, "policy_sharpness/p75": 10.0, "policy_sharpness/var": 0.39256152510643005, "reward": 0.0, "reward/max": 0.0, "reward/median": 0.0, "reward/min": 0.0, "reward/p25": 0.0, "reward/p75": 0.0, "reward/var": 0.0, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/max": 0.0, "rewards/accuracy_reward/median": 0.0, "rewards/accuracy_reward/min": 0.0, "rewards/accuracy_reward/p25": 0.0, "rewards/accuracy_reward/p75": 0.0, "rewards/accuracy_reward/var": 0.0, "sentence_full_gradient_variance/max_squared_error": 0.0, "sentence_full_gradient_variance/metric": 0.0, "sentence_full_gradient_variance/p75": 0.0, "sentence_full_gradient_variance/p90": 0.0, "sentence_full_gradient_variance/p95": 0.0, "sentence_full_gradient_variance/p99": 0.0, "state_level_variance/metric": 0.0, "state_level_variance_full_gradient/metric": 0.0, "step": 100 }, { "adam_stats/lr_effective_max": -0.0, "adam_stats/lr_effective_mean": 0.0, "adam_stats/lr_effective_min": -0.0, "adam_stats/m_t_max": 4.455728776520118e-05, "adam_stats/m_t_mean": 5.390583270557803e-13, "adam_stats/m_t_min": -5.6835426221368834e-05, "adam_stats/v_t_max": 8.580985013395548e-05, "adam_stats/v_t_mean": 8.97705867974885e-12, "adam_stats/v_t_min": 0.0, "epoch": 1.2784810126582278, "step": 100, "total_flos": 0.0, "train_loss": -0.2709375082887709, "train_runtime": 17046.2867, "train_samples_per_second": 0.563, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 24, "trial_name": null, "trial_params": null }