{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.048209366391184574, "eval_steps": 500, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 249.375, "epoch": 6.887052341597796e-05, "grad_norm": 0.14106940223836606, "kl": 0.3515625, "learning_rate": 0.0, "loss": 0.2208, "reward": 3.954012870788574, "reward_std": 0.7584215998649597, "rewards/accuracy_reward": 0.7194315195083618, "rewards/accuracy_reward/std": 0.386247843503952, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.10690449178218842, "rewards/ngram_similarity_reward": 0.2725999057292938, "rewards/ngram_similarity_reward/std": 0.1362065076828003, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0001377410468319559, "grad_norm": 0.14136185175847865, "kl": 0.3515625, "learning_rate": 6.666666666666667e-09, "loss": 0.2208, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 198.5, "epoch": 0.00020661157024793388, "grad_norm": 0.05394932040958392, "kl": 0.306640625, "learning_rate": 1.3333333333333334e-08, "loss": -0.0002, "reward": 4.688358306884766, "reward_std": 0.32646387815475464, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.37557196617126465, "rewards/ngram_similarity_reward/std": 0.21272745728492737, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0006297228974290192, "clip_ratio/high_mean": 0.0006297228974290192, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006297228974290192, "epoch": 0.0002754820936639118, "grad_norm": 0.05459109341502823, "kl": 0.306640625, "learning_rate": 2e-08, "loss": -0.0004, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 170.125, "epoch": 0.0003443526170798898, "grad_norm": 0.08526141673940196, "kl": 1.015625, "learning_rate": 2.6666666666666667e-08, "loss": 0.0142, "reward": 5.02903938293457, "reward_std": 0.4545746445655823, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.6026931405067444, "rewards/ngram_similarity_reward/std": 0.2891167402267456, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00041322314049586776, "grad_norm": 0.08500782967524835, "kl": 1.015625, "learning_rate": 3.3333333333333334e-08, "loss": 0.014, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 290.5, "epoch": 0.00048209366391184575, "grad_norm": 0.37886074059236424, "kl": 0.181640625, "learning_rate": 4e-08, "loss": 0.8005, "reward": 4.805356979370117, "reward_std": 1.6273210048675537, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward_func": 0.7821428775787354, "rewards/format_reward_func/std": 0.11823531985282898, "rewards/ngram_similarity_reward": 0.75, "rewards/ngram_similarity_reward/std": 0.4629100561141968, "rewards/sql_execution_reward_func": 0.40714287757873535, "rewards/sql_execution_reward_func/std": 0.1064261868596077, "rewards/xml_reward_func": 0.9910714626312256, "rewards/xml_reward_func/std": 0.025253823027014732, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005509641873278236, "grad_norm": 0.3784673622925976, "kl": 0.181640625, "learning_rate": 4.666666666666667e-08, "loss": 0.8002, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 273.875, "epoch": 0.0006198347107438017, "grad_norm": 0.09643978915920703, "kl": 2.265625, "learning_rate": 5.3333333333333334e-08, "loss": -0.0655, "reward": 4.893341064453125, "reward_std": 0.42967408895492554, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8166666626930237, "rewards/format_reward_func/std": 0.07766430824995041, "rewards/ngram_similarity_reward": 0.48861628770828247, "rewards/ngram_similarity_reward/std": 0.22403031587600708, "rewards/sql_execution_reward_func": 0.34375, "rewards/sql_execution_reward_func/std": 0.06781013309955597, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.00045641258475370705, "clip_ratio/high_mean": 0.00045641258475370705, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00045641258475370705, "epoch": 0.0006887052341597796, "grad_norm": 0.09785552082555604, "kl": 2.265625, "learning_rate": 6e-08, "loss": -0.0657, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 527.625, "epoch": 0.0007575757575757576, "grad_norm": 0.025669645081916652, "kl": 0.01904296875, "learning_rate": 6.666666666666667e-08, "loss": 0.031, "reward": 3.1130290031433105, "reward_std": 0.22338490188121796, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7708333730697632, "rewards/format_reward_func/std": 0.12010246515274048, "rewards/ngram_similarity_reward": 0.3781304359436035, "rewards/ngram_similarity_reward/std": 0.10036895424127579, "rewards/sql_execution_reward_func": 0.2750000059604645, "rewards/sql_execution_reward_func/std": 0.0963624119758606, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008264462809917355, "grad_norm": 0.025342042305390697, "kl": 0.0189208984375, "learning_rate": 7.333333333333333e-08, "loss": 0.031, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 148.25, "epoch": 0.0008953168044077135, "grad_norm": 0.12952172501374104, "kl": 3.484375, "learning_rate": 8e-08, "loss": 0.092, "reward": 5.364823341369629, "reward_std": 0.27514246106147766, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7958333492279053, "rewards/format_reward_func/std": 0.08807913959026337, "rewards/ngram_similarity_reward": 0.8293269276618958, "rewards/ngram_similarity_reward/std": 0.19596454501152039, "rewards/sql_execution_reward_func": 0.32499998807907104, "rewards/sql_execution_reward_func/std": 0.05345224216580391, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009641873278236915, "grad_norm": 0.13015943186069562, "kl": 3.484375, "learning_rate": 8.666666666666666e-08, "loss": 0.0924, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 591.75, "epoch": 0.0010330578512396695, "grad_norm": 0.04169713045939567, "kl": 1.703125, "learning_rate": 9.333333333333334e-08, "loss": 0.313, "reward": 2.6369152069091797, "reward_std": 0.33855679631233215, "rewards/accuracy_reward": 0.21875, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/format_reward_func": 0.7666666507720947, "rewards/format_reward_func/std": 0.1511857807636261, "rewards/ngram_similarity_reward": 0.12183237820863724, "rewards/ngram_similarity_reward/std": 0.1122971773147583, "rewards/sql_execution_reward_func": 0.25, "rewards/sql_execution_reward_func/std": 0.12535662949085236, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011019283746556473, "grad_norm": 0.041853380201235815, "kl": 1.703125, "learning_rate": 1e-07, "loss": 0.3129, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 147.75, "epoch": 0.0011707988980716253, "grad_norm": 0.09619757245127532, "kl": 3.265625, "learning_rate": 1.0666666666666667e-07, "loss": -0.0167, "reward": 3.7678802013397217, "reward_std": 0.1859426647424698, "rewards/accuracy_reward": 0.46875, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/format_reward_func": 0.8250000476837158, "rewards/format_reward_func/std": 0.034503273665905, "rewards/ngram_similarity_reward": 0.4244202971458435, "rewards/ngram_similarity_reward/std": 0.14735734462738037, "rewards/sql_execution_reward_func": 0.3687500059604645, "rewards/sql_execution_reward_func/std": 0.025877464562654495, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0012396694214876034, "grad_norm": 0.09573294054574594, "kl": 3.265625, "learning_rate": 1.1333333333333332e-07, "loss": -0.0168, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 8.247422374552116e-05, "clip_ratio/low_min": 8.247422374552116e-05, "clip_ratio/region_mean": 8.247422374552116e-05, "completion_length": 1515.625, "epoch": 0.0013085399449035812, "grad_norm": 0.06824618081038158, "kl": 0.03857421875, "learning_rate": 1.2e-07, "loss": 0.4451, "reward": 2.742703437805176, "reward_std": 0.8349890112876892, "rewards/accuracy_reward": 0.2604166567325592, "rewards/accuracy_reward/std": 0.18600596487522125, "rewards/format_reward_func": 0.7102083563804626, "rewards/format_reward_func/std": 0.18320870399475098, "rewards/ngram_similarity_reward": 0.1538604199886322, "rewards/ngram_similarity_reward/std": 0.20534415543079376, "rewards/sql_execution_reward_func": 0.39180871844291687, "rewards/sql_execution_reward_func/std": 0.33968406915664673, "rewards/xml_reward_func": 0.8890625238418579, "rewards/xml_reward_func/std": 0.1831565499305725, "step": 19 }, { "clip_ratio/high_max": 8.247422374552116e-05, "clip_ratio/high_mean": 8.247422374552116e-05, "clip_ratio/low_mean": 0.00016494844749104232, "clip_ratio/low_min": 0.00016494844749104232, "clip_ratio/region_mean": 0.0002474226930644363, "epoch": 0.0013774104683195593, "grad_norm": 0.06793192793354551, "kl": 0.03857421875, "learning_rate": 1.2666666666666666e-07, "loss": 0.4448, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 923.25, "epoch": 0.001446280991735537, "grad_norm": 0.22284656218085624, "kl": 0.474609375, "learning_rate": 1.3333333333333334e-07, "loss": 1.5775, "reward": 3.047518730163574, "reward_std": 1.4419806003570557, "rewards/accuracy_reward": 0.44843751192092896, "rewards/accuracy_reward/std": 0.3488141596317291, "rewards/format_reward_func": 0.7041666507720947, "rewards/format_reward_func/std": 0.26333484053611755, "rewards/ngram_similarity_reward": 0.19765129685401917, "rewards/ngram_similarity_reward/std": 0.1177549660205841, "rewards/sql_execution_reward_func": 0.3375000059604645, "rewards/sql_execution_reward_func/std": 0.14577379822731018, "rewards/xml_reward_func": 0.8125, "rewards/xml_reward_func/std": 0.3720119297504425, "step": 21 }, { "clip_ratio/high_max": 0.0001353912812191993, "clip_ratio/high_mean": 0.0001353912812191993, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001353912812191993, "epoch": 0.0015151515151515152, "grad_norm": 0.2278612330212295, "kl": 0.474609375, "learning_rate": 1.4e-07, "loss": 1.5768, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 192.25, "epoch": 0.0015840220385674932, "grad_norm": 0.037016919597560226, "kl": 0.1357421875, "learning_rate": 1.4666666666666666e-07, "loss": 0.0179, "reward": 4.911600112915039, "reward_std": 0.2595466375350952, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8083333969116211, "rewards/format_reward_func/std": 0.0235702246427536, "rewards/ngram_similarity_reward": 0.5146780014038086, "rewards/ngram_similarity_reward/std": 0.1712336391210556, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.0530330091714859, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001652892561983471, "grad_norm": 0.03706101218810853, "kl": 0.1357421875, "learning_rate": 1.533333333333333e-07, "loss": 0.0179, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00044503781828098, "clip_ratio/low_min": 0.00044503781828098, "clip_ratio/region_mean": 0.00044503781828098, "completion_length": 280.875, "epoch": 0.001721763085399449, "grad_norm": 0.2756667112375752, "kl": 0.015380859375, "learning_rate": 1.6e-07, "loss": 0.1488, "reward": 4.010749340057373, "reward_std": 1.2190486192703247, "rewards/accuracy_reward": 0.753125011920929, "rewards/accuracy_reward/std": 0.3709152638912201, "rewards/format_reward_func": 0.7583333253860474, "rewards/format_reward_func/std": 0.1178511381149292, "rewards/ngram_similarity_reward": 0.29188868403434753, "rewards/ngram_similarity_reward/std": 0.2935604453086853, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 0.9583333730697632, "rewards/xml_reward_func/std": 0.117851123213768, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001790633608815427, "grad_norm": 0.2745062045577718, "kl": 0.0155029296875, "learning_rate": 1.6666666666666665e-07, "loss": 0.1493, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 938.25, "epoch": 0.001859504132231405, "grad_norm": 0.10782750178330544, "kl": 3.703125, "learning_rate": 1.7333333333333332e-07, "loss": 0.1627, "reward": 3.540351629257202, "reward_std": 1.1052250862121582, "rewards/accuracy_reward": 0.5625, "rewards/accuracy_reward/std": 0.47715675830841064, "rewards/format_reward_func": 0.808055579662323, "rewards/format_reward_func/std": 0.12314479798078537, "rewards/ngram_similarity_reward": 0.2758462131023407, "rewards/ngram_similarity_reward/std": 0.18577206134796143, "rewards/sql_execution_reward_func": 0.21852678060531616, "rewards/sql_execution_reward_func/std": 0.06137052923440933, "rewards/xml_reward_func": 0.9750000238418579, "rewards/xml_reward_func/std": 0.0707106739282608, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001928374655647383, "grad_norm": 0.10813204661781033, "kl": 3.703125, "learning_rate": 1.8e-07, "loss": 0.1628, "step": 28 }, { "clip_ratio/high_max": 0.00024313153699040413, "clip_ratio/high_mean": 0.00024313153699040413, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024313153699040413, "completion_length": 514.125, "epoch": 0.001997245179063361, "grad_norm": 0.05110035960988501, "kl": 0.06591796875, "learning_rate": 1.8666666666666667e-07, "loss": 0.0678, "reward": 2.8232598304748535, "reward_std": 0.3983207046985626, "rewards/accuracy_reward": 0.21482065320014954, "rewards/accuracy_reward/std": 0.1354871243238449, "rewards/format_reward_func": 0.8916667103767395, "rewards/format_reward_func/std": 0.09385906159877777, "rewards/ngram_similarity_reward": 0.12630115449428558, "rewards/ngram_similarity_reward/std": 0.11305492371320724, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.13024702668190002, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.00024313153699040413, "clip_ratio/high_mean": 0.00024313153699040413, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024313153699040413, "epoch": 0.002066115702479339, "grad_norm": 0.05180116005716909, "kl": 0.06591796875, "learning_rate": 1.9333333333333332e-07, "loss": 0.0678, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0004379242309369147, "clip_ratio/low_min": 0.0004379242309369147, "clip_ratio/region_mean": 0.0004379242309369147, "completion_length": 1141.75, "epoch": 0.002134986225895317, "grad_norm": 0.3637775197159897, "kl": 0.130859375, "learning_rate": 2e-07, "loss": 1.4956, "reward": 2.9458391666412354, "reward_std": 1.6958247423171997, "rewards/accuracy_reward": 0.53125, "rewards/accuracy_reward/std": 0.4317220449447632, "rewards/format_reward_func": 0.6066666841506958, "rewards/format_reward_func/std": 0.2630227208137512, "rewards/ngram_similarity_reward": 0.1997261345386505, "rewards/ngram_similarity_reward/std": 0.1182771772146225, "rewards/sql_execution_reward_func": 0.22291666269302368, "rewards/sql_execution_reward_func/std": 0.15169347822666168, "rewards/xml_reward_func": 0.7541666626930237, "rewards/xml_reward_func/std": 0.44066599011421204, "step": 31 }, { "clip_ratio/high_max": 0.00010948105773422867, "clip_ratio/high_mean": 0.00010948105773422867, "clip_ratio/low_mean": 0.00032844318775460124, "clip_ratio/low_min": 0.00032844318775460124, "clip_ratio/region_mean": 0.0004379242309369147, "epoch": 0.0022038567493112946, "grad_norm": 0.40458628409251646, "kl": 0.130859375, "learning_rate": 2.0666666666666666e-07, "loss": 1.4956, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 648.375, "epoch": 0.0022727272727272726, "grad_norm": 0.11782456474919771, "kl": 0.06884765625, "learning_rate": 2.1333333333333334e-07, "loss": 0.3303, "reward": 3.084779977798462, "reward_std": 0.9518809914588928, "rewards/accuracy_reward": 0.29189324378967285, "rewards/accuracy_reward/std": 0.31050875782966614, "rewards/format_reward_func": 0.7148550748825073, "rewards/format_reward_func/std": 0.1123131588101387, "rewards/ngram_similarity_reward": 0.2901430130004883, "rewards/ngram_similarity_reward/std": 0.21228431165218353, "rewards/sql_execution_reward_func": 0.35092389583587646, "rewards/sql_execution_reward_func/std": 0.1605135202407837, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0003855793329421431, "clip_ratio/high_mean": 0.0003855793329421431, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003855793329421431, "epoch": 0.0023415977961432507, "grad_norm": 0.11760934237835333, "kl": 0.06884765625, "learning_rate": 2.1999999999999998e-07, "loss": 0.3299, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 258.0, "epoch": 0.0024104683195592287, "grad_norm": 0.14741203197795408, "kl": 1.4296875, "learning_rate": 2.2666666666666663e-07, "loss": -0.1719, "reward": 4.517668724060059, "reward_std": 1.101088523864746, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.8250000476837158, "rewards/format_reward_func/std": 0.034503273665905, "rewards/ngram_similarity_reward": 0.4159455895423889, "rewards/ngram_similarity_reward/std": 0.3752118945121765, "rewards/sql_execution_reward_func": 0.3187499940395355, "rewards/sql_execution_reward_func/std": 0.0752970352768898, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00048449612222611904, "clip_ratio/low_min": 0.00048449612222611904, "clip_ratio/region_mean": 0.00048449612222611904, "epoch": 0.0024793388429752068, "grad_norm": 0.14762912722918978, "kl": 1.4296875, "learning_rate": 2.3333333333333333e-07, "loss": -0.172, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0003392130311112851, "clip_ratio/low_min": 0.0003392130311112851, "clip_ratio/region_mean": 0.0003392130311112851, "completion_length": 368.5, "epoch": 0.0025482093663911844, "grad_norm": 0.15669020494220293, "kl": 1.203125, "learning_rate": 2.4e-07, "loss": 0.2235, "reward": 3.1981868743896484, "reward_std": 1.0118329524993896, "rewards/accuracy_reward": 0.31562501192092896, "rewards/accuracy_reward/std": 0.3691054582595825, "rewards/format_reward_func": 0.8083333373069763, "rewards/format_reward_func/std": 0.18665815889835358, "rewards/ngram_similarity_reward": 0.25990235805511475, "rewards/ngram_similarity_reward/std": 0.27765896916389465, "rewards/sql_execution_reward_func": 0.3687500059604645, "rewards/sql_execution_reward_func/std": 0.03720119222998619, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0003392130311112851, "clip_ratio/high_mean": 0.0003392130311112851, "clip_ratio/low_mean": 0.0003392130311112851, "clip_ratio/low_min": 0.0003392130311112851, "clip_ratio/region_mean": 0.0006784260622225702, "epoch": 0.0026170798898071624, "grad_norm": 0.15477088907847442, "kl": 1.203125, "learning_rate": 2.4666666666666665e-07, "loss": 0.2235, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 262.25, "epoch": 0.0026859504132231405, "grad_norm": 0.1256628831129127, "kl": 2.125, "learning_rate": 2.533333333333333e-07, "loss": -0.1082, "reward": 4.095180511474609, "reward_std": 0.9146755933761597, "rewards/accuracy_reward": 0.71875, "rewards/accuracy_reward/std": 0.38816189765930176, "rewards/format_reward_func": 0.7916666865348816, "rewards/format_reward_func/std": 0.08309489488601685, "rewards/ngram_similarity_reward": 0.3690091073513031, "rewards/ngram_similarity_reward/std": 0.1661357879638672, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.06943650543689728, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0027548209366391185, "grad_norm": 0.12396827062803148, "kl": 2.125, "learning_rate": 2.6e-07, "loss": -0.1081, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00019805901683866978, "clip_ratio/low_min": 0.00019805901683866978, "clip_ratio/region_mean": 0.00019805901683866978, "completion_length": 1262.25, "epoch": 0.0028236914600550966, "grad_norm": 0.10707796225481378, "kl": 5.71875, "learning_rate": 2.6666666666666667e-07, "loss": 0.4822, "reward": 2.7226686477661133, "reward_std": 0.9622436761856079, "rewards/accuracy_reward": 0.19143739342689514, "rewards/accuracy_reward/std": 0.34808802604675293, "rewards/format_reward_func": 0.8017659187316895, "rewards/format_reward_func/std": 0.17648838460445404, "rewards/ngram_similarity_reward": 0.12260061502456665, "rewards/ngram_similarity_reward/std": 0.11804450303316116, "rewards/sql_execution_reward_func": 0.4048214256763458, "rewards/sql_execution_reward_func/std": 0.21673692762851715, "rewards/xml_reward_func": 0.949305534362793, "rewards/xml_reward_func/std": 0.13677559792995453, "step": 41 }, { "clip_ratio/high_max": 9.902950841933489e-05, "clip_ratio/high_mean": 9.902950841933489e-05, "clip_ratio/low_mean": 9.902950841933489e-05, "clip_ratio/low_min": 9.902950841933489e-05, "clip_ratio/region_mean": 0.00019805901683866978, "epoch": 0.002892561983471074, "grad_norm": 0.10432352423260693, "kl": 5.53125, "learning_rate": 2.733333333333333e-07, "loss": 0.4819, "step": 42 }, { "clip_ratio/high_max": 0.00011611704394454136, "clip_ratio/high_mean": 0.00011611704394454136, "clip_ratio/low_mean": 0.00011611704394454136, "clip_ratio/low_min": 0.00011611704394454136, "clip_ratio/region_mean": 0.00023223408788908273, "completion_length": 1076.5, "epoch": 0.0029614325068870523, "grad_norm": 0.23572456106351736, "kl": 0.25, "learning_rate": 2.8e-07, "loss": 0.6941, "reward": 2.2679529190063477, "reward_std": 0.8697607517242432, "rewards/accuracy_reward": 0.09375, "rewards/accuracy_reward/std": 0.0578637570142746, "rewards/format_reward_func": 0.7783333659172058, "rewards/format_reward_func/std": 0.27315327525138855, "rewards/ngram_similarity_reward": 0.0951630026102066, "rewards/ngram_similarity_reward/std": 0.07827582955360413, "rewards/sql_execution_reward_func": 0.31562501192092896, "rewards/sql_execution_reward_func/std": 0.20041808485984802, "rewards/xml_reward_func": 0.84375, "rewards/xml_reward_func/std": 0.35197150707244873, "step": 43 }, { "clip_ratio/high_max": 0.00023223408788908273, "clip_ratio/high_mean": 0.00023223408788908273, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023223408788908273, "epoch": 0.0030303030303030303, "grad_norm": 0.22211517600123934, "kl": 0.25, "learning_rate": 2.866666666666667e-07, "loss": 0.6939, "step": 44 }, { "clip_ratio/high_max": 0.0001763512846082449, "clip_ratio/high_mean": 0.0001763512846082449, "clip_ratio/low_mean": 0.00026452692691236734, "clip_ratio/low_min": 0.00026452692691236734, "clip_ratio/region_mean": 0.0004408782406244427, "completion_length": 1417.625, "epoch": 0.0030991735537190084, "grad_norm": 0.23099340186897963, "kl": 2.8125, "learning_rate": 2.933333333333333e-07, "loss": 0.3594, "reward": 2.0684800148010254, "reward_std": 0.9137807488441467, "rewards/accuracy_reward": 0.0887838751077652, "rewards/accuracy_reward/std": 0.21191981434822083, "rewards/format_reward_func": 0.8311904668807983, "rewards/format_reward_func/std": 0.2879602313041687, "rewards/ngram_similarity_reward": 0.03874307870864868, "rewards/ngram_similarity_reward/std": 0.04205494001507759, "rewards/sql_execution_reward_func": 0.145357146859169, "rewards/sql_execution_reward_func/std": 0.1280505359172821, "rewards/xml_reward_func": 0.856249988079071, "rewards/xml_reward_func/std": 0.34993621706962585, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00026452692691236734, "clip_ratio/low_min": 0.00026452692691236734, "clip_ratio/region_mean": 0.00026452692691236734, "epoch": 0.0031680440771349864, "grad_norm": 0.2301543777272379, "kl": 2.8125, "learning_rate": 3e-07, "loss": 0.3591, "step": 46 }, { "clip_ratio/high_max": 0.0004037141625303775, "clip_ratio/high_mean": 0.0004037141625303775, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004037141625303775, "completion_length": 309.625, "epoch": 0.003236914600550964, "grad_norm": 0.15856078056273973, "kl": 0.7734375, "learning_rate": 3.066666666666666e-07, "loss": 0.2139, "reward": 4.2303667068481445, "reward_std": 0.7305248379707336, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.815000057220459, "rewards/format_reward_func/std": 0.04242641106247902, "rewards/ngram_similarity_reward": 0.20399489998817444, "rewards/ngram_similarity_reward/std": 0.16360625624656677, "rewards/sql_execution_reward_func": 0.359375, "rewards/sql_execution_reward_func/std": 0.018600599840283394, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0004037141625303775, "clip_ratio/high_mean": 0.0004037141625303775, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004037141625303775, "epoch": 0.003305785123966942, "grad_norm": 0.1574006908263827, "kl": 0.78125, "learning_rate": 3.1333333333333333e-07, "loss": 0.2137, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 130.625, "epoch": 0.00337465564738292, "grad_norm": 0.028947810268013723, "kl": 0.322265625, "learning_rate": 3.2e-07, "loss": -0.0106, "reward": 4.402675628662109, "reward_std": 0.11463428288698196, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.09258200973272324, "rewards/ngram_similarity_reward": 0.19761735200881958, "rewards/ngram_similarity_reward/std": 0.041315577924251556, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003443526170798898, "grad_norm": 0.029603525092829052, "kl": 0.322265625, "learning_rate": 3.2666666666666663e-07, "loss": -0.0108, "step": 50 }, { "clip_ratio/high_max": 0.0001478415069868788, "clip_ratio/high_mean": 0.0001478415069868788, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001478415069868788, "completion_length": 845.5, "epoch": 0.0035123966942148762, "grad_norm": 0.17300846922395424, "kl": 0.77734375, "learning_rate": 3.333333333333333e-07, "loss": 1.2508, "reward": 3.305680274963379, "reward_std": 1.5203808546066284, "rewards/accuracy_reward": 0.53125, "rewards/accuracy_reward/std": 0.5077524185180664, "rewards/format_reward_func": 0.7262986898422241, "rewards/format_reward_func/std": 0.21407900750637054, "rewards/ngram_similarity_reward": 0.204013854265213, "rewards/ngram_similarity_reward/std": 0.20759092271327972, "rewards/sql_execution_reward_func": 0.32307690382003784, "rewards/sql_execution_reward_func/std": 0.15890049934387207, "rewards/xml_reward_func": 0.8877841234207153, "rewards/xml_reward_func/std": 0.2704475522041321, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003581267217630854, "grad_norm": 0.17071130335981644, "kl": 0.78125, "learning_rate": 3.4000000000000003e-07, "loss": 1.2511, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00044749403605237603, "clip_ratio/low_min": 0.00044749403605237603, "clip_ratio/region_mean": 0.00044749403605237603, "completion_length": 838.0, "epoch": 0.003650137741046832, "grad_norm": 0.327046960467791, "kl": 5.9375, "learning_rate": 3.4666666666666665e-07, "loss": 1.0972, "reward": 2.6913039684295654, "reward_std": 1.0348117351531982, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.155838742852211, "rewards/format_reward_func": 0.7596726417541504, "rewards/format_reward_func/std": 0.24319575726985931, "rewards/ngram_similarity_reward": 0.09889017790555954, "rewards/ngram_similarity_reward/std": 0.08150958269834518, "rewards/sql_execution_reward_func": 0.40625, "rewards/sql_execution_reward_func/std": 0.25972169637680054, "rewards/xml_reward_func": 0.8770461082458496, "rewards/xml_reward_func/std": 0.3353174924850464, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00014916468353476375, "clip_ratio/low_min": 0.00014916468353476375, "clip_ratio/region_mean": 0.00014916468353476375, "epoch": 0.00371900826446281, "grad_norm": 0.34919298486704026, "kl": 5.9375, "learning_rate": 3.533333333333333e-07, "loss": 1.0974, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0005341880605556071, "clip_ratio/low_min": 0.0005341880605556071, "clip_ratio/region_mean": 0.0005341880605556071, "completion_length": 234.0, "epoch": 0.003787878787878788, "grad_norm": 0.19891768559817385, "kl": 0.625, "learning_rate": 3.6e-07, "loss": 0.114, "reward": 4.1376800537109375, "reward_std": 0.6557698249816895, "rewards/accuracy_reward": 0.878125011920929, "rewards/accuracy_reward/std": 0.34471458196640015, "rewards/format_reward_func": 0.8083333969116211, "rewards/format_reward_func/std": 0.10947203636169434, "rewards/ngram_similarity_reward": 0.14456453919410706, "rewards/ngram_similarity_reward/std": 0.0657598078250885, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.08210402727127075, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0005341880605556071, "clip_ratio/low_min": 0.0005341880605556071, "clip_ratio/region_mean": 0.0005341880605556071, "epoch": 0.003856749311294766, "grad_norm": 0.1984196776641716, "kl": 0.625, "learning_rate": 3.666666666666666e-07, "loss": 0.1138, "step": 56 }, { "clip_ratio/high_max": 0.0001610565377632156, "clip_ratio/high_mean": 0.0001610565377632156, "clip_ratio/low_mean": 0.0001610565377632156, "clip_ratio/low_min": 0.0001610565377632156, "clip_ratio/region_mean": 0.0003221130755264312, "completion_length": 776.125, "epoch": 0.003925619834710744, "grad_norm": 0.09956202917896016, "kl": 2.90625, "learning_rate": 3.7333333333333334e-07, "loss": 0.6066, "reward": 3.5108048915863037, "reward_std": 0.8940889239311218, "rewards/accuracy_reward": 0.578125, "rewards/accuracy_reward/std": 0.4952339828014374, "rewards/format_reward_func": 0.7544872164726257, "rewards/format_reward_func/std": 0.1463930308818817, "rewards/ngram_similarity_reward": 0.06478879600763321, "rewards/ngram_similarity_reward/std": 0.04108985885977745, "rewards/sql_execution_reward_func": 0.512499988079071, "rewards/sql_execution_reward_func/std": 0.2705813944339752, "rewards/xml_reward_func": 0.990384578704834, "rewards/xml_reward_func/std": 0.027196412906050682, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003994490358126722, "grad_norm": 0.10062685255699713, "kl": 2.921875, "learning_rate": 3.7999999999999996e-07, "loss": 0.607, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 196.5, "epoch": 0.004063360881542699, "grad_norm": 0.06043336477069392, "kl": 1.7578125, "learning_rate": 3.8666666666666664e-07, "loss": -0.0252, "reward": 3.042389392852783, "reward_std": 0.16437175869941711, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8083333969116211, "rewards/format_reward_func/std": 0.04272466152906418, "rewards/ngram_similarity_reward": 0.27062076330184937, "rewards/ngram_similarity_reward/std": 0.1361408531665802, "rewards/sql_execution_reward_func": 0.34375, "rewards/sql_execution_reward_func/std": 0.06232117488980293, "rewards/xml_reward_func": 0.984375, "rewards/xml_reward_func/std": 0.04419417306780815, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004132231404958678, "grad_norm": 0.058006460601756155, "kl": 1.65625, "learning_rate": 3.933333333333333e-07, "loss": -0.0253, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 243.125, "epoch": 0.004201101928374655, "grad_norm": 0.02799029503774224, "kl": 0.0390625, "learning_rate": 4e-07, "loss": -0.0077, "reward": 4.547506332397461, "reward_std": 0.23133958876132965, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.0, "rewards/ngram_similarity_reward": 0.2650041878223419, "rewards/ngram_similarity_reward/std": 0.1542264223098755, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004269972451790634, "grad_norm": 0.027959556299648514, "kl": 0.038818359375, "learning_rate": 4.0666666666666666e-07, "loss": -0.0077, "step": 62 }, { "clip_ratio/high_max": 0.0005293806316331029, "clip_ratio/high_mean": 0.0005293806316331029, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005293806316331029, "completion_length": 236.125, "epoch": 0.0043388429752066115, "grad_norm": 0.28525092529734797, "kl": 9.3125, "learning_rate": 4.1333333333333333e-07, "loss": 0.1052, "reward": 4.49778938293457, "reward_std": 0.6910950541496277, "rewards/accuracy_reward": 0.90625, "rewards/accuracy_reward/std": 0.2651650309562683, "rewards/format_reward_func": 0.8083333969116211, "rewards/format_reward_func/std": 0.0235702246427536, "rewards/ngram_similarity_reward": 0.347137451171875, "rewards/ngram_similarity_reward/std": 0.2834882140159607, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004407713498622589, "grad_norm": 0.29500405826021614, "kl": 9.3125, "learning_rate": 4.1999999999999995e-07, "loss": 0.1055, "step": 64 }, { "clip_ratio/high_max": 0.000170444865943864, "clip_ratio/high_mean": 0.000170444865943864, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000170444865943864, "completion_length": 733.375, "epoch": 0.004476584022038568, "grad_norm": 0.08470451703580886, "kl": 0.259765625, "learning_rate": 4.266666666666667e-07, "loss": 0.0018, "reward": 3.56166934967041, "reward_std": 1.0119047164916992, "rewards/accuracy_reward": 0.6000000238418579, "rewards/accuracy_reward/std": 0.43260011076927185, "rewards/format_reward_func": 0.7875000238418579, "rewards/format_reward_func/std": 0.1726888120174408, "rewards/ngram_similarity_reward": 0.2559044361114502, "rewards/ngram_similarity_reward/std": 0.11542002111673355, "rewards/sql_execution_reward_func": 0.21375001966953278, "rewards/sql_execution_reward_func/std": 0.13179394602775574, "rewards/xml_reward_func": 0.9765625, "rewards/xml_reward_func/std": 0.06629125773906708, "step": 65 }, { "clip_ratio/high_max": 0.000170444865943864, "clip_ratio/high_mean": 0.000170444865943864, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000170444865943864, "epoch": 0.004545454545454545, "grad_norm": 0.0845384991302133, "kl": 0.259765625, "learning_rate": 4.3333333333333335e-07, "loss": 0.0016, "step": 66 }, { "clip_ratio/high_max": 0.00023707919172011316, "clip_ratio/high_mean": 0.00023707919172011316, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023707919172011316, "completion_length": 527.25, "epoch": 0.004614325068870524, "grad_norm": 0.6727349287612319, "kl": 6.1875, "learning_rate": 4.3999999999999997e-07, "loss": 2.9328, "reward": 5.024999618530273, "reward_std": 1.6884057521820068, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.7232142686843872, "rewards/format_reward_func/std": 0.15328045189380646, "rewards/ngram_similarity_reward": 0.875, "rewards/ngram_similarity_reward/std": 0.3535533845424652, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 0.9330357313156128, "rewards/xml_reward_func/std": 0.18940360844135284, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004683195592286501, "grad_norm": 0.6728575450923183, "kl": 6.1875, "learning_rate": 4.4666666666666664e-07, "loss": 2.9319, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 417.25, "epoch": 0.004752066115702479, "grad_norm": 1.8041438208766032, "kl": 0.78515625, "learning_rate": 4.5333333333333326e-07, "loss": 2.1213, "reward": 4.571249961853027, "reward_std": 2.113943099975586, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward_func": 0.7275000214576721, "rewards/format_reward_func/std": 0.2456333190202713, "rewards/ngram_similarity_reward": 0.75, "rewards/ngram_similarity_reward/std": 0.4629100561141968, "rewards/sql_execution_reward_func": 0.34375, "rewards/sql_execution_reward_func/std": 0.14744853973388672, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0005991611978970468, "clip_ratio/low_min": 0.0005991611978970468, "clip_ratio/region_mean": 0.0005991611978970468, "epoch": 0.0048209366391184574, "grad_norm": 1.7843753785966043, "kl": 0.80078125, "learning_rate": 4.6e-07, "loss": 2.1212, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.000530222721863538, "clip_ratio/low_min": 0.000530222721863538, "clip_ratio/region_mean": 0.000530222721863538, "completion_length": 235.75, "epoch": 0.004889807162534435, "grad_norm": 0.1175581468295319, "kl": 0.0128173828125, "learning_rate": 4.6666666666666666e-07, "loss": -0.1061, "reward": 3.0656545162200928, "reward_std": 0.7050446271896362, "rewards/accuracy_reward": 0.31562501192092896, "rewards/accuracy_reward/std": 0.2875193953514099, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.09258200973272324, "rewards/ngram_similarity_reward": 0.22293633222579956, "rewards/ngram_similarity_reward/std": 0.09389791637659073, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0049586776859504135, "grad_norm": 0.11772150018914145, "kl": 0.0128173828125, "learning_rate": 4.733333333333333e-07, "loss": -0.106, "step": 72 }, { "clip_ratio/high_max": 0.0005170630756765604, "clip_ratio/high_mean": 0.0005170630756765604, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005170630756765604, "completion_length": 241.75, "epoch": 0.005027548209366391, "grad_norm": 0.027260633513646246, "kl": 0.82421875, "learning_rate": 4.8e-07, "loss": -0.011, "reward": 4.458961486816406, "reward_std": 0.15348279476165771, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8416666984558105, "rewards/format_reward_func/std": 0.04960158094763756, "rewards/ngram_similarity_reward": 0.19319681823253632, "rewards/ngram_similarity_reward/std": 0.1257815808057785, "rewards/sql_execution_reward_func": 0.32749998569488525, "rewards/sql_execution_reward_func/std": 0.075922891497612, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0005170630756765604, "clip_ratio/high_mean": 0.0005170630756765604, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005170630756765604, "epoch": 0.005096418732782369, "grad_norm": 0.027340785268170707, "kl": 0.82421875, "learning_rate": 4.866666666666666e-07, "loss": -0.011, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 268.5, "epoch": 0.005165289256198347, "grad_norm": 0.034844267602954074, "kl": 1.171875, "learning_rate": 4.933333333333333e-07, "loss": -0.0224, "reward": 3.2034854888916016, "reward_std": 0.13698825240135193, "rewards/accuracy_reward": 0.4479166567325592, "rewards/accuracy_reward/std": 0.043129101395606995, "rewards/format_reward_func": 0.8041666746139526, "rewards/format_reward_func/std": 0.05175492540001869, "rewards/ngram_similarity_reward": 0.12107368558645248, "rewards/ngram_similarity_reward/std": 0.03844968229532242, "rewards/sql_execution_reward_func": 0.3374999761581421, "rewards/sql_execution_reward_func/std": 0.058248236775398254, "rewards/xml_reward_func": 0.984375, "rewards/xml_reward_func/std": 0.04419417306780815, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005234159779614325, "grad_norm": 0.03483879536793551, "kl": 1.171875, "learning_rate": 5e-07, "loss": -0.0225, "step": 76 }, { "clip_ratio/high_max": 0.0011098779505118728, "clip_ratio/high_mean": 0.0011098779505118728, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011098779505118728, "completion_length": 112.625, "epoch": 0.005303030303030303, "grad_norm": 0.04986517759328865, "kl": 1.78125, "learning_rate": 5.066666666666667e-07, "loss": -0.0089, "reward": 2.9128785133361816, "reward_std": 0.0815652534365654, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.19191919267177582, "rewards/ngram_similarity_reward/std": 0.018703434616327286, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005371900826446281, "grad_norm": 0.04969167372610882, "kl": 1.78125, "learning_rate": 5.133333333333333e-07, "loss": -0.0088, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0004739336436614394, "clip_ratio/low_min": 0.0004739336436614394, "clip_ratio/region_mean": 0.0004739336436614394, "completion_length": 263.75, "epoch": 0.005440771349862259, "grad_norm": 0.08716182717309644, "kl": 3.046875, "learning_rate": 5.2e-07, "loss": 0.0137, "reward": 2.8807897567749023, "reward_std": 0.1985308676958084, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.0, "rewards/ngram_similarity_reward": 0.15385979413986206, "rewards/ngram_similarity_reward/std": 0.13235393166542053, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005509641873278237, "grad_norm": 0.08214743117255396, "kl": 3.046875, "learning_rate": 5.266666666666666e-07, "loss": 0.0136, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 402.875, "epoch": 0.005578512396694215, "grad_norm": 0.201367065690436, "kl": 8.125, "learning_rate": 5.333333333333333e-07, "loss": 0.0973, "reward": 2.80938720703125, "reward_std": 0.2742134928703308, "rewards/accuracy_reward": 0.19062699377536774, "rewards/accuracy_reward/std": 0.1101452112197876, "rewards/format_reward_func": 0.8915384411811829, "rewards/format_reward_func/std": 0.16307899355888367, "rewards/ngram_similarity_reward": 0.09314659237861633, "rewards/ngram_similarity_reward/std": 0.049852821975946426, "rewards/sql_execution_reward_func": 0.4281249940395355, "rewards/sql_execution_reward_func/std": 0.1319073736667633, "rewards/xml_reward_func": 0.96875, "rewards/xml_reward_func/std": 0.0883883461356163, "step": 81 }, { "clip_ratio/high_max": 0.0003102699411101639, "clip_ratio/high_mean": 0.0003102699411101639, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003102699411101639, "epoch": 0.005647382920110193, "grad_norm": 0.20551767251361705, "kl": 8.125, "learning_rate": 5.4e-07, "loss": 0.0973, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00032341526821255684, "clip_ratio/low_min": 0.00032341526821255684, "clip_ratio/region_mean": 0.00032341526821255684, "completion_length": 1159.5, "epoch": 0.005716253443526171, "grad_norm": 0.21954272573223427, "kl": 0.12158203125, "learning_rate": 5.466666666666666e-07, "loss": 1.3082, "reward": 2.2462821006774902, "reward_std": 1.0935583114624023, "rewards/accuracy_reward": 0.1875, "rewards/accuracy_reward/std": 0.1157275140285492, "rewards/format_reward_func": 0.6670833826065063, "rewards/format_reward_func/std": 0.26833075284957886, "rewards/ngram_similarity_reward": 0.07988256216049194, "rewards/ngram_similarity_reward/std": 0.05662178620696068, "rewards/sql_execution_reward_func": 0.3187500238418579, "rewards/sql_execution_reward_func/std": 0.13076014816761017, "rewards/xml_reward_func": 0.765625, "rewards/xml_reward_func/std": 0.43526214361190796, "step": 83 }, { "clip_ratio/high_max": 0.00010780509182950482, "clip_ratio/high_mean": 0.00010780509182950482, "clip_ratio/low_mean": 0.00021561018365900964, "clip_ratio/low_min": 0.00021561018365900964, "clip_ratio/region_mean": 0.00032341526821255684, "epoch": 0.005785123966942148, "grad_norm": 0.22158839339489528, "kl": 0.12060546875, "learning_rate": 5.533333333333334e-07, "loss": 1.308, "step": 84 }, { "clip_ratio/high_max": 8.220641757361591e-05, "clip_ratio/high_mean": 8.220641757361591e-05, "clip_ratio/low_mean": 0.00016441283514723182, "clip_ratio/low_min": 0.00016441283514723182, "clip_ratio/region_mean": 0.0002466192527208477, "completion_length": 3041.125, "epoch": 0.005853994490358127, "grad_norm": 0.039936946926428525, "kl": 0.6328125, "learning_rate": 5.6e-07, "loss": 0.0823, "reward": 1.3784091472625732, "reward_std": 0.6346496939659119, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.6340888142585754, "rewards/format_reward_func/std": 0.2518434524536133, "rewards/ngram_similarity_reward": 0.0, "rewards/ngram_similarity_reward/std": 0.0, "rewards/sql_execution_reward_func": 0.008333333767950535, "rewards/sql_execution_reward_func/std": 0.0235702283680439, "rewards/xml_reward_func": 0.73598712682724, "rewards/xml_reward_func/std": 0.4066542387008667, "step": 85 }, { "clip_ratio/high_max": 0.00032882567029446363, "clip_ratio/high_mean": 0.00032882567029446363, "clip_ratio/low_mean": 0.0002466192527208477, "clip_ratio/low_min": 0.0002466192527208477, "clip_ratio/region_mean": 0.0005754449521191418, "epoch": 0.0059228650137741045, "grad_norm": 0.03611894158675422, "kl": 0.65625, "learning_rate": 5.666666666666666e-07, "loss": 0.0823, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 425.125, "epoch": 0.005991735537190083, "grad_norm": 0.15325151410995547, "kl": 3.796875, "learning_rate": 5.733333333333334e-07, "loss": -0.1494, "reward": 4.136569023132324, "reward_std": 0.9584982991218567, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/std": 0.3720119297504425, "rewards/format_reward_func": 0.8387500047683716, "rewards/format_reward_func/std": 0.10651680827140808, "rewards/ngram_similarity_reward": 0.27007386088371277, "rewards/ngram_similarity_reward/std": 0.18431302905082703, "rewards/sql_execution_reward_func": 0.27291667461395264, "rewards/sql_execution_reward_func/std": 0.16449478268623352, "rewards/xml_reward_func": 0.9947916269302368, "rewards/xml_reward_func/std": 0.014731398783624172, "step": 87 }, { "clip_ratio/high_max": 0.0002940311678685248, "clip_ratio/high_mean": 0.0002940311678685248, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002940311678685248, "epoch": 0.006060606060606061, "grad_norm": 0.15405417706436955, "kl": 3.796875, "learning_rate": 5.8e-07, "loss": -0.1496, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 256.625, "epoch": 0.006129476584022038, "grad_norm": 0.28401746617098306, "kl": 1.0390625, "learning_rate": 5.866666666666666e-07, "loss": 0.0808, "reward": 3.965121030807495, "reward_std": 1.430607795715332, "rewards/accuracy_reward": 0.505244255065918, "rewards/accuracy_reward/std": 0.4352155029773712, "rewards/format_reward_func": 0.7708333730697632, "rewards/format_reward_func/std": 0.1174294650554657, "rewards/ngram_similarity_reward": 0.5600329041481018, "rewards/ngram_similarity_reward/std": 0.4559308588504791, "rewards/sql_execution_reward_func": 0.34375, "rewards/sql_execution_reward_func/std": 0.06781014055013657, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.006198347107438017, "grad_norm": 0.2661541736857027, "kl": 1.046875, "learning_rate": 5.933333333333334e-07, "loss": 0.0808, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 510.875, "epoch": 0.006267217630853994, "grad_norm": 0.8402162424042238, "kl": 0.75, "learning_rate": 6e-07, "loss": 2.267, "reward": 4.346341133117676, "reward_std": 1.585435390472412, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.6677966117858887, "rewards/format_reward_func/std": 0.1645776778459549, "rewards/ngram_similarity_reward": 0.4977022111415863, "rewards/ngram_similarity_reward/std": 0.42641595005989075, "rewards/sql_execution_reward_func": 0.26249998807907104, "rewards/sql_execution_reward_func/std": 0.16201850771903992, "rewards/xml_reward_func": 0.9194915294647217, "rewards/xml_reward_func/std": 0.22771236300468445, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.006336088154269973, "grad_norm": 0.8532444118940503, "kl": 0.7421875, "learning_rate": 6.066666666666666e-07, "loss": 2.268, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 992.875, "epoch": 0.00640495867768595, "grad_norm": 0.33260141950412137, "kl": 10.125, "learning_rate": 6.133333333333332e-07, "loss": 1.0539, "reward": 2.841299057006836, "reward_std": 1.0840891599655151, "rewards/accuracy_reward": 0.21875, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/format_reward_func": 0.668749988079071, "rewards/format_reward_func/std": 0.23812055587768555, "rewards/ngram_similarity_reward": 0.301838219165802, "rewards/ngram_similarity_reward/std": 0.17548206448554993, "rewards/sql_execution_reward_func": 0.4125000238418579, "rewards/sql_execution_reward_func/std": 0.19775526225566864, "rewards/xml_reward_func": 0.8697916269302368, "rewards/xml_reward_func/std": 0.3517512381076813, "step": 93 }, { "clip_ratio/high_max": 0.0002517940301913768, "clip_ratio/high_mean": 0.0002517940301913768, "clip_ratio/low_mean": 0.0001258970150956884, "clip_ratio/low_min": 0.0001258970150956884, "clip_ratio/region_mean": 0.00037769105983898044, "epoch": 0.006473829201101928, "grad_norm": 0.3070080874225156, "kl": 10.0, "learning_rate": 6.2e-07, "loss": 1.0535, "step": 94 }, { "clip_ratio/high_max": 0.000106236053397879, "clip_ratio/high_mean": 0.000106236053397879, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000106236053397879, "completion_length": 1176.625, "epoch": 0.0065426997245179065, "grad_norm": 0.2916776146975469, "kl": 0.041259765625, "learning_rate": 6.266666666666667e-07, "loss": 1.9501, "reward": 3.1202855110168457, "reward_std": 1.7458795309066772, "rewards/accuracy_reward": 0.5625, "rewards/accuracy_reward/std": 0.47715675830841064, "rewards/format_reward_func": 0.6511111259460449, "rewards/format_reward_func/std": 0.2657444477081299, "rewards/ngram_similarity_reward": 0.16139402985572815, "rewards/ngram_similarity_reward/std": 0.195029616355896, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.14376941323280334, "rewards/xml_reward_func": 0.7708333730697632, "rewards/xml_reward_func/std": 0.4245939254760742, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.000212472106795758, "clip_ratio/low_min": 0.000212472106795758, "clip_ratio/region_mean": 0.000212472106795758, "epoch": 0.006611570247933884, "grad_norm": 0.2787642261716525, "kl": 0.041259765625, "learning_rate": 6.333333333333332e-07, "loss": 1.9498, "step": 96 }, { "clip_ratio/high_max": 0.0005230125389061868, "clip_ratio/high_mean": 0.0005230125389061868, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005230125389061868, "completion_length": 239.0, "epoch": 0.006680440771349863, "grad_norm": 0.06569167118769814, "kl": 0.50390625, "learning_rate": 6.4e-07, "loss": 0.0327, "reward": 4.369965553283691, "reward_std": 0.44746655225753784, "rewards/accuracy_reward": 0.9375, "rewards/accuracy_reward/std": 0.1767766922712326, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.09258200973272324, "rewards/ngram_similarity_reward": 0.29247725009918213, "rewards/ngram_similarity_reward/std": 0.23834285140037537, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0010460250778123736, "clip_ratio/high_mean": 0.0010460250778123736, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010460250778123736, "epoch": 0.00674931129476584, "grad_norm": 0.06628629285986629, "kl": 0.486328125, "learning_rate": 6.466666666666666e-07, "loss": 0.0326, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 595.875, "epoch": 0.006818181818181818, "grad_norm": 0.04327913565476697, "kl": 0.033935546875, "learning_rate": 6.533333333333333e-07, "loss": 0.1156, "reward": 2.5874176025390625, "reward_std": 0.2622741460800171, "rewards/accuracy_reward": 0.1562846601009369, "rewards/accuracy_reward/std": 0.12941601872444153, "rewards/format_reward_func": 0.8283333778381348, "rewards/format_reward_func/std": 0.11589540541172028, "rewards/ngram_similarity_reward": 0.06100991368293762, "rewards/ngram_similarity_reward/std": 0.08329952508211136, "rewards/sql_execution_reward_func": 0.35499998927116394, "rewards/sql_execution_reward_func/std": 0.06984677165746689, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.006887052341597796, "grad_norm": 0.04332362825681914, "kl": 0.033935546875, "learning_rate": 6.6e-07, "loss": 0.1154, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 388.0, "epoch": 0.006955922865013774, "grad_norm": 0.07851354307217957, "kl": 0.8203125, "learning_rate": 6.666666666666666e-07, "loss": 0.086, "reward": 3.0345215797424316, "reward_std": 0.6461819410324097, "rewards/accuracy_reward": 0.34375, "rewards/accuracy_reward/std": 0.2651650309562683, "rewards/format_reward_func": 0.7900000214576721, "rewards/format_reward_func/std": 0.12359688431024551, "rewards/ngram_similarity_reward": 0.23454222083091736, "rewards/ngram_similarity_reward/std": 0.19957302510738373, "rewards/sql_execution_reward_func": 0.234375, "rewards/sql_execution_reward_func/std": 0.12315604090690613, "rewards/xml_reward_func": 0.9708333015441895, "rewards/xml_reward_func/std": 0.056869376450777054, "step": 101 }, { "clip_ratio/high_max": 0.0003221649385523051, "clip_ratio/high_mean": 0.0003221649385523051, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003221649385523051, "epoch": 0.0070247933884297524, "grad_norm": 0.07726623231176699, "kl": 0.828125, "learning_rate": 6.733333333333333e-07, "loss": 0.0862, "step": 102 }, { "clip_ratio/high_max": 0.000259807740803808, "clip_ratio/high_mean": 0.000259807740803808, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000259807740803808, "completion_length": 481.125, "epoch": 0.00709366391184573, "grad_norm": 0.05222081939263264, "kl": 0.2041015625, "learning_rate": 6.800000000000001e-07, "loss": -0.3157, "reward": 4.196127891540527, "reward_std": 0.7428631782531738, "rewards/accuracy_reward": 0.90625, "rewards/accuracy_reward/std": 0.2651650309562683, "rewards/format_reward_func": 0.720512866973877, "rewards/format_reward_func/std": 0.12362514436244965, "rewards/ngram_similarity_reward": 0.18791000545024872, "rewards/ngram_similarity_reward/std": 0.13248643279075623, "rewards/sql_execution_reward_func": 0.3812499940395355, "rewards/sql_execution_reward_func/std": 0.22028793394565582, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.007162534435261708, "grad_norm": 0.05184185262944727, "kl": 0.1962890625, "learning_rate": 6.866666666666666e-07, "loss": -0.3159, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 776.75, "epoch": 0.007231404958677686, "grad_norm": 0.08821642842292891, "kl": 0.7734375, "learning_rate": 6.933333333333333e-07, "loss": 0.1458, "reward": 3.051666259765625, "reward_std": 0.6166271567344666, "rewards/accuracy_reward": 0.3843750059604645, "rewards/accuracy_reward/std": 0.2150321900844574, "rewards/format_reward_func": 0.8524999618530273, "rewards/format_reward_func/std": 0.11135172098875046, "rewards/ngram_similarity_reward": 0.19666633009910583, "rewards/ngram_similarity_reward/std": 0.2626209557056427, "rewards/sql_execution_reward_func": 0.171875, "rewards/sql_execution_reward_func/std": 0.09118026494979858, "rewards/xml_reward_func": 0.9635416269302368, "rewards/xml_reward_func/std": 0.06842003017663956, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.007300275482093664, "grad_norm": 0.08744979516453764, "kl": 0.76171875, "learning_rate": 7e-07, "loss": 0.1457, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 266.5, "epoch": 0.007369146005509642, "grad_norm": 0.305356893492606, "kl": 6.65625, "learning_rate": 7.066666666666666e-07, "loss": 0.3461, "reward": 4.048697471618652, "reward_std": 1.3266526460647583, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward_func": 0.7583333253860474, "rewards/format_reward_func/std": 0.1003960371017456, "rewards/ngram_similarity_reward": 0.3352430462837219, "rewards/ngram_similarity_reward/std": 0.26802879571914673, "rewards/sql_execution_reward_func": 0.2874999940395355, "rewards/sql_execution_reward_func/std": 0.12747548520565033, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00743801652892562, "grad_norm": 0.31646887754915626, "kl": 6.65625, "learning_rate": 7.133333333333333e-07, "loss": 0.3453, "step": 108 }, { "clip_ratio/high_max": 0.00024378352100029588, "clip_ratio/high_mean": 0.00024378352100029588, "clip_ratio/low_mean": 0.00024378352100029588, "clip_ratio/low_min": 0.00024378352100029588, "clip_ratio/region_mean": 0.00048756704200059175, "completion_length": 512.75, "epoch": 0.0075068870523415975, "grad_norm": 0.03326509984497858, "kl": 0.027587890625, "learning_rate": 7.2e-07, "loss": 0.0021, "reward": 2.612682342529297, "reward_std": 0.30551669001579285, "rewards/accuracy_reward": 0.19374999403953552, "rewards/accuracy_reward/std": 0.10415475815534592, "rewards/format_reward_func": 0.7791666984558105, "rewards/format_reward_func/std": 0.11400013417005539, "rewards/ngram_similarity_reward": 0.0806771069765091, "rewards/ngram_similarity_reward/std": 0.08174490183591843, "rewards/sql_execution_reward_func": 0.32499998807907104, "rewards/sql_execution_reward_func/std": 0.13627703487873077, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0007313505630008876, "clip_ratio/high_mean": 0.0007313505630008876, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007313505630008876, "epoch": 0.007575757575757576, "grad_norm": 0.03327098606234379, "kl": 0.027099609375, "learning_rate": 7.266666666666667e-07, "loss": 0.0021, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 198.5, "epoch": 0.007644628099173554, "grad_norm": 0.2209322713080709, "kl": 1.4921875, "learning_rate": 7.333333333333332e-07, "loss": 0.1317, "reward": 5.1804704666137695, "reward_std": 0.9620435237884521, "rewards/accuracy_reward": 0.90625, "rewards/accuracy_reward/std": 0.2651650309562683, "rewards/format_reward_func": 0.6928571462631226, "rewards/format_reward_func/std": 0.13552618026733398, "rewards/ngram_similarity_reward": 0.8709090948104858, "rewards/ngram_similarity_reward/std": 0.27220383286476135, "rewards/sql_execution_reward_func": 0.3687500059604645, "rewards/sql_execution_reward_func/std": 0.03720119222998619, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.007713498622589532, "grad_norm": 0.2206283045180695, "kl": 1.4921875, "learning_rate": 7.4e-07, "loss": 0.133, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 257.75, "epoch": 0.00778236914600551, "grad_norm": 0.24039169374495728, "kl": 1.375, "learning_rate": 7.466666666666667e-07, "loss": -0.2162, "reward": 3.014768600463867, "reward_std": 1.033610224723816, "rewards/accuracy_reward": 0.2757352888584137, "rewards/accuracy_reward/std": 0.3284810483455658, "rewards/format_reward_func": 0.7575000524520874, "rewards/format_reward_func/std": 0.14360162615776062, "rewards/ngram_similarity_reward": 0.22226819396018982, "rewards/ngram_similarity_reward/std": 0.26429617404937744, "rewards/sql_execution_reward_func": 0.40625, "rewards/sql_execution_reward_func/std": 0.07288689911365509, "rewards/xml_reward_func": 0.9661458730697632, "rewards/xml_reward_func/std": 0.049723315984010696, "step": 113 }, { "clip_ratio/high_max": 0.0004849660617765039, "clip_ratio/high_mean": 0.0004849660617765039, "clip_ratio/low_mean": 0.0004849660617765039, "clip_ratio/low_min": 0.0004849660617765039, "clip_ratio/region_mean": 0.0009699321235530078, "epoch": 0.007851239669421488, "grad_norm": 0.23710377904727936, "kl": 1.3515625, "learning_rate": 7.533333333333332e-07, "loss": -0.2163, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 216.0, "epoch": 0.007920110192837466, "grad_norm": 0.2351971306094996, "kl": 1.25, "learning_rate": 7.599999999999999e-07, "loss": -0.1941, "reward": 3.604112386703491, "reward_std": 1.259140968322754, "rewards/accuracy_reward": 0.5687500238418579, "rewards/accuracy_reward/std": 0.4688035845756531, "rewards/format_reward_func": 0.737500011920929, "rewards/format_reward_func/std": 0.11877348273992538, "rewards/ngram_similarity_reward": 0.28190839290618896, "rewards/ngram_similarity_reward/std": 0.22604499757289886, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.08210402727127075, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.007988980716253443, "grad_norm": 0.23273556060433148, "kl": 1.2421875, "learning_rate": 7.666666666666667e-07, "loss": -0.1935, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 8.279516623588279e-05, "clip_ratio/low_min": 8.279516623588279e-05, "clip_ratio/region_mean": 8.279516623588279e-05, "completion_length": 1509.75, "epoch": 0.008057851239669421, "grad_norm": 0.11941991349295344, "kl": 4.71875, "learning_rate": 7.733333333333333e-07, "loss": 0.2557, "reward": 3.039119243621826, "reward_std": 0.7640610337257385, "rewards/accuracy_reward": 0.4437499940395355, "rewards/accuracy_reward/std": 0.3110322654247284, "rewards/format_reward_func": 0.8026785850524902, "rewards/format_reward_func/std": 0.09983333945274353, "rewards/ngram_similarity_reward": 0.15872830152511597, "rewards/ngram_similarity_reward/std": 0.07471704483032227, "rewards/sql_execution_reward_func": 0.11642857640981674, "rewards/sql_execution_reward_func/std": 0.08350206911563873, "rewards/xml_reward_func": 0.9944196939468384, "rewards/xml_reward_func/std": 0.010363386943936348, "step": 117 }, { "clip_ratio/high_max": 8.279516623588279e-05, "clip_ratio/high_mean": 8.279516623588279e-05, "clip_ratio/low_mean": 8.279516623588279e-05, "clip_ratio/low_min": 8.279516623588279e-05, "clip_ratio/region_mean": 0.00016559033247176558, "epoch": 0.008126721763085399, "grad_norm": 0.12006406368913197, "kl": 4.6875, "learning_rate": 7.799999999999999e-07, "loss": 0.2558, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 296.875, "epoch": 0.008195592286501378, "grad_norm": 0.035331902981728, "kl": 0.65234375, "learning_rate": 7.866666666666666e-07, "loss": -0.0013, "reward": 4.717468738555908, "reward_std": 0.20619268715381622, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8083333969116211, "rewards/format_reward_func/std": 0.10947203636169434, "rewards/ngram_similarity_reward": 0.3685903549194336, "rewards/ngram_similarity_reward/std": 0.16604401171207428, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.008264462809917356, "grad_norm": 0.035024904870290974, "kl": 0.6171875, "learning_rate": 7.933333333333333e-07, "loss": -0.0014, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0004582951369229704, "clip_ratio/low_min": 0.0004582951369229704, "clip_ratio/region_mean": 0.0004582951369229704, "completion_length": 545.5, "epoch": 0.008333333333333333, "grad_norm": 0.9156455411817438, "kl": 4.71875, "learning_rate": 8e-07, "loss": 3.3899, "reward": 4.423295497894287, "reward_std": 1.7320020198822021, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.675000011920929, "rewards/format_reward_func/std": 0.2121320515871048, "rewards/ngram_similarity_reward": 0.5738636255264282, "rewards/ngram_similarity_reward/std": 0.27872639894485474, "rewards/sql_execution_reward_func": 0.26249998807907104, "rewards/sql_execution_reward_func/std": 0.16201850771903992, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00840220385674931, "grad_norm": 0.9505196117059289, "kl": 4.71875, "learning_rate": 8.066666666666666e-07, "loss": 3.3907, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 736.375, "epoch": 0.008471074380165288, "grad_norm": 0.1497376437549018, "kl": 4.375, "learning_rate": 8.133333333333333e-07, "loss": -0.1334, "reward": 3.610806465148926, "reward_std": 1.0240850448608398, "rewards/accuracy_reward": 0.7189555764198303, "rewards/accuracy_reward/std": 0.410252183675766, "rewards/format_reward_func": 0.7924450635910034, "rewards/format_reward_func/std": 0.18630468845367432, "rewards/ngram_similarity_reward": 0.2304389327764511, "rewards/ngram_similarity_reward/std": 0.2699585258960724, "rewards/sql_execution_reward_func": 0.10249999910593033, "rewards/sql_execution_reward_func/std": 0.12429803609848022, "rewards/xml_reward_func": 0.9322916269302368, "rewards/xml_reward_func/std": 0.1752796769142151, "step": 123 }, { "clip_ratio/high_max": 0.00016975046310108155, "clip_ratio/high_mean": 0.00016975046310108155, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016975046310108155, "epoch": 0.008539944903581268, "grad_norm": 0.15086752273893142, "kl": 4.40625, "learning_rate": 8.199999999999999e-07, "loss": -0.1335, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0006605020025745034, "clip_ratio/low_min": 0.0006605020025745034, "clip_ratio/region_mean": 0.0006605020025745034, "completion_length": 567.75, "epoch": 0.008608815426997245, "grad_norm": 0.18858755871144714, "kl": 1.8984375, "learning_rate": 8.266666666666667e-07, "loss": 0.2975, "reward": 4.766119956970215, "reward_std": 1.2788139581680298, "rewards/accuracy_reward": 0.8145833015441895, "rewards/accuracy_reward/std": 0.37005338072776794, "rewards/format_reward_func": 0.9275000095367432, "rewards/format_reward_func/std": 0.13853828608989716, "rewards/ngram_similarity_reward": 0.5979688167572021, "rewards/ngram_similarity_reward/std": 0.44342783093452454, "rewards/sql_execution_reward_func": 0.328125, "rewards/sql_execution_reward_func/std": 0.10809841752052307, "rewards/xml_reward_func": 0.984375, "rewards/xml_reward_func/std": 0.04419417306780815, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.008677685950413223, "grad_norm": 0.18921200015809797, "kl": 1.859375, "learning_rate": 8.333333333333333e-07, "loss": 0.2969, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00027048957417719066, "clip_ratio/low_min": 0.00027048957417719066, "clip_ratio/region_mean": 0.00027048957417719066, "completion_length": 924.25, "epoch": 0.0087465564738292, "grad_norm": 0.34160303810848325, "kl": 0.111328125, "learning_rate": 8.399999999999999e-07, "loss": 0.8917, "reward": 2.225067138671875, "reward_std": 0.9235139489173889, "rewards/accuracy_reward": 0.13124999403953552, "rewards/accuracy_reward/std": 0.12730026245117188, "rewards/format_reward_func": 0.6983333826065063, "rewards/format_reward_func/std": 0.2552745044231415, "rewards/ngram_similarity_reward": 0.12323930859565735, "rewards/ngram_similarity_reward/std": 0.18214455246925354, "rewards/sql_execution_reward_func": 0.2043749988079071, "rewards/sql_execution_reward_func/std": 0.1328650712966919, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 127 }, { "clip_ratio/high_max": 0.00013524478708859533, "clip_ratio/high_mean": 0.00013524478708859533, "clip_ratio/low_mean": 0.0004057343758177012, "clip_ratio/low_min": 0.0004057343758177012, "clip_ratio/region_mean": 0.0005409791483543813, "epoch": 0.008815426997245178, "grad_norm": 0.3261029718027581, "kl": 0.11181640625, "learning_rate": 8.466666666666667e-07, "loss": 0.8914, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 275.375, "epoch": 0.008884297520661158, "grad_norm": 0.17744852388913124, "kl": 1.9140625, "learning_rate": 8.533333333333334e-07, "loss": 0.0994, "reward": 4.5745110511779785, "reward_std": 1.0768170356750488, "rewards/accuracy_reward": 0.78125, "rewards/accuracy_reward/std": 0.31160587072372437, "rewards/format_reward_func": 0.7416666746139526, "rewards/format_reward_func/std": 0.12051476538181305, "rewards/ngram_similarity_reward": 0.6418964266777039, "rewards/ngram_similarity_reward/std": 0.3862665891647339, "rewards/sql_execution_reward_func": 0.3075000047683716, "rewards/sql_execution_reward_func/std": 0.09866972267627716, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.008953168044077135, "grad_norm": 0.17959944708411982, "kl": 1.953125, "learning_rate": 8.599999999999999e-07, "loss": 0.0991, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 337.125, "epoch": 0.009022038567493113, "grad_norm": 0.15512666315377324, "kl": 4.0625, "learning_rate": 8.666666666666667e-07, "loss": 0.2697, "reward": 3.7707176208496094, "reward_std": 0.9612404108047485, "rewards/accuracy_reward": 0.71875, "rewards/accuracy_reward/std": 0.38816189765930176, "rewards/format_reward_func": 0.6958333849906921, "rewards/format_reward_func/std": 0.13266199827194214, "rewards/ngram_similarity_reward": 0.21936722099781036, "rewards/ngram_similarity_reward/std": 0.10582280158996582, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0707106739282608, "rewards/xml_reward_func": 0.9583333730697632, "rewards/xml_reward_func/std": 0.117851123213768, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00037078236346133053, "clip_ratio/low_min": 0.00037078236346133053, "clip_ratio/region_mean": 0.00037078236346133053, "epoch": 0.00909090909090909, "grad_norm": 0.1482371218998594, "kl": 3.8125, "learning_rate": 8.733333333333333e-07, "loss": 0.2693, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 435.375, "epoch": 0.009159779614325068, "grad_norm": 0.15889414323678797, "kl": 0.052490234375, "learning_rate": 8.799999999999999e-07, "loss": 0.7304, "reward": 4.086208820343018, "reward_std": 1.169477939605713, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.737500011920929, "rewards/format_reward_func/std": 0.1597989797592163, "rewards/ngram_similarity_reward": 0.272055983543396, "rewards/ngram_similarity_reward/std": 0.19724683463573456, "rewards/sql_execution_reward_func": 0.26875001192092896, "rewards/sql_execution_reward_func/std": 0.166770800948143, "rewards/xml_reward_func": 0.921875, "rewards/xml_reward_func/std": 0.17598575353622437, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.009228650137741047, "grad_norm": 0.16266405870796483, "kl": 0.0537109375, "learning_rate": 8.866666666666667e-07, "loss": 0.7314, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 728.375, "epoch": 0.009297520661157025, "grad_norm": 0.11965239423349422, "kl": 2.203125, "learning_rate": 8.933333333333333e-07, "loss": -0.0256, "reward": 3.127617835998535, "reward_std": 0.658403217792511, "rewards/accuracy_reward": 0.3125, "rewards/accuracy_reward/std": 0.29124119877815247, "rewards/format_reward_func": 0.7676136493682861, "rewards/format_reward_func/std": 0.10847873240709305, "rewards/ngram_similarity_reward": 0.2348538637161255, "rewards/ngram_similarity_reward/std": 0.16488397121429443, "rewards/sql_execution_reward_func": 0.39053571224212646, "rewards/sql_execution_reward_func/std": 0.07929697632789612, "rewards/xml_reward_func": 0.9921875, "rewards/xml_reward_func/std": 0.022097086533904076, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.009366391184573003, "grad_norm": 0.11783360698726582, "kl": 1.9453125, "learning_rate": 9e-07, "loss": -0.0264, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 1402.125, "epoch": 0.00943526170798898, "grad_norm": 0.1721969495846286, "kl": 7.78125, "learning_rate": 9.066666666666665e-07, "loss": 0.0406, "reward": 2.4565823078155518, "reward_std": 0.7244934439659119, "rewards/accuracy_reward": 0.012500000186264515, "rewards/accuracy_reward/std": 0.013363063335418701, "rewards/format_reward_func": 0.6883601546287537, "rewards/format_reward_func/std": 0.14038214087486267, "rewards/ngram_similarity_reward": 0.11798766255378723, "rewards/ngram_similarity_reward/std": 0.10340042412281036, "rewards/sql_execution_reward_func": 0.56971275806427, "rewards/sql_execution_reward_func/std": 0.7074722051620483, "rewards/xml_reward_func": 0.9965277910232544, "rewards/xml_reward_func/std": 0.009820932522416115, "step": 137 }, { "clip_ratio/high_max": 8.915039506973699e-05, "clip_ratio/high_mean": 8.915039506973699e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.915039506973699e-05, "epoch": 0.009504132231404958, "grad_norm": 0.17936977365613171, "kl": 7.96875, "learning_rate": 9.133333333333333e-07, "loss": 0.0409, "step": 138 }, { "clip_ratio/high_max": 0.00016423058696091175, "clip_ratio/high_mean": 0.00016423058696091175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016423058696091175, "completion_length": 761.125, "epoch": 0.009573002754820937, "grad_norm": 0.06905954425523524, "kl": 1.3828125, "learning_rate": 9.2e-07, "loss": 0.2511, "reward": 2.6955413818359375, "reward_std": 0.5710647106170654, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8611111044883728, "rewards/format_reward_func/std": 0.19363778829574585, "rewards/ngram_similarity_reward": 0.3372226655483246, "rewards/ngram_similarity_reward/std": 0.20394711196422577, "rewards/sql_execution_reward_func": 0.3466517925262451, "rewards/sql_execution_reward_func/std": 0.12747704982757568, "rewards/xml_reward_func": 0.9819444417953491, "rewards/xml_reward_func/std": 0.029806841164827347, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.009641873278236915, "grad_norm": 0.06820348348097877, "kl": 1.4140625, "learning_rate": 9.266666666666665e-07, "loss": 0.2513, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 676.0, "epoch": 0.009710743801652892, "grad_norm": 0.2844180859381576, "kl": 0.6875, "learning_rate": 9.333333333333333e-07, "loss": 1.8021, "reward": 2.951951742172241, "reward_std": 1.7420177459716797, "rewards/accuracy_reward": 0.3656249940395355, "rewards/accuracy_reward/std": 0.4346463084220886, "rewards/format_reward_func": 0.612500011920929, "rewards/format_reward_func/std": 0.20310096442699432, "rewards/ngram_similarity_reward": 0.31796780228614807, "rewards/ngram_similarity_reward/std": 0.32343313097953796, "rewards/sql_execution_reward_func": 0.2562499940395355, "rewards/sql_execution_reward_func/std": 0.1801537126302719, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00977961432506887, "grad_norm": 0.29314265072091794, "kl": 0.6953125, "learning_rate": 9.399999999999999e-07, "loss": 1.803, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 261.25, "epoch": 0.009848484848484848, "grad_norm": 0.03895740577825041, "kl": 0.01708984375, "learning_rate": 9.466666666666666e-07, "loss": 0.0008, "reward": 4.4947967529296875, "reward_std": 0.2941080629825592, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7583333253860474, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.2618090510368347, "rewards/ngram_similarity_reward/std": 0.1598150134086609, "rewards/sql_execution_reward_func": 0.34375, "rewards/sql_execution_reward_func/std": 0.06232117488980293, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.009917355371900827, "grad_norm": 0.03922774356369609, "kl": 0.01708984375, "learning_rate": 9.533333333333333e-07, "loss": 0.0007, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 305.5, "epoch": 0.009986225895316805, "grad_norm": 0.0963700766849368, "kl": 1.5546875, "learning_rate": 9.6e-07, "loss": 0.0202, "reward": 2.5837576389312744, "reward_std": 0.5013255476951599, "rewards/accuracy_reward": 0.15937499701976776, "rewards/accuracy_reward/std": 0.12531210482120514, "rewards/format_reward_func": 0.7333333492279053, "rewards/format_reward_func/std": 0.11268723756074905, "rewards/ngram_similarity_reward": 0.20236624777317047, "rewards/ngram_similarity_reward/std": 0.12598712742328644, "rewards/sql_execution_reward_func": 0.24374999105930328, "rewards/sql_execution_reward_func/std": 0.12938730418682098, "rewards/xml_reward_func": 0.984375, "rewards/xml_reward_func/std": 0.04419417306780815, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.010055096418732782, "grad_norm": 0.10129862971464607, "kl": 1.4765625, "learning_rate": 9.666666666666666e-07, "loss": 0.0202, "step": 146 }, { "clip_ratio/high_max": 0.0004051863797940314, "clip_ratio/high_mean": 0.0004051863797940314, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004051863797940314, "completion_length": 308.5, "epoch": 0.01012396694214876, "grad_norm": 0.10490626611779665, "kl": 0.08984375, "learning_rate": 9.733333333333333e-07, "loss": -0.0934, "reward": 4.065784454345703, "reward_std": 0.8407307863235474, "rewards/accuracy_reward": 0.71875, "rewards/accuracy_reward/std": 0.38816189765930176, "rewards/format_reward_func": 0.8125, "rewards/format_reward_func/std": 0.050198011100292206, "rewards/ngram_similarity_reward": 0.2938562035560608, "rewards/ngram_similarity_reward/std": 0.12533415853977203, "rewards/sql_execution_reward_func": 0.375, "rewards/sql_execution_reward_func/std": 0.03779644891619682, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.010192837465564738, "grad_norm": 0.10473638593979362, "kl": 0.0927734375, "learning_rate": 9.8e-07, "loss": -0.0933, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 524.0, "epoch": 0.010261707988980717, "grad_norm": 0.10697755318876968, "kl": 4.0, "learning_rate": 9.866666666666666e-07, "loss": 0.236, "reward": 3.2960402965545654, "reward_std": 0.34962838888168335, "rewards/accuracy_reward": 0.3385416567325592, "rewards/accuracy_reward/std": 0.14846687018871307, "rewards/format_reward_func": 0.7722222805023193, "rewards/format_reward_func/std": 0.07856741547584534, "rewards/ngram_similarity_reward": 0.28208234906196594, "rewards/ngram_similarity_reward/std": 0.1938333809375763, "rewards/sql_execution_reward_func": 0.4375, "rewards/sql_execution_reward_func/std": 0.2474873661994934, "rewards/xml_reward_func": 0.9861111044883728, "rewards/xml_reward_func/std": 0.03928370773792267, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.010330578512396695, "grad_norm": 0.10579386722388626, "kl": 4.0, "learning_rate": 9.933333333333333e-07, "loss": 0.2361, "step": 150 }, { "clip_ratio/high_max": 6.276676140259951e-05, "clip_ratio/high_mean": 6.276676140259951e-05, "clip_ratio/low_mean": 0.0001883002696558833, "clip_ratio/low_min": 0.0001883002696558833, "clip_ratio/region_mean": 0.00025106704561039805, "completion_length": 1991.5, "epoch": 0.010399449035812672, "grad_norm": 0.11509031806107387, "kl": 0.8828125, "learning_rate": 1e-06, "loss": 0.6577, "reward": 2.034876823425293, "reward_std": 1.3124228715896606, "rewards/accuracy_reward": 0.16249999403953552, "rewards/accuracy_reward/std": 0.18077215552330017, "rewards/format_reward_func": 0.6897193789482117, "rewards/format_reward_func/std": 0.32449084520339966, "rewards/ngram_similarity_reward": 0.05488854646682739, "rewards/ngram_similarity_reward/std": 0.06693465262651443, "rewards/sql_execution_reward_func": 0.20892857015132904, "rewards/sql_execution_reward_func/std": 0.2945941388607025, "rewards/xml_reward_func": 0.7288960814476013, "rewards/xml_reward_func/std": 0.44225022196769714, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00012553352280519903, "clip_ratio/low_min": 0.00012553352280519903, "clip_ratio/region_mean": 0.00012553352280519903, "epoch": 0.01046831955922865, "grad_norm": 0.11426886996615056, "kl": 0.8828125, "learning_rate": 9.999987815308708e-07, "loss": 0.658, "step": 152 }, { "clip_ratio/high_max": 0.0009813542710617185, "clip_ratio/high_mean": 0.0009813542710617185, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009813542710617185, "completion_length": 127.375, "epoch": 0.010537190082644627, "grad_norm": 0.2855315636207493, "kl": 0.9765625, "learning_rate": 9.999951261300815e-07, "loss": 0.1756, "reward": 5.374999523162842, "reward_std": 0.7161403894424438, "rewards/accuracy_reward": 0.90625, "rewards/accuracy_reward/std": 0.2651650309562683, "rewards/format_reward_func": 0.8250000476837158, "rewards/format_reward_func/std": 0.12817397713661194, "rewards/ngram_similarity_reward": 0.9375, "rewards/ngram_similarity_reward/std": 0.1767766922712326, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.1412634402513504, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.010606060606060607, "grad_norm": 0.2863441844202871, "kl": 0.98046875, "learning_rate": 9.999890338174275e-07, "loss": 0.1765, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 151.625, "epoch": 0.010674931129476584, "grad_norm": 0.13253712706656853, "kl": 0.0252685546875, "learning_rate": 9.999805046259016e-07, "loss": 0.1484, "reward": 4.343276023864746, "reward_std": 0.4626372456550598, "rewards/accuracy_reward": 0.8984721302986145, "rewards/accuracy_reward/std": 0.18799307942390442, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.1414213478565216, "rewards/ngram_similarity_reward": 0.2975543737411499, "rewards/ngram_similarity_reward/std": 0.11051194369792938, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0008244023192673922, "clip_ratio/low_min": 0.0008244023192673922, "clip_ratio/region_mean": 0.0008244023192673922, "epoch": 0.010743801652892562, "grad_norm": 0.13507503336803517, "kl": 0.0252685546875, "learning_rate": 9.99969538601693e-07, "loss": 0.1485, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 562.5, "epoch": 0.01081267217630854, "grad_norm": 0.14723546482373662, "kl": 0.05224609375, "learning_rate": 9.999561358041868e-07, "loss": 0.6764, "reward": 3.5154531002044678, "reward_std": 1.0232155323028564, "rewards/accuracy_reward": 0.503125011920929, "rewards/accuracy_reward/std": 0.43924397230148315, "rewards/format_reward_func": 0.7677083611488342, "rewards/format_reward_func/std": 0.13717946410179138, "rewards/ngram_similarity_reward": 0.23094442486763, "rewards/ngram_similarity_reward/std": 0.16094525158405304, "rewards/sql_execution_reward_func": 0.4087499976158142, "rewards/sql_execution_reward_func/std": 0.1470119059085846, "rewards/xml_reward_func": 0.986328125, "rewards/xml_reward_func/std": 0.038669902831315994, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.010881542699724517, "grad_norm": 0.14642120899523517, "kl": 0.05224609375, "learning_rate": 9.99940296305965e-07, "loss": 0.676, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 238.75, "epoch": 0.010950413223140497, "grad_norm": 0.3108072312273219, "kl": 0.376953125, "learning_rate": 9.999220201928054e-07, "loss": 0.6961, "reward": 4.436520576477051, "reward_std": 1.1481634378433228, "rewards/accuracy_reward": 0.7875000238418579, "rewards/accuracy_reward/std": 0.39708760380744934, "rewards/format_reward_func": 0.8333333730697632, "rewards/format_reward_func/std": 0.07126966118812561, "rewards/ngram_similarity_reward": 0.49379149079322815, "rewards/ngram_similarity_reward/std": 0.274742066860199, "rewards/sql_execution_reward_func": 0.2874999940395355, "rewards/sql_execution_reward_func/std": 0.12747548520565033, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0005235602147877216, "clip_ratio/high_mean": 0.0005235602147877216, "clip_ratio/low_mean": 0.0005235602147877216, "clip_ratio/low_min": 0.0005235602147877216, "clip_ratio/region_mean": 0.0010471204295754433, "epoch": 0.011019283746556474, "grad_norm": 0.3150277962834983, "kl": 0.376953125, "learning_rate": 9.999013075636804e-07, "loss": 0.6954, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 176.5, "epoch": 0.011088154269972452, "grad_norm": 0.061518336393438, "kl": 0.0181884765625, "learning_rate": 9.998781585307575e-07, "loss": 0.0411, "reward": 3.26299786567688, "reward_std": 0.2810080349445343, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.45449864864349365, "rewards/ngram_similarity_reward/std": 0.08995026350021362, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.00141643057577312, "clip_ratio/high_mean": 0.00141643057577312, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00141643057577312, "epoch": 0.01115702479338843, "grad_norm": 0.06039516529750827, "kl": 0.02001953125, "learning_rate": 9.99852573219399e-07, "loss": 0.0411, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 99.75, "epoch": 0.011225895316804407, "grad_norm": 0.02419258650803721, "kl": 0.2001953125, "learning_rate": 9.998245517681593e-07, "loss": -0.0251, "reward": 5.59999942779541, "reward_std": 0.09258192032575607, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.09258200973272324, "rewards/ngram_similarity_reward": 1.0, "rewards/ngram_similarity_reward/std": 0.0, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.011294765840220386, "grad_norm": 0.023582249340761568, "kl": 0.1875, "learning_rate": 9.99794094328787e-07, "loss": -0.0251, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 169.875, "epoch": 0.011363636363636364, "grad_norm": 0.06258194983235658, "kl": 0.0712890625, "learning_rate": 9.997612010662212e-07, "loss": -0.0232, "reward": 4.709610939025879, "reward_std": 0.2689416706562042, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.10690449178218842, "rewards/ngram_similarity_reward": 0.3730742335319519, "rewards/ngram_similarity_reward/std": 0.1198696717619896, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.011432506887052342, "grad_norm": 0.06202876288039677, "kl": 0.0673828125, "learning_rate": 9.997258721585931e-07, "loss": -0.0231, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 342.75, "epoch": 0.01150137741046832, "grad_norm": 0.04203987476384058, "kl": 0.345703125, "learning_rate": 9.996881077972233e-07, "loss": 0.027, "reward": 4.336270332336426, "reward_std": 0.2974494993686676, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.11233452707529068, "rewards/ngram_similarity_reward": 0.19084684550762177, "rewards/ngram_similarity_reward/std": 0.1818958967924118, "rewards/sql_execution_reward_func": 0.2750000059604645, "rewards/sql_execution_reward_func/std": 0.0963624119758606, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.011570247933884297, "grad_norm": 0.04239337188493323, "kl": 0.3515625, "learning_rate": 9.996479081866218e-07, "loss": 0.0272, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00041254126699641347, "clip_ratio/low_min": 0.00041254126699641347, "clip_ratio/region_mean": 0.00041254126699641347, "completion_length": 303.0, "epoch": 0.011639118457300276, "grad_norm": 0.06531419317038895, "kl": 0.05712890625, "learning_rate": 9.996052735444862e-07, "loss": 0.1013, "reward": 4.452203750610352, "reward_std": 0.35249993205070496, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7583333253860474, "rewards/format_reward_func/std": 0.1178511381149292, "rewards/ngram_similarity_reward": 0.2570248246192932, "rewards/ngram_similarity_reward/std": 0.1396777480840683, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 0.9583333730697632, "rewards/xml_reward_func/std": 0.117851123213768, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00041254126699641347, "clip_ratio/low_min": 0.00041254126699641347, "clip_ratio/region_mean": 0.00041254126699641347, "epoch": 0.011707988980716254, "grad_norm": 0.06615508329651217, "kl": 0.05712890625, "learning_rate": 9.995602041017011e-07, "loss": 0.1013, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001071926235454157, "clip_ratio/low_min": 0.0001071926235454157, "clip_ratio/region_mean": 0.0001071926235454157, "completion_length": 1166.125, "epoch": 0.011776859504132231, "grad_norm": 0.4747944022635659, "kl": 0.703125, "learning_rate": 9.99512700102336e-07, "loss": 1.6769, "reward": 2.5870847702026367, "reward_std": 1.6720845699310303, "rewards/accuracy_reward": 0.44062501192092896, "rewards/accuracy_reward/std": 0.4739532768726349, "rewards/format_reward_func": 0.6266190409660339, "rewards/format_reward_func/std": 0.26563340425491333, "rewards/ngram_similarity_reward": 0.07709609717130661, "rewards/ngram_similarity_reward/std": 0.06477025896310806, "rewards/sql_execution_reward_func": 0.21250000596046448, "rewards/sql_execution_reward_func/std": 0.1597989797592163, "rewards/xml_reward_func": 0.7510714530944824, "rewards/xml_reward_func/std": 0.44009390473365784, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.011845730027548209, "grad_norm": 0.483723403079627, "kl": 0.70703125, "learning_rate": 9.994627618036452e-07, "loss": 1.6768, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00025136154727078974, "clip_ratio/low_min": 0.00025136154727078974, "clip_ratio/region_mean": 0.00025136154727078974, "completion_length": 1491.875, "epoch": 0.011914600550964187, "grad_norm": 0.1894965925041846, "kl": 1.7109375, "learning_rate": 9.994103894760656e-07, "loss": 0.948, "reward": 2.1622610092163086, "reward_std": 1.0960407257080078, "rewards/accuracy_reward": 0.16875000298023224, "rewards/accuracy_reward/std": 0.1412634402513504, "rewards/format_reward_func": 0.6234068870544434, "rewards/format_reward_func/std": 0.2943829298019409, "rewards/ngram_similarity_reward": 0.0906611904501915, "rewards/ngram_similarity_reward/std": 0.14983177185058594, "rewards/sql_execution_reward_func": 0.36250001192092896, "rewards/sql_execution_reward_func/std": 0.32376137375831604, "rewards/xml_reward_func": 0.7028623819351196, "rewards/xml_reward_func/std": 0.4090360105037689, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 8.378717757295817e-05, "clip_ratio/low_min": 8.378717757295817e-05, "clip_ratio/region_mean": 8.378717757295817e-05, "epoch": 0.011983471074380166, "grad_norm": 0.2015628889821955, "kl": 1.7109375, "learning_rate": 9.99355583403215e-07, "loss": 0.948, "step": 174 }, { "clip_ratio/high_max": 7.475517486454919e-05, "clip_ratio/high_mean": 7.475517486454919e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.475517486454919e-05, "completion_length": 1672.125, "epoch": 0.012052341597796144, "grad_norm": 0.07173227109781513, "kl": 0.232421875, "learning_rate": 9.992983438818915e-07, "loss": 0.4571, "reward": 1.9555273056030273, "reward_std": 0.9328388571739197, "rewards/accuracy_reward": 0.08770386129617691, "rewards/accuracy_reward/std": 0.1333673596382141, "rewards/format_reward_func": 0.6711904406547546, "rewards/format_reward_func/std": 0.2309214472770691, "rewards/ngram_similarity_reward": 0.09692495316267014, "rewards/ngram_similarity_reward/std": 0.08439066261053085, "rewards/sql_execution_reward_func": 0.14687499403953552, "rewards/sql_execution_reward_func/std": 0.17165035009384155, "rewards/xml_reward_func": 0.8166666626930237, "rewards/xml_reward_func/std": 0.36817872524261475, "step": 175 }, { "clip_ratio/high_max": 7.475517486454919e-05, "clip_ratio/high_mean": 7.475517486454919e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.475517486454919e-05, "epoch": 0.012121212121212121, "grad_norm": 0.07099366368574264, "kl": 0.232421875, "learning_rate": 9.992386712220707e-07, "loss": 0.4571, "step": 176 }, { "clip_ratio/high_max": 0.00022366360644809902, "clip_ratio/high_mean": 0.00022366360644809902, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022366360644809902, "completion_length": 1117.75, "epoch": 0.012190082644628099, "grad_norm": 0.08957467777683123, "kl": 2.5, "learning_rate": 9.99176565746905e-07, "loss": 0.4114, "reward": 2.2939205169677734, "reward_std": 0.6930707097053528, "rewards/accuracy_reward": 0.0625, "rewards/accuracy_reward/std": 0.06681530922651291, "rewards/format_reward_func": 0.7983332872390747, "rewards/format_reward_func/std": 0.23043783009052277, "rewards/ngram_similarity_reward": 0.04223490506410599, "rewards/ngram_similarity_reward/std": 0.054903291165828705, "rewards/sql_execution_reward_func": 0.42125001549720764, "rewards/sql_execution_reward_func/std": 0.16745468974113464, "rewards/xml_reward_func": 0.8859848976135254, "rewards/xml_reward_func/std": 0.23376266658306122, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00011183180322404951, "clip_ratio/low_min": 0.00011183180322404951, "clip_ratio/region_mean": 0.00011183180322404951, "epoch": 0.012258953168044076, "grad_norm": 0.09222104952853645, "kl": 2.5, "learning_rate": 9.991120277927223e-07, "loss": 0.4112, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 357.5, "epoch": 0.012327823691460056, "grad_norm": 0.0821152008467046, "kl": 1.5234375, "learning_rate": 9.990450577090216e-07, "loss": 0.2705, "reward": 3.4503684043884277, "reward_std": 0.716273844242096, "rewards/accuracy_reward": 0.5260416865348816, "rewards/accuracy_reward/std": 0.40638354420661926, "rewards/format_reward_func": 0.7725000381469727, "rewards/format_reward_func/std": 0.07778175920248032, "rewards/ngram_similarity_reward": 0.11719012260437012, "rewards/ngram_similarity_reward/std": 0.11048425734043121, "rewards/sql_execution_reward_func": 0.45625001192092896, "rewards/sql_execution_reward_func/std": 0.3005203902721405, "rewards/xml_reward_func": 0.9937499761581421, "rewards/xml_reward_func/std": 0.01767767407000065, "step": 179 }, { "clip_ratio/high_max": 0.00034965036320500076, "clip_ratio/high_mean": 0.00034965036320500076, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034965036320500076, "epoch": 0.012396694214876033, "grad_norm": 0.07898000290779261, "kl": 1.5, "learning_rate": 9.989756558584744e-07, "loss": 0.2707, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 126.625, "epoch": 0.012465564738292011, "grad_norm": 0.046041725363662966, "kl": 0.039794921875, "learning_rate": 9.989038226169207e-07, "loss": -0.0549, "reward": 2.6563644409179688, "reward_std": 0.19764670729637146, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7250000238418579, "rewards/format_reward_func/std": 0.1035098284482956, "rewards/ngram_similarity_reward": 0.08340965211391449, "rewards/ngram_similarity_reward/std": 0.01470770500600338, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.012534435261707989, "grad_norm": 0.046291881356591597, "kl": 0.03955078125, "learning_rate": 9.988295583733678e-07, "loss": -0.0549, "step": 182 }, { "clip_ratio/high_max": 0.0010471204295754433, "clip_ratio/high_mean": 0.0010471204295754433, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010471204295754433, "completion_length": 238.75, "epoch": 0.012603305785123966, "grad_norm": 0.32686918025401224, "kl": 0.0159912109375, "learning_rate": 9.987528635299873e-07, "loss": -0.2005, "reward": 4.2284417152404785, "reward_std": 1.1578679084777832, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/std": 0.4381372928619385, "rewards/format_reward_func": 0.7900000214576721, "rewards/format_reward_func/std": 0.08750510215759277, "rewards/ngram_similarity_reward": 0.483961284160614, "rewards/ngram_similarity_reward/std": 0.28696224093437195, "rewards/sql_execution_reward_func": 0.3374999761581421, "rewards/sql_execution_reward_func/std": 0.0353553369641304, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0010471204295754433, "clip_ratio/high_mean": 0.0010471204295754433, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010471204295754433, "epoch": 0.012672176308539946, "grad_norm": 0.2905579277743501, "kl": 0.0159912109375, "learning_rate": 9.98673738502114e-07, "loss": -0.2006, "step": 184 }, { "clip_ratio/high_max": 7.911392458481714e-05, "clip_ratio/high_mean": 7.911392458481714e-05, "clip_ratio/low_mean": 0.0004746835329569876, "clip_ratio/low_min": 0.0004746835329569876, "clip_ratio/region_mean": 0.0005537974648177624, "completion_length": 1580.0, "epoch": 0.012741046831955923, "grad_norm": 0.27680804916319846, "kl": 1.921875, "learning_rate": 9.985921837182433e-07, "loss": 0.5432, "reward": 2.845797061920166, "reward_std": 1.278908133506775, "rewards/accuracy_reward": 0.38365381956100464, "rewards/accuracy_reward/std": 0.3052564859390259, "rewards/format_reward_func": 0.7723268270492554, "rewards/format_reward_func/std": 0.2741811275482178, "rewards/ngram_similarity_reward": 0.2434617280960083, "rewards/ngram_similarity_reward/std": 0.2661808729171753, "rewards/sql_execution_reward_func": 0.13465909659862518, "rewards/sql_execution_reward_func/std": 0.12990586459636688, "rewards/xml_reward_func": 0.8063108921051025, "rewards/xml_reward_func/std": 0.33744677901268005, "step": 185 }, { "clip_ratio/high_max": 7.911392458481714e-05, "clip_ratio/high_mean": 7.911392458481714e-05, "clip_ratio/low_mean": 7.911392458481714e-05, "clip_ratio/low_min": 7.911392458481714e-05, "clip_ratio/region_mean": 0.00015822784916963428, "epoch": 0.0128099173553719, "grad_norm": 0.33690670156953545, "kl": 1.953125, "learning_rate": 9.985081996200277e-07, "loss": 0.5432, "step": 186 }, { "clip_ratio/high_max": 0.0008748906548134983, "clip_ratio/high_mean": 0.0008748906548134983, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008748906548134983, "completion_length": 285.75, "epoch": 0.012878787878787878, "grad_norm": 0.07555530037896556, "kl": 4.65625, "learning_rate": 9.98421786662277e-07, "loss": -0.0645, "reward": 2.866915225982666, "reward_std": 0.28682687878608704, "rewards/accuracy_reward": 0.21875, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/format_reward_func": 0.8050000667572021, "rewards/format_reward_func/std": 0.18071286380290985, "rewards/ngram_similarity_reward": 0.14544343948364258, "rewards/ngram_similarity_reward/std": 0.10195522010326385, "rewards/sql_execution_reward_func": 0.40625, "rewards/sql_execution_reward_func/std": 0.06781014055013657, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0008748906548134983, "clip_ratio/high_mean": 0.0008748906548134983, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008748906548134983, "epoch": 0.012947658402203856, "grad_norm": 0.07575126167474401, "kl": 4.6875, "learning_rate": 9.98332945312953e-07, "loss": -0.0646, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 190.875, "epoch": 0.013016528925619835, "grad_norm": 0.06380003798881202, "kl": 2.078125, "learning_rate": 9.982416760531691e-07, "loss": -0.0155, "reward": 4.446132183074951, "reward_std": 0.09322249889373779, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.09258200973272324, "rewards/ngram_similarity_reward": 0.23075491189956665, "rewards/ngram_similarity_reward/std": 0.028478043153882027, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.013085399449035813, "grad_norm": 0.06301359175917012, "kl": 2.078125, "learning_rate": 9.981479793771866e-07, "loss": -0.0155, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 232.25, "epoch": 0.01315426997245179, "grad_norm": 0.2389237332272315, "kl": 7.71875, "learning_rate": 9.98051855792412e-07, "loss": 0.2101, "reward": 4.202084064483643, "reward_std": 0.3171129524707794, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7300000190734863, "rewards/format_reward_func/std": 0.13648653030395508, "rewards/ngram_similarity_reward": 0.13555604219436646, "rewards/ngram_similarity_reward/std": 0.04481219872832298, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.0530330091714859, "rewards/xml_reward_func": 0.9375, "rewards/xml_reward_func/std": 0.1767766922712326, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.013223140495867768, "grad_norm": 0.24053231620288973, "kl": 7.71875, "learning_rate": 9.979533058193946e-07, "loss": 0.2103, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00023282886832021177, "clip_ratio/low_min": 0.00023282886832021177, "clip_ratio/region_mean": 0.00023282886832021177, "completion_length": 1073.75, "epoch": 0.013292011019283746, "grad_norm": 0.13144510399711226, "kl": 2.203125, "learning_rate": 9.97852329991824e-07, "loss": 0.7493, "reward": 3.065859317779541, "reward_std": 1.160332202911377, "rewards/accuracy_reward": 0.34756767749786377, "rewards/accuracy_reward/std": 0.43907371163368225, "rewards/format_reward_func": 0.7553752064704895, "rewards/format_reward_func/std": 0.1666983664035797, "rewards/ngram_similarity_reward": 0.33041059970855713, "rewards/ngram_similarity_reward/std": 0.3034749925136566, "rewards/sql_execution_reward_func": 0.18214286863803864, "rewards/sql_execution_reward_func/std": 0.13471706211566925, "rewards/xml_reward_func": 0.9375902414321899, "rewards/xml_reward_func/std": 0.11284366995096207, "step": 193 }, { "clip_ratio/high_max": 0.00011641443416010588, "clip_ratio/high_mean": 0.00011641443416010588, "clip_ratio/low_mean": 0.00011641443416010588, "clip_ratio/low_min": 0.00011641443416010588, "clip_ratio/region_mean": 0.00023282886832021177, "epoch": 0.013360881542699725, "grad_norm": 0.1325361538653418, "kl": 2.078125, "learning_rate": 9.977489288565263e-07, "loss": 0.7493, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 260.75, "epoch": 0.013429752066115703, "grad_norm": 0.07896437783204392, "kl": 0.08203125, "learning_rate": 9.976431029734622e-07, "loss": 0.0584, "reward": 4.883208274841309, "reward_std": 0.5054306983947754, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.5054721832275391, "rewards/ngram_similarity_reward/std": 0.32215070724487305, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01349862258953168, "grad_norm": 0.07766807702127282, "kl": 0.08056640625, "learning_rate": 9.975348529157229e-07, "loss": 0.0584, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 208.375, "epoch": 0.013567493112947658, "grad_norm": 0.23761697950719138, "kl": 0.8046875, "learning_rate": 9.97424179269528e-07, "loss": 0.2097, "reward": 3.493976354598999, "reward_std": 1.352766513824463, "rewards/accuracy_reward": 0.40937501192092896, "rewards/accuracy_reward/std": 0.3727162778377533, "rewards/format_reward_func": 0.8083333969116211, "rewards/format_reward_func/std": 0.0235702246427536, "rewards/ngram_similarity_reward": 0.35709550976753235, "rewards/ngram_similarity_reward/std": 0.40240058302879333, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.0530330091714859, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.013636363636363636, "grad_norm": 0.24173293016673775, "kl": 0.79296875, "learning_rate": 9.973110826342211e-07, "loss": 0.2092, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 575.625, "epoch": 0.013705234159779615, "grad_norm": 0.12101053812622053, "kl": 0.95703125, "learning_rate": 9.971955636222684e-07, "loss": 0.4597, "reward": 2.971817970275879, "reward_std": 0.9585478901863098, "rewards/accuracy_reward": 0.3864583373069763, "rewards/accuracy_reward/std": 0.3667190968990326, "rewards/format_reward_func": 0.7458333373069763, "rewards/format_reward_func/std": 0.19184774160385132, "rewards/ngram_similarity_reward": 0.11190657317638397, "rewards/ngram_similarity_reward/std": 0.10878849029541016, "rewards/sql_execution_reward_func": 0.3268750011920929, "rewards/sql_execution_reward_func/std": 0.10067407041788101, "rewards/xml_reward_func": 0.9583333730697632, "rewards/xml_reward_func/std": 0.117851123213768, "step": 199 }, { "clip_ratio/high_max": 0.00021715526236221194, "clip_ratio/high_mean": 0.00021715526236221194, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021715526236221194, "epoch": 0.013774104683195593, "grad_norm": 0.12132770193399849, "kl": 0.91796875, "learning_rate": 9.970776228592532e-07, "loss": 0.4595, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 265.0, "epoch": 0.01384297520661157, "grad_norm": 0.04433232538785716, "kl": 0.87109375, "learning_rate": 9.969572609838744e-07, "loss": -0.0017, "reward": 4.561840534210205, "reward_std": 0.2613115906715393, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7833333611488342, "rewards/format_reward_func/std": 0.077664315700531, "rewards/ngram_similarity_reward": 0.3106715679168701, "rewards/ngram_similarity_reward/std": 0.2356560230255127, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.06943650543689728, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.013911845730027548, "grad_norm": 0.044133942752731074, "kl": 0.8125, "learning_rate": 9.968344786479415e-07, "loss": -0.0018, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00012411567149683833, "clip_ratio/low_min": 0.00012411567149683833, "clip_ratio/region_mean": 0.00012411567149683833, "completion_length": 1007.125, "epoch": 0.013980716253443526, "grad_norm": 0.112653928076259, "kl": 1.1015625, "learning_rate": 9.967092765163728e-07, "loss": 0.4645, "reward": 2.879596471786499, "reward_std": 0.7631068825721741, "rewards/accuracy_reward": 0.23700104653835297, "rewards/accuracy_reward/std": 0.21389921009540558, "rewards/format_reward_func": 0.8033333420753479, "rewards/format_reward_func/std": 0.2283342182636261, "rewards/ngram_similarity_reward": 0.15215134620666504, "rewards/ngram_similarity_reward/std": 0.09951012581586838, "rewards/sql_execution_reward_func": 0.4674999713897705, "rewards/sql_execution_reward_func/std": 0.08548182994127274, "rewards/xml_reward_func": 0.9065340757369995, "rewards/xml_reward_func/std": 0.16637073457241058, "step": 203 }, { "clip_ratio/high_max": 0.00012411567149683833, "clip_ratio/high_mean": 0.00012411567149683833, "clip_ratio/low_mean": 0.00024823134299367666, "clip_ratio/low_min": 0.00024823134299367666, "clip_ratio/region_mean": 0.000372347014490515, "epoch": 0.014049586776859505, "grad_norm": 0.11090351386063793, "kl": 1.1015625, "learning_rate": 9.965816552671897e-07, "loss": 0.4646, "step": 204 }, { "clip_ratio/high_max": 0.0001310959632974118, "clip_ratio/high_mean": 0.0001310959632974118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001310959632974118, "completion_length": 953.5, "epoch": 0.014118457300275482, "grad_norm": 0.1273496264186867, "kl": 2.890625, "learning_rate": 9.964516155915151e-07, "loss": 0.7385, "reward": 3.015195369720459, "reward_std": 1.2466422319412231, "rewards/accuracy_reward": 0.28125, "rewards/accuracy_reward/std": 0.4519304037094116, "rewards/format_reward_func": 0.8216666579246521, "rewards/format_reward_func/std": 0.1847434937953949, "rewards/ngram_similarity_reward": 0.27068597078323364, "rewards/ngram_similarity_reward/std": 0.26945096254348755, "rewards/sql_execution_reward_func": 0.22500000894069672, "rewards/sql_execution_reward_func/std": 0.18662413954734802, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0001310959632974118, "clip_ratio/high_mean": 0.0001310959632974118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001310959632974118, "epoch": 0.01418732782369146, "grad_norm": 0.12631313320674065, "kl": 2.984375, "learning_rate": 9.963191581935677e-07, "loss": 0.7386, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00013823610788676888, "clip_ratio/low_min": 0.00013823610788676888, "clip_ratio/region_mean": 0.00013823610788676888, "completion_length": 904.25, "epoch": 0.014256198347107438, "grad_norm": 0.8940590175730418, "kl": 1.90625, "learning_rate": 9.961842837906603e-07, "loss": 1.414, "reward": 3.4299964904785156, "reward_std": 1.7683619260787964, "rewards/accuracy_reward": 0.5332596302032471, "rewards/accuracy_reward/std": 0.5065104961395264, "rewards/format_reward_func": 0.7633333206176758, "rewards/format_reward_func/std": 0.2594989240169525, "rewards/ngram_similarity_reward": 0.3821473717689514, "rewards/ngram_similarity_reward/std": 0.34406864643096924, "rewards/sql_execution_reward_func": 0.1519230753183365, "rewards/sql_execution_reward_func/std": 0.11958914995193481, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00013823610788676888, "clip_ratio/low_min": 0.00013823610788676888, "clip_ratio/region_mean": 0.00013823610788676888, "epoch": 0.014325068870523415, "grad_norm": 0.8564828075074082, "kl": 1.890625, "learning_rate": 9.960469931131936e-07, "loss": 1.4133, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 223.875, "epoch": 0.014393939393939395, "grad_norm": 0.10757541150699883, "kl": 0.6953125, "learning_rate": 9.959072869046547e-07, "loss": -0.1629, "reward": 3.7501542568206787, "reward_std": 0.695688009262085, "rewards/accuracy_reward": 0.44062501192092896, "rewards/accuracy_reward/std": 0.16793785989284515, "rewards/format_reward_func": 0.7250000238418579, "rewards/format_reward_func/std": 0.10947203636169434, "rewards/ngram_similarity_reward": 0.5376030206680298, "rewards/ngram_similarity_reward/std": 0.2553205192089081, "rewards/sql_execution_reward_func": 0.3375000059604645, "rewards/sql_execution_reward_func/std": 0.058248236775398254, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.014462809917355372, "grad_norm": 0.10763761629040448, "kl": 0.66015625, "learning_rate": 9.957651659216108e-07, "loss": -0.163, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 199.375, "epoch": 0.01453168044077135, "grad_norm": 0.2497526864474372, "kl": 0.953125, "learning_rate": 9.956206309337066e-07, "loss": -0.0875, "reward": 3.599432945251465, "reward_std": 1.1212992668151855, "rewards/accuracy_reward": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward_func": 0.8166667222976685, "rewards/format_reward_func/std": 0.030860668048262596, "rewards/ngram_similarity_reward": 0.11351090669631958, "rewards/ngram_similarity_reward/std": 0.07506603747606277, "rewards/sql_execution_reward_func": 0.36250001192092896, "rewards/sql_execution_reward_func/std": 0.023145508021116257, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.014600550964187328, "grad_norm": 0.245012172665473, "kl": 0.94921875, "learning_rate": 9.954736827236595e-07, "loss": -0.0881, "step": 212 }, { "clip_ratio/high_max": 0.00035536603536456823, "clip_ratio/high_mean": 0.00035536603536456823, "clip_ratio/low_mean": 0.00017768301768228412, "clip_ratio/low_min": 0.00017768301768228412, "clip_ratio/region_mean": 0.0005330490530468524, "completion_length": 703.5, "epoch": 0.014669421487603305, "grad_norm": 0.1888728334908304, "kl": 0.93359375, "learning_rate": 9.953243220872554e-07, "loss": 0.86, "reward": 2.2215094566345215, "reward_std": 0.6006115674972534, "rewards/accuracy_reward": 0.02187500149011612, "rewards/accuracy_reward/std": 0.008838835172355175, "rewards/format_reward_func": 0.7354166507720947, "rewards/format_reward_func/std": 0.17648182809352875, "rewards/ngram_similarity_reward": 0.1722387969493866, "rewards/ngram_similarity_reward/std": 0.0703953355550766, "rewards/sql_execution_reward_func": 0.2875000238418579, "rewards/sql_execution_reward_func/std": 0.13024702668190002, "rewards/xml_reward_func": 0.896484375, "rewards/xml_reward_func/std": 0.2927864193916321, "step": 213 }, { "clip_ratio/high_max": 0.00017768301768228412, "clip_ratio/high_mean": 0.00017768301768228412, "clip_ratio/low_mean": 0.0007107320707291365, "clip_ratio/low_min": 0.0007107320707291365, "clip_ratio/region_mean": 0.0008884150884114206, "epoch": 0.014738292011019285, "grad_norm": 0.17656734436488838, "kl": 0.921875, "learning_rate": 9.951725498333448e-07, "loss": 0.8597, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 376.5, "epoch": 0.014807162534435262, "grad_norm": 0.0458489341453127, "kl": 1.46875, "learning_rate": 9.950183667838379e-07, "loss": -0.0344, "reward": 2.579881191253662, "reward_std": 0.24052265286445618, "rewards/accuracy_reward": 0.125, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7816666960716248, "rewards/format_reward_func/std": 0.0963953509926796, "rewards/ngram_similarity_reward": 0.1529764086008072, "rewards/ngram_similarity_reward/std": 0.13643908500671387, "rewards/sql_execution_reward_func": 0.3187500238418579, "rewards/sql_execution_reward_func/std": 0.15338444709777832, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01487603305785124, "grad_norm": 0.04590559501148063, "kl": 1.4765625, "learning_rate": 9.948617737737001e-07, "loss": -0.0345, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 171.25, "epoch": 0.014944903581267217, "grad_norm": 0.2745582578190712, "kl": 8.0625, "learning_rate": 9.947027716509488e-07, "loss": 0.1295, "reward": 5.409134387969971, "reward_std": 0.4864734709262848, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.0, "rewards/ngram_similarity_reward": 0.8394230604171753, "rewards/ngram_similarity_reward/std": 0.32431572675704956, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.015013774104683195, "grad_norm": 0.2746776149210642, "kl": 8.0625, "learning_rate": 9.945413612766464e-07, "loss": 0.129, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0003995205624960363, "clip_ratio/low_min": 0.0003995205624960363, "clip_ratio/region_mean": 0.0003995205624960363, "completion_length": 312.875, "epoch": 0.015082644628099174, "grad_norm": 0.032302987023826606, "kl": 0.74609375, "learning_rate": 9.943775435248978e-07, "loss": -0.0608, "reward": 4.245344638824463, "reward_std": 0.1083558052778244, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7416666746139526, "rewards/format_reward_func/std": 0.09041351079940796, "rewards/ngram_similarity_reward": 0.11078531295061111, "rewards/ngram_similarity_reward/std": 0.12893661856651306, "rewards/sql_execution_reward_func": 0.3375000059604645, "rewards/sql_execution_reward_func/std": 0.058248236775398254, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0007990411249920726, "clip_ratio/low_min": 0.0007990411249920726, "clip_ratio/region_mean": 0.0007990411249920726, "epoch": 0.015151515151515152, "grad_norm": 0.0347289003247738, "kl": 0.7421875, "learning_rate": 9.942113192828444e-07, "loss": -0.0608, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 301.625, "epoch": 0.01522038567493113, "grad_norm": 0.19131480744413784, "kl": 0.515625, "learning_rate": 9.940426894506606e-07, "loss": 0.4297, "reward": 3.7734522819519043, "reward_std": 0.9909169673919678, "rewards/accuracy_reward": 0.78125, "rewards/accuracy_reward/std": 0.41052016615867615, "rewards/format_reward_func": 0.7599999904632568, "rewards/format_reward_func/std": 0.10253918170928955, "rewards/ngram_similarity_reward": 0.11521834135055542, "rewards/ngram_similarity_reward/std": 0.026367785409092903, "rewards/sql_execution_reward_func": 0.27812498807907104, "rewards/sql_execution_reward_func/std": 0.13721561431884766, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.015289256198347107, "grad_norm": 0.19203709295806282, "kl": 0.51953125, "learning_rate": 9.93871654941547e-07, "loss": 0.4296, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0003396739193703979, "clip_ratio/low_min": 0.0003396739193703979, "clip_ratio/region_mean": 0.0003396739193703979, "completion_length": 368.0, "epoch": 0.015358126721763085, "grad_norm": 0.18485142332940963, "kl": 0.0703125, "learning_rate": 9.93698216681727e-07, "loss": 0.1525, "reward": 4.109217643737793, "reward_std": 1.0879193544387817, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/std": 0.4381372928619385, "rewards/format_reward_func": 0.8333333730697632, "rewards/format_reward_func/std": 0.13333332538604736, "rewards/ngram_similarity_reward": 0.38808947801589966, "rewards/ngram_similarity_reward/std": 0.22400052845478058, "rewards/sql_execution_reward_func": 0.3187499940395355, "rewards/sql_execution_reward_func/std": 0.13871219754219055, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.015426997245179064, "grad_norm": 0.17289784915139014, "kl": 0.07080078125, "learning_rate": 9.935223756104419e-07, "loss": 0.1527, "step": 224 }, { "clip_ratio/high_max": 0.00017109615146182477, "clip_ratio/high_mean": 0.00017109615146182477, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017109615146182477, "completion_length": 2191.75, "epoch": 0.015495867768595042, "grad_norm": 0.07218121464254233, "kl": 1.2109375, "learning_rate": 9.93344132679944e-07, "loss": 0.2439, "reward": 2.139040946960449, "reward_std": 0.8310713768005371, "rewards/accuracy_reward": 0.140625, "rewards/accuracy_reward/std": 0.11947616934776306, "rewards/format_reward_func": 0.8064935207366943, "rewards/format_reward_func/std": 0.2743750512599945, "rewards/ngram_similarity_reward": 0.02747032232582569, "rewards/ngram_similarity_reward/std": 0.019747428596019745, "rewards/sql_execution_reward_func": 0.14077380299568176, "rewards/sql_execution_reward_func/std": 0.14696717262268066, "rewards/xml_reward_func": 0.8693181872367859, "rewards/xml_reward_func/std": 0.3516175448894501, "step": 225 }, { "clip_ratio/high_max": 5.703205169993453e-05, "clip_ratio/high_mean": 5.703205169993453e-05, "clip_ratio/low_mean": 5.703205169993453e-05, "clip_ratio/low_min": 5.703205169993453e-05, "clip_ratio/region_mean": 0.00011406410339986905, "epoch": 0.01556473829201102, "grad_norm": 0.06948982827353982, "kl": 1.1953125, "learning_rate": 9.931634888554935e-07, "loss": 0.2436, "step": 226 }, { "clip_ratio/high_max": 0.00011410314618842676, "clip_ratio/high_mean": 0.00011410314618842676, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00011410314618842676, "completion_length": 1095.5, "epoch": 0.015633608815426997, "grad_norm": 0.25411664555686436, "kl": 11.875, "learning_rate": 9.929804451153525e-07, "loss": 0.981, "reward": 3.008777141571045, "reward_std": 1.4382257461547852, "rewards/accuracy_reward": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward_func": 0.7437499761581421, "rewards/format_reward_func/std": 0.1654377281665802, "rewards/ngram_similarity_reward": 0.27691134810447693, "rewards/ngram_similarity_reward/std": 0.2543497085571289, "rewards/sql_execution_reward_func": 0.20330607891082764, "rewards/sql_execution_reward_func/std": 0.14535415172576904, "rewards/xml_reward_func": 0.8963541984558105, "rewards/xml_reward_func/std": 0.23816676437854767, "step": 227 }, { "clip_ratio/high_max": 0.00011410314618842676, "clip_ratio/high_mean": 0.00011410314618842676, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00011410314618842676, "epoch": 0.015702479338842976, "grad_norm": 0.25692193158539267, "kl": 11.875, "learning_rate": 9.927950024507794e-07, "loss": 0.9806, "step": 228 }, { "clip_ratio/high_max": 0.0001247349428012967, "clip_ratio/high_mean": 0.0001247349428012967, "clip_ratio/low_mean": 0.0002494698856025934, "clip_ratio/low_min": 0.0002494698856025934, "clip_ratio/region_mean": 0.00037420482840389013, "completion_length": 1002.125, "epoch": 0.015771349862258952, "grad_norm": 0.14925138809286262, "kl": 0.01055908203125, "learning_rate": 9.926071618660237e-07, "loss": 1.1214, "reward": 2.6593165397644043, "reward_std": 1.489763855934143, "rewards/accuracy_reward": 0.25312501192092896, "rewards/accuracy_reward/std": 0.3247079849243164, "rewards/format_reward_func": 0.6833333373069763, "rewards/format_reward_func/std": 0.2867441773414612, "rewards/ngram_similarity_reward": 0.29857224225997925, "rewards/ngram_similarity_reward/std": 0.3485589921474457, "rewards/sql_execution_reward_func": 0.22499999403953552, "rewards/sql_execution_reward_func/std": 0.16256867349147797, "rewards/xml_reward_func": 0.796875, "rewards/xml_reward_func/std": 0.36558112502098083, "step": 229 }, { "clip_ratio/high_max": 0.0002494698856025934, "clip_ratio/high_mean": 0.0002494698856025934, "clip_ratio/low_mean": 0.0002494698856025934, "clip_ratio/low_min": 0.0002494698856025934, "clip_ratio/region_mean": 0.0004989397712051868, "epoch": 0.01584022038567493, "grad_norm": 0.13711842593508042, "kl": 0.010986328125, "learning_rate": 9.924169243783205e-07, "loss": 1.1215, "step": 230 }, { "clip_ratio/high_max": 0.00018563207413535565, "clip_ratio/high_mean": 0.00018563207413535565, "clip_ratio/low_mean": 0.00018563207413535565, "clip_ratio/low_min": 0.00018563207413535565, "clip_ratio/region_mean": 0.0003712641482707113, "completion_length": 673.375, "epoch": 0.015909090909090907, "grad_norm": 0.3455238642962306, "kl": 0.259765625, "learning_rate": 9.922242910178859e-07, "loss": 1.7497, "reward": 2.861886501312256, "reward_std": 1.261361002922058, "rewards/accuracy_reward": 0.3125, "rewards/accuracy_reward/std": 0.29124119877815247, "rewards/format_reward_func": 0.6875, "rewards/format_reward_func/std": 0.22320714592933655, "rewards/ngram_similarity_reward": 0.24750769138336182, "rewards/ngram_similarity_reward/std": 0.17447251081466675, "rewards/sql_execution_reward_func": 0.3187499940395355, "rewards/sql_execution_reward_func/std": 0.13346347212791443, "rewards/xml_reward_func": 0.859375, "rewards/xml_reward_func/std": 0.3499840497970581, "step": 231 }, { "clip_ratio/high_max": 0.00018563207413535565, "clip_ratio/high_mean": 0.00018563207413535565, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018563207413535565, "epoch": 0.015977961432506887, "grad_norm": 0.3421840617651558, "kl": 0.25390625, "learning_rate": 9.9202926282791e-07, "loss": 1.7499, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 292.25, "epoch": 0.016046831955922866, "grad_norm": 0.2074888551012889, "kl": 1.0859375, "learning_rate": 9.918318408645516e-07, "loss": 0.316, "reward": 4.275868892669678, "reward_std": 1.0786798000335693, "rewards/accuracy_reward": 0.8295454382896423, "rewards/accuracy_reward/std": 0.31771767139434814, "rewards/format_reward_func": 0.7625000476837158, "rewards/format_reward_func/std": 0.1505940705537796, "rewards/ngram_similarity_reward": 0.3986855149269104, "rewards/ngram_similarity_reward/std": 0.38036665320396423, "rewards/sql_execution_reward_func": 0.3187499940395355, "rewards/sql_execution_reward_func/std": 0.13346347212791443, "rewards/xml_reward_func": 0.9375, "rewards/xml_reward_func/std": 0.1767766922712326, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.016115702479338842, "grad_norm": 0.20819841617454488, "kl": 1.09375, "learning_rate": 9.916320261969338e-07, "loss": 0.3162, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 275.625, "epoch": 0.01618457300275482, "grad_norm": 0.10351769844482332, "kl": 0.1630859375, "learning_rate": 9.91429819907136e-07, "loss": 0.1567, "reward": 4.4869232177734375, "reward_std": 0.582209050655365, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7083333730697632, "rewards/format_reward_func/std": 0.13303537666797638, "rewards/ngram_similarity_reward": 0.3239211440086365, "rewards/ngram_similarity_reward/std": 0.3311314582824707, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 0.9427083730697632, "rewards/xml_reward_func/std": 0.11980783939361572, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00045351474545896053, "clip_ratio/low_min": 0.00045351474545896053, "clip_ratio/region_mean": 0.00045351474545896053, "epoch": 0.016253443526170797, "grad_norm": 0.0995182314997501, "kl": 0.1728515625, "learning_rate": 9.912252230901906e-07, "loss": 0.1565, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 107.625, "epoch": 0.016322314049586777, "grad_norm": 0.36708106708338595, "kl": 0.2158203125, "learning_rate": 9.910182368540745e-07, "loss": -0.0107, "reward": 4.714914321899414, "reward_std": 1.2763879299163818, "rewards/accuracy_reward": 0.784375011920929, "rewards/accuracy_reward/std": 0.4037630259990692, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.09258200973272324, "rewards/ngram_similarity_reward": 0.6974431872367859, "rewards/ngram_similarity_reward/std": 0.35245269536972046, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.016391184573002756, "grad_norm": 0.35700468897585297, "kl": 0.2138671875, "learning_rate": 9.908088623197048e-07, "loss": -0.0126, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 232.25, "epoch": 0.016460055096418732, "grad_norm": 0.07948704361127844, "kl": 0.228515625, "learning_rate": 9.90597100620932e-07, "loss": 0.099, "reward": 4.398736953735352, "reward_std": 0.30003252625465393, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7416666746139526, "rewards/format_reward_func/std": 0.13540063798427582, "rewards/ngram_similarity_reward": 0.2449910193681717, "rewards/ngram_similarity_reward/std": 0.08882871270179749, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.0530330091714859, "rewards/xml_reward_func": 0.9583333730697632, "rewards/xml_reward_func/std": 0.117851123213768, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01652892561983471, "grad_norm": 0.07947043968888683, "kl": 0.224609375, "learning_rate": 9.90382952904535e-07, "loss": 0.0991, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00021308331633917987, "clip_ratio/low_min": 0.00021308331633917987, "clip_ratio/region_mean": 0.00021308331633917987, "completion_length": 1173.25, "epoch": 0.016597796143250687, "grad_norm": 0.10884414581681921, "kl": 0.08935546875, "learning_rate": 9.901664203302124e-07, "loss": 0.5031, "reward": 3.169909954071045, "reward_std": 1.3330878019332886, "rewards/accuracy_reward": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward_func": 0.7939996123313904, "rewards/format_reward_func/std": 0.16313748061656952, "rewards/ngram_similarity_reward": 0.337741881608963, "rewards/ngram_similarity_reward/std": 0.38212454319000244, "rewards/sql_execution_reward_func": 0.13433441519737244, "rewards/sql_execution_reward_func/std": 0.13120117783546448, "rewards/xml_reward_func": 0.984963059425354, "rewards/xml_reward_func/std": 0.028858603909611702, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00010654165816958994, "clip_ratio/low_min": 0.00010654165816958994, "clip_ratio/region_mean": 0.00010654165816958994, "epoch": 0.016666666666666666, "grad_norm": 0.10842086037989926, "kl": 0.08935546875, "learning_rate": 9.899475040705794e-07, "loss": 0.5026, "step": 242 }, { "clip_ratio/high_max": 0.00038491145824082196, "clip_ratio/high_mean": 0.00038491145824082196, "clip_ratio/low_mean": 0.00019245572912041098, "clip_ratio/low_min": 0.00019245572912041098, "clip_ratio/region_mean": 0.0005773672019131482, "completion_length": 649.5, "epoch": 0.016735537190082646, "grad_norm": 0.16056851218805188, "kl": 0.2001953125, "learning_rate": 9.897262053111585e-07, "loss": 1.2482, "reward": 2.621795654296875, "reward_std": 0.7668780088424683, "rewards/accuracy_reward": 0.21875, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/format_reward_func": 0.7401515245437622, "rewards/format_reward_func/std": 0.13893911242485046, "rewards/ngram_similarity_reward": 0.1473587155342102, "rewards/ngram_similarity_reward/std": 0.11145709455013275, "rewards/sql_execution_reward_func": 0.3187499940395355, "rewards/sql_execution_reward_func/std": 0.13076014816761017, "rewards/xml_reward_func": 0.904356062412262, "rewards/xml_reward_func/std": 0.2386385202407837, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01680440771349862, "grad_norm": 0.16998891643863648, "kl": 0.1962890625, "learning_rate": 9.895025252503755e-07, "loss": 1.2483, "step": 244 }, { "clip_ratio/high_max": 0.00031411967938765883, "clip_ratio/high_mean": 0.00031411967938765883, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031411967938765883, "completion_length": 795.875, "epoch": 0.0168732782369146, "grad_norm": 0.06091566208269815, "kl": 1.8984375, "learning_rate": 9.892764650995512e-07, "loss": 0.0852, "reward": 2.959446668624878, "reward_std": 0.44855254888534546, "rewards/accuracy_reward": 0.1875, "rewards/accuracy_reward/std": 0.16366341710090637, "rewards/format_reward_func": 0.8949999809265137, "rewards/format_reward_func/std": 0.13417589664459229, "rewards/ngram_similarity_reward": 0.2838374078273773, "rewards/ngram_similarity_reward/std": 0.11781128495931625, "rewards/sql_execution_reward_func": 0.2668154835700989, "rewards/sql_execution_reward_func/std": 0.21058352291584015, "rewards/xml_reward_func": 0.996874988079071, "rewards/xml_reward_func/std": 0.008838826790452003, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.016942148760330577, "grad_norm": 0.06023127389074478, "kl": 1.8828125, "learning_rate": 9.890480260828965e-07, "loss": 0.0851, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 259.125, "epoch": 0.017011019283746556, "grad_norm": 0.20317248166958926, "kl": 1.046875, "learning_rate": 9.888172094375033e-07, "loss": 0.2558, "reward": 3.9622440338134766, "reward_std": 0.8244420886039734, "rewards/accuracy_reward": 0.78125, "rewards/accuracy_reward/std": 0.41052016615867615, "rewards/format_reward_func": 0.7833333015441895, "rewards/format_reward_func/std": 0.11683660745620728, "rewards/ngram_similarity_reward": 0.1526072919368744, "rewards/ngram_similarity_reward/std": 0.12506651878356934, "rewards/sql_execution_reward_func": 0.38749998807907104, "rewards/sql_execution_reward_func/std": 0.06943651288747787, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 247 }, { "clip_ratio/high_max": 0.0009647853439673781, "clip_ratio/high_mean": 0.0009647853439673781, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009647853439673781, "epoch": 0.017079889807162536, "grad_norm": 0.20351895763029695, "kl": 1.1171875, "learning_rate": 9.885840164133413e-07, "loss": 0.2551, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 268.75, "epoch": 0.01714876033057851, "grad_norm": 0.22206511876575424, "kl": 8.5, "learning_rate": 9.883484482732472e-07, "loss": 0.0041, "reward": 3.3958568572998047, "reward_std": 0.7988384962081909, "rewards/accuracy_reward": 0.53125, "rewards/accuracy_reward/std": 0.38816189765930176, "rewards/format_reward_func": 0.8041666746139526, "rewards/format_reward_func/std": 0.1060660108923912, "rewards/ngram_similarity_reward": 0.12779349088668823, "rewards/ngram_similarity_reward/std": 0.051636677235364914, "rewards/sql_execution_reward_func": 0.3375000059604645, "rewards/sql_execution_reward_func/std": 0.07440237700939178, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01721763085399449, "grad_norm": 0.22382119668659292, "kl": 8.5, "learning_rate": 9.881105062929221e-07, "loss": 0.004, "step": 250 }, { "clip_ratio/high_max": 0.0005387931014411151, "clip_ratio/high_mean": 0.0005387931014411151, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005387931014411151, "completion_length": 232.0, "epoch": 0.017286501377410467, "grad_norm": 0.38208091437232405, "kl": 11.8125, "learning_rate": 9.878701917609207e-07, "loss": 0.0901, "reward": 4.153555393218994, "reward_std": 0.810690701007843, "rewards/accuracy_reward": 0.878125011920929, "rewards/accuracy_reward/std": 0.34471458196640015, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.18153707683086395, "rewards/ngram_similarity_reward/std": 0.12443697452545166, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.017355371900826446, "grad_norm": 0.35382595085301605, "kl": 10.8125, "learning_rate": 9.876275059786468e-07, "loss": 0.088, "step": 252 }, { "clip_ratio/high_max": 8.982305007521063e-05, "clip_ratio/high_mean": 8.982305007521063e-05, "clip_ratio/low_mean": 8.982305007521063e-05, "clip_ratio/low_min": 8.982305007521063e-05, "clip_ratio/region_mean": 0.00017964610015042126, "completion_length": 1391.625, "epoch": 0.017424242424242425, "grad_norm": 0.36533296253936887, "kl": 6.71875, "learning_rate": 9.873824502603459e-07, "loss": 0.5048, "reward": 2.5791056156158447, "reward_std": 1.1102229356765747, "rewards/accuracy_reward": 0.19610387086868286, "rewards/accuracy_reward/std": 0.16865099966526031, "rewards/format_reward_func": 0.6632097959518433, "rewards/format_reward_func/std": 0.20942039787769318, "rewards/ngram_similarity_reward": 0.29990190267562866, "rewards/ngram_similarity_reward/std": 0.2614748775959015, "rewards/sql_execution_reward_func": 0.1896306872367859, "rewards/sql_execution_reward_func/std": 0.1134599968791008, "rewards/xml_reward_func": 0.8842045664787292, "rewards/xml_reward_func/std": 0.31167104840278625, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00017964610015042126, "clip_ratio/low_min": 0.00017964610015042126, "clip_ratio/region_mean": 0.00017964610015042126, "epoch": 0.0174931129476584, "grad_norm": 0.33159412452954534, "kl": 6.71875, "learning_rate": 9.871350259330965e-07, "loss": 0.5042, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 307.625, "epoch": 0.01756198347107438, "grad_norm": 0.1638773036493552, "kl": 5.46875, "learning_rate": 9.868852343368053e-07, "loss": -0.0041, "reward": 2.853452205657959, "reward_std": 0.12225552648305893, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8208333253860474, "rewards/format_reward_func/std": 0.08533314615488052, "rewards/ngram_similarity_reward": 0.1259126514196396, "rewards/ngram_similarity_reward/std": 0.04669121652841568, "rewards/sql_execution_reward_func": 0.34375, "rewards/sql_execution_reward_func/std": 0.06781013309955597, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.017630853994490357, "grad_norm": 0.16695873285237786, "kl": 5.46875, "learning_rate": 9.866330768241983e-07, "loss": -0.0041, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00045330915600061417, "clip_ratio/low_min": 0.00045330915600061417, "clip_ratio/region_mean": 0.00045330915600061417, "completion_length": 275.75, "epoch": 0.017699724517906336, "grad_norm": 0.057588277148672325, "kl": 0.0576171875, "learning_rate": 9.863785547608138e-07, "loss": -0.0222, "reward": 5.116064071655273, "reward_std": 0.44596555829048157, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8083333969116211, "rewards/format_reward_func/std": 0.0235702246427536, "rewards/ngram_similarity_reward": 0.6509870886802673, "rewards/ngram_similarity_reward/std": 0.30348721146583557, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.0530330091714859, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0009066183120012283, "clip_ratio/low_min": 0.0009066183120012283, "clip_ratio/region_mean": 0.0009066183120012283, "epoch": 0.017768595041322315, "grad_norm": 0.05804443398295835, "kl": 0.0546875, "learning_rate": 9.861216695249954e-07, "loss": -0.0222, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 206.75, "epoch": 0.01783746556473829, "grad_norm": 0.20087301579342204, "kl": 0.1845703125, "learning_rate": 9.85862422507884e-07, "loss": -0.0194, "reward": 3.836207628250122, "reward_std": 0.9346718788146973, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/std": 0.4381372928619385, "rewards/format_reward_func": 0.8166666626930237, "rewards/format_reward_func/std": 0.1321374773979187, "rewards/ngram_similarity_reward": 0.204694002866745, "rewards/ngram_similarity_reward/std": 0.08984899520874023, "rewards/sql_execution_reward_func": 0.3374999761581421, "rewards/sql_execution_reward_func/std": 0.058248236775398254, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01790633608815427, "grad_norm": 0.19830227711134946, "kl": 0.1845703125, "learning_rate": 9.856008151134105e-07, "loss": -0.0196, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 273.625, "epoch": 0.017975206611570246, "grad_norm": 0.11036992751014373, "kl": 0.00885009765625, "learning_rate": 9.853368487582886e-07, "loss": -0.0434, "reward": 3.529776096343994, "reward_std": 0.7265958189964294, "rewards/accuracy_reward": 0.625, "rewards/accuracy_reward/std": 0.40089187026023865, "rewards/format_reward_func": 0.7083333730697632, "rewards/format_reward_func/std": 0.13303537666797638, "rewards/ngram_similarity_reward": 0.1754063069820404, "rewards/ngram_similarity_reward/std": 0.03375350311398506, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 0.9583333730697632, "rewards/xml_reward_func/std": 0.117851123213768, "step": 261 }, { "clip_ratio/high_max": 0.0004568296135403216, "clip_ratio/high_mean": 0.0004568296135403216, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004568296135403216, "epoch": 0.018044077134986226, "grad_norm": 0.10901394426104408, "kl": 0.0086669921875, "learning_rate": 9.850705248720068e-07, "loss": -0.0435, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 267.125, "epoch": 0.018112947658402205, "grad_norm": 0.050474858566836776, "kl": 1.5234375, "learning_rate": 9.848018448968196e-07, "loss": 0.0234, "reward": 2.776127338409424, "reward_std": 0.13444402813911438, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8333333730697632, "rewards/format_reward_func/std": 0.09258200228214264, "rewards/ngram_similarity_reward": 0.07714031636714935, "rewards/ngram_similarity_reward/std": 0.06956751644611359, "rewards/sql_execution_reward_func": 0.3375000059604645, "rewards/sql_execution_reward_func/std": 0.06408699601888657, "rewards/xml_reward_func": 0.9895833730697632, "rewards/xml_reward_func/std": 0.029462775215506554, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01818181818181818, "grad_norm": 0.05089158366937775, "kl": 1.53125, "learning_rate": 9.845308102877422e-07, "loss": 0.0235, "step": 264 }, { "clip_ratio/high_max": 7.820442988304421e-05, "clip_ratio/high_mean": 7.820442988304421e-05, "clip_ratio/low_mean": 7.820442988304421e-05, "clip_ratio/low_min": 7.820442988304421e-05, "clip_ratio/region_mean": 0.00015640885976608843, "completion_length": 1598.375, "epoch": 0.01825068870523416, "grad_norm": 0.23668555971318758, "kl": 3.734375, "learning_rate": 9.8425742251254e-07, "loss": 1.4662, "reward": 2.628861904144287, "reward_std": 1.9213978052139282, "rewards/accuracy_reward": 0.40625, "rewards/accuracy_reward/std": 0.4988826811313629, "rewards/format_reward_func": 0.624404788017273, "rewards/format_reward_func/std": 0.292076975107193, "rewards/ngram_similarity_reward": 0.16338811814785004, "rewards/ngram_similarity_reward/std": 0.18861299753189087, "rewards/sql_execution_reward_func": 0.19910714030265808, "rewards/sql_execution_reward_func/std": 0.19667193293571472, "rewards/xml_reward_func": 0.7477678656578064, "rewards/xml_reward_func/std": 0.4615734815597534, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0003910221275873482, "clip_ratio/low_min": 0.0003910221275873482, "clip_ratio/region_mean": 0.0003910221275873482, "epoch": 0.018319559228650136, "grad_norm": 0.1937235212817833, "kl": 3.734375, "learning_rate": 9.839816830517225e-07, "loss": 1.466, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 266.875, "epoch": 0.018388429752066116, "grad_norm": 0.256278083230878, "kl": 8.5, "learning_rate": 9.837035933985339e-07, "loss": 0.0887, "reward": 4.571894645690918, "reward_std": 0.6190200448036194, "rewards/accuracy_reward": 0.8825408816337585, "rewards/accuracy_reward/std": 0.2174919992685318, "rewards/format_reward_func": 0.8416666984558105, "rewards/format_reward_func/std": 0.0636209025979042, "rewards/ngram_similarity_reward": 0.37676405906677246, "rewards/ngram_similarity_reward/std": 0.2266584038734436, "rewards/sql_execution_reward_func": 0.4000000059604645, "rewards/sql_execution_reward_func/std": 0.04629099741578102, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.018457300275482095, "grad_norm": 0.25532844242247005, "kl": 8.5, "learning_rate": 9.83423155058946e-07, "loss": 0.0884, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 389.5, "epoch": 0.01852617079889807, "grad_norm": 0.3414570571147013, "kl": 5.75, "learning_rate": 9.8314036955165e-07, "loss": 0.8334, "reward": 4.335766792297363, "reward_std": 1.2123948335647583, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward_func": 0.8316667079925537, "rewards/format_reward_func/std": 0.070979543030262, "rewards/ngram_similarity_reward": 0.42773348093032837, "rewards/ngram_similarity_reward/std": 0.37550103664398193, "rewards/sql_execution_reward_func": 0.375, "rewards/sql_execution_reward_func/std": 0.16256865859031677, "rewards/xml_reward_func": 0.987500011920929, "rewards/xml_reward_func/std": 0.0353553481400013, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.000320924271363765, "clip_ratio/low_min": 0.000320924271363765, "clip_ratio/region_mean": 0.000320924271363765, "epoch": 0.01859504132231405, "grad_norm": 0.3358423412496733, "kl": 5.46875, "learning_rate": 9.82855238408048e-07, "loss": 0.8318, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00020695364219136536, "clip_ratio/low_min": 0.00020695364219136536, "clip_ratio/region_mean": 0.00020695364219136536, "completion_length": 604.0, "epoch": 0.018663911845730026, "grad_norm": 1.651932836563191, "kl": 5.65625, "learning_rate": 9.825677631722435e-07, "loss": 1.7718, "reward": 3.017164707183838, "reward_std": 1.304434895515442, "rewards/accuracy_reward": 0.4360119104385376, "rewards/accuracy_reward/std": 0.3671466112136841, "rewards/format_reward_func": 0.7233333587646484, "rewards/format_reward_func/std": 0.23047226667404175, "rewards/ngram_similarity_reward": 0.181205153465271, "rewards/ngram_similarity_reward/std": 0.09516214579343796, "rewards/sql_execution_reward_func": 0.2750000059604645, "rewards/sql_execution_reward_func/std": 0.12535662949085236, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010347681818529963, "clip_ratio/low_min": 0.0010347681818529963, "clip_ratio/region_mean": 0.0010347681818529963, "epoch": 0.018732782369146005, "grad_norm": 1.397676857484213, "kl": 5.96875, "learning_rate": 9.822779454010358e-07, "loss": 1.7711, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 461.125, "epoch": 0.018801652892561985, "grad_norm": 0.3773076709725868, "kl": 14.25, "learning_rate": 9.819857866639093e-07, "loss": 0.0641, "reward": 4.805474281311035, "reward_std": 0.8327004313468933, "rewards/accuracy_reward": 0.7048611044883728, "rewards/accuracy_reward/std": 0.45253974199295044, "rewards/format_reward_func": 0.8028408885002136, "rewards/format_reward_func/std": 0.10238669812679291, "rewards/ngram_similarity_reward": 0.605555534362793, "rewards/ngram_similarity_reward/std": 0.3852139413356781, "rewards/sql_execution_reward_func": 0.6845779418945312, "rewards/sql_execution_reward_func/std": 1.1368353366851807, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01887052341597796, "grad_norm": 0.37224079476635247, "kl": 14.3125, "learning_rate": 9.816912885430258e-07, "loss": 0.0644, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 808.75, "epoch": 0.01893939393939394, "grad_norm": 0.09768466216417614, "kl": 3.609375, "learning_rate": 9.813944526332158e-07, "loss": -0.1999, "reward": 3.8548834323883057, "reward_std": 0.8441895246505737, "rewards/accuracy_reward": 0.71875, "rewards/accuracy_reward/std": 0.38816189765930176, "rewards/format_reward_func": 0.8689177632331848, "rewards/format_reward_func/std": 0.0829489603638649, "rewards/ngram_similarity_reward": 0.24200202524662018, "rewards/ngram_similarity_reward/std": 0.16438433527946472, "rewards/sql_execution_reward_func": 0.20705357193946838, "rewards/sql_execution_reward_func/std": 0.10714791715145111, "rewards/xml_reward_func": 0.9784091114997864, "rewards/xml_reward_func/std": 0.05249106511473656, "step": 275 }, { "clip_ratio/high_max": 0.00015455950051546097, "clip_ratio/high_mean": 0.00015455950051546097, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015455950051546097, "epoch": 0.019008264462809916, "grad_norm": 0.09666253462909943, "kl": 3.6875, "learning_rate": 9.8109528054197e-07, "loss": -0.2001, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 213.0, "epoch": 0.019077134986225895, "grad_norm": 0.053817978145184575, "kl": 0.006317138671875, "learning_rate": 9.807937738894303e-07, "loss": 0.0338, "reward": 4.50139045715332, "reward_std": 0.24483630061149597, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.09258200973272324, "rewards/ngram_similarity_reward": 0.26759400963783264, "rewards/ngram_similarity_reward/std": 0.122084841132164, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.019146005509641875, "grad_norm": 0.0539195375901155, "kl": 0.00628662109375, "learning_rate": 9.804899343083813e-07, "loss": 0.0337, "step": 278 }, { "clip_ratio/high_max": 0.00043936731526628137, "clip_ratio/high_mean": 0.00043936731526628137, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00043936731526628137, "completion_length": 284.5, "epoch": 0.01921487603305785, "grad_norm": 0.06990143302757458, "kl": 0.01507568359375, "learning_rate": 9.801837634442417e-07, "loss": -0.0313, "reward": 4.705854892730713, "reward_std": 0.4992641508579254, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7833333611488342, "rewards/format_reward_func/std": 0.077664315700531, "rewards/ngram_similarity_reward": 0.39418119192123413, "rewards/ngram_similarity_reward/std": 0.3067473769187927, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.0530330091714859, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 279 }, { "clip_ratio/high_max": 0.00043936731526628137, "clip_ratio/high_mean": 0.00043936731526628137, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00043936731526628137, "epoch": 0.01928374655647383, "grad_norm": 0.06967400951389334, "kl": 0.01519775390625, "learning_rate": 9.798752629550546e-07, "loss": -0.0314, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 1281.5, "epoch": 0.019352617079889806, "grad_norm": 0.366535947353781, "kl": 1.4453125, "learning_rate": 9.795644345114794e-07, "loss": 0.3969, "reward": 2.879608154296875, "reward_std": 1.6010526418685913, "rewards/accuracy_reward": 0.40937501192092896, "rewards/accuracy_reward/std": 0.4960446357727051, "rewards/format_reward_func": 0.7082750797271729, "rewards/format_reward_func/std": 0.263462096452713, "rewards/ngram_similarity_reward": 0.17306822538375854, "rewards/ngram_similarity_reward/std": 0.13373208045959473, "rewards/sql_execution_reward_func": 0.23000000417232513, "rewards/sql_execution_reward_func/std": 0.25961509346961975, "rewards/xml_reward_func": 0.8629807829856873, "rewards/xml_reward_func/std": 0.35031646490097046, "step": 281 }, { "clip_ratio/high_max": 9.75419461610727e-05, "clip_ratio/high_mean": 9.75419461610727e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 9.75419461610727e-05, "epoch": 0.019421487603305785, "grad_norm": 0.3185536011081735, "kl": 1.5078125, "learning_rate": 9.79251279796782e-07, "loss": 0.3962, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 228.75, "epoch": 0.019490358126721764, "grad_norm": 0.03529079430842732, "kl": 0.2392578125, "learning_rate": 9.78935800506826e-07, "loss": 0.0028, "reward": 4.413466930389404, "reward_std": 0.20829373598098755, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.19231119751930237, "rewards/ngram_similarity_reward/std": 0.1225782036781311, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0005464481073431671, "clip_ratio/high_mean": 0.0005464481073431671, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005464481073431671, "epoch": 0.01955922865013774, "grad_norm": 0.03463219360483411, "kl": 0.255859375, "learning_rate": 9.786179983500642e-07, "loss": 0.0028, "step": 284 }, { "clip_ratio/high_max": 0.0006635700119659305, "clip_ratio/high_mean": 0.0006635700119659305, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006635700119659305, "completion_length": 376.75, "epoch": 0.01962809917355372, "grad_norm": 0.11211755368362075, "kl": 0.359375, "learning_rate": 9.782978750475281e-07, "loss": 0.1889, "reward": 4.046504020690918, "reward_std": 0.6567471027374268, "rewards/accuracy_reward": 0.71875, "rewards/accuracy_reward/std": 0.38816189765930176, "rewards/format_reward_func": 0.815000057220459, "rewards/format_reward_func/std": 0.050615258514881134, "rewards/ngram_similarity_reward": 0.2654470205307007, "rewards/ngram_similarity_reward/std": 0.15136471390724182, "rewards/sql_execution_reward_func": 0.3958333134651184, "rewards/sql_execution_reward_func/std": 0.1314600557088852, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.019696969696969695, "grad_norm": 0.11102718414735378, "kl": 0.365234375, "learning_rate": 9.779754323328192e-07, "loss": 0.1888, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 256.5, "epoch": 0.019765840220385675, "grad_norm": 0.06595681904205325, "kl": 0.1806640625, "learning_rate": 9.776506719520994e-07, "loss": -0.0255, "reward": 3.1194748878479004, "reward_std": 0.46252527832984924, "rewards/accuracy_reward": 0.17499999701976776, "rewards/accuracy_reward/std": 0.10350984334945679, "rewards/format_reward_func": 0.8083333969116211, "rewards/format_reward_func/std": 0.0235702246427536, "rewards/ngram_similarity_reward": 0.40326112508773804, "rewards/ngram_similarity_reward/std": 0.25340536236763, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.000487329438328743, "clip_ratio/high_mean": 0.000487329438328743, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000487329438328743, "epoch": 0.019834710743801654, "grad_norm": 0.06785289174147488, "kl": 0.1708984375, "learning_rate": 9.773235956640826e-07, "loss": -0.0256, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00020781380590051413, "clip_ratio/low_min": 0.00020781380590051413, "clip_ratio/region_mean": 0.00020781380590051413, "completion_length": 601.5, "epoch": 0.01990358126721763, "grad_norm": 0.11845967024651463, "kl": 3.578125, "learning_rate": 9.769942052400235e-07, "loss": 0.3727, "reward": 3.4547295570373535, "reward_std": 0.7407901883125305, "rewards/accuracy_reward": 0.5625, "rewards/accuracy_reward/std": 0.3666396141052246, "rewards/format_reward_func": 0.7333333492279053, "rewards/format_reward_func/std": 0.16714218258857727, "rewards/ngram_similarity_reward": 0.2140655815601349, "rewards/ngram_similarity_reward/std": 0.11735667288303375, "rewards/sql_execution_reward_func": 0.3169642686843872, "rewards/sql_execution_reward_func/std": 0.12145482003688812, "rewards/xml_reward_func": 0.9583333730697632, "rewards/xml_reward_func/std": 0.117851123213768, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.01997245179063361, "grad_norm": 0.11559873779881855, "kl": 3.515625, "learning_rate": 9.766625024637086e-07, "loss": 0.3724, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0006322111585177481, "clip_ratio/low_min": 0.0006322111585177481, "clip_ratio/region_mean": 0.0006322111585177481, "completion_length": 790.875, "epoch": 0.020041322314049585, "grad_norm": 0.1666659144099162, "kl": 3.328125, "learning_rate": 9.76328489131448e-07, "loss": 1.2105, "reward": 2.487016201019287, "reward_std": 1.2102782726287842, "rewards/accuracy_reward": 0.16249999403953552, "rewards/accuracy_reward/std": 0.12100767344236374, "rewards/format_reward_func": 0.637499988079071, "rewards/format_reward_func/std": 0.2386719286441803, "rewards/ngram_similarity_reward": 0.2823164761066437, "rewards/ngram_similarity_reward/std": 0.3131355941295624, "rewards/sql_execution_reward_func": 0.29374998807907104, "rewards/sql_execution_reward_func/std": 0.13211873173713684, "rewards/xml_reward_func": 0.8072916269302368, "rewards/xml_reward_func/std": 0.35280725359916687, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0007902639335952699, "clip_ratio/low_min": 0.0007902639335952699, "clip_ratio/region_mean": 0.0007902639335952699, "epoch": 0.020110192837465565, "grad_norm": 0.16408125819515512, "kl": 3.328125, "learning_rate": 9.759921670520634e-07, "loss": 1.2103, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 207.75, "epoch": 0.020179063360881544, "grad_norm": 0.15232944285858455, "kl": 0.173828125, "learning_rate": 9.75653538046879e-07, "loss": 0.0929, "reward": 4.200943470001221, "reward_std": 0.8406234979629517, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/std": 0.26726123690605164, "rewards/format_reward_func": 0.8166667222976685, "rewards/format_reward_func/std": 0.030860668048262596, "rewards/ngram_similarity_reward": 0.3811846375465393, "rewards/ngram_similarity_reward/std": 0.20364835858345032, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.06943650543689728, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02024793388429752, "grad_norm": 0.15170664330122455, "kl": 0.1728515625, "learning_rate": 9.753126039497133e-07, "loss": 0.0929, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 246.75, "epoch": 0.0203168044077135, "grad_norm": 0.07320600602110572, "kl": 0.55078125, "learning_rate": 9.749693666068663e-07, "loss": -0.0752, "reward": 4.819070339202881, "reward_std": 0.47666218876838684, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8208333253860474, "rewards/format_reward_func/std": 0.03959115594625473, "rewards/ngram_similarity_reward": 0.4363248348236084, "rewards/ngram_similarity_reward/std": 0.33983099460601807, "rewards/sql_execution_reward_func": 0.34375, "rewards/sql_execution_reward_func/std": 0.06781013309955597, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.020385674931129475, "grad_norm": 0.07450673779820566, "kl": 0.53515625, "learning_rate": 9.746238278771125e-07, "loss": -0.0754, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 585.5, "epoch": 0.020454545454545454, "grad_norm": 0.22260106421580023, "kl": 2.859375, "learning_rate": 9.742759896316883e-07, "loss": 0.6453, "reward": 4.373322486877441, "reward_std": 0.8700640201568604, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.817307710647583, "rewards/format_reward_func/std": 0.06054990366101265, "rewards/ngram_similarity_reward": 0.3572149872779846, "rewards/ngram_similarity_reward/std": 0.21708711981773376, "rewards/sql_execution_reward_func": 0.2798076868057251, "rewards/sql_execution_reward_func/std": 0.10413573682308197, "rewards/xml_reward_func": 0.990384578704834, "rewards/xml_reward_func/std": 0.027196412906050682, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.020523415977961434, "grad_norm": 0.2081727080059269, "kl": 2.9375, "learning_rate": 9.739258537542835e-07, "loss": 0.6455, "step": 298 }, { "clip_ratio/high_max": 0.0011709601385518909, "clip_ratio/high_mean": 0.0011709601385518909, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011709601385518909, "completion_length": 106.75, "epoch": 0.02059228650137741, "grad_norm": 0.10352325686230922, "kl": 0.248046875, "learning_rate": 9.735734221410302e-07, "loss": -0.0484, "reward": 5.276041507720947, "reward_std": 0.364528089761734, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7250000238418579, "rewards/format_reward_func/std": 0.1035098284482956, "rewards/ngram_similarity_reward": 0.800694465637207, "rewards/ngram_similarity_reward/std": 0.2141425460577011, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02066115702479339, "grad_norm": 0.10274186927845054, "kl": 0.248046875, "learning_rate": 9.73218696700493e-07, "loss": -0.0483, "step": 300 }, { "clip_ratio/high_max": 0.0004597701190505177, "clip_ratio/high_mean": 0.0004597701190505177, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004597701190505177, "completion_length": 271.875, "epoch": 0.020730027548209365, "grad_norm": 0.16956996396187504, "kl": 0.0155029296875, "learning_rate": 9.728616793536587e-07, "loss": 0.156, "reward": 4.037558555603027, "reward_std": 0.9323105812072754, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.09258200973272324, "rewards/ngram_similarity_reward": 0.1542058289051056, "rewards/ngram_similarity_reward/std": 0.08503308892250061, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.020798898071625344, "grad_norm": 0.1709022982606798, "kl": 0.0155029296875, "learning_rate": 9.725023720339255e-07, "loss": 0.1564, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 509.75, "epoch": 0.020867768595041324, "grad_norm": 0.188113349023809, "kl": 1.15625, "learning_rate": 9.72140776687093e-07, "loss": 0.2582, "reward": 3.821199893951416, "reward_std": 1.2219619750976562, "rewards/accuracy_reward": 0.659375011920929, "rewards/accuracy_reward/std": 0.4758334159851074, "rewards/format_reward_func": 0.6941666603088379, "rewards/format_reward_func/std": 0.17150823771953583, "rewards/ngram_similarity_reward": 0.31501299142837524, "rewards/ngram_similarity_reward/std": 0.3074829876422882, "rewards/sql_execution_reward_func": 0.3843749761581421, "rewards/sql_execution_reward_func/std": 0.13020415604114532, "rewards/xml_reward_func": 0.9513888955116272, "rewards/xml_reward_func/std": 0.09122669696807861, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0002452182525303215, "clip_ratio/low_min": 0.0002452182525303215, "clip_ratio/region_mean": 0.0002452182525303215, "epoch": 0.0209366391184573, "grad_norm": 0.1873469110954264, "kl": 1.1796875, "learning_rate": 9.717768952713511e-07, "loss": 0.2581, "step": 304 }, { "clip_ratio/high_max": 0.00032195751555263996, "clip_ratio/high_mean": 0.00032195751555263996, "clip_ratio/low_mean": 0.00032195751555263996, "clip_ratio/low_min": 0.00032195751555263996, "clip_ratio/region_mean": 0.0006439150311052799, "completion_length": 776.5, "epoch": 0.02100550964187328, "grad_norm": 0.21426724450708798, "kl": 0.55859375, "learning_rate": 9.7141072975727e-07, "loss": 0.254, "reward": 3.4095072746276855, "reward_std": 1.3546323776245117, "rewards/accuracy_reward": 0.53125, "rewards/accuracy_reward/std": 0.5077524185180664, "rewards/format_reward_func": 0.7977564334869385, "rewards/format_reward_func/std": 0.09526634961366653, "rewards/ngram_similarity_reward": 0.19917990267276764, "rewards/ngram_similarity_reward/std": 0.24220845103263855, "rewards/sql_execution_reward_func": 0.26730769872665405, "rewards/sql_execution_reward_func/std": 0.11456208676099777, "rewards/xml_reward_func": 0.9831730723381042, "rewards/xml_reward_func/std": 0.04759372025728226, "step": 305 }, { "clip_ratio/high_max": 0.00032195751555263996, "clip_ratio/high_mean": 0.00032195751555263996, "clip_ratio/low_mean": 0.00016097875777631998, "clip_ratio/low_min": 0.00016097875777631998, "clip_ratio/region_mean": 0.0004829362442251295, "epoch": 0.021074380165289255, "grad_norm": 0.18408433406169067, "kl": 0.55859375, "learning_rate": 9.71042282127789e-07, "loss": 0.2535, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 353.75, "epoch": 0.021143250688705234, "grad_norm": 0.1929294470406731, "kl": 0.74609375, "learning_rate": 9.706715543782064e-07, "loss": -0.0664, "reward": 4.2974419593811035, "reward_std": 1.3654481172561646, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward_func": 0.8333333730697632, "rewards/format_reward_func/std": 0.07126966118812561, "rewards/ngram_similarity_reward": 0.4219057857990265, "rewards/ngram_similarity_reward/std": 0.35618212819099426, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.0530330091714859, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0003533568815328181, "clip_ratio/high_mean": 0.0003533568815328181, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003533568815328181, "epoch": 0.021212121212121213, "grad_norm": 0.19486566086029505, "kl": 0.74609375, "learning_rate": 9.70298548516168e-07, "loss": -0.0662, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001417434395989403, "clip_ratio/low_min": 0.0001417434395989403, "clip_ratio/region_mean": 0.0001417434395989403, "completion_length": 881.875, "epoch": 0.02128099173553719, "grad_norm": 0.043109306429845576, "kl": 1.1171875, "learning_rate": 9.699232665616563e-07, "loss": 0.1874, "reward": 2.5363893508911133, "reward_std": 0.3066645562648773, "rewards/accuracy_reward": 0.16249999403953552, "rewards/accuracy_reward/std": 0.12174328416585922, "rewards/format_reward_func": 0.7481944561004639, "rewards/format_reward_func/std": 0.14728456735610962, "rewards/ngram_similarity_reward": 0.17671330273151398, "rewards/ngram_similarity_reward/std": 0.08643222600221634, "rewards/sql_execution_reward_func": 0.19812500476837158, "rewards/sql_execution_reward_func/std": 0.12603677809238434, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001417434395989403, "clip_ratio/low_min": 0.0001417434395989403, "clip_ratio/region_mean": 0.0001417434395989403, "epoch": 0.02134986225895317, "grad_norm": 0.043842307705139485, "kl": 1.125, "learning_rate": 9.695457105469804e-07, "loss": 0.1874, "step": 310 }, { "clip_ratio/high_max": 0.00047281323350034654, "clip_ratio/high_mean": 0.00047281323350034654, "clip_ratio/low_mean": 0.0003152088320348412, "clip_ratio/low_min": 0.0003152088320348412, "clip_ratio/region_mean": 0.0007880220655351877, "completion_length": 793.125, "epoch": 0.021418732782369145, "grad_norm": 2.7497975361737494, "kl": 133.0, "learning_rate": 9.69165882516764e-07, "loss": 1.205, "reward": 2.455181121826172, "reward_std": 1.1181678771972656, "rewards/accuracy_reward": 0.28125, "rewards/accuracy_reward/std": 0.31160587072372437, "rewards/format_reward_func": 0.6833333969116211, "rewards/format_reward_func/std": 0.22182504832744598, "rewards/ngram_similarity_reward": 0.11456508934497833, "rewards/ngram_similarity_reward/std": 0.05233549326658249, "rewards/sql_execution_reward_func": 0.16249999403953552, "rewards/sql_execution_reward_func/std": 0.1157275140285492, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 311 }, { "clip_ratio/high_max": 0.0001576044160174206, "clip_ratio/high_mean": 0.0001576044160174206, "clip_ratio/low_mean": 0.0003152088320348412, "clip_ratio/low_min": 0.0003152088320348412, "clip_ratio/region_mean": 0.00047281323350034654, "epoch": 0.021487603305785124, "grad_norm": 2.7517787506895983, "kl": 133.0, "learning_rate": 9.687837845279347e-07, "loss": 1.2056, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001049428028636612, "clip_ratio/low_min": 0.0001049428028636612, "clip_ratio/region_mean": 0.0001049428028636612, "completion_length": 1191.125, "epoch": 0.021556473829201103, "grad_norm": 0.08743899460383785, "kl": 0.12353515625, "learning_rate": 9.683994186497132e-07, "loss": 0.3593, "reward": 2.660045623779297, "reward_std": 1.03121018409729, "rewards/accuracy_reward": 0.3187499940395355, "rewards/accuracy_reward/std": 0.4333733916282654, "rewards/format_reward_func": 0.7281798124313354, "rewards/format_reward_func/std": 0.14829671382904053, "rewards/ngram_similarity_reward": 0.11201776564121246, "rewards/ngram_similarity_reward/std": 0.1155858039855957, "rewards/sql_execution_reward_func": 0.14196428656578064, "rewards/sql_execution_reward_func/std": 0.12699931859970093, "rewards/xml_reward_func": 0.984375, "rewards/xml_reward_func/std": 0.04419417306780815, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001049428028636612, "clip_ratio/low_min": 0.0001049428028636612, "clip_ratio/region_mean": 0.0001049428028636612, "epoch": 0.02162534435261708, "grad_norm": 0.08907869159149241, "kl": 0.12353515625, "learning_rate": 9.68012786963601e-07, "loss": 0.3596, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 246.25, "epoch": 0.02169421487603306, "grad_norm": 0.06016493502773692, "kl": 0.703125, "learning_rate": 9.67623891563371e-07, "loss": 0.0734, "reward": 3.9316442012786865, "reward_std": 0.29109570384025574, "rewards/accuracy_reward": 0.5, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8083333373069763, "rewards/format_reward_func/std": 0.15710677206516266, "rewards/ngram_similarity_reward": 0.49887385964393616, "rewards/ngram_similarity_reward/std": 0.2202184796333313, "rewards/sql_execution_reward_func": 0.375, "rewards/sql_execution_reward_func/std": 0.0267261303961277, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.021763085399449034, "grad_norm": 0.059380912508235635, "kl": 0.7265625, "learning_rate": 9.672327345550543e-07, "loss": 0.0735, "step": 316 }, { "clip_ratio/high_max": 0.000487329438328743, "clip_ratio/high_mean": 0.000487329438328743, "clip_ratio/low_mean": 0.000487329438328743, "clip_ratio/low_min": 0.000487329438328743, "clip_ratio/region_mean": 0.000974658876657486, "completion_length": 256.5, "epoch": 0.021831955922865014, "grad_norm": 0.11680898918214606, "kl": 0.01324462890625, "learning_rate": 9.668393180569296e-07, "loss": 0.0061, "reward": 4.515913963317871, "reward_std": 0.5775894522666931, "rewards/accuracy_reward": 0.90625, "rewards/accuracy_reward/std": 0.2651650309562683, "rewards/format_reward_func": 0.7833333015441895, "rewards/format_reward_func/std": 0.077664315700531, "rewards/ngram_similarity_reward": 0.3925539255142212, "rewards/ngram_similarity_reward/std": 0.2424614578485489, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.0530330091714859, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.000487329438328743, "clip_ratio/high_mean": 0.000487329438328743, "clip_ratio/low_mean": 0.000974658876657486, "clip_ratio/low_min": 0.000974658876657486, "clip_ratio/region_mean": 0.001461988314986229, "epoch": 0.021900826446280993, "grad_norm": 0.11839719211517827, "kl": 0.01251220703125, "learning_rate": 9.66443644199512e-07, "loss": 0.0062, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 225.125, "epoch": 0.02196969696969697, "grad_norm": 0.1044638346068409, "kl": 1.0390625, "learning_rate": 9.66045715125541e-07, "loss": -0.0605, "reward": 3.3308491706848145, "reward_std": 0.5633618235588074, "rewards/accuracy_reward": 0.34375, "rewards/accuracy_reward/std": 0.2651650309562683, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.3455662131309509, "rewards/ngram_similarity_reward/std": 0.15105798840522766, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02203856749311295, "grad_norm": 0.10565186824755948, "kl": 0.9765625, "learning_rate": 9.656455329899686e-07, "loss": -0.0609, "step": 320 }, { "clip_ratio/high_max": 5.4045289289206266e-05, "clip_ratio/high_mean": 5.4045289289206266e-05, "clip_ratio/low_mean": 5.4045289289206266e-05, "clip_ratio/low_min": 5.4045289289206266e-05, "clip_ratio/region_mean": 0.00010809057857841253, "completion_length": 2312.875, "epoch": 0.022107438016528924, "grad_norm": 0.09760953249381277, "kl": 0.28125, "learning_rate": 9.65243099959949e-07, "loss": 0.4766, "reward": 1.688037633895874, "reward_std": 1.1651062965393066, "rewards/accuracy_reward": 0.06562499701976776, "rewards/accuracy_reward/std": 0.11412516981363297, "rewards/format_reward_func": 0.551317572593689, "rewards/format_reward_func/std": 0.27137237787246704, "rewards/ngram_similarity_reward": 0.011440116912126541, "rewards/ngram_similarity_reward/std": 0.0180190522223711, "rewards/sql_execution_reward_func": 0.33207419514656067, "rewards/sql_execution_reward_func/std": 0.4278336763381958, "rewards/xml_reward_func": 0.6562356948852539, "rewards/xml_reward_func/std": 0.4604273736476898, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001621358678676188, "clip_ratio/low_min": 0.0001621358678676188, "clip_ratio/region_mean": 0.0001621358678676188, "epoch": 0.022176308539944904, "grad_norm": 0.09509095024237005, "kl": 0.28515625, "learning_rate": 9.648384182148252e-07, "loss": 0.4766, "step": 322 }, { "clip_ratio/high_max": 0.00021896211546845734, "clip_ratio/high_mean": 0.00021896211546845734, "clip_ratio/low_mean": 0.00021896211546845734, "clip_ratio/low_min": 0.00021896211546845734, "clip_ratio/region_mean": 0.0004379242309369147, "completion_length": 570.875, "epoch": 0.022245179063360883, "grad_norm": 0.044572749473926254, "kl": 0.83984375, "learning_rate": 9.64431489946118e-07, "loss": 0.0332, "reward": 2.8154218196868896, "reward_std": 0.263384073972702, "rewards/accuracy_reward": 0.24895831942558289, "rewards/accuracy_reward/std": 0.09133463352918625, "rewards/format_reward_func": 0.8233333230018616, "rewards/format_reward_func/std": 0.11618266999721527, "rewards/ngram_similarity_reward": 0.1263228952884674, "rewards/ngram_similarity_reward/std": 0.10620111227035522, "rewards/sql_execution_reward_func": 0.3203125, "rewards/sql_execution_reward_func/std": 0.10435548424720764, "rewards/xml_reward_func": 0.984375, "rewards/xml_reward_func/std": 0.04419417306780815, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02231404958677686, "grad_norm": 0.04160665914681532, "kl": 0.71484375, "learning_rate": 9.640223173575146e-07, "loss": 0.0329, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 391.0, "epoch": 0.022382920110192838, "grad_norm": 0.18927743366864128, "kl": 0.9453125, "learning_rate": 9.636109026648554e-07, "loss": 0.5646, "reward": 3.9817495346069336, "reward_std": 0.8420621156692505, "rewards/accuracy_reward": 0.71875, "rewards/accuracy_reward/std": 0.38816189765930176, "rewards/format_reward_func": 0.7371212244033813, "rewards/format_reward_func/std": 0.13070760667324066, "rewards/ngram_similarity_reward": 0.2551310062408447, "rewards/ngram_similarity_reward/std": 0.22724129259586334, "rewards/sql_execution_reward_func": 0.44999998807907104, "rewards/sql_execution_reward_func/std": 0.16475090384483337, "rewards/xml_reward_func": 0.9744318127632141, "rewards/xml_reward_func/std": 0.07231773436069489, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.022451790633608814, "grad_norm": 0.18904583595314628, "kl": 0.921875, "learning_rate": 9.631972480961233e-07, "loss": 0.5642, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 282.125, "epoch": 0.022520661157024793, "grad_norm": 0.09022119407958597, "kl": 1.515625, "learning_rate": 9.627813558914305e-07, "loss": 0.0038, "reward": 4.593855857849121, "reward_std": 0.5141752362251282, "rewards/accuracy_reward": 0.953125, "rewards/accuracy_reward/std": 0.13258251547813416, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.3750706911087036, "rewards/ngram_similarity_reward/std": 0.3809676468372345, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0004430660046637058, "clip_ratio/high_mean": 0.0004430660046637058, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004430660046637058, "epoch": 0.022589531680440773, "grad_norm": 0.0843159776418538, "kl": 1.4921875, "learning_rate": 9.623632283030077e-07, "loss": 0.0038, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0003490401431918144, "clip_ratio/low_min": 0.0003490401431918144, "clip_ratio/region_mean": 0.0003490401431918144, "completion_length": 1074.375, "epoch": 0.02265840220385675, "grad_norm": 0.7477747236914997, "kl": 8.0625, "learning_rate": 9.619428675951906e-07, "loss": 1.0198, "reward": 2.697984218597412, "reward_std": 1.4204814434051514, "rewards/accuracy_reward": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward_func": 0.7457863092422485, "rewards/format_reward_func/std": 0.2316998541355133, "rewards/ngram_similarity_reward": 0.08548162877559662, "rewards/ngram_similarity_reward/std": 0.057873088866472244, "rewards/sql_execution_reward_func": 0.23375000059604645, "rewards/sql_execution_reward_func/std": 0.17062385380268097, "rewards/xml_reward_func": 0.8402255773544312, "rewards/xml_reward_func/std": 0.31797489523887634, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00023269343364518136, "clip_ratio/low_min": 0.00023269343364518136, "clip_ratio/region_mean": 0.00023269343364518136, "epoch": 0.022727272727272728, "grad_norm": 0.6117073371946714, "kl": 7.5625, "learning_rate": 9.615202760444081e-07, "loss": 1.0189, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 278.75, "epoch": 0.022796143250688704, "grad_norm": 0.16243231552966778, "kl": 3.453125, "learning_rate": 9.610954559391704e-07, "loss": 0.1586, "reward": 3.506500720977783, "reward_std": 0.7975364923477173, "rewards/accuracy_reward": 0.546875, "rewards/accuracy_reward/std": 0.3775951862335205, "rewards/format_reward_func": 0.7833333015441895, "rewards/format_reward_func/std": 0.077664315700531, "rewards/ngram_similarity_reward": 0.198778435587883, "rewards/ngram_similarity_reward/std": 0.07498889416456223, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.0530330091714859, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.022865013774104683, "grad_norm": 0.15128602046381495, "kl": 3.453125, "learning_rate": 9.606684095800558e-07, "loss": 0.1585, "step": 332 }, { "clip_ratio/high_max": 0.0006666666595265269, "clip_ratio/high_mean": 0.0006666666595265269, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006666666595265269, "completion_length": 187.5, "epoch": 0.022933884297520662, "grad_norm": 0.03360208106671057, "kl": 0.92578125, "learning_rate": 9.602391392796997e-07, "loss": -0.0163, "reward": 5.615475654602051, "reward_std": 0.07315368950366974, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7654762268066406, "rewards/format_reward_func/std": 0.08889596909284592, "rewards/ngram_similarity_reward": 1.0, "rewards/ngram_similarity_reward/std": 0.0, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.08017837256193161, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0006666666595265269, "clip_ratio/high_mean": 0.0006666666595265269, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006666666595265269, "epoch": 0.02300275482093664, "grad_norm": 0.03391788579439823, "kl": 0.92578125, "learning_rate": 9.598076473627796e-07, "loss": -0.0163, "step": 334 }, { "clip_ratio/high_max": 6.0452181060099974e-05, "clip_ratio/high_mean": 6.0452181060099974e-05, "clip_ratio/low_mean": 6.0452181060099974e-05, "clip_ratio/low_min": 6.0452181060099974e-05, "clip_ratio/region_mean": 0.00012090436212019995, "completion_length": 2067.75, "epoch": 0.023071625344352618, "grad_norm": 0.0714615734157065, "kl": 0.69921875, "learning_rate": 9.59373936166005e-07, "loss": 0.3213, "reward": 1.5725713968276978, "reward_std": 0.8416710495948792, "rewards/accuracy_reward": 0.06875000149011612, "rewards/accuracy_reward/std": 0.11319231986999512, "rewards/format_reward_func": 0.5, "rewards/format_reward_func/std": 0.1885618269443512, "rewards/ngram_similarity_reward": 0.047634243965148926, "rewards/ngram_similarity_reward/std": 0.06826825439929962, "rewards/sql_execution_reward_func": 0.2252272665500641, "rewards/sql_execution_reward_func/std": 0.24166716635227203, "rewards/xml_reward_func": 0.6383928656578064, "rewards/xml_reward_func/std": 0.38486185669898987, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.023140495867768594, "grad_norm": 0.06348466383641654, "kl": 0.67578125, "learning_rate": 9.589380080381038e-07, "loss": 0.3211, "step": 336 }, { "clip_ratio/high_max": 0.0003302510012872517, "clip_ratio/high_mean": 0.0003302510012872517, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003302510012872517, "completion_length": 1135.5, "epoch": 0.023209366391184573, "grad_norm": 0.07651693363906689, "kl": 2.21875, "learning_rate": 9.58499865339809e-07, "loss": 0.0147, "reward": 4.060192108154297, "reward_std": 0.8464688062667847, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.8195726275444031, "rewards/format_reward_func/std": 0.0831306129693985, "rewards/ngram_similarity_reward": 0.2036893367767334, "rewards/ngram_similarity_reward/std": 0.2072262316942215, "rewards/sql_execution_reward_func": 0.2033143937587738, "rewards/sql_execution_reward_func/std": 0.09972408413887024, "rewards/xml_reward_func": 0.9817708730697632, "rewards/xml_reward_func/std": 0.043753545731306076, "step": 337 }, { "clip_ratio/high_max": 0.00022016733419150114, "clip_ratio/high_mean": 0.00022016733419150114, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022016733419150114, "epoch": 0.023278236914600552, "grad_norm": 0.07575569434675095, "kl": 2.21875, "learning_rate": 9.580595104438462e-07, "loss": 0.0146, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 160.125, "epoch": 0.023347107438016528, "grad_norm": 0.08670231810956475, "kl": 2.28125, "learning_rate": 9.576169457349219e-07, "loss": -0.011, "reward": 4.602577209472656, "reward_std": 0.2995913028717041, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.0, "rewards/ngram_similarity_reward": 0.30171817541122437, "rewards/ngram_similarity_reward/std": 0.19972752034664154, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.023415977961432508, "grad_norm": 0.0851888207572447, "kl": 2.140625, "learning_rate": 9.571721736097088e-07, "loss": -0.0111, "step": 340 }, { "clip_ratio/high_max": 6.423845479730517e-05, "clip_ratio/high_mean": 6.423845479730517e-05, "clip_ratio/low_mean": 0.0001927153643919155, "clip_ratio/low_min": 0.0001927153643919155, "clip_ratio/region_mean": 0.00025695381918922067, "completion_length": 1945.875, "epoch": 0.023484848484848483, "grad_norm": 0.09581788406585201, "kl": 0.32421875, "learning_rate": 9.567251964768342e-07, "loss": 0.4503, "reward": 1.9446253776550293, "reward_std": 1.2536516189575195, "rewards/accuracy_reward": 0.16875000298023224, "rewards/accuracy_reward/std": 0.3463457226753235, "rewards/format_reward_func": 0.5920833349227905, "rewards/format_reward_func/std": 0.24655041098594666, "rewards/ngram_similarity_reward": 0.10502804815769196, "rewards/ngram_similarity_reward/std": 0.12203266471624374, "rewards/sql_execution_reward_func": 0.08249999582767487, "rewards/sql_execution_reward_func/std": 0.13101252913475037, "rewards/xml_reward_func": 0.7749999761581421, "rewards/xml_reward_func/std": 0.4070802330970764, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.023553719008264463, "grad_norm": 0.10044000518804955, "kl": 0.32421875, "learning_rate": 9.562760167568664e-07, "loss": 0.4505, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 622.75, "epoch": 0.023622589531680442, "grad_norm": 1.4988785838141658, "kl": 3.953125, "learning_rate": 9.55824636882301e-07, "loss": 1.4597, "reward": 4.103126049041748, "reward_std": 1.3231440782546997, "rewards/accuracy_reward": 0.8020833134651184, "rewards/accuracy_reward/std": 0.3830161988735199, "rewards/format_reward_func": 0.7482143044471741, "rewards/format_reward_func/std": 0.1901261806488037, "rewards/ngram_similarity_reward": 0.30754032731056213, "rewards/ngram_similarity_reward/std": 0.21356160938739777, "rewards/sql_execution_reward_func": 0.40625, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 0.883184552192688, "rewards/xml_reward_func/std": 0.2832936644554138, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00020072259940207005, "clip_ratio/low_min": 0.00020072259940207005, "clip_ratio/region_mean": 0.00020072259940207005, "epoch": 0.023691460055096418, "grad_norm": 1.4062711274014508, "kl": 3.890625, "learning_rate": 9.553710592975495e-07, "loss": 1.4593, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0004161464748904109, "clip_ratio/low_min": 0.0004161464748904109, "clip_ratio/region_mean": 0.0004161464748904109, "completion_length": 300.375, "epoch": 0.023760330578512397, "grad_norm": 0.20672644036823945, "kl": 0.4609375, "learning_rate": 9.54915286458924e-07, "loss": 0.2182, "reward": 4.089803695678711, "reward_std": 0.736841082572937, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.8525000214576721, "rewards/format_reward_func/std": 0.04733366519212723, "rewards/ngram_similarity_reward": 0.11236937344074249, "rewards/ngram_similarity_reward/std": 0.06095876172184944, "rewards/sql_execution_reward_func": 0.3187499940395355, "rewards/sql_execution_reward_func/std": 0.10066741704940796, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0004161464748904109, "clip_ratio/high_mean": 0.0004161464748904109, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004161464748904109, "epoch": 0.023829201101928373, "grad_norm": 0.20294215532918822, "kl": 0.462890625, "learning_rate": 9.54457320834625e-07, "loss": 0.2182, "step": 346 }, { "clip_ratio/high_max": 0.0008022462716326118, "clip_ratio/high_mean": 0.0008022462716326118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008022462716326118, "completion_length": 311.625, "epoch": 0.023898071625344353, "grad_norm": 0.08624843708482818, "kl": 0.30859375, "learning_rate": 9.539971649047284e-07, "loss": -0.0396, "reward": 4.049992561340332, "reward_std": 0.5394740700721741, "rewards/accuracy_reward": 0.90625, "rewards/accuracy_reward/std": 0.2651650309562683, "rewards/format_reward_func": 0.8166666626930237, "rewards/format_reward_func/std": 0.11126972734928131, "rewards/ngram_similarity_reward": 0.07638402283191681, "rewards/ngram_similarity_reward/std": 0.023988937959074974, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.06781013309955597, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0004011231358163059, "clip_ratio/high_mean": 0.0004011231358163059, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004011231358163059, "epoch": 0.023966942148760332, "grad_norm": 0.08537055514091033, "kl": 0.30859375, "learning_rate": 9.535348211611701e-07, "loss": -0.0399, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 276.875, "epoch": 0.024035812672176308, "grad_norm": 0.253871363499583, "kl": 0.2890625, "learning_rate": 9.530702921077358e-07, "loss": 0.5162, "reward": 4.037322998046875, "reward_std": 0.9758275151252747, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.1414213627576828, "rewards/ngram_similarity_reward": 0.19154879450798035, "rewards/ngram_similarity_reward/std": 0.08005400747060776, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.06943650543689728, "rewards/xml_reward_func": 0.9375, "rewards/xml_reward_func/std": 0.12400396168231964, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.024104683195592287, "grad_norm": 0.2595267528250474, "kl": 0.2734375, "learning_rate": 9.526035802600442e-07, "loss": 0.5171, "step": 350 }, { "clip_ratio/high_max": 0.00012140342005295679, "clip_ratio/high_mean": 0.00012140342005295679, "clip_ratio/low_mean": 0.0003642102819867432, "clip_ratio/low_min": 0.0003642102819867432, "clip_ratio/region_mean": 0.00048561368021182716, "completion_length": 1029.625, "epoch": 0.024173553719008263, "grad_norm": 0.5207479460497908, "kl": 0.94140625, "learning_rate": 9.521346881455354e-07, "loss": 1.2143, "reward": 2.549398899078369, "reward_std": 1.3494876623153687, "rewards/accuracy_reward": 0.1875, "rewards/accuracy_reward/std": 0.1157275140285492, "rewards/format_reward_func": 0.639386773109436, "rewards/format_reward_func/std": 0.283730149269104, "rewards/ngram_similarity_reward": 0.29260239005088806, "rewards/ngram_similarity_reward/std": 0.22540690004825592, "rewards/sql_execution_reward_func": 0.34375, "rewards/sql_execution_reward_func/std": 0.142521932721138, "rewards/xml_reward_func": 0.7523584961891174, "rewards/xml_reward_func/std": 0.4585707187652588, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0003642102819867432, "clip_ratio/low_min": 0.0003642102819867432, "clip_ratio/region_mean": 0.0003642102819867432, "epoch": 0.024242424242424242, "grad_norm": 0.5045815226521788, "kl": 0.9453125, "learning_rate": 9.516636183034564e-07, "loss": 1.2133, "step": 352 }, { "clip_ratio/high_max": 0.00016963528469204903, "clip_ratio/high_mean": 0.00016963528469204903, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016963528469204903, "completion_length": 736.875, "epoch": 0.024311294765840222, "grad_norm": 0.08685387551005407, "kl": 1.5234375, "learning_rate": 9.511903732848475e-07, "loss": 0.2358, "reward": 4.090473651885986, "reward_std": 0.7192865610122681, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.7892857193946838, "rewards/format_reward_func/std": 0.10524778068065643, "rewards/ngram_similarity_reward": 0.2146015465259552, "rewards/ngram_similarity_reward/std": 0.07470176368951797, "rewards/sql_execution_reward_func": 0.23375000059604645, "rewards/sql_execution_reward_func/std": 0.10322478413581848, "rewards/xml_reward_func": 0.9955357313156128, "rewards/xml_reward_func/std": 0.012626901268959045, "step": 353 }, { "clip_ratio/high_max": 0.00016963528469204903, "clip_ratio/high_mean": 0.00016963528469204903, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016963528469204903, "epoch": 0.024380165289256198, "grad_norm": 0.08671797403015574, "kl": 1.5, "learning_rate": 9.50714955652528e-07, "loss": 0.2353, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 206.0, "epoch": 0.024449035812672177, "grad_norm": 0.16795670781260705, "kl": 0.0311279296875, "learning_rate": 9.502373679810839e-07, "loss": 0.1664, "reward": 3.6245086193084717, "reward_std": 0.9226840138435364, "rewards/accuracy_reward": 0.625, "rewards/accuracy_reward/std": 0.40089187026023865, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.0, "rewards/ngram_similarity_reward": 0.1496725082397461, "rewards/ngram_similarity_reward/std": 0.12341858446598053, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.024517906336088153, "grad_norm": 0.17048437320223953, "kl": 0.03125, "learning_rate": 9.497576128568518e-07, "loss": 0.1657, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00017268174269702286, "clip_ratio/low_min": 0.00017268174269702286, "clip_ratio/region_mean": 0.00017268174269702286, "completion_length": 723.875, "epoch": 0.024586776859504132, "grad_norm": 0.6105309509801486, "kl": 1.8046875, "learning_rate": 9.492756928779066e-07, "loss": 2.1153, "reward": 3.932663679122925, "reward_std": 1.5167485475540161, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.7166666984558105, "rewards/format_reward_func/std": 0.22466908395290375, "rewards/ngram_similarity_reward": 0.2148313820362091, "rewards/ngram_similarity_reward/std": 0.1180344820022583, "rewards/sql_execution_reward_func": 0.26875001192092896, "rewards/sql_execution_reward_func/std": 0.12799972295761108, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001554135698825121, "clip_ratio/low_min": 0.001554135698825121, "clip_ratio/region_mean": 0.001554135698825121, "epoch": 0.02465564738292011, "grad_norm": 0.30481056403059753, "kl": 2.09375, "learning_rate": 9.487916106540465e-07, "loss": 2.1146, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 134.875, "epoch": 0.024724517906336087, "grad_norm": 0.15985589489916077, "kl": 2.34375, "learning_rate": 9.483053688067795e-07, "loss": -0.0714, "reward": 4.076248645782471, "reward_std": 0.770804762840271, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.7000000476837158, "rewards/format_reward_func/std": 0.12848322093486786, "rewards/ngram_similarity_reward": 0.2369435727596283, "rewards/ngram_similarity_reward/std": 0.05708030238747597, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.12747548520565033, "rewards/xml_reward_func": 0.9583333730697632, "rewards/xml_reward_func/std": 0.117851123213768, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.024793388429752067, "grad_norm": 0.15605492171854984, "kl": 2.40625, "learning_rate": 9.478169699693083e-07, "loss": -0.0714, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 406.0, "epoch": 0.024862258953168043, "grad_norm": 0.13906506610041114, "kl": 1.203125, "learning_rate": 9.473264167865171e-07, "loss": 0.0626, "reward": 3.092243194580078, "reward_std": 0.6647549867630005, "rewards/accuracy_reward": 0.0062500000931322575, "rewards/accuracy_reward/std": 0.011572751216590405, "rewards/format_reward_func": 0.8545454740524292, "rewards/format_reward_func/std": 0.2010597586631775, "rewards/ngram_similarity_reward": 0.5167984366416931, "rewards/ngram_similarity_reward/std": 0.5173434615135193, "rewards/sql_execution_reward_func": 0.45000001788139343, "rewards/sql_execution_reward_func/std": 0.14392459392547607, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.024931129476584022, "grad_norm": 0.14208423483141416, "kl": 1.21875, "learning_rate": 9.468337119149565e-07, "loss": 0.0627, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 1185.25, "epoch": 0.025, "grad_norm": 0.016243563445688263, "kl": 0.53515625, "learning_rate": 9.463388580228297e-07, "loss": -0.0032, "reward": 2.8859641551971436, "reward_std": 0.14726129174232483, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.9078571796417236, "rewards/format_reward_func/std": 0.14858515560626984, "rewards/ngram_similarity_reward": 0.18249797821044922, "rewards/ngram_similarity_reward/std": 0.09047423303127289, "rewards/sql_execution_reward_func": 0.220505952835083, "rewards/sql_execution_reward_func/std": 0.09926605224609375, "rewards/xml_reward_func": 0.9838541746139526, "rewards/xml_reward_func/std": 0.02444193884730339, "step": 363 }, { "clip_ratio/high_max": 0.00021092596580274403, "clip_ratio/high_mean": 0.00021092596580274403, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021092596580274403, "epoch": 0.025068870523415977, "grad_norm": 0.0160155095093054, "kl": 0.54296875, "learning_rate": 9.458418577899774e-07, "loss": -0.0032, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 207.875, "epoch": 0.025137741046831957, "grad_norm": 0.019132583158603906, "kl": 0.0498046875, "learning_rate": 9.453427139078638e-07, "loss": 0.0038, "reward": 4.83601188659668, "reward_std": 0.08680284768342972, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7833333611488342, "rewards/format_reward_func/std": 0.077664315700531, "rewards/ngram_similarity_reward": 0.4642857015132904, "rewards/ngram_similarity_reward/std": 0.0, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.025206611570247933, "grad_norm": 0.016845003691845283, "kl": 0.047119140625, "learning_rate": 9.448414290795618e-07, "loss": 0.0038, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0002090300986310467, "clip_ratio/low_min": 0.0002090300986310467, "clip_ratio/region_mean": 0.0002090300986310467, "completion_length": 598.0, "epoch": 0.025275482093663912, "grad_norm": 0.05636061214075263, "kl": 0.439453125, "learning_rate": 9.443380060197385e-07, "loss": -0.0726, "reward": 3.0256850719451904, "reward_std": 0.4178740978240967, "rewards/accuracy_reward": 0.03125, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/format_reward_func": 0.9233332872390747, "rewards/format_reward_func/std": 0.1399773210287094, "rewards/ngram_similarity_reward": 0.42240118980407715, "rewards/ngram_similarity_reward/std": 0.28715354204177856, "rewards/sql_execution_reward_func": 0.4531250298023224, "rewards/sql_execution_reward_func/std": 0.18537485599517822, "rewards/xml_reward_func": 0.953125, "rewards/xml_reward_func/std": 0.06842003017663956, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02534435261707989, "grad_norm": 0.056700247746230545, "kl": 0.43359375, "learning_rate": 9.438324474546405e-07, "loss": -0.0725, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00038925651460886, "clip_ratio/low_min": 0.00038925651460886, "clip_ratio/region_mean": 0.00038925651460886, "completion_length": 321.125, "epoch": 0.025413223140495867, "grad_norm": 0.09370350593993994, "kl": 0.36328125, "learning_rate": 9.433247561220788e-07, "loss": 0.0942, "reward": 3.3635754585266113, "reward_std": 0.38260596990585327, "rewards/accuracy_reward": 0.39444440603256226, "rewards/accuracy_reward/std": 0.08121649920940399, "rewards/format_reward_func": 0.815000057220459, "rewards/format_reward_func/std": 0.04242641106247902, "rewards/ngram_similarity_reward": 0.29187440872192383, "rewards/ngram_similarity_reward/std": 0.19853217899799347, "rewards/sql_execution_reward_func": 0.3218749761581421, "rewards/sql_execution_reward_func/std": 0.07954951375722885, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00038925651460886, "clip_ratio/low_min": 0.00038925651460886, "clip_ratio/region_mean": 0.00038925651460886, "epoch": 0.025482093663911846, "grad_norm": 0.09382823406356984, "kl": 0.3515625, "learning_rate": 9.428149347714143e-07, "loss": 0.0944, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00016005121869966388, "clip_ratio/low_min": 0.00016005121869966388, "clip_ratio/region_mean": 0.00016005121869966388, "completion_length": 781.0, "epoch": 0.025550964187327822, "grad_norm": 0.11186879851188897, "kl": 2.28125, "learning_rate": 9.42302986163543e-07, "loss": 0.3079, "reward": 3.333596706390381, "reward_std": 1.108310580253601, "rewards/accuracy_reward": 0.48124998807907104, "rewards/accuracy_reward/std": 0.43747448921203613, "rewards/format_reward_func": 0.8391071557998657, "rewards/format_reward_func/std": 0.09134436398744583, "rewards/ngram_similarity_reward": 0.20785903930664062, "rewards/ngram_similarity_reward/std": 0.06544888764619827, "rewards/sql_execution_reward_func": 0.22410714626312256, "rewards/sql_execution_reward_func/std": 0.20418143272399902, "rewards/xml_reward_func": 0.99609375, "rewards/xml_reward_func/std": 0.011048543266952038, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00016005121869966388, "clip_ratio/low_min": 0.00016005121869966388, "clip_ratio/region_mean": 0.00016005121869966388, "epoch": 0.0256198347107438, "grad_norm": 0.113600090156519, "kl": 2.25, "learning_rate": 9.417889130708808e-07, "loss": 0.3079, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0006949270609766245, "clip_ratio/low_min": 0.0006949270609766245, "clip_ratio/region_mean": 0.0006949270609766245, "completion_length": 179.875, "epoch": 0.02568870523415978, "grad_norm": 0.12966089855992685, "kl": 0.328125, "learning_rate": 9.412727182773486e-07, "loss": 0.0711, "reward": 5.129166603088379, "reward_std": 0.6573002338409424, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.2314550280570984, "rewards/format_reward_func": 0.7583333253860474, "rewards/format_reward_func/std": 0.1178511381149292, "rewards/ngram_similarity_reward": 0.875, "rewards/ngram_similarity_reward/std": 0.17251639068126678, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 0.9583333730697632, "rewards/xml_reward_func/std": 0.117851123213768, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.025757575757575757, "grad_norm": 0.1294433766629443, "kl": 0.3203125, "learning_rate": 9.40754404578357e-07, "loss": 0.0709, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 329.25, "epoch": 0.025826446280991736, "grad_norm": 0.2000132885622516, "kl": 8.5625, "learning_rate": 9.402339747807915e-07, "loss": 0.1391, "reward": 3.1576616764068604, "reward_std": 0.531465470790863, "rewards/accuracy_reward": 0.2218540757894516, "rewards/accuracy_reward/std": 0.08964689821004868, "rewards/format_reward_func": 0.7875000238418579, "rewards/format_reward_func/std": 0.08345228433609009, "rewards/ngram_similarity_reward": 0.38221901655197144, "rewards/ngram_similarity_reward/std": 0.301504909992218, "rewards/sql_execution_reward_func": 0.3531249761581421, "rewards/sql_execution_reward_func/std": 0.008838837035000324, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.025895316804407712, "grad_norm": 0.19857531319961372, "kl": 8.5625, "learning_rate": 9.397114317029974e-07, "loss": 0.1389, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 202.875, "epoch": 0.02596418732782369, "grad_norm": 0.10842481203933643, "kl": 0.09033203125, "learning_rate": 9.391867781747639e-07, "loss": 0.0152, "reward": 5.040841102600098, "reward_std": 0.6243661046028137, "rewards/accuracy_reward": 0.9375, "rewards/accuracy_reward/std": 0.1767766922712326, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.0, "rewards/ngram_similarity_reward": 0.6772273778915405, "rewards/ngram_similarity_reward/std": 0.22394612431526184, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0006161429337225854, "clip_ratio/high_mean": 0.0006161429337225854, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006161429337225854, "epoch": 0.02603305785123967, "grad_norm": 0.10752442091852112, "kl": 0.09033203125, "learning_rate": 9.386600170373094e-07, "loss": 0.0155, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 945.25, "epoch": 0.026101928374655647, "grad_norm": 0.15740197753766058, "kl": 2.109375, "learning_rate": 9.381311511432658e-07, "loss": 1.0893, "reward": 2.5683176517486572, "reward_std": 0.9997466206550598, "rewards/accuracy_reward": 0.21875, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/format_reward_func": 0.675000011920929, "rewards/format_reward_func/std": 0.22092878818511963, "rewards/ngram_similarity_reward": 0.18679505586624146, "rewards/ngram_similarity_reward/std": 0.15740740299224854, "rewards/sql_execution_reward_func": 0.3006249964237213, "rewards/sql_execution_reward_func/std": 0.15538977086544037, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00013224015128798783, "clip_ratio/low_min": 0.00013224015128798783, "clip_ratio/region_mean": 0.00013224015128798783, "epoch": 0.026170798898071626, "grad_norm": 0.15476228882572243, "kl": 2.09375, "learning_rate": 9.376001833566633e-07, "loss": 1.0892, "step": 380 }, { "clip_ratio/high_max": 0.0008271298720501363, "clip_ratio/high_mean": 0.0008271298720501363, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008271298720501363, "completion_length": 151.125, "epoch": 0.026239669421487602, "grad_norm": 0.02673312406989442, "kl": 0.029541015625, "learning_rate": 9.370671165529144e-07, "loss": -0.016, "reward": 4.709374904632568, "reward_std": 0.11692698299884796, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.3895833492279053, "rewards/ngram_similarity_reward/std": 0.08240555226802826, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0008271298720501363, "clip_ratio/high_mean": 0.0008271298720501363, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008271298720501363, "epoch": 0.02630853994490358, "grad_norm": 0.025058468499111492, "kl": 0.0303955078125, "learning_rate": 9.36531953618799e-07, "loss": -0.016, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 571.5, "epoch": 0.02637741046831956, "grad_norm": 0.36487038019473317, "kl": 1.890625, "learning_rate": 9.359946974524481e-07, "loss": 2.2309, "reward": 3.872239112854004, "reward_std": 1.3457363843917847, "rewards/accuracy_reward": 0.7856706976890564, "rewards/accuracy_reward/std": 0.4022504687309265, "rewards/format_reward_func": 0.6979166865348816, "rewards/format_reward_func/std": 0.15595011413097382, "rewards/ngram_similarity_reward": 0.24295960366725922, "rewards/ngram_similarity_reward/std": 0.13969774544239044, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 0.9322916865348816, "rewards/xml_reward_func/std": 0.19150808453559875, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.026446280991735537, "grad_norm": 0.36233490044408684, "kl": 1.890625, "learning_rate": 9.354553509633288e-07, "loss": 2.2297, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00040799673297442496, "clip_ratio/low_min": 0.00040799673297442496, "clip_ratio/region_mean": 0.00040799673297442496, "completion_length": 306.375, "epoch": 0.026515151515151516, "grad_norm": 0.16757274606261868, "kl": 3.421875, "learning_rate": 9.34913917072228e-07, "loss": 0.0615, "reward": 3.948106050491333, "reward_std": 0.741007387638092, "rewards/accuracy_reward": 0.7604166865348816, "rewards/accuracy_reward/std": 0.27163344621658325, "rewards/format_reward_func": 0.8375000357627869, "rewards/format_reward_func/std": 0.041547439992427826, "rewards/ngram_similarity_reward": 0.19734862446784973, "rewards/ngram_similarity_reward/std": 0.19364126026630402, "rewards/sql_execution_reward_func": 0.29374998807907104, "rewards/sql_execution_reward_func/std": 0.09425459057092667, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00040799673297442496, "clip_ratio/low_min": 0.00040799673297442496, "clip_ratio/region_mean": 0.00040799673297442496, "epoch": 0.026584022038567492, "grad_norm": 0.1662692181058001, "kl": 3.421875, "learning_rate": 9.343703987112365e-07, "loss": 0.0615, "step": 386 }, { "clip_ratio/high_max": 0.0005605381447821856, "clip_ratio/high_mean": 0.0005605381447821856, "clip_ratio/low_mean": 0.0005605381447821856, "clip_ratio/low_min": 0.0005605381447821856, "clip_ratio/region_mean": 0.0011210762895643711, "completion_length": 223.0, "epoch": 0.02665289256198347, "grad_norm": 0.1703024569811615, "kl": 5.125, "learning_rate": 9.338247988237337e-07, "loss": -0.0047, "reward": 3.981877326965332, "reward_std": 0.4769638180732727, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/std": 0.25877460837364197, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.09258200973272324, "rewards/ngram_similarity_reward": 0.3337515890598297, "rewards/ngram_similarity_reward/std": 0.2190747708082199, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02672176308539945, "grad_norm": 0.17406092906761086, "kl": 5.0625, "learning_rate": 9.332771203643714e-07, "loss": -0.0045, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0006899724248796701, "clip_ratio/low_min": 0.0006899724248796701, "clip_ratio/region_mean": 0.0006899724248796701, "completion_length": 543.5, "epoch": 0.026790633608815426, "grad_norm": 4.973006461532154, "kl": 40.75, "learning_rate": 9.327273662990573e-07, "loss": 3.0182, "reward": 4.235201835632324, "reward_std": 1.6828643083572388, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.699999988079071, "rewards/format_reward_func/std": 0.21380899846553802, "rewards/ngram_similarity_reward": 0.4026346206665039, "rewards/ngram_similarity_reward/std": 0.3102334439754486, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0004599815874826163, "clip_ratio/low_min": 0.0004599815874826163, "clip_ratio/region_mean": 0.0004599815874826163, "epoch": 0.026859504132231406, "grad_norm": 5.155735596941493, "kl": 41.75, "learning_rate": 9.3217553960494e-07, "loss": 3.0203, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.000369822490029037, "clip_ratio/low_min": 0.000369822490029037, "clip_ratio/region_mean": 0.000369822490029037, "completion_length": 338.0, "epoch": 0.02692837465564738, "grad_norm": 0.12736333694027752, "kl": 0.384765625, "learning_rate": 9.316216432703916e-07, "loss": 0.1774, "reward": 3.651826858520508, "reward_std": 0.8147109746932983, "rewards/accuracy_reward": 0.6256038546562195, "rewards/accuracy_reward/std": 0.40024736523628235, "rewards/format_reward_func": 0.7708333730697632, "rewards/format_reward_func/std": 0.11189208924770355, "rewards/ngram_similarity_reward": 0.1740240454673767, "rewards/ngram_similarity_reward/std": 0.09049734473228455, "rewards/sql_execution_reward_func": 0.3687499761581421, "rewards/sql_execution_reward_func/std": 0.03720119222998619, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.000369822490029037, "clip_ratio/low_min": 0.000369822490029037, "clip_ratio/region_mean": 0.000369822490029037, "epoch": 0.02699724517906336, "grad_norm": 0.12769420103493132, "kl": 0.384765625, "learning_rate": 9.310656802949928e-07, "loss": 0.1775, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 210.0, "epoch": 0.02706611570247934, "grad_norm": 0.35818179421128427, "kl": 0.265625, "learning_rate": 9.305076536895153e-07, "loss": 0.2976, "reward": 4.524670124053955, "reward_std": 0.9872885346412659, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.7925000190734863, "rewards/format_reward_func/std": 0.08548181504011154, "rewards/ngram_similarity_reward": 0.41311341524124146, "rewards/ngram_similarity_reward/std": 0.3206789195537567, "rewards/sql_execution_reward_func": 0.36250001192092896, "rewards/sql_execution_reward_func/std": 0.058248236775398254, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.027134986225895316, "grad_norm": 0.35833730969470756, "kl": 0.26953125, "learning_rate": 9.299475664759068e-07, "loss": 0.2979, "step": 394 }, { "clip_ratio/high_max": 0.00029761905898340046, "clip_ratio/high_mean": 0.00029761905898340046, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029761905898340046, "completion_length": 420.0, "epoch": 0.027203856749311296, "grad_norm": 0.08837180508481383, "kl": 0.984375, "learning_rate": 9.293854216872739e-07, "loss": 0.0058, "reward": 2.544029712677002, "reward_std": 0.4850391447544098, "rewards/accuracy_reward": 0.16249999403953552, "rewards/accuracy_reward/std": 0.12100767344236374, "rewards/format_reward_func": 0.7541667222976685, "rewards/format_reward_func/std": 0.16803012788295746, "rewards/ngram_similarity_reward": 0.1067836582660675, "rewards/ngram_similarity_reward/std": 0.17285500466823578, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.13822858035564423, "rewards/xml_reward_func": 0.9921875, "rewards/xml_reward_func/std": 0.022097086533904076, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02727272727272727, "grad_norm": 0.08911613671924666, "kl": 0.9765625, "learning_rate": 9.288212223678658e-07, "loss": 0.0057, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00012130034156143665, "clip_ratio/low_min": 0.00012130034156143665, "clip_ratio/region_mean": 0.00012130034156143665, "completion_length": 1030.5, "epoch": 0.02734159779614325, "grad_norm": 2.335805779663917, "kl": 14.0625, "learning_rate": 9.282549715730579e-07, "loss": 0.0374, "reward": 2.570875883102417, "reward_std": 1.6052484512329102, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.804754912853241, "rewards/format_reward_func/std": 0.28595787286758423, "rewards/ngram_similarity_reward": 0.023544974625110626, "rewards/ngram_similarity_reward/std": 0.04030204191803932, "rewards/sql_execution_reward_func": 0.8558036088943481, "rewards/sql_execution_reward_func/std": 1.457813024520874, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02741046831955923, "grad_norm": 2.037839895924472, "kl": 12.625, "learning_rate": 9.27686672369335e-07, "loss": 0.0357, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 8.656509453430772e-05, "clip_ratio/low_min": 8.656509453430772e-05, "clip_ratio/region_mean": 8.656509453430772e-05, "completion_length": 1444.0, "epoch": 0.027479338842975206, "grad_norm": 0.31611310991852115, "kl": 4.5, "learning_rate": 9.271163278342752e-07, "loss": 0.9607, "reward": 1.913440465927124, "reward_std": 1.1536670923233032, "rewards/accuracy_reward": 0.1303606927394867, "rewards/accuracy_reward/std": 0.13293883204460144, "rewards/format_reward_func": 0.601822018623352, "rewards/format_reward_func/std": 0.2633495330810547, "rewards/ngram_similarity_reward": 0.1224920004606247, "rewards/ngram_similarity_reward/std": 0.15821650624275208, "rewards/sql_execution_reward_func": 0.14625000953674316, "rewards/sql_execution_reward_func/std": 0.15061184763908386, "rewards/xml_reward_func": 0.7209091186523438, "rewards/xml_reward_func/std": 0.41154301166534424, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.027548209366391185, "grad_norm": 0.2615140586345956, "kl": 4.34375, "learning_rate": 9.265439410565328e-07, "loss": 0.9604, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 347.75, "epoch": 0.02761707988980716, "grad_norm": 0.17012797376498356, "kl": 3.5625, "learning_rate": 9.259695151358214e-07, "loss": -0.0526, "reward": 3.9652810096740723, "reward_std": 0.7549468278884888, "rewards/accuracy_reward": 0.7931570410728455, "rewards/accuracy_reward/std": 0.3885672688484192, "rewards/format_reward_func": 0.7916666269302368, "rewards/format_reward_func/std": 0.15301208198070526, "rewards/ngram_similarity_reward": 0.18320021033287048, "rewards/ngram_similarity_reward/std": 0.09522945433855057, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.12747548520565033, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.000359453639248386, "clip_ratio/high_mean": 0.000359453639248386, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000359453639248386, "epoch": 0.02768595041322314, "grad_norm": 0.16810087700209847, "kl": 3.546875, "learning_rate": 9.253930531828978e-07, "loss": -0.0532, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0004464855301193893, "clip_ratio/low_min": 0.0004464855301193893, "clip_ratio/region_mean": 0.0004464855301193893, "completion_length": 1959.75, "epoch": 0.02775482093663912, "grad_norm": 0.36413252550272635, "kl": 3.96875, "learning_rate": 9.248145583195447e-07, "loss": 0.9707, "reward": 2.0477967262268066, "reward_std": 1.7018312215805054, "rewards/accuracy_reward": 0.19062499701976776, "rewards/accuracy_reward/std": 0.37031295895576477, "rewards/format_reward_func": 0.5498768091201782, "rewards/format_reward_func/std": 0.30500349402427673, "rewards/ngram_similarity_reward": 0.04684477671980858, "rewards/ngram_similarity_reward/std": 0.08722365647554398, "rewards/sql_execution_reward_func": 0.416194349527359, "rewards/sql_execution_reward_func/std": 0.4046666622161865, "rewards/xml_reward_func": 0.6302083134651184, "rewards/xml_reward_func/std": 0.5105229616165161, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00012756729847751558, "clip_ratio/low_min": 0.00012756729847751558, "clip_ratio/region_mean": 0.00012756729847751558, "epoch": 0.027823691460055096, "grad_norm": 0.32195453472529584, "kl": 3.40625, "learning_rate": 9.242340336785538e-07, "loss": 0.9701, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 1214.625, "epoch": 0.027892561983471075, "grad_norm": 0.08077958073953749, "kl": 1.03125, "learning_rate": 9.236514824037088e-07, "loss": 0.4271, "reward": 2.9059340953826904, "reward_std": 0.7738859057426453, "rewards/accuracy_reward": 0.26603370904922485, "rewards/accuracy_reward/std": 0.19134071469306946, "rewards/format_reward_func": 0.8072131872177124, "rewards/format_reward_func/std": 0.13504864275455475, "rewards/ngram_similarity_reward": 0.20925813913345337, "rewards/ngram_similarity_reward/std": 0.17259958386421204, "rewards/sql_execution_reward_func": 0.35357141494750977, "rewards/sql_execution_reward_func/std": 0.19456911087036133, "rewards/xml_reward_func": 0.8991950750350952, "rewards/xml_reward_func/std": 0.15505477786064148, "step": 405 }, { "clip_ratio/high_max": 0.00020582485012710094, "clip_ratio/high_mean": 0.00020582485012710094, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020582485012710094, "epoch": 0.02796143250688705, "grad_norm": 0.08048869609908245, "kl": 1.0234375, "learning_rate": 9.230669076497687e-07, "loss": 0.4268, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00031476235017180443, "clip_ratio/low_min": 0.00031476235017180443, "clip_ratio/region_mean": 0.00031476235017180443, "completion_length": 794.25, "epoch": 0.02803030303030303, "grad_norm": 0.2961929859652113, "kl": 6.15625, "learning_rate": 9.224803125824501e-07, "loss": 1.5195, "reward": 3.727477550506592, "reward_std": 1.7553353309631348, "rewards/accuracy_reward": 0.6701388955116272, "rewards/accuracy_reward/std": 0.4686930477619171, "rewards/format_reward_func": 0.7027777433395386, "rewards/format_reward_func/std": 0.21543197333812714, "rewards/ngram_similarity_reward": 0.26608526706695557, "rewards/ngram_similarity_reward/std": 0.2814880609512329, "rewards/sql_execution_reward_func": 0.4102941155433655, "rewards/sql_execution_reward_func/std": 0.3011307120323181, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00015738117508590221, "clip_ratio/low_min": 0.00015738117508590221, "clip_ratio/region_mean": 0.00015738117508590221, "epoch": 0.02809917355371901, "grad_norm": 0.3609942810009184, "kl": 5.9375, "learning_rate": 9.218917003784111e-07, "loss": 1.5188, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00014112333883531392, "clip_ratio/low_min": 0.00014112333883531392, "clip_ratio/region_mean": 0.00014112333883531392, "completion_length": 885.75, "epoch": 0.028168044077134986, "grad_norm": 0.08846855554985787, "kl": 1.0703125, "learning_rate": 9.213010742252327e-07, "loss": 0.2007, "reward": 2.7666313648223877, "reward_std": 0.66302889585495, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.841867208480835, "rewards/format_reward_func/std": 0.2674364745616913, "rewards/ngram_similarity_reward": 0.08378894627094269, "rewards/ngram_similarity_reward/std": 0.053589724004268646, "rewards/sql_execution_reward_func": 0.44062498211860657, "rewards/sql_execution_reward_func/std": 0.38124269247055054, "rewards/xml_reward_func": 0.8584558963775635, "rewards/xml_reward_func/std": 0.31185364723205566, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.028236914600550965, "grad_norm": 0.10688640463307629, "kl": 0.98046875, "learning_rate": 9.207084373214028e-07, "loss": 0.2008, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0002768549311440438, "clip_ratio/low_min": 0.0002768549311440438, "clip_ratio/region_mean": 0.0002768549311440438, "completion_length": 451.5, "epoch": 0.02830578512396694, "grad_norm": 0.22929679802764377, "kl": 4.03125, "learning_rate": 9.20113792876298e-07, "loss": 0.0868, "reward": 3.6796579360961914, "reward_std": 1.1927443742752075, "rewards/accuracy_reward": 0.3423624038696289, "rewards/accuracy_reward/std": 0.4161439538002014, "rewards/format_reward_func": 0.8624999523162842, "rewards/format_reward_func/std": 0.16850179433822632, "rewards/ngram_similarity_reward": 0.4841219186782837, "rewards/ngram_similarity_reward/std": 0.2749311327934265, "rewards/sql_execution_reward_func": 0.40625, "rewards/sql_execution_reward_func/std": 0.06781013309955597, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02837465564738292, "grad_norm": 0.23209786392064166, "kl": 4.03125, "learning_rate": 9.195171441101668e-07, "loss": 0.0869, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001970443408936262, "clip_ratio/low_min": 0.0001970443408936262, "clip_ratio/region_mean": 0.0001970443408936262, "completion_length": 634.375, "epoch": 0.0284435261707989, "grad_norm": 0.11861165812562736, "kl": 0.8359375, "learning_rate": 9.189184942541119e-07, "loss": 0.2391, "reward": 3.327702045440674, "reward_std": 0.9040876030921936, "rewards/accuracy_reward": 0.484375, "rewards/accuracy_reward/std": 0.4454006254673004, "rewards/format_reward_func": 0.8279762268066406, "rewards/format_reward_func/std": 0.1370341032743454, "rewards/ngram_similarity_reward": 0.11350773274898529, "rewards/ngram_similarity_reward/std": 0.0898231789469719, "rewards/sql_execution_reward_func": 0.38749998807907104, "rewards/sql_execution_reward_func/std": 0.10938138514757156, "rewards/xml_reward_func": 0.9732142686843872, "rewards/xml_reward_func/std": 0.07576145231723785, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001970443408936262, "clip_ratio/low_min": 0.0001970443408936262, "clip_ratio/region_mean": 0.0001970443408936262, "epoch": 0.028512396694214875, "grad_norm": 0.11750667338368127, "kl": 0.88671875, "learning_rate": 9.183178465500726e-07, "loss": 0.2389, "step": 414 }, { "clip_ratio/high_max": 0.0003762227133847773, "clip_ratio/high_mean": 0.0003762227133847773, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003762227133847773, "completion_length": 332.25, "epoch": 0.028581267217630855, "grad_norm": 0.10586154590111242, "kl": 1.4921875, "learning_rate": 9.177152042508077e-07, "loss": -0.1937, "reward": 4.088635444641113, "reward_std": 0.8016102910041809, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.8316667079925537, "rewards/format_reward_func/std": 0.04998411610722542, "rewards/ngram_similarity_reward": 0.11645147204399109, "rewards/ngram_similarity_reward/std": 0.11542514711618423, "rewards/sql_execution_reward_func": 0.3531249761581421, "rewards/sql_execution_reward_func/std": 0.10892125219106674, "rewards/xml_reward_func": 0.9791666269302368, "rewards/xml_reward_func/std": 0.0589255727827549, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02865013774104683, "grad_norm": 0.10606794656635311, "kl": 1.4921875, "learning_rate": 9.171105706198774e-07, "loss": -0.1941, "step": 416 }, { "clip_ratio/high_max": 0.000583090353757143, "clip_ratio/high_mean": 0.000583090353757143, "clip_ratio/low_mean": 0.000583090353757143, "clip_ratio/low_min": 0.000583090353757143, "clip_ratio/region_mean": 0.001166180707514286, "completion_length": 214.375, "epoch": 0.02871900826446281, "grad_norm": 0.22215402526882377, "kl": 6.4375, "learning_rate": 9.165039489316257e-07, "loss": 0.1084, "reward": 3.3529000282287598, "reward_std": 0.31212934851646423, "rewards/accuracy_reward": 0.5, "rewards/accuracy_reward/std": 0.117851123213768, "rewards/format_reward_func": 0.8083333969116211, "rewards/format_reward_func/std": 0.10947203636169434, "rewards/ngram_similarity_reward": 0.14221110939979553, "rewards/ngram_similarity_reward/std": 0.06953488290309906, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.0530330091714859, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02878787878787879, "grad_norm": 0.21908017369399613, "kl": 6.40625, "learning_rate": 9.158953424711624e-07, "loss": 0.1084, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 157.75, "epoch": 0.028856749311294765, "grad_norm": 0.14600308965119155, "kl": 3.96875, "learning_rate": 9.152847545343466e-07, "loss": -0.0009, "reward": 5.41249942779541, "reward_std": 0.2825266420841217, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.8583333492279053, "rewards/ngram_similarity_reward/std": 0.19659805297851562, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.028925619834710745, "grad_norm": 0.1451443416641911, "kl": 3.96875, "learning_rate": 9.146721884277674e-07, "loss": -0.0007, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 628.125, "epoch": 0.02899449035812672, "grad_norm": 0.24560984010193077, "kl": 0.177734375, "learning_rate": 9.140576474687263e-07, "loss": 1.1502, "reward": 3.4407706260681152, "reward_std": 1.3766350746154785, "rewards/accuracy_reward": 0.5854166746139526, "rewards/accuracy_reward/std": 0.47862547636032104, "rewards/format_reward_func": 0.7837499976158142, "rewards/format_reward_func/std": 0.1916133463382721, "rewards/ngram_similarity_reward": 0.2108313888311386, "rewards/ngram_similarity_reward/std": 0.2759905755519867, "rewards/sql_execution_reward_func": 0.23504464328289032, "rewards/sql_execution_reward_func/std": 0.14201901853084564, "rewards/xml_reward_func": 0.9348958730697632, "rewards/xml_reward_func/std": 0.16038669645786285, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0290633608815427, "grad_norm": 0.247102206264298, "kl": 0.177734375, "learning_rate": 9.134411349852197e-07, "loss": 1.1511, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 550.5, "epoch": 0.02913223140495868, "grad_norm": 0.23145939396279372, "kl": 0.185546875, "learning_rate": 9.128226543159209e-07, "loss": 1.5076, "reward": 4.373437404632568, "reward_std": 1.5570740699768066, "rewards/accuracy_reward": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward_func": 0.8050000071525574, "rewards/format_reward_func/std": 0.11988089233636856, "rewards/ngram_similarity_reward": 0.5356249809265137, "rewards/ngram_similarity_reward/std": 0.49760451912879944, "rewards/sql_execution_reward_func": 0.26499998569488525, "rewards/sql_execution_reward_func/std": 0.09399847686290741, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.029201101928374655, "grad_norm": 0.234559340636943, "kl": 0.1943359375, "learning_rate": 9.122022088101613e-07, "loss": 1.508, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 452.875, "epoch": 0.029269972451790634, "grad_norm": 0.16101509844149076, "kl": 2.09375, "learning_rate": 9.11579801827913e-07, "loss": 0.4445, "reward": 3.965263843536377, "reward_std": 1.2010440826416016, "rewards/accuracy_reward": 0.65625, "rewards/accuracy_reward/std": 0.39949744939804077, "rewards/format_reward_func": 0.8812500238418579, "rewards/format_reward_func/std": 0.09897790849208832, "rewards/ngram_similarity_reward": 0.37743785977363586, "rewards/ngram_similarity_reward/std": 0.24997082352638245, "rewards/sql_execution_reward_func": 0.2366071492433548, "rewards/sql_execution_reward_func/std": 0.14675486087799072, "rewards/xml_reward_func": 0.96875, "rewards/xml_reward_func/std": 0.0883883461356163, "step": 425 }, { "clip_ratio/high_max": 0.0002760143543127924, "clip_ratio/high_mean": 0.0002760143543127924, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002760143543127924, "epoch": 0.02933884297520661, "grad_norm": 0.15993175034847462, "kl": 2.0625, "learning_rate": 9.109554367397697e-07, "loss": 0.4445, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 423.625, "epoch": 0.02940771349862259, "grad_norm": 0.155650362682168, "kl": 4.03125, "learning_rate": 9.103291169269299e-07, "loss": 0.0091, "reward": 4.22676944732666, "reward_std": 0.8171695470809937, "rewards/accuracy_reward": 0.734375, "rewards/accuracy_reward/std": 0.36659735441207886, "rewards/format_reward_func": 0.8916666507720947, "rewards/format_reward_func/std": 0.07918232679367065, "rewards/ngram_similarity_reward": 0.33423522114753723, "rewards/ngram_similarity_reward/std": 0.1955910176038742, "rewards/sql_execution_reward_func": 0.3649999797344208, "rewards/sql_execution_reward_func/std": 0.06824326515197754, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02947658402203857, "grad_norm": 0.15555540522351696, "kl": 4.03125, "learning_rate": 9.097008457811777e-07, "loss": 0.0088, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0005479452083818614, "clip_ratio/low_min": 0.0005479452083818614, "clip_ratio/region_mean": 0.0005479452083818614, "completion_length": 228.125, "epoch": 0.029545454545454545, "grad_norm": 0.16274759223570562, "kl": 0.458984375, "learning_rate": 9.090706267048638e-07, "loss": 0.1819, "reward": 4.669747352600098, "reward_std": 0.865683913230896, "rewards/accuracy_reward": 0.7812550067901611, "rewards/accuracy_reward/std": 0.3115960955619812, "rewards/format_reward_func": 0.7333333492279053, "rewards/format_reward_func/std": 0.11268723756074905, "rewards/ngram_similarity_reward": 0.707602858543396, "rewards/ngram_similarity_reward/std": 0.311005562543869, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.12747548520565033, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.029614325068870524, "grad_norm": 0.1635416077109376, "kl": 0.458984375, "learning_rate": 9.084384631108882e-07, "loss": 0.182, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 301.0, "epoch": 0.0296831955922865, "grad_norm": 0.3901365796515787, "kl": 0.01080322265625, "learning_rate": 9.078043584226815e-07, "loss": 0.4262, "reward": 4.796428680419922, "reward_std": 1.1368030309677124, "rewards/accuracy_reward": 0.784375011920929, "rewards/accuracy_reward/std": 0.40376296639442444, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.1414213478565216, "rewards/ngram_similarity_reward": 0.7767857313156128, "rewards/ngram_similarity_reward/std": 0.3397552967071533, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.12747548520565033, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0004152823821641505, "clip_ratio/low_min": 0.0004152823821641505, "clip_ratio/region_mean": 0.0004152823821641505, "epoch": 0.02975206611570248, "grad_norm": 0.3875857921666726, "kl": 0.01080322265625, "learning_rate": 9.071683160741855e-07, "loss": 0.4266, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 135.375, "epoch": 0.02982093663911846, "grad_norm": 0.11352260595855491, "kl": 2.5625, "learning_rate": 9.065303395098358e-07, "loss": 0.0941, "reward": 5.556249618530273, "reward_std": 0.1971900761127472, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.09258200973272324, "rewards/ngram_similarity_reward": 1.0, "rewards/ngram_similarity_reward/std": 0.0, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.029889807162534435, "grad_norm": 0.10903394706231109, "kl": 2.421875, "learning_rate": 9.058904321845423e-07, "loss": 0.0938, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 235.0, "epoch": 0.029958677685950414, "grad_norm": 0.15701653385478262, "kl": 0.341796875, "learning_rate": 9.052485975636711e-07, "loss": 0.094, "reward": 5.376609802246094, "reward_std": 0.5441883206367493, "rewards/accuracy_reward": 0.9375, "rewards/accuracy_reward/std": 0.1767766922712326, "rewards/format_reward_func": 0.8541666865348816, "rewards/format_reward_func/std": 0.14685863256454468, "rewards/ngram_similarity_reward": 0.8607954382896423, "rewards/ngram_similarity_reward/std": 0.3055076599121094, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03002754820936639, "grad_norm": 0.15761707173518238, "kl": 0.337890625, "learning_rate": 9.046048391230247e-07, "loss": 0.0938, "step": 436 }, { "clip_ratio/high_max": 0.0001817851298255846, "clip_ratio/high_mean": 0.0001817851298255846, "clip_ratio/low_mean": 0.0003635702596511692, "clip_ratio/low_min": 0.0003635702596511692, "clip_ratio/region_mean": 0.0005453553749248385, "completion_length": 687.625, "epoch": 0.03009641873278237, "grad_norm": 0.17600301531326512, "kl": 4.28125, "learning_rate": 9.039591603488251e-07, "loss": 1.403, "reward": 3.252396583557129, "reward_std": 1.1301604509353638, "rewards/accuracy_reward": 0.5729166269302368, "rewards/accuracy_reward/std": 0.337584525346756, "rewards/format_reward_func": 0.6699404716491699, "rewards/format_reward_func/std": 0.2109680473804474, "rewards/ngram_similarity_reward": 0.13552623987197876, "rewards/ngram_similarity_reward/std": 0.11718530207872391, "rewards/sql_execution_reward_func": 0.3374999761581421, "rewards/sql_execution_reward_func/std": 0.14330288767814636, "rewards/xml_reward_func": 0.8958333730697632, "rewards/xml_reward_func/std": 0.19795581698417664, "step": 437 }, { "clip_ratio/high_max": 0.0001817851298255846, "clip_ratio/high_mean": 0.0001817851298255846, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001817851298255846, "epoch": 0.03016528925619835, "grad_norm": 0.18201685314748683, "kl": 4.1875, "learning_rate": 9.033115647376923e-07, "loss": 1.403, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 218.375, "epoch": 0.030234159779614325, "grad_norm": 0.09442886454456302, "kl": 0.049560546875, "learning_rate": 9.026620557966279e-07, "loss": 0.012, "reward": 3.2165780067443848, "reward_std": 0.5765666365623474, "rewards/accuracy_reward": 0.34375, "rewards/accuracy_reward/std": 0.2651650309562683, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.0, "rewards/ngram_similarity_reward": 0.25271856784820557, "rewards/ngram_similarity_reward/std": 0.16027332842350006, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.030303030303030304, "grad_norm": 0.09292173332694191, "kl": 0.0498046875, "learning_rate": 9.020106370429943e-07, "loss": 0.0126, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 396.5, "epoch": 0.03037190082644628, "grad_norm": 0.17987722320601796, "kl": 2.96875, "learning_rate": 9.013573120044966e-07, "loss": 0.1003, "reward": 3.7829155921936035, "reward_std": 0.9586636424064636, "rewards/accuracy_reward": 0.3333333134651184, "rewards/accuracy_reward/std": 0.2920915186405182, "rewards/format_reward_func": 0.8212499618530273, "rewards/format_reward_func/std": 0.16181448101997375, "rewards/ngram_similarity_reward": 0.5799994468688965, "rewards/ngram_similarity_reward/std": 0.2838853895664215, "rewards/sql_execution_reward_func": 0.42500001192092896, "rewards/sql_execution_reward_func/std": 0.04629100486636162, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0003152585122734308, "clip_ratio/low_min": 0.0003152585122734308, "clip_ratio/region_mean": 0.0003152585122734308, "epoch": 0.03044077134986226, "grad_norm": 0.18205890414922693, "kl": 2.96875, "learning_rate": 9.007020842191634e-07, "loss": 0.1003, "step": 442 }, { "clip_ratio/high_max": 0.00018789929163176566, "clip_ratio/high_mean": 0.00018789929163176566, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018789929163176566, "completion_length": 665.25, "epoch": 0.03050964187327824, "grad_norm": 1.5761544493542408, "kl": 21.75, "learning_rate": 9.00044957235327e-07, "loss": 1.4413, "reward": 2.635112762451172, "reward_std": 1.007145881652832, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.13363061845302582, "rewards/format_reward_func": 0.7053571343421936, "rewards/format_reward_func/std": 0.22638463973999023, "rewards/ngram_similarity_reward": 0.1448371261358261, "rewards/ngram_similarity_reward/std": 0.06181570515036583, "rewards/sql_execution_reward_func": 0.3375000059604645, "rewards/sql_execution_reward_func/std": 0.1620185226202011, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0005636978312395513, "clip_ratio/low_min": 0.0005636978312395513, "clip_ratio/region_mean": 0.0005636978312395513, "epoch": 0.030578512396694214, "grad_norm": 1.6137842607217183, "kl": 21.75, "learning_rate": 8.993859346116049e-07, "loss": 1.4414, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010615711798891425, "clip_ratio/low_min": 0.0010615711798891425, "clip_ratio/region_mean": 0.0010615711798891425, "completion_length": 471.0, "epoch": 0.030647382920110194, "grad_norm": 2.1867155931411553, "kl": 11.6875, "learning_rate": 8.987250199168808e-07, "loss": 1.9448, "reward": 3.602407932281494, "reward_std": 1.3982555866241455, "rewards/accuracy_reward": 0.6875, "rewards/accuracy_reward/std": 0.4381372928619385, "rewards/format_reward_func": 0.7099727392196655, "rewards/format_reward_func/std": 0.215154230594635, "rewards/ngram_similarity_reward": 0.18942390382289886, "rewards/ngram_similarity_reward/std": 0.11034456640481949, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 0.8770492076873779, "rewards/xml_reward_func/std": 0.3477574288845062, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002123142359778285, "clip_ratio/low_min": 0.002123142359778285, "clip_ratio/region_mean": 0.002123142359778285, "epoch": 0.03071625344352617, "grad_norm": 1.7401185400228159, "kl": 12.125, "learning_rate": 8.980622167302837e-07, "loss": 1.9438, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 269.125, "epoch": 0.03078512396694215, "grad_norm": 0.07662627668342777, "kl": 1.4921875, "learning_rate": 8.973975286411705e-07, "loss": 0.0801, "reward": 3.459291934967041, "reward_std": 0.3128335773944855, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8100000023841858, "rewards/format_reward_func/std": 0.07153642177581787, "rewards/ngram_similarity_reward": 0.5841113328933716, "rewards/ngram_similarity_reward/std": 0.16829191148281097, "rewards/sql_execution_reward_func": 0.2731249928474426, "rewards/sql_execution_reward_func/std": 0.10918325185775757, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03085399449035813, "grad_norm": 0.07668572606431721, "kl": 1.484375, "learning_rate": 8.967309592491052e-07, "loss": 0.0803, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00023218017304316163, "clip_ratio/low_min": 0.00023218017304316163, "clip_ratio/region_mean": 0.00023218017304316163, "completion_length": 538.375, "epoch": 0.030922865013774104, "grad_norm": 0.17406708967718132, "kl": 1.0703125, "learning_rate": 8.960625121638396e-07, "loss": 0.602, "reward": 3.2083206176757812, "reward_std": 1.1854636669158936, "rewards/accuracy_reward": 0.5885416269302368, "rewards/accuracy_reward/std": 0.4157356023788452, "rewards/format_reward_func": 0.6875, "rewards/format_reward_func/std": 0.23566018044948578, "rewards/ngram_similarity_reward": 0.14999164640903473, "rewards/ngram_similarity_reward/std": 0.09473142772912979, "rewards/sql_execution_reward_func": 0.24375000596046448, "rewards/sql_execution_reward_func/std": 0.21286733448505402, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.2314550280570984, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00023218017304316163, "clip_ratio/low_min": 0.00023218017304316163, "clip_ratio/region_mean": 0.00023218017304316163, "epoch": 0.030991735537190084, "grad_norm": 0.179940224465617, "kl": 1.015625, "learning_rate": 8.953921910052949e-07, "loss": 0.6021, "step": 450 }, { "clip_ratio/high_max": 0.00020012007735203952, "clip_ratio/high_mean": 0.00020012007735203952, "clip_ratio/low_mean": 0.00040024015470407903, "clip_ratio/low_min": 0.00040024015470407903, "clip_ratio/region_mean": 0.0006003602175042033, "completion_length": 624.625, "epoch": 0.03106060606060606, "grad_norm": 0.13713043120870105, "kl": 0.0517578125, "learning_rate": 8.9471999940354e-07, "loss": -0.246, "reward": 2.8860466480255127, "reward_std": 1.09855055809021, "rewards/accuracy_reward": 0.3343749940395355, "rewards/accuracy_reward/std": 0.432587206363678, "rewards/format_reward_func": 0.7858333587646484, "rewards/format_reward_func/std": 0.12880094349384308, "rewards/ngram_similarity_reward": 0.16889220476150513, "rewards/ngram_similarity_reward/std": 0.1909971982240677, "rewards/sql_execution_reward_func": 0.18854166567325592, "rewards/sql_execution_reward_func/std": 0.0634550079703331, "rewards/xml_reward_func": 0.9895833730697632, "rewards/xml_reward_func/std": 0.029462775215506554, "step": 451 }, { "clip_ratio/high_max": 0.00040024015470407903, "clip_ratio/high_mean": 0.00040024015470407903, "clip_ratio/low_mean": 0.00020012007735203952, "clip_ratio/low_min": 0.00020012007735203952, "clip_ratio/region_mean": 0.0006003602175042033, "epoch": 0.03112947658402204, "grad_norm": 0.13746820287967645, "kl": 0.0517578125, "learning_rate": 8.940459409987742e-07, "loss": -0.2461, "step": 452 }, { "clip_ratio/high_max": 0.000425894366344437, "clip_ratio/high_mean": 0.000425894366344437, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000425894366344437, "completion_length": 293.5, "epoch": 0.031198347107438018, "grad_norm": 0.016500731890919024, "kl": 0.0146484375, "learning_rate": 8.933700194413052e-07, "loss": 0.0241, "reward": 4.26320743560791, "reward_std": 0.07146498560905457, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.09213833510875702, "rewards/ngram_similarity_reward/std": 0.0009073466644622386, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.031267217630853994, "grad_norm": 0.016311055342955304, "kl": 0.01458740234375, "learning_rate": 8.926922383915315e-07, "loss": 0.0241, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 359.125, "epoch": 0.03133608815426997, "grad_norm": 0.09934990192168991, "kl": 1.5, "learning_rate": 8.920126015199208e-07, "loss": -0.0364, "reward": 3.4357595443725586, "reward_std": 0.5569614768028259, "rewards/accuracy_reward": 0.5062500238418579, "rewards/accuracy_reward/std": 0.20488819479942322, "rewards/format_reward_func": 0.8416666984558105, "rewards/format_reward_func/std": 0.13303537666797638, "rewards/ngram_similarity_reward": 0.1710619330406189, "rewards/ngram_similarity_reward/std": 0.13310366868972778, "rewards/sql_execution_reward_func": 0.32499998807907104, "rewards/sql_execution_reward_func/std": 0.08017837256193161, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03140495867768595, "grad_norm": 0.09942825590459917, "kl": 1.5703125, "learning_rate": 8.91331112506991e-07, "loss": -0.0359, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.000367579486919567, "clip_ratio/low_min": 0.000367579486919567, "clip_ratio/region_mean": 0.000367579486919567, "completion_length": 1360.25, "epoch": 0.03147382920110193, "grad_norm": 0.28660830737722093, "kl": 5.53125, "learning_rate": 8.906477750432903e-07, "loss": 0.9077, "reward": 2.3858604431152344, "reward_std": 1.147567629814148, "rewards/accuracy_reward": 0.1927083283662796, "rewards/accuracy_reward/std": 0.21700693666934967, "rewards/format_reward_func": 0.6391667127609253, "rewards/format_reward_func/std": 0.2709391713142395, "rewards/ngram_similarity_reward": 0.18931640684604645, "rewards/ngram_similarity_reward/std": 0.21109150350093842, "rewards/sql_execution_reward_func": 0.3210526406764984, "rewards/sql_execution_reward_func/std": 0.33674222230911255, "rewards/xml_reward_func": 0.7562500238418579, "rewards/xml_reward_func/std": 0.3633379340171814, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001837897434597835, "clip_ratio/low_min": 0.0001837897434597835, "clip_ratio/region_mean": 0.0001837897434597835, "epoch": 0.031542699724517904, "grad_norm": 0.24630558552501766, "kl": 5.4375, "learning_rate": 8.899625928293772e-07, "loss": 0.9074, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 144.75, "epoch": 0.03161157024793389, "grad_norm": 0.47959925511703017, "kl": 0.53515625, "learning_rate": 8.892755695757996e-07, "loss": -0.3243, "reward": 3.8839268684387207, "reward_std": 1.62051260471344, "rewards/accuracy_reward": 0.5406376123428345, "rewards/accuracy_reward/std": 0.4965759217739105, "rewards/format_reward_func": 0.7583333253860474, "rewards/format_reward_func/std": 0.1003960371017456, "rewards/ngram_similarity_reward": 0.5170454382896423, "rewards/ngram_similarity_reward/std": 0.5182259678840637, "rewards/sql_execution_reward_func": 0.26875001192092896, "rewards/sql_execution_reward_func/std": 0.166770800948143, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03168044077134986, "grad_norm": 0.475188812989954, "kl": 0.53125, "learning_rate": 8.88586709003076e-07, "loss": -0.3258, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 971.875, "epoch": 0.03174931129476584, "grad_norm": 0.059532935400584984, "kl": 1.890625, "learning_rate": 8.878960148416747e-07, "loss": 0.3056, "reward": 3.147740125656128, "reward_std": 0.49070096015930176, "rewards/accuracy_reward": 0.3229166567325592, "rewards/accuracy_reward/std": 0.15550418198108673, "rewards/format_reward_func": 0.8498485088348389, "rewards/format_reward_func/std": 0.12955760955810547, "rewards/ngram_similarity_reward": 0.2022055983543396, "rewards/ngram_similarity_reward/std": 0.09625716507434845, "rewards/sql_execution_reward_func": 0.3643749952316284, "rewards/sql_execution_reward_func/std": 0.04761733487248421, "rewards/xml_reward_func": 0.984375, "rewards/xml_reward_func/std": 0.04419417306780815, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00012861736468039453, "clip_ratio/low_min": 0.00012861736468039453, "clip_ratio/region_mean": 0.00012861736468039453, "epoch": 0.031818181818181815, "grad_norm": 0.059406698822868954, "kl": 1.8828125, "learning_rate": 8.872034908319934e-07, "loss": 0.3054, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00016219285316765308, "clip_ratio/low_min": 0.00016219285316765308, "clip_ratio/region_mean": 0.00016219285316765308, "completion_length": 1541.375, "epoch": 0.0318870523415978, "grad_norm": 0.043981303490652324, "kl": 0.765625, "learning_rate": 8.865091407243394e-07, "loss": -0.0308, "reward": 2.638115406036377, "reward_std": 0.5154206156730652, "rewards/accuracy_reward": 0.02187500149011612, "rewards/accuracy_reward/std": 0.008838835172355175, "rewards/format_reward_func": 0.8535497784614563, "rewards/format_reward_func/std": 0.15438537299633026, "rewards/ngram_similarity_reward": 0.4350935220718384, "rewards/ngram_similarity_reward/std": 0.31961312890052795, "rewards/sql_execution_reward_func": 0.09025862067937851, "rewards/sql_execution_reward_func/std": 0.11875604838132858, "rewards/xml_reward_func": 0.9979166984558105, "rewards/xml_reward_func/std": 0.00589255103841424, "step": 463 }, { "clip_ratio/high_max": 0.00016219285316765308, "clip_ratio/high_mean": 0.00016219285316765308, "clip_ratio/low_mean": 8.109642658382654e-05, "clip_ratio/low_min": 8.109642658382654e-05, "clip_ratio/region_mean": 0.0002432892651995644, "epoch": 0.031955922865013774, "grad_norm": 0.04404540254160283, "kl": 0.7578125, "learning_rate": 8.85812968278909e-07, "loss": -0.0308, "step": 464 }, { "clip_ratio/high_max": 0.0006648935959674418, "clip_ratio/high_mean": 0.0006648935959674418, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006648935959674418, "completion_length": 188.0, "epoch": 0.03202479338842975, "grad_norm": 0.22845440499424977, "kl": 0.234375, "learning_rate": 8.851149772657672e-07, "loss": 0.1225, "reward": 3.241877555847168, "reward_std": 0.6978832483291626, "rewards/accuracy_reward": 0.5000019073486328, "rewards/accuracy_reward/std": 0.23145267367362976, "rewards/format_reward_func": 0.737500011920929, "rewards/format_reward_func/std": 0.11877350509166718, "rewards/ngram_similarity_reward": 0.1445825695991516, "rewards/ngram_similarity_reward/std": 0.09484723210334778, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 0.9375, "rewards/xml_reward_func/std": 0.1767766922712326, "step": 465 }, { "clip_ratio/high_max": 0.0006648935959674418, "clip_ratio/high_mean": 0.0006648935959674418, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006648935959674418, "epoch": 0.03209366391184573, "grad_norm": 0.23232890991410818, "kl": 0.23046875, "learning_rate": 8.844151714648274e-07, "loss": 0.123, "step": 466 }, { "clip_ratio/high_max": 0.0002554930979385972, "clip_ratio/high_mean": 0.0002554930979385972, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002554930979385972, "completion_length": 489.25, "epoch": 0.03216253443526171, "grad_norm": 0.2521071165206778, "kl": 1.75, "learning_rate": 8.837135546658307e-07, "loss": 1.0602, "reward": 4.377957344055176, "reward_std": 1.1933263540267944, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.7649999856948853, "rewards/format_reward_func/std": 0.10993504524230957, "rewards/ngram_similarity_reward": 0.3601817488670349, "rewards/ngram_similarity_reward/std": 0.2570459246635437, "rewards/sql_execution_reward_func": 0.32499998807907104, "rewards/sql_execution_reward_func/std": 0.1414213478565216, "rewards/xml_reward_func": 0.9976851940155029, "rewards/xml_reward_func/std": 0.006547281518578529, "step": 467 }, { "clip_ratio/high_max": 0.0002554930979385972, "clip_ratio/high_mean": 0.0002554930979385972, "clip_ratio/low_mean": 0.0002554930979385972, "clip_ratio/low_min": 0.0002554930979385972, "clip_ratio/region_mean": 0.0005109861958771944, "epoch": 0.032231404958677684, "grad_norm": 0.24848097342639155, "kl": 1.6640625, "learning_rate": 8.830101306683258e-07, "loss": 1.0596, "step": 468 }, { "clip_ratio/high_max": 6.624709931202233e-05, "clip_ratio/high_mean": 6.624709931202233e-05, "clip_ratio/low_mean": 0.0005299767944961786, "clip_ratio/low_min": 0.0005299767944961786, "clip_ratio/region_mean": 0.0005962239229120314, "completion_length": 1886.875, "epoch": 0.03230027548209367, "grad_norm": 0.22337701627754475, "kl": 2.359375, "learning_rate": 8.823049032816478e-07, "loss": 0.2351, "reward": 1.8340365886688232, "reward_std": 0.7838248610496521, "rewards/accuracy_reward": 0.00937500037252903, "rewards/accuracy_reward/std": 0.012938730418682098, "rewards/format_reward_func": 0.6956372261047363, "rewards/format_reward_func/std": 0.2166592925786972, "rewards/ngram_similarity_reward": 0.17884422838687897, "rewards/ngram_similarity_reward/std": 0.26237162947654724, "rewards/sql_execution_reward_func": 0.011607143096625805, "rewards/sql_execution_reward_func/std": 0.03282995894551277, "rewards/xml_reward_func": 0.8397759199142456, "rewards/xml_reward_func/std": 0.21221239864826202, "step": 469 }, { "clip_ratio/high_max": 6.624709931202233e-05, "clip_ratio/high_mean": 6.624709931202233e-05, "clip_ratio/low_mean": 0.00033123549656011164, "clip_ratio/low_min": 0.00033123549656011164, "clip_ratio/region_mean": 0.00039748259587213397, "epoch": 0.03236914600550964, "grad_norm": 0.1860690773264672, "kl": 2.21875, "learning_rate": 8.815978763248977e-07, "loss": 0.235, "step": 470 }, { "clip_ratio/high_max": 0.00014355441089719534, "clip_ratio/high_mean": 0.00014355441089719534, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00014355441089719534, "completion_length": 870.75, "epoch": 0.03243801652892562, "grad_norm": 0.3496713653044142, "kl": 3.296875, "learning_rate": 8.808890536269229e-07, "loss": 0.5749, "reward": 2.5565342903137207, "reward_std": 0.7250954508781433, "rewards/accuracy_reward": 0.1875, "rewards/accuracy_reward/std": 0.1157275140285492, "rewards/format_reward_func": 0.7739912271499634, "rewards/format_reward_func/std": 0.223030224442482, "rewards/ngram_similarity_reward": 0.180467426776886, "rewards/ngram_similarity_reward/std": 0.14401423931121826, "rewards/sql_execution_reward_func": 0.2593750059604645, "rewards/sql_execution_reward_func/std": 0.11947616189718246, "rewards/xml_reward_func": 0.8774670958518982, "rewards/xml_reward_func/std": 0.29928141832351685, "step": 471 }, { "clip_ratio/high_max": 0.0002871088217943907, "clip_ratio/high_mean": 0.0002871088217943907, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002871088217943907, "epoch": 0.032506887052341595, "grad_norm": 0.302669281058022, "kl": 3.125, "learning_rate": 8.801784390262943e-07, "loss": 0.5748, "step": 472 }, { "clip_ratio/high_max": 0.00013401232718024403, "clip_ratio/high_mean": 0.00013401232718024403, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00013401232718024403, "completion_length": 932.75, "epoch": 0.03257575757575758, "grad_norm": 0.20051178113659238, "kl": 1.3125, "learning_rate": 8.794660363712876e-07, "loss": 1.1936, "reward": 3.4594569206237793, "reward_std": 1.2624660730361938, "rewards/accuracy_reward": 0.65625, "rewards/accuracy_reward/std": 0.48065248131752014, "rewards/format_reward_func": 0.7908333539962769, "rewards/format_reward_func/std": 0.20088493824005127, "rewards/ngram_similarity_reward": 0.14491569995880127, "rewards/ngram_similarity_reward/std": 0.0916602686047554, "rewards/sql_execution_reward_func": 0.20125000178813934, "rewards/sql_execution_reward_func/std": 0.1639196127653122, "rewards/xml_reward_func": 0.9375, "rewards/xml_reward_func/std": 0.1767766922712326, "step": 473 }, { "clip_ratio/high_max": 0.00013401232718024403, "clip_ratio/high_mean": 0.00013401232718024403, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00013401232718024403, "epoch": 0.03264462809917355, "grad_norm": 0.19896784964196623, "kl": 1.3125, "learning_rate": 8.787518495198611e-07, "loss": 1.1933, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00032819167245179415, "clip_ratio/low_min": 0.00032819167245179415, "clip_ratio/region_mean": 0.00032819167245179415, "completion_length": 380.875, "epoch": 0.03271349862258953, "grad_norm": 0.9445374703960155, "kl": 42.5, "learning_rate": 8.780358823396352e-07, "loss": 0.0487, "reward": 3.501600503921509, "reward_std": 0.6235167980194092, "rewards/accuracy_reward": 0.4375, "rewards/accuracy_reward/std": 0.3471825420856476, "rewards/format_reward_func": 0.815000057220459, "rewards/format_reward_func/std": 0.10184020549058914, "rewards/ngram_similarity_reward": 0.2924559712409973, "rewards/ngram_similarity_reward/std": 0.08771280944347382, "rewards/sql_execution_reward_func": 0.37291666865348816, "rewards/sql_execution_reward_func/std": 0.04792313277721405, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03278236914600551, "grad_norm": 0.9379768332032067, "kl": 42.5, "learning_rate": 8.773181387078719e-07, "loss": 0.0486, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 336.125, "epoch": 0.03285123966942149, "grad_norm": 0.044099538683761125, "kl": 0.035400390625, "learning_rate": 8.765986225114532e-07, "loss": 0.1032, "reward": 4.431956768035889, "reward_std": 0.16714437305927277, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.6916666626930237, "rewards/format_reward_func/std": 0.117851123213768, "rewards/ngram_similarity_reward": 0.27963799238204956, "rewards/ngram_similarity_reward/std": 0.05438057333230972, "rewards/sql_execution_reward_func": 0.36250001192092896, "rewards/sql_execution_reward_func/std": 0.0353553369641304, "rewards/xml_reward_func": 0.9583333730697632, "rewards/xml_reward_func/std": 0.117851123213768, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.032920110192837464, "grad_norm": 0.044005377605110994, "kl": 0.03466796875, "learning_rate": 8.758773376468604e-07, "loss": 0.103, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 214.125, "epoch": 0.032988980716253447, "grad_norm": 0.14777897928850506, "kl": 0.55078125, "learning_rate": 8.751542880201526e-07, "loss": 0.0091, "reward": 5.001431465148926, "reward_std": 0.6070734262466431, "rewards/accuracy_reward": 0.9600079655647278, "rewards/accuracy_reward/std": 0.11311451345682144, "rewards/format_reward_func": 0.7583333849906921, "rewards/format_reward_func/std": 0.1003960371017456, "rewards/ngram_similarity_reward": 0.6612216234207153, "rewards/ngram_similarity_reward/std": 0.32034412026405334, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.0530330091714859, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03305785123966942, "grad_norm": 0.1477362353682078, "kl": 0.55078125, "learning_rate": 8.744294775469463e-07, "loss": 0.0089, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00012592872371897101, "clip_ratio/low_min": 0.00012592872371897101, "clip_ratio/region_mean": 0.00012592872371897101, "completion_length": 992.625, "epoch": 0.0331267217630854, "grad_norm": 0.12155352965868312, "kl": 0.609375, "learning_rate": 8.737029101523929e-07, "loss": 0.5843, "reward": 2.856083393096924, "reward_std": 0.9655802845954895, "rewards/accuracy_reward": 0.28437501192092896, "rewards/accuracy_reward/std": 0.33619123697280884, "rewards/format_reward_func": 0.8411905169487, "rewards/format_reward_func/std": 0.1464454084634781, "rewards/ngram_similarity_reward": 0.15676309168338776, "rewards/ngram_similarity_reward/std": 0.16697706282138824, "rewards/sql_execution_reward_func": 0.23778408765792847, "rewards/sql_execution_reward_func/std": 0.14479832351207733, "rewards/xml_reward_func": 0.9732142686843872, "rewards/xml_reward_func/std": 0.07576145231723785, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00012592872371897101, "clip_ratio/low_min": 0.00012592872371897101, "clip_ratio/region_mean": 0.00012592872371897101, "epoch": 0.033195592286501374, "grad_norm": 0.12261200290191361, "kl": 0.609375, "learning_rate": 8.729745897711594e-07, "loss": 0.5842, "step": 482 }, { "clip_ratio/high_max": 0.00016775708354543895, "clip_ratio/high_mean": 0.00016775708354543895, "clip_ratio/low_mean": 8.387854177271947e-05, "clip_ratio/low_min": 8.387854177271947e-05, "clip_ratio/region_mean": 0.0002516356180422008, "completion_length": 1490.25, "epoch": 0.03326446280991736, "grad_norm": 0.07365152316005921, "kl": 1.078125, "learning_rate": 8.722445203474052e-07, "loss": 0.3494, "reward": 2.502516746520996, "reward_std": 0.8060210347175598, "rewards/accuracy_reward": 0.2281745970249176, "rewards/accuracy_reward/std": 0.19133639335632324, "rewards/format_reward_func": 0.7302182912826538, "rewards/format_reward_func/std": 0.20680543780326843, "rewards/ngram_similarity_reward": 0.04706794023513794, "rewards/ngram_similarity_reward/std": 0.03163953870534897, "rewards/sql_execution_reward_func": 0.3356249928474426, "rewards/sql_execution_reward_func/std": 0.12624911963939667, "rewards/xml_reward_func": 0.9097222089767456, "rewards/xml_reward_func/std": 0.16782008111476898, "step": 483 }, { "clip_ratio/high_max": 0.0002516356180422008, "clip_ratio/high_mean": 0.0002516356180422008, "clip_ratio/low_mean": 0.0002516356180422008, "clip_ratio/low_min": 0.0002516356180422008, "clip_ratio/region_mean": 0.0005032712360844016, "epoch": 0.03333333333333333, "grad_norm": 0.0759953205350708, "kl": 1.1796875, "learning_rate": 8.715127058347614e-07, "loss": 0.3498, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 417.0, "epoch": 0.03340220385674931, "grad_norm": 0.298270071805956, "kl": 16.75, "learning_rate": 8.707791501963101e-07, "loss": 0.0208, "reward": 2.9200642108917236, "reward_std": 0.12474224716424942, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7944444417953491, "rewards/format_reward_func/std": 0.09558138251304626, "rewards/ngram_similarity_reward": 0.1837465763092041, "rewards/ngram_similarity_reward/std": 0.03896722197532654, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03347107438016529, "grad_norm": 0.33645759508013623, "kl": 17.5, "learning_rate": 8.700438574045617e-07, "loss": 0.0224, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 868.875, "epoch": 0.03353994490358127, "grad_norm": 0.24942233306121164, "kl": 4.75, "learning_rate": 8.693068314414344e-07, "loss": 0.7333, "reward": 2.623732328414917, "reward_std": 0.7771828174591064, "rewards/accuracy_reward": 0.26344072818756104, "rewards/accuracy_reward/std": 0.11169245094060898, "rewards/format_reward_func": 0.743678867816925, "rewards/format_reward_func/std": 0.16568545997142792, "rewards/ngram_similarity_reward": 0.11924709379673004, "rewards/ngram_similarity_reward/std": 0.1196996420621872, "rewards/sql_execution_reward_func": 0.29374998807907104, "rewards/sql_execution_reward_func/std": 0.15221577882766724, "rewards/xml_reward_func": 0.8805513381958008, "rewards/xml_reward_func/std": 0.20993514358997345, "step": 487 }, { "clip_ratio/high_max": 0.0002877283841371536, "clip_ratio/high_mean": 0.0002877283841371536, "clip_ratio/low_mean": 0.0001438641920685768, "clip_ratio/low_min": 0.0001438641920685768, "clip_ratio/region_mean": 0.00043159257620573044, "epoch": 0.03360881542699724, "grad_norm": 0.249492934750294, "kl": 4.75, "learning_rate": 8.68568076298232e-07, "loss": 0.7337, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 169.25, "epoch": 0.033677685950413226, "grad_norm": 0.10738632553684634, "kl": 0.021240234375, "learning_rate": 8.678275959756228e-07, "loss": -0.0069, "reward": 5.2584991455078125, "reward_std": 0.42539364099502563, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.7556662559509277, "rewards/ngram_similarity_reward/std": 0.29582479596138, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 489 }, { "clip_ratio/high_max": 0.0007385524222627282, "clip_ratio/high_mean": 0.0007385524222627282, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007385524222627282, "epoch": 0.0337465564738292, "grad_norm": 0.1071846645750683, "kl": 0.0211181640625, "learning_rate": 8.670853944836176e-07, "loss": -0.0075, "step": 490 }, { "clip_ratio/high_max": 0.00011130899656563997, "clip_ratio/high_mean": 0.00011130899656563997, "clip_ratio/low_mean": 0.0006678539793938398, "clip_ratio/low_min": 0.0006678539793938398, "clip_ratio/region_mean": 0.0007791629759594798, "completion_length": 1123.0, "epoch": 0.03381542699724518, "grad_norm": 0.061036200616448946, "kl": 1.0390625, "learning_rate": 8.663414758415478e-07, "loss": 0.2328, "reward": 2.2063238620758057, "reward_std": 0.37843915820121765, "rewards/accuracy_reward": 0.03125, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/format_reward_func": 0.8272435665130615, "rewards/format_reward_func/std": 0.22850307822227478, "rewards/ngram_similarity_reward": 0.0, "rewards/ngram_similarity_reward/std": 0.0, "rewards/sql_execution_reward_func": 0.40732550621032715, "rewards/sql_execution_reward_func/std": 0.11117535829544067, "rewards/xml_reward_func": 0.909254789352417, "rewards/xml_reward_func/std": 0.1924353539943695, "step": 491 }, { "clip_ratio/high_max": 0.00011130899656563997, "clip_ratio/high_mean": 0.00011130899656563997, "clip_ratio/low_mean": 0.0007791629759594798, "clip_ratio/low_min": 0.0007791629759594798, "clip_ratio/region_mean": 0.0008904719725251198, "epoch": 0.033884297520661154, "grad_norm": 0.06028727462543812, "kl": 1.046875, "learning_rate": 8.65595844078044e-07, "loss": 0.2329, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00034716195659711957, "clip_ratio/low_min": 0.00034716195659711957, "clip_ratio/region_mean": 0.00034716195659711957, "completion_length": 720.125, "epoch": 0.03395316804407714, "grad_norm": 0.6914494721238268, "kl": 3.140625, "learning_rate": 8.648485032310144e-07, "loss": 1.9231, "reward": 3.6308975219726562, "reward_std": 1.5161113739013672, "rewards/accuracy_reward": 0.7162460088729858, "rewards/accuracy_reward/std": 0.35727736353874207, "rewards/format_reward_func": 0.6583333015441895, "rewards/format_reward_func/std": 0.2237842082977295, "rewards/ngram_similarity_reward": 0.2669927477836609, "rewards/ngram_similarity_reward/std": 0.36931562423706055, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 0.8333333730697632, "rewards/xml_reward_func/std": 0.35634833574295044, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010414858115836978, "clip_ratio/low_min": 0.0010414858115836978, "clip_ratio/region_mean": 0.0010414858115836978, "epoch": 0.03402203856749311, "grad_norm": 0.4645940422351772, "kl": 3.4375, "learning_rate": 8.640994573476223e-07, "loss": 1.9223, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 226.125, "epoch": 0.03409090909090909, "grad_norm": 0.13832286184627293, "kl": 0.1845703125, "learning_rate": 8.633487104842642e-07, "loss": 0.174, "reward": 3.1244874000549316, "reward_std": 0.4548882246017456, "rewards/accuracy_reward": 0.3645833134651184, "rewards/accuracy_reward/std": 0.1473139226436615, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.17728103697299957, "rewards/ngram_similarity_reward": 0.26354721188545227, "rewards/ngram_similarity_reward/std": 0.19815123081207275, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.12747548520565033, "rewards/xml_reward_func": 0.9375, "rewards/xml_reward_func/std": 0.1767766922712326, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0005527915782295167, "clip_ratio/low_min": 0.0005527915782295167, "clip_ratio/region_mean": 0.0005527915782295167, "epoch": 0.03415977961432507, "grad_norm": 0.13409644123112052, "kl": 0.1806640625, "learning_rate": 8.625962667065487e-07, "loss": 0.174, "step": 496 }, { "clip_ratio/high_max": 0.0003460806328803301, "clip_ratio/high_mean": 0.0003460806328803301, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003460806328803301, "completion_length": 722.375, "epoch": 0.03422865013774105, "grad_norm": 0.13969321352422093, "kl": 1.46875, "learning_rate": 8.618421300892737e-07, "loss": 0.7455, "reward": 3.2855281829833984, "reward_std": 1.1689748764038086, "rewards/accuracy_reward": 0.40287595987319946, "rewards/accuracy_reward/std": 0.3485274314880371, "rewards/format_reward_func": 0.8666666746139526, "rewards/format_reward_func/std": 0.11818736791610718, "rewards/ngram_similarity_reward": 0.2319340705871582, "rewards/ngram_similarity_reward/std": 0.2539976239204407, "rewards/sql_execution_reward_func": 0.29124999046325684, "rewards/sql_execution_reward_func/std": 0.13108748197555542, "rewards/xml_reward_func": 0.9739583134651184, "rewards/xml_reward_func/std": 0.05866192281246185, "step": 497 }, { "clip_ratio/high_max": 0.0005191209493204951, "clip_ratio/high_mean": 0.0005191209493204951, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005191209493204951, "epoch": 0.03429752066115702, "grad_norm": 0.13723837142871917, "kl": 1.46875, "learning_rate": 8.610863047164045e-07, "loss": 0.7448, "step": 498 }, { "clip_ratio/high_max": 0.00019015022553503513, "clip_ratio/high_mean": 0.00019015022553503513, "clip_ratio/low_mean": 0.0017113519133999944, "clip_ratio/low_min": 0.0017113519133999944, "clip_ratio/region_mean": 0.0019015021389350295, "completion_length": 657.375, "epoch": 0.034366391184573006, "grad_norm": 1.4939897812267893, "kl": 3.953125, "learning_rate": 8.603287946810513e-07, "loss": 1.815, "reward": 3.853968858718872, "reward_std": 1.845828652381897, "rewards/accuracy_reward": 0.5985852479934692, "rewards/accuracy_reward/std": 0.43767330050468445, "rewards/format_reward_func": 0.716442346572876, "rewards/format_reward_func/std": 0.22365957498550415, "rewards/ngram_similarity_reward": 0.49325019121170044, "rewards/ngram_similarity_reward/std": 0.3717215657234192, "rewards/sql_execution_reward_func": 0.328125, "rewards/sql_execution_reward_func/std": 0.13458767533302307, "rewards/xml_reward_func": 0.8723558187484741, "rewards/xml_reward_func/std": 0.34127816557884216, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001331051578745246, "clip_ratio/low_min": 0.001331051578745246, "clip_ratio/region_mean": 0.001331051578745246, "epoch": 0.03443526170798898, "grad_norm": 1.4769869443160453, "kl": 4.0625, "learning_rate": 8.595696040854483e-07, "loss": 1.8142, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 254.25, "epoch": 0.03450413223140496, "grad_norm": 0.1060018939988467, "kl": 0.478515625, "learning_rate": 8.588087370409302e-07, "loss": 0.1146, "reward": 2.9963579177856445, "reward_std": 0.2760652005672455, "rewards/accuracy_reward": 0.21900270879268646, "rewards/accuracy_reward/std": 0.08849108219146729, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.10690449178218842, "rewards/ngram_similarity_reward": 0.2701515257358551, "rewards/ngram_similarity_reward/std": 0.12649184465408325, "rewards/sql_execution_reward_func": 0.3531249761581421, "rewards/sql_execution_reward_func/std": 0.008838837035000324, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03457300275482093, "grad_norm": 0.10512254158200064, "kl": 0.50390625, "learning_rate": 8.580461976679099e-07, "loss": 0.1146, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00019696670642588288, "clip_ratio/low_min": 0.00019696670642588288, "clip_ratio/region_mean": 0.00019696670642588288, "completion_length": 634.625, "epoch": 0.034641873278236916, "grad_norm": 1.011839344099846, "kl": 4.125, "learning_rate": 8.572819900958576e-07, "loss": 2.4371, "reward": 3.9859938621520996, "reward_std": 1.5824122428894043, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.7583333253860474, "rewards/format_reward_func/std": 0.2362067997455597, "rewards/ngram_similarity_reward": 0.21010708808898926, "rewards/ngram_similarity_reward/std": 0.27943435311317444, "rewards/sql_execution_reward_func": 0.2874999940395355, "rewards/sql_execution_reward_func/std": 0.12747548520565033, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 503 }, { "clip_ratio/high_max": 0.00019696670642588288, "clip_ratio/high_mean": 0.00019696670642588288, "clip_ratio/low_mean": 0.0011818002676591277, "clip_ratio/low_min": 0.0011818002676591277, "clip_ratio/region_mean": 0.0013787669595330954, "epoch": 0.03471074380165289, "grad_norm": 0.5493168724041478, "kl": 4.59375, "learning_rate": 8.565161184632766e-07, "loss": 2.4357, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 170.125, "epoch": 0.03477961432506887, "grad_norm": 0.11946702388195814, "kl": 2.96875, "learning_rate": 8.557485869176825e-07, "loss": -0.0566, "reward": 4.787670612335205, "reward_std": 0.3516136407852173, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8083333969116211, "rewards/format_reward_func/std": 0.0235702246427536, "rewards/ngram_similarity_reward": 0.41539162397384644, "rewards/ngram_similarity_reward/std": 0.21670576930046082, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03484848484848485, "grad_norm": 0.12052158294202074, "kl": 2.9375, "learning_rate": 8.549793996155795e-07, "loss": -0.0566, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00011502846609801054, "clip_ratio/low_min": 0.00011502846609801054, "clip_ratio/region_mean": 0.00011502846609801054, "completion_length": 2173.375, "epoch": 0.03491735537190083, "grad_norm": 0.06746261887304561, "kl": 3.015625, "learning_rate": 8.542085607224388e-07, "loss": 0.0561, "reward": 1.9623353481292725, "reward_std": 0.22443898022174835, "rewards/accuracy_reward": 0.01875000074505806, "rewards/accuracy_reward/std": 0.022160131484270096, "rewards/format_reward_func": 0.8582721948623657, "rewards/format_reward_func/std": 0.09882131218910217, "rewards/ngram_similarity_reward": 0.07245595753192902, "rewards/ngram_similarity_reward/std": 0.06096646934747696, "rewards/sql_execution_reward_func": 0.02083333395421505, "rewards/sql_execution_reward_func/std": 0.0589255727827549, "rewards/xml_reward_func": 0.9370457530021667, "rewards/xml_reward_func/std": 0.09396952390670776, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 5.751423304900527e-05, "clip_ratio/low_min": 5.751423304900527e-05, "clip_ratio/region_mean": 5.751423304900527e-05, "epoch": 0.0349862258953168, "grad_norm": 0.06600042123591136, "kl": 3.0, "learning_rate": 8.534360744126753e-07, "loss": 0.056, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0003660322108771652, "clip_ratio/low_min": 0.0003660322108771652, "clip_ratio/region_mean": 0.0003660322108771652, "completion_length": 341.5, "epoch": 0.035055096418732785, "grad_norm": 0.15126914297584249, "kl": 1.140625, "learning_rate": 8.526619448696261e-07, "loss": 0.3211, "reward": 4.349348545074463, "reward_std": 1.0124865770339966, "rewards/accuracy_reward": 0.6889104843139648, "rewards/accuracy_reward/std": 0.42489340901374817, "rewards/format_reward_func": 0.8100000023841858, "rewards/format_reward_func/std": 0.12359688431024551, "rewards/ngram_similarity_reward": 0.4602891206741333, "rewards/ngram_similarity_reward/std": 0.32551151514053345, "rewards/sql_execution_reward_func": 0.4749999940395355, "rewards/sql_execution_reward_func/std": 0.23754696547985077, "rewards/xml_reward_func": 0.99609375, "rewards/xml_reward_func/std": 0.011048543266952038, "step": 509 }, { "clip_ratio/high_max": 0.0003660322108771652, "clip_ratio/high_mean": 0.0003660322108771652, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003660322108771652, "epoch": 0.03512396694214876, "grad_norm": 0.1516614215476673, "kl": 1.1484375, "learning_rate": 8.518861762855258e-07, "loss": 0.3215, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 137.375, "epoch": 0.03519283746556474, "grad_norm": 0.004197897868267989, "kl": 0.1328125, "learning_rate": 8.511087728614862e-07, "loss": 0.0003, "reward": 5.649999618530273, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.0, "rewards/ngram_similarity_reward": 1.0, "rewards/ngram_similarity_reward/std": 0.0, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03526170798898071, "grad_norm": 0.0045103495135037025, "kl": 0.1416015625, "learning_rate": 8.503297388074719e-07, "loss": 0.0003, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 246.0, "epoch": 0.035330578512396696, "grad_norm": 0.09903817518641593, "kl": 0.0908203125, "learning_rate": 8.495490783422785e-07, "loss": 0.1358, "reward": 5.004542350769043, "reward_std": 0.399242103099823, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7166666984558105, "rewards/format_reward_func/std": 0.11683660745620728, "rewards/ngram_similarity_reward": 0.6974728107452393, "rewards/ngram_similarity_reward/std": 0.16351094841957092, "rewards/sql_execution_reward_func": 0.26249998807907104, "rewards/sql_execution_reward_func/std": 0.16201850771903992, "rewards/xml_reward_func": 0.9791666269302368, "rewards/xml_reward_func/std": 0.0589255727827549, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03539944903581267, "grad_norm": 0.09998565778289477, "kl": 0.08935546875, "learning_rate": 8.487667956935087e-07, "loss": 0.1362, "step": 514 }, { "clip_ratio/high_max": 0.00014452954928856343, "clip_ratio/high_mean": 0.00014452954928856343, "clip_ratio/low_mean": 0.00014452954928856343, "clip_ratio/low_min": 0.00014452954928856343, "clip_ratio/region_mean": 0.00028905909857712686, "completion_length": 864.875, "epoch": 0.03546831955922865, "grad_norm": 0.07581140479020315, "kl": 1.0703125, "learning_rate": 8.479828950975505e-07, "loss": 0.3251, "reward": 2.7569894790649414, "reward_std": 0.5242893099784851, "rewards/accuracy_reward": 0.218753844499588, "rewards/accuracy_reward/std": 0.08838990330696106, "rewards/format_reward_func": 0.8576388955116272, "rewards/format_reward_func/std": 0.11988888680934906, "rewards/ngram_similarity_reward": 0.1658812165260315, "rewards/ngram_similarity_reward/std": 0.0985482782125473, "rewards/sql_execution_reward_func": 0.26250001788139343, "rewards/sql_execution_reward_func/std": 0.1642080694437027, "rewards/xml_reward_func": 0.9505208730697632, "rewards/xml_reward_func/std": 0.11676096171140671, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00014452954928856343, "clip_ratio/low_min": 0.00014452954928856343, "clip_ratio/region_mean": 0.00014452954928856343, "epoch": 0.03553719008264463, "grad_norm": 0.07622536155956287, "kl": 1.1171875, "learning_rate": 8.471973807995534e-07, "loss": 0.3251, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00015103459008969367, "clip_ratio/low_min": 0.00015103459008969367, "clip_ratio/region_mean": 0.00015103459008969367, "completion_length": 827.625, "epoch": 0.035606060606060606, "grad_norm": 0.9658185674098715, "kl": 10.1875, "learning_rate": 8.464102570534061e-07, "loss": 1.3818, "reward": 2.975598096847534, "reward_std": 1.282923936843872, "rewards/accuracy_reward": 0.28437501192092896, "rewards/accuracy_reward/std": 0.30849215388298035, "rewards/format_reward_func": 0.7583333849906921, "rewards/format_reward_func/std": 0.23076823353767395, "rewards/ngram_similarity_reward": 0.33942651748657227, "rewards/ngram_similarity_reward/std": 0.279974102973938, "rewards/sql_execution_reward_func": 0.2800000011920929, "rewards/sql_execution_reward_func/std": 0.139591246843338, "rewards/xml_reward_func": 0.859375, "rewards/xml_reward_func/std": 0.3499840497970581, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00015103459008969367, "clip_ratio/low_min": 0.00015103459008969367, "clip_ratio/region_mean": 0.00015103459008969367, "epoch": 0.03567493112947658, "grad_norm": 0.8657050728944141, "kl": 9.75, "learning_rate": 8.456215281217132e-07, "loss": 1.3814, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 252.25, "epoch": 0.035743801652892565, "grad_norm": 0.05587859561545771, "kl": 0.031982421875, "learning_rate": 8.448311982757712e-07, "loss": -0.0388, "reward": 3.0235886573791504, "reward_std": 0.24513524770736694, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7916666269302368, "rewards/format_reward_func/std": 0.10947203636169434, "rewards/ngram_similarity_reward": 0.2504480481147766, "rewards/ngram_similarity_reward/std": 0.15530943870544434, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03581267217630854, "grad_norm": 0.055374677794700025, "kl": 0.032958984375, "learning_rate": 8.440392717955475e-07, "loss": -0.0387, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 347.75, "epoch": 0.03588154269972452, "grad_norm": 0.12179095938879295, "kl": 0.2431640625, "learning_rate": 8.432457529696548e-07, "loss": 0.1361, "reward": 4.198591709136963, "reward_std": 0.8613433241844177, "rewards/accuracy_reward": 0.671875, "rewards/accuracy_reward/std": 0.27497971057891846, "rewards/format_reward_func": 0.8375000357627869, "rewards/format_reward_func/std": 0.041547439992427826, "rewards/ngram_similarity_reward": 0.44072771072387695, "rewards/ngram_similarity_reward/std": 0.2666963040828705, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.07288689911365509, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.000359453639248386, "clip_ratio/low_min": 0.000359453639248386, "clip_ratio/region_mean": 0.000359453639248386, "epoch": 0.03595041322314049, "grad_norm": 0.1222859788200284, "kl": 0.251953125, "learning_rate": 8.424506460953297e-07, "loss": 0.1362, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0006021677982062101, "clip_ratio/low_min": 0.0006021677982062101, "clip_ratio/region_mean": 0.0006021677982062101, "completion_length": 622.75, "epoch": 0.036019283746556476, "grad_norm": 0.42776986347419566, "kl": 4.8125, "learning_rate": 8.416539554784089e-07, "loss": 2.043, "reward": 3.664046049118042, "reward_std": 1.369542121887207, "rewards/accuracy_reward": 0.7083333134651184, "rewards/accuracy_reward/std": 0.41785547137260437, "rewards/format_reward_func": 0.6797619462013245, "rewards/format_reward_func/std": 0.20002835988998413, "rewards/ngram_similarity_reward": 0.21591176092624664, "rewards/ngram_similarity_reward/std": 0.15187130868434906, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 0.9375, "rewards/xml_reward_func/std": 0.1767766922712326, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03608815426997245, "grad_norm": 0.3256870042361319, "kl": 4.59375, "learning_rate": 8.408556854333048e-07, "loss": 2.0425, "step": 524 }, { "clip_ratio/high_max": 0.00012091898679500446, "clip_ratio/high_mean": 0.00012091898679500446, "clip_ratio/low_mean": 0.00012091898679500446, "clip_ratio/low_min": 0.00012091898679500446, "clip_ratio/region_mean": 0.00024183797359000891, "completion_length": 1033.75, "epoch": 0.03615702479338843, "grad_norm": 0.03923700917164751, "kl": 0.05517578125, "learning_rate": 8.400558402829841e-07, "loss": 0.117, "reward": 2.409860610961914, "reward_std": 0.4755810499191284, "rewards/accuracy_reward": 0.09270833432674408, "rewards/accuracy_reward/std": 0.1560574173927307, "rewards/format_reward_func": 0.8516572117805481, "rewards/format_reward_func/std": 0.17481385171413422, "rewards/ngram_similarity_reward": 0.2092457115650177, "rewards/ngram_similarity_reward/std": 0.10114887356758118, "rewards/sql_execution_reward_func": 0.09583333134651184, "rewards/sql_execution_reward_func/std": 0.09708039462566376, "rewards/xml_reward_func": 0.9630848169326782, "rewards/xml_reward_func/std": 0.07109083235263824, "step": 525 }, { "clip_ratio/high_max": 0.00024183797359000891, "clip_ratio/high_mean": 0.00024183797359000891, "clip_ratio/low_mean": 0.00012091898679500446, "clip_ratio/low_min": 0.00012091898679500446, "clip_ratio/region_mean": 0.00036275695310905576, "epoch": 0.03622589531680441, "grad_norm": 0.03904797912934992, "kl": 0.05712890625, "learning_rate": 8.392544243589427e-07, "loss": 0.1171, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0002002002001972869, "clip_ratio/low_min": 0.0002002002001972869, "clip_ratio/region_mean": 0.0002002002001972869, "completion_length": 624.375, "epoch": 0.036294765840220386, "grad_norm": 0.20011042334658902, "kl": 3.046875, "learning_rate": 8.38451442001183e-07, "loss": 1.0886, "reward": 3.5610218048095703, "reward_std": 1.0559192895889282, "rewards/accuracy_reward": 0.6625000238418579, "rewards/accuracy_reward/std": 0.4711308777332306, "rewards/format_reward_func": 0.800694465637207, "rewards/format_reward_func/std": 0.10815256834030151, "rewards/ngram_similarity_reward": 0.13466274738311768, "rewards/ngram_similarity_reward/std": 0.07716767489910126, "rewards/sql_execution_reward_func": 0.24375000596046448, "rewards/sql_execution_reward_func/std": 0.1590990275144577, "rewards/xml_reward_func": 0.9895833730697632, "rewards/xml_reward_func/std": 0.029462775215506554, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03636363636363636, "grad_norm": 0.20013663385008285, "kl": 3.046875, "learning_rate": 8.376468975581906e-07, "loss": 1.0881, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 197.875, "epoch": 0.036432506887052345, "grad_norm": 0.23975493268125161, "kl": 5.46875, "learning_rate": 8.368407953869103e-07, "loss": 0.256, "reward": 4.151838302612305, "reward_std": 0.809045135974884, "rewards/accuracy_reward": 0.7244318127632141, "rewards/accuracy_reward/std": 0.3021073043346405, "rewards/format_reward_func": 0.815000057220459, "rewards/format_reward_func/std": 0.04242641106247902, "rewards/ngram_similarity_reward": 0.34614986181259155, "rewards/ngram_similarity_reward/std": 0.3169597089290619, "rewards/sql_execution_reward_func": 0.3687499761581421, "rewards/sql_execution_reward_func/std": 0.0530330128967762, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03650137741046832, "grad_norm": 0.23456611409810638, "kl": 5.21875, "learning_rate": 8.360331398527225e-07, "loss": 0.2551, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 378.125, "epoch": 0.036570247933884296, "grad_norm": 0.0740527440516298, "kl": 0.71484375, "learning_rate": 8.352239353294194e-07, "loss": 0.0303, "reward": 2.961812973022461, "reward_std": 0.4316392242908478, "rewards/accuracy_reward": 0.21875, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/format_reward_func": 0.7983333468437195, "rewards/format_reward_func/std": 0.09166882932186127, "rewards/ngram_similarity_reward": 0.2902364730834961, "rewards/ngram_similarity_reward/std": 0.07914653420448303, "rewards/sql_execution_reward_func": 0.2906249761581421, "rewards/sql_execution_reward_func/std": 0.1295166164636612, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03663911845730027, "grad_norm": 0.07407885353791115, "kl": 0.71484375, "learning_rate": 8.344131861991828e-07, "loss": 0.0304, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 679.625, "epoch": 0.036707988980716255, "grad_norm": 0.1996562989526218, "kl": 0.384765625, "learning_rate": 8.33600896852558e-07, "loss": 1.4366, "reward": 3.243713855743408, "reward_std": 1.0239101648330688, "rewards/accuracy_reward": 0.5, "rewards/accuracy_reward/std": 0.26726123690605164, "rewards/format_reward_func": 0.6986842155456543, "rewards/format_reward_func/std": 0.15418575704097748, "rewards/ngram_similarity_reward": 0.21971289813518524, "rewards/ngram_similarity_reward/std": 0.13297516107559204, "rewards/sql_execution_reward_func": 0.28125, "rewards/sql_execution_reward_func/std": 0.13346347212791443, "rewards/xml_reward_func": 0.9342105388641357, "rewards/xml_reward_func/std": 0.1860807240009308, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03677685950413223, "grad_norm": 0.1984524519068192, "kl": 0.408203125, "learning_rate": 8.327870716884315e-07, "loss": 1.4367, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.000697836687322706, "clip_ratio/low_min": 0.000697836687322706, "clip_ratio/region_mean": 0.000697836687322706, "completion_length": 179.125, "epoch": 0.03684573002754821, "grad_norm": 0.1626208109397389, "kl": 0.0654296875, "learning_rate": 8.319717151140072e-07, "loss": 0.1337, "reward": 5.468749523162842, "reward_std": 0.4374743402004242, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7250000238418579, "rewards/format_reward_func/std": 0.14880476891994476, "rewards/ngram_similarity_reward": 1.0, "rewards/ngram_similarity_reward/std": 0.0, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 0.9375, "rewards/xml_reward_func/std": 0.1767766922712326, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.000697836687322706, "clip_ratio/low_min": 0.000697836687322706, "clip_ratio/region_mean": 0.000697836687322706, "epoch": 0.03691460055096419, "grad_norm": 0.16113232647765954, "kl": 0.0654296875, "learning_rate": 8.31154831544782e-07, "loss": 0.1336, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.000281928398180753, "clip_ratio/low_min": 0.000281928398180753, "clip_ratio/region_mean": 0.000281928398180753, "completion_length": 443.375, "epoch": 0.036983471074380166, "grad_norm": 0.34526910353895734, "kl": 2.359375, "learning_rate": 8.303364254045224e-07, "loss": 0.8876, "reward": 3.7214462757110596, "reward_std": 1.1934529542922974, "rewards/accuracy_reward": 0.59375, "rewards/accuracy_reward/std": 0.4419417679309845, "rewards/format_reward_func": 0.8533333539962769, "rewards/format_reward_func/std": 0.12966378033161163, "rewards/ngram_similarity_reward": 0.3162420988082886, "rewards/ngram_similarity_reward/std": 0.30942532420158386, "rewards/sql_execution_reward_func": 0.22499999403953552, "rewards/sql_execution_reward_func/std": 0.11649647355079651, "rewards/xml_reward_func": 0.981249988079071, "rewards/xml_reward_func/std": 0.053033001720905304, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03705234159779614, "grad_norm": 0.3474709383119147, "kl": 2.328125, "learning_rate": 8.295165011252396e-07, "loss": 0.8879, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 287.0, "epoch": 0.037121212121212124, "grad_norm": 0.1723384330101528, "kl": 4.34375, "learning_rate": 8.28695063147167e-07, "loss": 0.0874, "reward": 3.177549123764038, "reward_std": 0.816467821598053, "rewards/accuracy_reward": 0.34687501192092896, "rewards/accuracy_reward/std": 0.29290342330932617, "rewards/format_reward_func": 0.8125, "rewards/format_reward_func/std": 0.11259914934635162, "rewards/ngram_similarity_reward": 0.20586606860160828, "rewards/ngram_similarity_reward/std": 0.17539238929748535, "rewards/sql_execution_reward_func": 0.36250001192092896, "rewards/sql_execution_reward_func/std": 0.0353553369641304, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0371900826446281, "grad_norm": 0.17386915927871024, "kl": 4.625, "learning_rate": 8.278721159187346e-07, "loss": 0.0885, "step": 540 }, { "clip_ratio/high_max": 0.00010312467929907143, "clip_ratio/high_mean": 0.00010312467929907143, "clip_ratio/low_mean": 0.00010312467929907143, "clip_ratio/low_min": 0.00010312467929907143, "clip_ratio/region_mean": 0.00020624935859814286, "completion_length": 1212.125, "epoch": 0.037258953168044076, "grad_norm": 0.08747682350530044, "kl": 0.98046875, "learning_rate": 8.270476638965461e-07, "loss": 0.5653, "reward": 2.031001091003418, "reward_std": 0.7635068297386169, "rewards/accuracy_reward": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7987500429153442, "rewards/format_reward_func/std": 0.27570366859436035, "rewards/ngram_similarity_reward": 0.09261190891265869, "rewards/ngram_similarity_reward/std": 0.05515960603952408, "rewards/sql_execution_reward_func": 0.21833333373069763, "rewards/sql_execution_reward_func/std": 0.1563674956560135, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00010312467929907143, "clip_ratio/low_min": 0.00010312467929907143, "clip_ratio/region_mean": 0.00010312467929907143, "epoch": 0.03732782369146005, "grad_norm": 0.11424736871266997, "kl": 1.1484375, "learning_rate": 8.262217115453542e-07, "loss": 0.5655, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 259.375, "epoch": 0.037396694214876035, "grad_norm": 0.25920456245328444, "kl": 2.40625, "learning_rate": 8.253942633380361e-07, "loss": 0.3152, "reward": 3.6845531463623047, "reward_std": 1.1428842544555664, "rewards/accuracy_reward": 0.6145879626274109, "rewards/accuracy_reward/std": 0.37250640988349915, "rewards/format_reward_func": 0.824999988079071, "rewards/format_reward_func/std": 0.12817399203777313, "rewards/ngram_similarity_reward": 0.21191813051700592, "rewards/ngram_similarity_reward/std": 0.2853183150291443, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.12747548520565033, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0004819277091883123, "clip_ratio/low_min": 0.0004819277091883123, "clip_ratio/region_mean": 0.0004819277091883123, "epoch": 0.03746556473829201, "grad_norm": 0.24780682592719808, "kl": 2.390625, "learning_rate": 8.245653237555705e-07, "loss": 0.3151, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 539.0, "epoch": 0.03753443526170799, "grad_norm": 0.43083917615353523, "kl": 15.625, "learning_rate": 8.237348972870114e-07, "loss": 1.0641, "reward": 3.42832612991333, "reward_std": 1.198496699333191, "rewards/accuracy_reward": 0.565625011920929, "rewards/accuracy_reward/std": 0.47301042079925537, "rewards/format_reward_func": 0.7388889193534851, "rewards/format_reward_func/std": 0.10406211018562317, "rewards/ngram_similarity_reward": 0.15592098236083984, "rewards/ngram_similarity_reward/std": 0.11342120915651321, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.13611313700675964, "rewards/xml_reward_func": 0.9930555820465088, "rewards/xml_reward_func/std": 0.01964186504483223, "step": 545 }, { "clip_ratio/high_max": 0.00023191094805952162, "clip_ratio/high_mean": 0.00023191094805952162, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023191094805952162, "epoch": 0.03760330578512397, "grad_norm": 0.4126431936250881, "kl": 14.75, "learning_rate": 8.229029884294662e-07, "loss": 1.0621, "step": 546 }, { "clip_ratio/high_max": 7.491758879041299e-05, "clip_ratio/high_mean": 7.491758879041299e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.491758879041299e-05, "completion_length": 1668.5, "epoch": 0.037672176308539945, "grad_norm": 0.092248329804871, "kl": 5.4375, "learning_rate": 8.220696016880687e-07, "loss": 0.1013, "reward": 2.179778575897217, "reward_std": 0.3563680648803711, "rewards/accuracy_reward": 0.04062500223517418, "rewards/accuracy_reward/std": 0.05334774777293205, "rewards/format_reward_func": 0.8722347617149353, "rewards/format_reward_func/std": 0.10876157134771347, "rewards/ngram_similarity_reward": 0.048928096890449524, "rewards/ngram_similarity_reward/std": 0.04654516279697418, "rewards/sql_execution_reward_func": 0.17165178060531616, "rewards/sql_execution_reward_func/std": 0.20175421237945557, "rewards/xml_reward_func": 0.981249988079071, "rewards/xml_reward_func/std": 0.053033001720905304, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03774104683195592, "grad_norm": 0.08974590096139444, "kl": 5.46875, "learning_rate": 8.212347415759572e-07, "loss": 0.1014, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 119.875, "epoch": 0.037809917355371904, "grad_norm": 0.09104206270165878, "kl": 0.038330078125, "learning_rate": 8.203984126142485e-07, "loss": 0.0252, "reward": 5.349999904632568, "reward_std": 0.3207133114337921, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.0, "rewards/ngram_similarity_reward": 0.800000011920929, "rewards/ngram_similarity_reward/std": 0.21380898356437683, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03787878787878788, "grad_norm": 0.09126678401160868, "kl": 0.0390625, "learning_rate": 8.195606193320136e-07, "loss": 0.0255, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 278.125, "epoch": 0.037947658402203856, "grad_norm": 0.7087304677766189, "kl": 2.078125, "learning_rate": 8.187213662662538e-07, "loss": 1.6295, "reward": 4.7109375, "reward_std": 1.2951792478561401, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.7464286088943481, "rewards/format_reward_func/std": 0.1460360586643219, "rewards/ngram_similarity_reward": 0.6232638955116272, "rewards/ngram_similarity_reward/std": 0.3120201528072357, "rewards/sql_execution_reward_func": 0.32499998807907104, "rewards/sql_execution_reward_func/std": 0.05345224216580391, "rewards/xml_reward_func": 0.9546130895614624, "rewards/xml_reward_func/std": 0.08931051194667816, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03801652892561983, "grad_norm": 0.7052960436060486, "kl": 2.0625, "learning_rate": 8.178806579618753e-07, "loss": 1.6285, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 231.125, "epoch": 0.038085399449035814, "grad_norm": 0.23426776293439436, "kl": 0.9296875, "learning_rate": 8.170384989716657e-07, "loss": 0.2868, "reward": 4.127418518066406, "reward_std": 0.6703744530677795, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.8333333730697632, "rewards/format_reward_func/std": 0.08728715032339096, "rewards/ngram_similarity_reward": 0.09605669975280762, "rewards/ngram_similarity_reward/std": 0.10043389350175858, "rewards/sql_execution_reward_func": 0.4000000059604645, "rewards/sql_execution_reward_func/std": 0.046291008591651917, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03815426997245179, "grad_norm": 0.23490280330389954, "kl": 0.9296875, "learning_rate": 8.161948938562677e-07, "loss": 0.2878, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00038037277408875525, "clip_ratio/low_min": 0.00038037277408875525, "clip_ratio/region_mean": 0.00038037277408875525, "completion_length": 328.625, "epoch": 0.038223140495867766, "grad_norm": 0.1869892212216449, "kl": 0.1220703125, "learning_rate": 8.153498471841564e-07, "loss": 0.2028, "reward": 3.6708385944366455, "reward_std": 0.9364548921585083, "rewards/accuracy_reward": 0.6197916269302368, "rewards/accuracy_reward/std": 0.31531471014022827, "rewards/format_reward_func": 0.7920833826065063, "rewards/format_reward_func/std": 0.1258360892534256, "rewards/ngram_similarity_reward": 0.23444783687591553, "rewards/ngram_similarity_reward/std": 0.2701125144958496, "rewards/sql_execution_reward_func": 0.2875000238418579, "rewards/sql_execution_reward_func/std": 0.07905694097280502, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.00038037277408875525, "clip_ratio/high_mean": 0.00038037277408875525, "clip_ratio/low_mean": 0.00038037277408875525, "clip_ratio/low_min": 0.00038037277408875525, "clip_ratio/region_mean": 0.0007607455481775105, "epoch": 0.03829201101928375, "grad_norm": 0.18013099102595304, "kl": 0.1142578125, "learning_rate": 8.145033635316128e-07, "loss": 0.2027, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 290.375, "epoch": 0.038360881542699725, "grad_norm": 0.1530389835085894, "kl": 4.375, "learning_rate": 8.136554474827002e-07, "loss": 0.0036, "reward": 3.3330183029174805, "reward_std": 0.435871958732605, "rewards/accuracy_reward": 0.4211128354072571, "rewards/accuracy_reward/std": 0.23390574753284454, "rewards/format_reward_func": 0.8416666984558105, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.20775079727172852, "rewards/ngram_similarity_reward/std": 0.11111582815647125, "rewards/sql_execution_reward_func": 0.3375000059604645, "rewards/sql_execution_reward_func/std": 0.058248236775398254, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0384297520661157, "grad_norm": 0.18532118736368758, "kl": 5.1875, "learning_rate": 8.128061036292386e-07, "loss": 0.0053, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00035398229374550283, "clip_ratio/low_min": 0.00035398229374550283, "clip_ratio/region_mean": 0.00035398229374550283, "completion_length": 353.125, "epoch": 0.038498622589531684, "grad_norm": 0.20581512084418274, "kl": 4.9375, "learning_rate": 8.119553365707802e-07, "loss": 0.2912, "reward": 3.4174551963806152, "reward_std": 1.074726939201355, "rewards/accuracy_reward": 0.40937501192092896, "rewards/accuracy_reward/std": 0.49604466557502747, "rewards/format_reward_func": 0.8895833492279053, "rewards/format_reward_func/std": 0.10725192725658417, "rewards/ngram_similarity_reward": 0.17274795472621918, "rewards/ngram_similarity_reward/std": 0.22068369388580322, "rewards/sql_execution_reward_func": 0.44999998807907104, "rewards/sql_execution_reward_func/std": 0.11649647355079651, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03856749311294766, "grad_norm": 0.20708621218992865, "kl": 5.0, "learning_rate": 8.111031509145847e-07, "loss": 0.2912, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 785.125, "epoch": 0.038636363636363635, "grad_norm": 0.0883764443297059, "kl": 2.3125, "learning_rate": 8.102495512755938e-07, "loss": 0.6029, "reward": 3.527216911315918, "reward_std": 0.9214572310447693, "rewards/accuracy_reward": 0.6283653974533081, "rewards/accuracy_reward/std": 0.416299045085907, "rewards/format_reward_func": 0.8541666865348816, "rewards/format_reward_func/std": 0.11675167083740234, "rewards/ngram_similarity_reward": 0.11296285688877106, "rewards/ngram_similarity_reward/std": 0.11786568909883499, "rewards/sql_execution_reward_func": 0.26249998807907104, "rewards/sql_execution_reward_func/std": 0.13562026619911194, "rewards/xml_reward_func": 0.984375, "rewards/xml_reward_func/std": 0.04419417306780815, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03870523415977961, "grad_norm": 0.08780474716054082, "kl": 2.28125, "learning_rate": 8.093945422764069e-07, "loss": 0.6027, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00017602535081095994, "clip_ratio/low_min": 0.00017602535081095994, "clip_ratio/region_mean": 0.00017602535081095994, "completion_length": 710.125, "epoch": 0.038774104683195594, "grad_norm": 0.7394793916918758, "kl": 2.65625, "learning_rate": 8.085381285472554e-07, "loss": 0.5796, "reward": 2.3697643280029297, "reward_std": 0.6918754577636719, "rewards/accuracy_reward": 0.09375, "rewards/accuracy_reward/std": 0.12938730418682098, "rewards/format_reward_func": 0.7727857828140259, "rewards/format_reward_func/std": 0.25197675824165344, "rewards/ngram_similarity_reward": 0.060902394354343414, "rewards/ngram_similarity_reward/std": 0.04078385978937149, "rewards/sql_execution_reward_func": 0.4624999761581421, "rewards/sql_execution_reward_func/std": 0.09910313785076141, "rewards/xml_reward_func": 0.8556250333786011, "rewards/xml_reward_func/std": 0.31954696774482727, "step": 563 }, { "clip_ratio/high_max": 0.0005280760233290493, "clip_ratio/high_mean": 0.0005280760233290493, "clip_ratio/low_mean": 0.0005280760233290493, "clip_ratio/low_min": 0.0005280760233290493, "clip_ratio/region_mean": 0.0010561520466580987, "epoch": 0.03884297520661157, "grad_norm": 0.7235996737011193, "kl": 2.6875, "learning_rate": 8.076803147259775e-07, "loss": 0.5788, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 291.875, "epoch": 0.038911845730027546, "grad_norm": 0.21965239342570095, "kl": 5.875, "learning_rate": 8.068211054579943e-07, "loss": 0.2515, "reward": 4.551966667175293, "reward_std": 0.6681945323944092, "rewards/accuracy_reward": 0.90625, "rewards/accuracy_reward/std": 0.2651650309562683, "rewards/format_reward_func": 0.8400000333786011, "rewards/format_reward_func/std": 0.07708992809057236, "rewards/ngram_similarity_reward": 0.4267277717590332, "rewards/ngram_similarity_reward/std": 0.119071826338768, "rewards/sql_execution_reward_func": 0.3218749761581421, "rewards/sql_execution_reward_func/std": 0.07954951375722885, "rewards/xml_reward_func": 0.9375, "rewards/xml_reward_func/std": 0.1157275140285492, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03898071625344353, "grad_norm": 0.2153161512066258, "kl": 5.875, "learning_rate": 8.059605053962833e-07, "loss": 0.2512, "step": 566 }, { "clip_ratio/high_max": 0.0001239618140971288, "clip_ratio/high_mean": 0.0001239618140971288, "clip_ratio/low_mean": 0.00037188545684330165, "clip_ratio/low_min": 0.00037188545684330165, "clip_ratio/region_mean": 0.0004958472563885152, "completion_length": 1008.375, "epoch": 0.039049586776859505, "grad_norm": 0.29834311989728324, "kl": 5.40625, "learning_rate": 8.050985192013539e-07, "loss": 1.2414, "reward": 2.707050323486328, "reward_std": 1.5301672220230103, "rewards/accuracy_reward": 0.27744871377944946, "rewards/accuracy_reward/std": 0.3695339560508728, "rewards/format_reward_func": 0.7799999713897705, "rewards/format_reward_func/std": 0.23454108834266663, "rewards/ngram_similarity_reward": 0.18722222745418549, "rewards/ngram_similarity_reward/std": 0.23101875185966492, "rewards/sql_execution_reward_func": 0.3031249940395355, "rewards/sql_execution_reward_func/std": 0.1701456904411316, "rewards/xml_reward_func": 0.7881944179534912, "rewards/xml_reward_func/std": 0.3922579288482666, "step": 567 }, { "clip_ratio/high_max": 0.00037188545684330165, "clip_ratio/high_mean": 0.00037188545684330165, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037188545684330165, "epoch": 0.03911845730027548, "grad_norm": 0.4558379222216218, "kl": 5.40625, "learning_rate": 8.04235151541222e-07, "loss": 1.2412, "step": 568 }, { "clip_ratio/high_max": 0.0003694126207847148, "clip_ratio/high_mean": 0.0003694126207847148, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003694126207847148, "completion_length": 338.375, "epoch": 0.03918732782369146, "grad_norm": 0.12161952414458432, "kl": 0.1845703125, "learning_rate": 8.033704070913847e-07, "loss": 0.0429, "reward": 3.9850709438323975, "reward_std": 0.6736142039299011, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/std": 0.3471825420856476, "rewards/format_reward_func": 0.7583333849906921, "rewards/format_reward_func/std": 0.1003960371017456, "rewards/ngram_similarity_reward": 0.1636584997177124, "rewards/ngram_similarity_reward/std": 0.044332172721624374, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 569 }, { "clip_ratio/high_max": 0.0003694126207847148, "clip_ratio/high_mean": 0.0003694126207847148, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003694126207847148, "epoch": 0.03925619834710744, "grad_norm": 0.12242485029504785, "kl": 0.169921875, "learning_rate": 8.025042905347949e-07, "loss": 0.0422, "step": 570 }, { "clip_ratio/high_max": 0.000334057112922892, "clip_ratio/high_mean": 0.000334057112922892, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000334057112922892, "completion_length": 748.375, "epoch": 0.039325068870523415, "grad_norm": 0.0467905559824139, "kl": 0.6484375, "learning_rate": 8.01636806561836e-07, "loss": 0.0786, "reward": 3.2583115100860596, "reward_std": 0.3917481005191803, "rewards/accuracy_reward": 0.375, "rewards/accuracy_reward/std": 0.18898223340511322, "rewards/format_reward_func": 0.845634937286377, "rewards/format_reward_func/std": 0.11625833064317703, "rewards/ngram_similarity_reward": 0.23157604038715363, "rewards/ngram_similarity_reward/std": 0.06183230131864548, "rewards/sql_execution_reward_func": 0.3153125047683716, "rewards/sql_execution_reward_func/std": 0.15440228581428528, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 571 }, { "clip_ratio/high_max": 0.000334057112922892, "clip_ratio/high_mean": 0.000334057112922892, "clip_ratio/low_mean": 0.000167028556461446, "clip_ratio/low_min": 0.000167028556461446, "clip_ratio/region_mean": 0.0005010857130400836, "epoch": 0.03939393939393939, "grad_norm": 0.04644181465747904, "kl": 0.62109375, "learning_rate": 8.00767959870297e-07, "loss": 0.0785, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 299.75, "epoch": 0.039462809917355374, "grad_norm": 0.039840351236114444, "kl": 0.421875, "learning_rate": 7.998977551653457e-07, "loss": -0.0497, "reward": 2.9225974082946777, "reward_std": 0.3171372413635254, "rewards/accuracy_reward": 0.22187499701976776, "rewards/accuracy_reward/std": 0.07954951375722885, "rewards/format_reward_func": 0.7833333015441895, "rewards/format_reward_func/std": 0.077664315700531, "rewards/ngram_similarity_reward": 0.22617603838443756, "rewards/ngram_similarity_reward/std": 0.13135920464992523, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03953168044077135, "grad_norm": 0.040504742791517834, "kl": 0.423828125, "learning_rate": 7.990261971595048e-07, "loss": -0.0497, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 149.75, "epoch": 0.039600550964187325, "grad_norm": 0.10315354488785546, "kl": 2.890625, "learning_rate": 7.981532905726257e-07, "loss": 0.0381, "reward": 4.870651721954346, "reward_std": 0.253084272146225, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.4929347634315491, "rewards/ngram_similarity_reward/std": 0.2148241400718689, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.000834724516607821, "clip_ratio/low_min": 0.000834724516607821, "clip_ratio/region_mean": 0.000834724516607821, "epoch": 0.03966942148760331, "grad_norm": 0.10458282284995678, "kl": 2.859375, "learning_rate": 7.972790401318627e-07, "loss": 0.0381, "step": 576 }, { "clip_ratio/high_max": 0.0001936108455993235, "clip_ratio/high_mean": 0.0001936108455993235, "clip_ratio/low_mean": 0.000387221691198647, "clip_ratio/low_min": 0.000387221691198647, "clip_ratio/region_mean": 0.0005808325367979705, "completion_length": 645.625, "epoch": 0.039738292011019284, "grad_norm": 1.4604315777296912, "kl": 5.34375, "learning_rate": 7.964034505716476e-07, "loss": 0.2958, "reward": 2.420628070831299, "reward_std": 0.8254527449607849, "rewards/accuracy_reward": 0.0625, "rewards/accuracy_reward/std": 0.1157275140285492, "rewards/format_reward_func": 0.7883620858192444, "rewards/format_reward_func/std": 0.29293787479400635, "rewards/ngram_similarity_reward": 0.15412554144859314, "rewards/ngram_similarity_reward/std": 0.1824289858341217, "rewards/sql_execution_reward_func": 0.4000000059604645, "rewards/sql_execution_reward_func/std": 0.17320509254932404, "rewards/xml_reward_func": 0.8760775923728943, "rewards/xml_reward_func/std": 0.350505530834198, "step": 577 }, { "clip_ratio/high_max": 0.000387221691198647, "clip_ratio/high_mean": 0.000387221691198647, "clip_ratio/low_mean": 0.0005808325367979705, "clip_ratio/low_min": 0.0005808325367979705, "clip_ratio/region_mean": 0.0009680542279966176, "epoch": 0.03980716253443526, "grad_norm": 1.4197157466514159, "kl": 5.25, "learning_rate": 7.955265266336642e-07, "loss": 0.2946, "step": 578 }, { "clip_ratio/high_max": 0.0003859513672068715, "clip_ratio/high_mean": 0.0003859513672068715, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003859513672068715, "completion_length": 323.875, "epoch": 0.03987603305785124, "grad_norm": 0.1635929897165733, "kl": 0.83203125, "learning_rate": 7.946482730668225e-07, "loss": -0.1731, "reward": 4.111170291900635, "reward_std": 1.0955185890197754, "rewards/accuracy_reward": 0.690625011920929, "rewards/accuracy_reward/std": 0.43258723616600037, "rewards/format_reward_func": 0.8416666984558105, "rewards/format_reward_func/std": 0.12817399203777313, "rewards/ngram_similarity_reward": 0.4255022704601288, "rewards/ngram_similarity_reward/std": 0.2761766016483307, "rewards/sql_execution_reward_func": 0.28125, "rewards/sql_execution_reward_func/std": 0.0883883461356163, "rewards/xml_reward_func": 0.96875, "rewards/xml_reward_func/std": 0.0883883461356163, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03994490358126722, "grad_norm": 0.16680163146880975, "kl": 0.80859375, "learning_rate": 7.93768694627233e-07, "loss": -0.1729, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 120.375, "epoch": 0.040013774104683195, "grad_norm": 0.23856696852939727, "kl": 7.4375, "learning_rate": 7.928877960781808e-07, "loss": 0.0149, "reward": 5.649999618530273, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.0, "rewards/ngram_similarity_reward": 1.0, "rewards/ngram_similarity_reward/std": 0.0, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04008264462809917, "grad_norm": 0.2748703534941183, "kl": 8.4375, "learning_rate": 7.920055821901002e-07, "loss": 0.0169, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 515.125, "epoch": 0.04015151515151515, "grad_norm": 0.07225250547598683, "kl": 1.1484375, "learning_rate": 7.911220577405484e-07, "loss": 0.0522, "reward": 2.8089075088500977, "reward_std": 0.4512503743171692, "rewards/accuracy_reward": 0.19943180680274963, "rewards/accuracy_reward/std": 0.1236124336719513, "rewards/format_reward_func": 0.8479166626930237, "rewards/format_reward_func/std": 0.07737637311220169, "rewards/ngram_similarity_reward": 0.12554514408111572, "rewards/ngram_similarity_reward/std": 0.2079796940088272, "rewards/sql_execution_reward_func": 0.3946428894996643, "rewards/sql_execution_reward_func/std": 0.14854739606380463, "rewards/xml_reward_func": 0.9791666269302368, "rewards/xml_reward_func/std": 0.0589255727827549, "step": 583 }, { "clip_ratio/high_max": 0.00024265954561997205, "clip_ratio/high_mean": 0.00024265954561997205, "clip_ratio/low_mean": 0.0004853190912399441, "clip_ratio/low_min": 0.0004853190912399441, "clip_ratio/region_mean": 0.0007279786514118314, "epoch": 0.04022038567493113, "grad_norm": 0.07269874846061254, "kl": 1.2109375, "learning_rate": 7.902372275141801e-07, "loss": 0.052, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 174.5, "epoch": 0.040289256198347105, "grad_norm": 0.2544075241474745, "kl": 6.53125, "learning_rate": 7.893510963027209e-07, "loss": -0.019, "reward": 4.53587532043457, "reward_std": 0.42482882738113403, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7291666865348816, "rewards/format_reward_func/std": 0.08807914704084396, "rewards/ngram_similarity_reward": 0.3107227683067322, "rewards/ngram_similarity_reward/std": 0.2595166265964508, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 0.984375, "rewards/xml_reward_func/std": 0.04419417306780815, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04035812672176309, "grad_norm": 0.26497647260978097, "kl": 6.90625, "learning_rate": 7.884636689049422e-07, "loss": -0.0183, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0015313936164602637, "clip_ratio/low_min": 0.0015313936164602637, "clip_ratio/region_mean": 0.0015313936164602637, "completion_length": 489.75, "epoch": 0.040426997245179064, "grad_norm": 1.1936302014967914, "kl": 9.75, "learning_rate": 7.875749501266346e-07, "loss": 1.5393, "reward": 3.3483405113220215, "reward_std": 1.1700046062469482, "rewards/accuracy_reward": 0.48288217186927795, "rewards/accuracy_reward/std": 0.195113867521286, "rewards/format_reward_func": 0.6844444274902344, "rewards/format_reward_func/std": 0.21038976311683655, "rewards/ngram_similarity_reward": 0.3149580955505371, "rewards/ngram_similarity_reward/std": 0.24804110825061798, "rewards/sql_execution_reward_func": 0.3687499761581421, "rewards/sql_execution_reward_func/std": 0.03720119222998619, "rewards/xml_reward_func": 0.8569444417953491, "rewards/xml_reward_func/std": 0.3312867283821106, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0010209290776401758, "clip_ratio/low_min": 0.0010209290776401758, "clip_ratio/region_mean": 0.0010209290776401758, "epoch": 0.04049586776859504, "grad_norm": 0.8923393388918435, "kl": 8.6875, "learning_rate": 7.866849447805819e-07, "loss": 1.5372, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00036416607326827943, "clip_ratio/low_min": 0.00036416607326827943, "clip_ratio/region_mean": 0.00036416607326827943, "completion_length": 343.25, "epoch": 0.04056473829201102, "grad_norm": 0.17305113779816747, "kl": 4.78125, "learning_rate": 7.857936576865356e-07, "loss": 0.0377, "reward": 3.5778913497924805, "reward_std": 0.5442790389060974, "rewards/accuracy_reward": 0.6145833134651184, "rewards/accuracy_reward/std": 0.23961569368839264, "rewards/format_reward_func": 0.7683333158493042, "rewards/format_reward_func/std": 0.0852074921131134, "rewards/ngram_similarity_reward": 0.16401100158691406, "rewards/ngram_similarity_reward/std": 0.062167733907699585, "rewards/sql_execution_reward_func": 0.3343750238418579, "rewards/sql_execution_reward_func/std": 0.05499593913555145, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.040633608815427, "grad_norm": 0.15357390106894395, "kl": 4.21875, "learning_rate": 7.849010936711881e-07, "loss": 0.0366, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 369.375, "epoch": 0.040702479338842974, "grad_norm": 0.036317833062382725, "kl": 2.828125, "learning_rate": 7.840072575681468e-07, "loss": -0.0033, "reward": 2.9934024810791016, "reward_std": 0.17151764035224915, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.737500011920929, "rewards/format_reward_func/std": 0.09161254018545151, "rewards/ngram_similarity_reward": 0.25810176134109497, "rewards/ngram_similarity_reward/std": 0.12038852274417877, "rewards/sql_execution_reward_func": 0.3687500059604645, "rewards/sql_execution_reward_func/std": 0.03720119222998619, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04077134986225895, "grad_norm": 0.03622480636991398, "kl": 2.65625, "learning_rate": 7.831121542179086e-07, "loss": -0.0037, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 249.5, "epoch": 0.04084022038567493, "grad_norm": 0.04042859027259439, "kl": 0.26953125, "learning_rate": 7.822157884678321e-07, "loss": 0.0086, "reward": 2.920473575592041, "reward_std": 0.2365255504846573, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8166667222976685, "rewards/format_reward_func/std": 0.030860668048262596, "rewards/ngram_similarity_reward": 0.19420452415943146, "rewards/ngram_similarity_reward/std": 0.15137001872062683, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.06943650543689728, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04090909090909091, "grad_norm": 0.04076273493960334, "kl": 0.28515625, "learning_rate": 7.813181651721131e-07, "loss": 0.0085, "step": 594 }, { "clip_ratio/high_max": 0.000181620052899234, "clip_ratio/high_mean": 0.000181620052899234, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000181620052899234, "completion_length": 1376.5, "epoch": 0.040977961432506885, "grad_norm": 0.059384006791151536, "kl": 2.5, "learning_rate": 7.804192891917571e-07, "loss": 0.1682, "reward": 2.6323142051696777, "reward_std": 0.5579981803894043, "rewards/accuracy_reward": 0.15625, "rewards/accuracy_reward/std": 0.18600596487522125, "rewards/format_reward_func": 0.8673872947692871, "rewards/format_reward_func/std": 0.13816282153129578, "rewards/ngram_similarity_reward": 0.17250150442123413, "rewards/ngram_similarity_reward/std": 0.12208812683820724, "rewards/sql_execution_reward_func": 0.19883927702903748, "rewards/sql_execution_reward_func/std": 0.2808779180049896, "rewards/xml_reward_func": 0.9948354363441467, "rewards/xml_reward_func/std": 0.010646226815879345, "step": 595 }, { "clip_ratio/high_max": 9.0810026449617e-05, "clip_ratio/high_mean": 9.0810026449617e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 9.0810026449617e-05, "epoch": 0.04104683195592287, "grad_norm": 0.05930556726144004, "kl": 2.46875, "learning_rate": 7.795191653945538e-07, "loss": 0.1682, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 311.0, "epoch": 0.04111570247933884, "grad_norm": 0.2569282486855635, "kl": 0.265625, "learning_rate": 7.7861779865505e-07, "loss": 0.6964, "reward": 4.19815731048584, "reward_std": 1.1684526205062866, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.7750000357627869, "rewards/format_reward_func/std": 0.15507294237613678, "rewards/ngram_similarity_reward": 0.32377153635025024, "rewards/ngram_similarity_reward/std": 0.18488426506519318, "rewards/sql_execution_reward_func": 0.25, "rewards/sql_execution_reward_func/std": 0.12535662949085236, "rewards/xml_reward_func": 0.9375, "rewards/xml_reward_func/std": 0.1767766922712326, "step": 597 }, { "clip_ratio/high_max": 0.0004019292537122965, "clip_ratio/high_mean": 0.0004019292537122965, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004019292537122965, "epoch": 0.04118457300275482, "grad_norm": 0.25574097887989317, "kl": 0.265625, "learning_rate": 7.777151938545235e-07, "loss": 0.6963, "step": 598 }, { "clip_ratio/high_max": 0.00039478877442888916, "clip_ratio/high_mean": 0.00039478877442888916, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039478877442888916, "completion_length": 316.625, "epoch": 0.0412534435261708, "grad_norm": 0.07861031124751577, "kl": 0.28515625, "learning_rate": 7.768113558809575e-07, "loss": -0.0132, "reward": 4.149958610534668, "reward_std": 0.6019772887229919, "rewards/accuracy_reward": 0.90625, "rewards/accuracy_reward/std": 0.2651650309562683, "rewards/format_reward_func": 0.815000057220459, "rewards/format_reward_func/std": 0.04242641106247902, "rewards/ngram_similarity_reward": 0.10247259587049484, "rewards/ngram_similarity_reward/std": 0.0816013291478157, "rewards/sql_execution_reward_func": 0.3687499761581421, "rewards/sql_execution_reward_func/std": 0.0530330128967762, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04132231404958678, "grad_norm": 0.07773283965752727, "kl": 0.28515625, "learning_rate": 7.759062896290121e-07, "loss": -0.0133, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00045055191731080413, "clip_ratio/low_min": 0.00045055191731080413, "clip_ratio/region_mean": 0.00045055191731080413, "completion_length": 554.875, "epoch": 0.041391184573002754, "grad_norm": 0.19088205091491833, "kl": 0.67578125, "learning_rate": 7.75e-07, "loss": 0.4145, "reward": 3.8869781494140625, "reward_std": 0.9392510056495667, "rewards/accuracy_reward": 0.596875011920929, "rewards/accuracy_reward/std": 0.43720653653144836, "rewards/format_reward_func": 0.8202381134033203, "rewards/format_reward_func/std": 0.1035098284482956, "rewards/ngram_similarity_reward": 0.35610052943229675, "rewards/ngram_similarity_reward/std": 0.1873786300420761, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.12535664439201355, "rewards/xml_reward_func": 0.9888392686843872, "rewards/xml_reward_func/std": 0.031567275524139404, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0009011038346216083, "clip_ratio/low_min": 0.0009011038346216083, "clip_ratio/region_mean": 0.0009011038346216083, "epoch": 0.04146005509641873, "grad_norm": 0.191333774547367, "kl": 0.609375, "learning_rate": 7.740924919018585e-07, "loss": 0.4146, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 230.75, "epoch": 0.04152892561983471, "grad_norm": 0.060935979899073525, "kl": 0.34375, "learning_rate": 7.73183770249124e-07, "loss": -0.0495, "reward": 4.505663871765137, "reward_std": 0.3834214210510254, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7916666269302368, "rewards/format_reward_func/std": 0.0235702246427536, "rewards/ngram_similarity_reward": 0.23849815130233765, "rewards/ngram_similarity_reward/std": 0.254610151052475, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04159779614325069, "grad_norm": 0.061290767139584854, "kl": 0.37109375, "learning_rate": 7.72273839962904e-07, "loss": -0.0495, "step": 604 }, { "clip_ratio/high_max": 0.0004526935226749629, "clip_ratio/high_mean": 0.0004526935226749629, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004526935226749629, "completion_length": 276.125, "epoch": 0.041666666666666664, "grad_norm": 0.07303563588271421, "kl": 0.1748046875, "learning_rate": 7.713627059708518e-07, "loss": 0.0475, "reward": 3.1847071647644043, "reward_std": 0.32308873534202576, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8583333492279053, "rewards/format_reward_func/std": 0.12567278742790222, "rewards/ngram_similarity_reward": 0.3467492163181305, "rewards/ngram_similarity_reward/std": 0.23189790546894073, "rewards/sql_execution_reward_func": 0.3062500059604645, "rewards/sql_execution_reward_func/std": 0.09038607776165009, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 605 }, { "clip_ratio/high_max": 0.0004526935226749629, "clip_ratio/high_mean": 0.0004526935226749629, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004526935226749629, "epoch": 0.04173553719008265, "grad_norm": 0.07392461839787251, "kl": 0.1767578125, "learning_rate": 7.704503732071391e-07, "loss": 0.0475, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 200.5, "epoch": 0.04180440771349862, "grad_norm": 0.1452723980060268, "kl": 0.0419921875, "learning_rate": 7.695368466124296e-07, "loss": -0.1092, "reward": 4.22036600112915, "reward_std": 0.8271040916442871, "rewards/accuracy_reward": 0.824999988079071, "rewards/accuracy_reward/std": 0.32513734698295593, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.0, "rewards/ngram_similarity_reward": 0.2802438735961914, "rewards/ngram_similarity_reward/std": 0.19033463299274445, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0418732782369146, "grad_norm": 0.14633890975312405, "kl": 0.044921875, "learning_rate": 7.686221311338521e-07, "loss": -0.1095, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 235.375, "epoch": 0.04194214876033058, "grad_norm": 0.17312483877801002, "kl": 3.15625, "learning_rate": 7.677062317249734e-07, "loss": 0.0036, "reward": 4.160343170166016, "reward_std": 0.7345706820487976, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/std": 0.3471825420856476, "rewards/format_reward_func": 0.8083333969116211, "rewards/format_reward_func/std": 0.0235702246427536, "rewards/ngram_similarity_reward": 0.24717304110527039, "rewards/ngram_similarity_reward/std": 0.09695756435394287, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04201101928374656, "grad_norm": 0.17671382323266793, "kl": 3.34375, "learning_rate": 7.667891533457718e-07, "loss": 0.0042, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 428.625, "epoch": 0.042079889807162534, "grad_norm": 0.30589501862889157, "kl": 1.140625, "learning_rate": 7.658709009626109e-07, "loss": 0.9043, "reward": 3.89839506149292, "reward_std": 1.2737040519714355, "rewards/accuracy_reward": 0.78125, "rewards/accuracy_reward/std": 0.41052016615867615, "rewards/format_reward_func": 0.7393518686294556, "rewards/format_reward_func/std": 0.11641548573970795, "rewards/ngram_similarity_reward": 0.21744853258132935, "rewards/ngram_similarity_reward/std": 0.27416422963142395, "rewards/sql_execution_reward_func": 0.2750000059604645, "rewards/sql_execution_reward_func/std": 0.13093073666095734, "rewards/xml_reward_func": 0.9953703880310059, "rewards/xml_reward_func/std": 0.013094563037157059, "step": 611 }, { "clip_ratio/high_max": 0.0002916302182711661, "clip_ratio/high_mean": 0.0002916302182711661, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002916302182711661, "epoch": 0.04214876033057851, "grad_norm": 0.3048432731826006, "kl": 1.125, "learning_rate": 7.649514795482109e-07, "loss": 0.9037, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 270.625, "epoch": 0.04221763085399449, "grad_norm": 0.01401723069151059, "kl": 0.021240234375, "learning_rate": 7.640308940816239e-07, "loss": 0.0128, "reward": 2.7812459468841553, "reward_std": 0.07501618564128876, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.10416390746831894, "rewards/ngram_similarity_reward/std": 0.003795438213273883, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04228650137741047, "grad_norm": 0.014162654784354401, "kl": 0.021240234375, "learning_rate": 7.631091495482049e-07, "loss": 0.0128, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 158.375, "epoch": 0.042355371900826444, "grad_norm": 0.14121702781071976, "kl": 4.34375, "learning_rate": 7.621862509395866e-07, "loss": -0.0211, "reward": 4.361529350280762, "reward_std": 0.16976961493492126, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.07918232679367065, "rewards/ngram_similarity_reward": 0.14935302734375, "rewards/ngram_similarity_reward/std": 0.08160539716482162, "rewards/sql_execution_reward_func": 0.36250001192092896, "rewards/sql_execution_reward_func/std": 0.023145508021116257, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04242424242424243, "grad_norm": 0.1264255616885701, "kl": 3.84375, "learning_rate": 7.612622032536507e-07, "loss": -0.0224, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 292.125, "epoch": 0.0424931129476584, "grad_norm": 0.15693554373045962, "kl": 1.2890625, "learning_rate": 7.603370114945023e-07, "loss": 0.1567, "reward": 3.6939597129821777, "reward_std": 0.8184418678283691, "rewards/accuracy_reward": 0.59375, "rewards/accuracy_reward/std": 0.35197150707244873, "rewards/format_reward_func": 0.7900000214576721, "rewards/format_reward_func/std": 0.12359688431024551, "rewards/ngram_similarity_reward": 0.2693065106868744, "rewards/ngram_similarity_reward/std": 0.15174898505210876, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.08345229923725128, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04256198347107438, "grad_norm": 0.15394901020725377, "kl": 1.2890625, "learning_rate": 7.594106806724416e-07, "loss": 0.1571, "step": 618 }, { "clip_ratio/high_max": 0.00040600894135423005, "clip_ratio/high_mean": 0.00040600894135423005, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040600894135423005, "completion_length": 307.875, "epoch": 0.04263085399449036, "grad_norm": 0.1257144486147223, "kl": 0.9921875, "learning_rate": 7.584832158039378e-07, "loss": -0.188, "reward": 3.027996063232422, "reward_std": 0.6877099871635437, "rewards/accuracy_reward": 0.38644149899482727, "rewards/accuracy_reward/std": 0.2835290729999542, "rewards/format_reward_func": 0.8104166984558105, "rewards/format_reward_func/std": 0.1466389298439026, "rewards/ngram_similarity_reward": 0.0988452136516571, "rewards/ngram_similarity_reward/std": 0.01865200325846672, "rewards/sql_execution_reward_func": 0.2964285612106323, "rewards/sql_execution_reward_func/std": 0.13363061845302582, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00040600894135423005, "clip_ratio/low_min": 0.00040600894135423005, "clip_ratio/region_mean": 0.00040600894135423005, "epoch": 0.04269972451790634, "grad_norm": 0.1318862925675979, "kl": 1.015625, "learning_rate": 7.575546219116008e-07, "loss": -0.1882, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 221.375, "epoch": 0.04276859504132231, "grad_norm": 0.28525131049464764, "kl": 1.0546875, "learning_rate": 7.566249040241553e-07, "loss": -0.0472, "reward": 3.728358507156372, "reward_std": 1.2283371686935425, "rewards/accuracy_reward": 0.440972238779068, "rewards/accuracy_reward/std": 0.36078664660453796, "rewards/format_reward_func": 0.8416666984558105, "rewards/format_reward_func/std": 0.16690458357334137, "rewards/ngram_similarity_reward": 0.41983160376548767, "rewards/ngram_similarity_reward/std": 0.36205607652664185, "rewards/sql_execution_reward_func": 0.375, "rewards/sql_execution_reward_func/std": 0.03779644891619682, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04283746556473829, "grad_norm": 0.2859816609063531, "kl": 1.0625, "learning_rate": 7.556940671764124e-07, "loss": -0.0472, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 287.0, "epoch": 0.04290633608815427, "grad_norm": 0.07528471395532506, "kl": 3.859375, "learning_rate": 7.547621164092432e-07, "loss": -0.0139, "reward": 2.990143299102783, "reward_std": 0.16613133251667023, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8233333826065063, "rewards/format_reward_func/std": 0.04549551010131836, "rewards/ngram_similarity_reward": 0.2112065553665161, "rewards/ngram_similarity_reward/std": 0.10290427505970001, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.08017837256193161, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04297520661157025, "grad_norm": 0.07527313216832703, "kl": 3.890625, "learning_rate": 7.538290567695508e-07, "loss": -0.0138, "step": 624 }, { "clip_ratio/high_max": 0.00015304560656659305, "clip_ratio/high_mean": 0.00015304560656659305, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015304560656659305, "completion_length": 816.75, "epoch": 0.043044077134986224, "grad_norm": 0.172774060588899, "kl": 3.59375, "learning_rate": 7.528948933102438e-07, "loss": 0.7266, "reward": 3.065478563308716, "reward_std": 1.1093891859054565, "rewards/accuracy_reward": 0.44301408529281616, "rewards/accuracy_reward/std": 0.44696804881095886, "rewards/format_reward_func": 0.7830886244773865, "rewards/format_reward_func/std": 0.12240727245807648, "rewards/ngram_similarity_reward": 0.1538245528936386, "rewards/ngram_similarity_reward/std": 0.15196800231933594, "rewards/sql_execution_reward_func": 0.18437500298023224, "rewards/sql_execution_reward_func/std": 0.13688987493515015, "rewards/xml_reward_func": 0.981249988079071, "rewards/xml_reward_func/std": 0.053033001720905304, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.043112947658402206, "grad_norm": 0.17813919543350726, "kl": 3.6875, "learning_rate": 7.51959631090208e-07, "loss": 0.7267, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 497.875, "epoch": 0.04318181818181818, "grad_norm": 0.08256684161870487, "kl": 0.50390625, "learning_rate": 7.510232751742795e-07, "loss": 0.141, "reward": 4.064952850341797, "reward_std": 0.5764957070350647, "rewards/accuracy_reward": 0.8125, "rewards/accuracy_reward/std": 0.3471825420856476, "rewards/format_reward_func": 0.8458333015441895, "rewards/format_reward_func/std": 0.08152806013822556, "rewards/ngram_similarity_reward": 0.20441317558288574, "rewards/ngram_similarity_reward/std": 0.05904560908675194, "rewards/sql_execution_reward_func": 0.2875000238418579, "rewards/sql_execution_reward_func/std": 0.13024701178073883, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0005021340912207961, "clip_ratio/high_mean": 0.0005021340912207961, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005021340912207961, "epoch": 0.04325068870523416, "grad_norm": 0.08284690146168781, "kl": 0.5390625, "learning_rate": 7.500858306332172e-07, "loss": 0.1409, "step": 628 }, { "clip_ratio/high_max": 0.0004599815874826163, "clip_ratio/high_mean": 0.0004599815874826163, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004599815874826163, "completion_length": 271.75, "epoch": 0.04331955922865014, "grad_norm": 0.060565814778915546, "kl": 3.765625, "learning_rate": 7.49147302543676e-07, "loss": 0.0666, "reward": 2.8690505027770996, "reward_std": 0.11449865251779556, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8191666603088379, "rewards/format_reward_func/std": 0.06584615260362625, "rewards/ngram_similarity_reward": 0.14367249608039856, "rewards/ngram_similarity_reward/std": 0.058363787829875946, "rewards/sql_execution_reward_func": 0.3343749940395355, "rewards/sql_execution_reward_func/std": 0.08756375312805176, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 629 }, { "clip_ratio/high_max": 0.0004599815874826163, "clip_ratio/high_mean": 0.0004599815874826163, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004599815874826163, "epoch": 0.04338842975206612, "grad_norm": 0.05690966349586049, "kl": 3.671875, "learning_rate": 7.482076959881777e-07, "loss": 0.0664, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 317.625, "epoch": 0.04345730027548209, "grad_norm": 0.18927536711143, "kl": 0.6015625, "learning_rate": 7.472670160550848e-07, "loss": 0.1725, "reward": 2.7031009197235107, "reward_std": 0.19560188055038452, "rewards/accuracy_reward": 0.2425239533185959, "rewards/accuracy_reward/std": 0.09799450635910034, "rewards/format_reward_func": 0.8166667222976685, "rewards/format_reward_func/std": 0.11126971989870071, "rewards/ngram_similarity_reward": 0.1023130714893341, "rewards/ngram_similarity_reward/std": 0.15619555115699768, "rewards/sql_execution_reward_func": 0.26875001192092896, "rewards/sql_execution_reward_func/std": 0.12799972295761108, "rewards/xml_reward_func": 0.9791666269302368, "rewards/xml_reward_func/std": 0.0589255727827549, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00039354583714157343, "clip_ratio/low_min": 0.00039354583714157343, "clip_ratio/region_mean": 0.00039354583714157343, "epoch": 0.04352617079889807, "grad_norm": 0.14030456256114607, "kl": 0.61328125, "learning_rate": 7.46325267838573e-07, "loss": 0.1723, "step": 632 }, { "clip_ratio/high_max": 0.00018839487165678293, "clip_ratio/high_mean": 0.00018839487165678293, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018839487165678293, "completion_length": 663.5, "epoch": 0.04359504132231405, "grad_norm": 0.03817103123899515, "kl": 1.8046875, "learning_rate": 7.453824564386025e-07, "loss": 0.0033, "reward": 2.6070008277893066, "reward_std": 0.28321510553359985, "rewards/accuracy_reward": 0.19999998807907104, "rewards/accuracy_reward/std": 0.09354143589735031, "rewards/format_reward_func": 0.8316667079925537, "rewards/format_reward_func/std": 0.11665985733270645, "rewards/ngram_similarity_reward": 0.1016116663813591, "rewards/ngram_similarity_reward/std": 0.07182673364877701, "rewards/sql_execution_reward_func": 0.24375000596046448, "rewards/sql_execution_reward_func/std": 0.17410485446453094, "rewards/xml_reward_func": 0.9791666269302368, "rewards/xml_reward_func/std": 0.0589255727827549, "step": 633 }, { "clip_ratio/high_max": 0.00018839487165678293, "clip_ratio/high_mean": 0.00018839487165678293, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018839487165678293, "epoch": 0.04366391184573003, "grad_norm": 0.03811150057206544, "kl": 1.84375, "learning_rate": 7.444385869608921e-07, "loss": 0.0034, "step": 634 }, { "clip_ratio/high_max": 0.00028563267551362514, "clip_ratio/high_mean": 0.00028563267551362514, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028563267551362514, "completion_length": 437.625, "epoch": 0.043732782369146, "grad_norm": 0.11729292962056061, "kl": 5.1875, "learning_rate": 7.434936645168892e-07, "loss": 0.0736, "reward": 3.795628547668457, "reward_std": 0.6909461617469788, "rewards/accuracy_reward": 0.71875, "rewards/accuracy_reward/std": 0.30435824394226074, "rewards/format_reward_func": 0.8291667103767395, "rewards/format_reward_func/std": 0.041547439992427826, "rewards/ngram_similarity_reward": 0.14847443997859955, "rewards/ngram_similarity_reward/std": 0.11178919672966003, "rewards/sql_execution_reward_func": 0.3062500059604645, "rewards/sql_execution_reward_func/std": 0.06781013309955597, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.043801652892561986, "grad_norm": 0.12018134437594369, "kl": 5.0, "learning_rate": 7.425476942237444e-07, "loss": 0.0732, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 201.125, "epoch": 0.04387052341597796, "grad_norm": 0.11540753920185133, "kl": 2.03125, "learning_rate": 7.416006812042827e-07, "loss": 0.0192, "reward": 4.306108474731445, "reward_std": 0.5104184746742249, "rewards/accuracy_reward": 0.9270833134651184, "rewards/accuracy_reward/std": 0.2062394767999649, "rewards/format_reward_func": 0.7749999761581421, "rewards/format_reward_func/std": 0.0707106739282608, "rewards/ngram_similarity_reward": 0.2179613709449768, "rewards/ngram_similarity_reward/std": 0.13747107982635498, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04393939393939394, "grad_norm": 0.11362532180223325, "kl": 1.921875, "learning_rate": 7.406526305869756e-07, "loss": 0.0193, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 478.125, "epoch": 0.04400826446280992, "grad_norm": 0.4797743332069775, "kl": 2.53125, "learning_rate": 7.39703547505914e-07, "loss": 1.3137, "reward": 2.9775619506835938, "reward_std": 0.9150780439376831, "rewards/accuracy_reward": 0.3645833134651184, "rewards/accuracy_reward/std": 0.1473139226436615, "rewards/format_reward_func": 0.7496093511581421, "rewards/format_reward_func/std": 0.14252622425556183, "rewards/ngram_similarity_reward": 0.1706748604774475, "rewards/ngram_similarity_reward/std": 0.24579408764839172, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 0.9365234375, "rewards/xml_reward_func/std": 0.17953883111476898, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0440771349862259, "grad_norm": 0.47960862616824584, "kl": 2.4375, "learning_rate": 7.387534371007797e-07, "loss": 1.313, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0005356186302378774, "clip_ratio/low_min": 0.0005356186302378774, "clip_ratio/region_mean": 0.0005356186302378774, "completion_length": 466.75, "epoch": 0.04414600550964187, "grad_norm": 0.052521982244101706, "kl": 0.03955078125, "learning_rate": 7.37802304516818e-07, "loss": 0.0345, "reward": 4.367038726806641, "reward_std": 0.45011821389198303, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7400000095367432, "rewards/format_reward_func/std": 0.12282391637563705, "rewards/ngram_similarity_reward": 0.20344240963459015, "rewards/ngram_similarity_reward/std": 0.2694307863712311, "rewards/sql_execution_reward_func": 0.3218749761581421, "rewards/sql_execution_reward_func/std": 0.07954951375722885, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0002678093151189387, "clip_ratio/low_min": 0.0002678093151189387, "clip_ratio/region_mean": 0.0002678093151189387, "epoch": 0.04421487603305785, "grad_norm": 0.06471805044293549, "kl": 0.038818359375, "learning_rate": 7.368501549048099e-07, "loss": 0.0344, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 678.125, "epoch": 0.04428374655647383, "grad_norm": 0.039241540222239545, "kl": 1.2734375, "learning_rate": 7.358969934210438e-07, "loss": 0.0482, "reward": 2.9784042835235596, "reward_std": 0.16531403362751007, "rewards/accuracy_reward": 0.21875, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/format_reward_func": 0.7960144877433777, "rewards/format_reward_func/std": 0.1011463925242424, "rewards/ngram_similarity_reward": 0.1831873506307602, "rewards/ngram_similarity_reward/std": 0.1408328115940094, "rewards/sql_execution_reward_func": 0.4701087176799774, "rewards/sql_execution_reward_func/std": 0.31999436020851135, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04435261707988981, "grad_norm": 0.03750378193108055, "kl": 1.2578125, "learning_rate": 7.349428252272878e-07, "loss": 0.0482, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013763271272182465, "clip_ratio/low_min": 0.0013763271272182465, "clip_ratio/region_mean": 0.0013763271272182465, "completion_length": 635.75, "epoch": 0.04442148760330578, "grad_norm": 1.6225528358748025, "kl": 11.75, "learning_rate": 7.33987655490762e-07, "loss": 1.1329, "reward": 2.819016695022583, "reward_std": 1.462122917175293, "rewards/accuracy_reward": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward_func": 0.7666666507720947, "rewards/format_reward_func/std": 0.24944382905960083, "rewards/ngram_similarity_reward": 0.08969175815582275, "rewards/ngram_similarity_reward/std": 0.07617413252592087, "rewards/sql_execution_reward_func": 0.3084375262260437, "rewards/sql_execution_reward_func/std": 0.13051366806030273, "rewards/xml_reward_func": 0.859375, "rewards/xml_reward_func/std": 0.3499840497970581, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002556036226451397, "clip_ratio/low_min": 0.002556036226451397, "clip_ratio/region_mean": 0.002556036226451397, "epoch": 0.044490358126721766, "grad_norm": 1.343169888277821, "kl": 13.0, "learning_rate": 7.330314893841101e-07, "loss": 1.1323, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 1238.25, "epoch": 0.04455922865013774, "grad_norm": 0.06152660537460022, "kl": 1.046875, "learning_rate": 7.320743320853715e-07, "loss": 0.1163, "reward": 2.7123522758483887, "reward_std": 0.6363670229911804, "rewards/accuracy_reward": 0.15416666865348816, "rewards/accuracy_reward/std": 0.2007426768541336, "rewards/format_reward_func": 0.89083331823349, "rewards/format_reward_func/std": 0.12607316672801971, "rewards/ngram_similarity_reward": 0.2824363708496094, "rewards/ngram_similarity_reward/std": 0.2522357702255249, "rewards/sql_execution_reward_func": 0.08953125029802322, "rewards/sql_execution_reward_func/std": 0.10036527365446091, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 647 }, { "clip_ratio/high_max": 0.0001009489205898717, "clip_ratio/high_mean": 0.0001009489205898717, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001009489205898717, "epoch": 0.04462809917355372, "grad_norm": 0.06208605018192824, "kl": 1.0390625, "learning_rate": 7.311161887779533e-07, "loss": 0.1162, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 170.5, "epoch": 0.0446969696969697, "grad_norm": 0.306987873659295, "kl": 1.1328125, "learning_rate": 7.301570646506027e-07, "loss": -0.1296, "reward": 3.3510284423828125, "reward_std": 1.1438119411468506, "rewards/accuracy_reward": 0.503125011920929, "rewards/accuracy_reward/std": 0.5312447547912598, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.1511857807636261, "rewards/ngram_similarity_reward": 0.12568575143814087, "rewards/ngram_similarity_reward/std": 0.11960887908935547, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.044765840220385676, "grad_norm": 0.3056333897960261, "kl": 1.1328125, "learning_rate": 7.291969648973778e-07, "loss": -0.1295, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 159.5, "epoch": 0.04483471074380165, "grad_norm": 0.20799614013396112, "kl": 0.1064453125, "learning_rate": 7.282358947176205e-07, "loss": 0.3147, "reward": 5.44374942779541, "reward_std": 0.4313081204891205, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7000000476837158, "rewards/format_reward_func/std": 0.1511857956647873, "rewards/ngram_similarity_reward": 1.0, "rewards/ngram_similarity_reward/std": 0.0, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 0.9375, "rewards/xml_reward_func/std": 0.1767766922712326, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04490358126721763, "grad_norm": 0.20452137155925684, "kl": 0.11865234375, "learning_rate": 7.27273859315928e-07, "loss": 0.314, "step": 652 }, { "clip_ratio/high_max": 0.0006830600905232131, "clip_ratio/high_mean": 0.0006830600905232131, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006830600905232131, "completion_length": 183.0, "epoch": 0.04497245179063361, "grad_norm": 0.03363813228376703, "kl": 0.1328125, "learning_rate": 7.263108639021242e-07, "loss": -0.0109, "reward": 2.9153006076812744, "reward_std": 0.1223466619849205, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8083333373069763, "rewards/format_reward_func/std": 0.04272466152906418, "rewards/ngram_similarity_reward": 0.15881147980690002, "rewards/ngram_similarity_reward/std": 0.049701008945703506, "rewards/sql_execution_reward_func": 0.3687500059604645, "rewards/sql_execution_reward_func/std": 0.025877464562654495, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0006830600905232131, "clip_ratio/low_min": 0.0006830600905232131, "clip_ratio/region_mean": 0.0006830600905232131, "epoch": 0.04504132231404959, "grad_norm": 0.03558168189740455, "kl": 0.1298828125, "learning_rate": 7.253469136912325e-07, "loss": -0.0109, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 332.375, "epoch": 0.04511019283746556, "grad_norm": 0.2931369949442033, "kl": 3.03125, "learning_rate": 7.243820139034464e-07, "loss": -0.3425, "reward": 4.587669372558594, "reward_std": 1.121956467628479, "rewards/accuracy_reward": 0.625, "rewards/accuracy_reward/std": 0.40089187026023865, "rewards/format_reward_func": 0.8583333492279053, "rewards/format_reward_func/std": 0.117851123213768, "rewards/ngram_similarity_reward": 0.715390682220459, "rewards/ngram_similarity_reward/std": 0.26971229910850525, "rewards/sql_execution_reward_func": 0.40625, "rewards/sql_execution_reward_func/std": 0.04955155774950981, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.045179063360881545, "grad_norm": 0.2962341051267519, "kl": 3.390625, "learning_rate": 7.234161697641017e-07, "loss": -0.3413, "step": 656 }, { "clip_ratio/high_max": 0.0020120723638683558, "clip_ratio/high_mean": 0.0020120723638683558, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020120723638683558, "completion_length": 186.375, "epoch": 0.04524793388429752, "grad_norm": 0.15548676464648967, "kl": 0.55859375, "learning_rate": 7.224493865036488e-07, "loss": 0.0032, "reward": 3.7700612545013428, "reward_std": 0.5478720664978027, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7583333253860474, "rewards/format_reward_func/std": 0.1137387603521347, "rewards/ngram_similarity_reward": 0.7786518931388855, "rewards/ngram_similarity_reward/std": 0.31837329268455505, "rewards/sql_execution_reward_func": 0.34375, "rewards/sql_execution_reward_func/std": 0.06781013309955597, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 657 }, { "clip_ratio/high_max": 0.0006706907879561186, "clip_ratio/high_mean": 0.0006706907879561186, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006706907879561186, "epoch": 0.0453168044077135, "grad_norm": 0.15467778706706628, "kl": 0.55859375, "learning_rate": 7.214816693576234e-07, "loss": 0.0033, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 575.875, "epoch": 0.04538567493112948, "grad_norm": 0.11066462860357534, "kl": 2.84375, "learning_rate": 7.205130235666186e-07, "loss": 0.235, "reward": 3.0134620666503906, "reward_std": 0.5471258163452148, "rewards/accuracy_reward": 0.3247767686843872, "rewards/accuracy_reward/std": 0.21128520369529724, "rewards/format_reward_func": 0.8866666555404663, "rewards/format_reward_func/std": 0.07010196149349213, "rewards/ngram_similarity_reward": 0.1526850461959839, "rewards/ngram_similarity_reward/std": 0.079578697681427, "rewards/sql_execution_reward_func": 0.24821428954601288, "rewards/sql_execution_reward_func/std": 0.1471671313047409, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.045454545454545456, "grad_norm": 0.11356980618568517, "kl": 3.03125, "learning_rate": 7.195434543762566e-07, "loss": 0.2355, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 420.25, "epoch": 0.04552341597796143, "grad_norm": 0.09450323825693126, "kl": 0.427734375, "learning_rate": 7.185729670371604e-07, "loss": 0.1914, "reward": 2.915675163269043, "reward_std": 0.6716057658195496, "rewards/accuracy_reward": 0.2874999940395355, "rewards/accuracy_reward/std": 0.3053101599216461, "rewards/format_reward_func": 0.7716666460037231, "rewards/format_reward_func/std": 0.07523507624864578, "rewards/ngram_similarity_reward": 0.17100578546524048, "rewards/ngram_similarity_reward/std": 0.09604670852422714, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.12747548520565033, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04559228650137741, "grad_norm": 0.0986643182406052, "kl": 0.404296875, "learning_rate": 7.176015668049247e-07, "loss": 0.1914, "step": 662 }, { "clip_ratio/high_max": 0.0005643340991809964, "clip_ratio/high_mean": 0.0005643340991809964, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005643340991809964, "completion_length": 221.5, "epoch": 0.04566115702479339, "grad_norm": 0.3556138739124025, "kl": 9.8125, "learning_rate": 7.166292589400883e-07, "loss": -0.1991, "reward": 4.227733612060547, "reward_std": 0.859535813331604, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.10690449178218842, "rewards/ngram_similarity_reward": 0.2309892773628235, "rewards/ngram_similarity_reward/std": 0.18194611370563507, "rewards/sql_execution_reward_func": 0.33125001192092896, "rewards/sql_execution_reward_func/std": 0.0530330091714859, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 663 }, { "clip_ratio/high_max": 0.0005643340991809964, "clip_ratio/high_mean": 0.0005643340991809964, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005643340991809964, "epoch": 0.045730027548209366, "grad_norm": 0.36761416658027507, "kl": 10.0625, "learning_rate": 7.156560487081051e-07, "loss": -0.1985, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 174.625, "epoch": 0.04579889807162534, "grad_norm": 0.08020782807962631, "kl": 1.2265625, "learning_rate": 7.146819413793154e-07, "loss": 0.0115, "reward": 3.2421059608459473, "reward_std": 0.46626555919647217, "rewards/accuracy_reward": 0.4312499761581421, "rewards/accuracy_reward/std": 0.2298097014427185, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.10690449178218842, "rewards/ngram_similarity_reward": 0.15307055413722992, "rewards/ngram_similarity_reward/std": 0.013588431291282177, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.045867768595041325, "grad_norm": 0.08098649316819756, "kl": 1.265625, "learning_rate": 7.137069422289181e-07, "loss": 0.0115, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0006555943982675672, "clip_ratio/low_min": 0.0006555943982675672, "clip_ratio/region_mean": 0.0006555943982675672, "completion_length": 572.0, "epoch": 0.0459366391184573, "grad_norm": 2.840652752854055, "kl": 32.0, "learning_rate": 7.127310565369415e-07, "loss": 2.9391, "reward": 4.671029090881348, "reward_std": 1.8430156707763672, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.7083333730697632, "rewards/format_reward_func/std": 0.21948714554309845, "rewards/ngram_similarity_reward": 0.7001307010650635, "rewards/ngram_similarity_reward/std": 0.36276480555534363, "rewards/sql_execution_reward_func": 0.2874999940395355, "rewards/sql_execution_reward_func/std": 0.12747548520565033, "rewards/xml_reward_func": 0.875, "rewards/xml_reward_func/std": 0.3535533845424652, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0004370629321783781, "clip_ratio/low_min": 0.0004370629321783781, "clip_ratio/region_mean": 0.0004370629321783781, "epoch": 0.04600550964187328, "grad_norm": 2.3743604764296746, "kl": 30.625, "learning_rate": 7.117542895882149e-07, "loss": 2.9399, "step": 668 }, { "clip_ratio/high_max": 0.0004484304809011519, "clip_ratio/high_mean": 0.0004484304809011519, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004484304809011519, "completion_length": 278.75, "epoch": 0.04607438016528926, "grad_norm": 0.2611579427864825, "kl": 0.0299072265625, "learning_rate": 7.107766466723397e-07, "loss": 0.2657, "reward": 4.554493427276611, "reward_std": 0.7957165241241455, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.75, "rewards/format_reward_func/std": 0.1414213478565216, "rewards/ngram_similarity_reward": 0.49049586057662964, "rewards/ngram_similarity_reward/std": 0.2050696462392807, "rewards/sql_execution_reward_func": 0.3187500238418579, "rewards/sql_execution_reward_func/std": 0.13076014816761017, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 669 }, { "clip_ratio/high_max": 0.0004484304809011519, "clip_ratio/high_mean": 0.0004484304809011519, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004484304809011519, "epoch": 0.046143250688705235, "grad_norm": 0.25776535896122355, "kl": 0.0299072265625, "learning_rate": 7.097981330836616e-07, "loss": 0.2661, "step": 670 }, { "clip_ratio/high_max": 0.0007225433364510536, "clip_ratio/high_mean": 0.0007225433364510536, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007225433364510536, "completion_length": 173.0, "epoch": 0.04621212121212121, "grad_norm": 0.08820400129870623, "kl": 0.51171875, "learning_rate": 7.08818754121241e-07, "loss": -0.0337, "reward": 5.621249675750732, "reward_std": 0.21256521344184875, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.815000057220459, "rewards/format_reward_func/std": 0.04242641106247902, "rewards/ngram_similarity_reward": 0.9583333730697632, "rewards/ngram_similarity_reward/std": 0.117851123213768, "rewards/sql_execution_reward_func": 0.3687499761581421, "rewards/sql_execution_reward_func/std": 0.0530330128967762, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 671 }, { "clip_ratio/high_max": 0.0007225433364510536, "clip_ratio/high_mean": 0.0007225433364510536, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007225433364510536, "epoch": 0.04628099173553719, "grad_norm": 0.08874619223338492, "kl": 0.54296875, "learning_rate": 7.078385150888246e-07, "loss": -0.034, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 393.875, "epoch": 0.04634986225895317, "grad_norm": 0.15648883059691143, "kl": 0.057861328125, "learning_rate": 7.068574212948169e-07, "loss": 0.0174, "reward": 3.276618719100952, "reward_std": 0.9315258264541626, "rewards/accuracy_reward": 0.47187501192092896, "rewards/accuracy_reward/std": 0.4482978880405426, "rewards/format_reward_func": 0.8374999761581421, "rewards/format_reward_func/std": 0.11189209669828415, "rewards/ngram_similarity_reward": 0.0983014702796936, "rewards/ngram_similarity_reward/std": 0.056946925818920135, "rewards/sql_execution_reward_func": 0.3687500059604645, "rewards/sql_execution_reward_func/std": 0.03720119222998619, "rewards/xml_reward_func": 0.9791666269302368, "rewards/xml_reward_func/std": 0.0589255727827549, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.046418732782369146, "grad_norm": 0.15457685905420698, "kl": 0.053955078125, "learning_rate": 7.058754780522511e-07, "loss": 0.0171, "step": 674 }, { "clip_ratio/high_max": 0.0002599428116809577, "clip_ratio/high_mean": 0.0002599428116809577, "clip_ratio/low_mean": 0.0008664760389365256, "clip_ratio/low_min": 0.0008664760389365256, "clip_ratio/region_mean": 0.0011264188215136528, "completion_length": 1442.625, "epoch": 0.04648760330578512, "grad_norm": 2.742523410020617, "kl": 21.0, "learning_rate": 7.048926906787609e-07, "loss": 1.2295, "reward": 2.4943957328796387, "reward_std": 1.6710635423660278, "rewards/accuracy_reward": 0.19955357909202576, "rewards/accuracy_reward/std": 0.3480287790298462, "rewards/format_reward_func": 0.65625, "rewards/format_reward_func/std": 0.30380359292030334, "rewards/ngram_similarity_reward": 0.3454069495201111, "rewards/ngram_similarity_reward/std": 0.32367610931396484, "rewards/sql_execution_reward_func": 0.2073863446712494, "rewards/sql_execution_reward_func/std": 0.13573312759399414, "rewards/xml_reward_func": 0.7135416269302368, "rewards/xml_reward_func/std": 0.445122092962265, "step": 675 }, { "clip_ratio/high_max": 0.00017329520778730512, "clip_ratio/high_mean": 0.00017329520778730512, "clip_ratio/low_mean": 0.0002599428116809577, "clip_ratio/low_min": 0.0002599428116809577, "clip_ratio/region_mean": 0.0004332380194682628, "epoch": 0.046556473829201105, "grad_norm": 2.3418666415819263, "kl": 19.5, "learning_rate": 7.039090644965509e-07, "loss": 1.2274, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 259.0, "epoch": 0.04662534435261708, "grad_norm": 0.20119162077206507, "kl": 0.26171875, "learning_rate": 7.029246048323686e-07, "loss": 0.0145, "reward": 3.8009791374206543, "reward_std": 0.9908252358436584, "rewards/accuracy_reward": 0.5051867365837097, "rewards/accuracy_reward/std": 0.4190922975540161, "rewards/format_reward_func": 0.7833333015441895, "rewards/format_reward_func/std": 0.077664315700531, "rewards/ngram_similarity_reward": 0.4631815254688263, "rewards/ngram_similarity_reward/std": 0.18862338364124298, "rewards/sql_execution_reward_func": 0.3125, "rewards/sql_execution_reward_func/std": 0.12747548520565033, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.046694214876033056, "grad_norm": 0.20115252686506624, "kl": 0.287109375, "learning_rate": 7.019393170174745e-07, "loss": 0.0153, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 428.625, "epoch": 0.04676308539944904, "grad_norm": 0.15848875800345022, "kl": 5.59375, "learning_rate": 7.009532063876148e-07, "loss": 1.1829, "reward": 4.283294200897217, "reward_std": 0.8436344265937805, "rewards/accuracy_reward": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward_func": 0.8176470994949341, "rewards/format_reward_func/std": 0.04991341754794121, "rewards/ngram_similarity_reward": 0.27538251876831055, "rewards/ngram_similarity_reward/std": 0.04729562997817993, "rewards/sql_execution_reward_func": 0.3062499761581421, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 0.9963235259056091, "rewards/xml_reward_func/std": 0.010398639366030693, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.046831955922865015, "grad_norm": 0.15713684960452115, "kl": 5.5, "learning_rate": 6.999662782829908e-07, "loss": 1.1827, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 245.0, "epoch": 0.04690082644628099, "grad_norm": 0.15591397279512167, "kl": 0.392578125, "learning_rate": 6.989785380482312e-07, "loss": 0.2227, "reward": 3.442603349685669, "reward_std": 0.699516773223877, "rewards/accuracy_reward": 0.53125, "rewards/accuracy_reward/std": 0.33905068039894104, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.10690449178218842, "rewards/ngram_similarity_reward": 0.14923548698425293, "rewards/ngram_similarity_reward/std": 0.03827813267707825, "rewards/sql_execution_reward_func": 0.35624998807907104, "rewards/sql_execution_reward_func/std": 0.01767767407000065, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04696969696969697, "grad_norm": 0.1587177305634696, "kl": 0.408203125, "learning_rate": 6.979899910323624e-07, "loss": 0.2225, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 179.75, "epoch": 0.04703856749311295, "grad_norm": 0.047752496805851155, "kl": 1.578125, "learning_rate": 6.970006425887804e-07, "loss": -0.0035, "reward": 2.752098798751831, "reward_std": 0.06916909664869308, "rewards/accuracy_reward": 0.25, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.7875000238418579, "rewards/format_reward_func/std": 0.08345228433609009, "rewards/ngram_similarity_reward": 0.08056584745645523, "rewards/ngram_similarity_reward/std": 0.06320854276418686, "rewards/sql_execution_reward_func": 0.34375, "rewards/sql_execution_reward_func/std": 0.017677662894129753, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 683 }, { "clip_ratio/high_max": 0.0006954103009775281, "clip_ratio/high_mean": 0.0006954103009775281, "clip_ratio/low_mean": 0.0006954103009775281, "clip_ratio/low_min": 0.0006954103009775281, "clip_ratio/region_mean": 0.0013908206019550562, "epoch": 0.047107438016528926, "grad_norm": 0.04469971162834287, "kl": 1.5, "learning_rate": 6.960104980752206e-07, "loss": -0.0037, "step": 684 }, { "clip_ratio/high_max": 0.0005005004932172596, "clip_ratio/high_mean": 0.0005005004932172596, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005005004932172596, "completion_length": 249.75, "epoch": 0.0471763085399449, "grad_norm": 0.41183846361879556, "kl": 16.25, "learning_rate": 6.950195628537299e-07, "loss": 0.0874, "reward": 4.31707239151001, "reward_std": 0.23369000852108002, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.8149999976158142, "rewards/format_reward_func/std": 0.12040935456752777, "rewards/ngram_similarity_reward": 0.16388171911239624, "rewards/ngram_similarity_reward/std": 0.023676924407482147, "rewards/sql_execution_reward_func": 0.2562499940395355, "rewards/sql_execution_reward_func/std": 0.1237436905503273, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.047245179063360884, "grad_norm": 0.43998989388428855, "kl": 17.25, "learning_rate": 6.940278422906372e-07, "loss": 0.0896, "step": 686 }, { "clip_ratio/high_max": 0.0001911315048346296, "clip_ratio/high_mean": 0.0001911315048346296, "clip_ratio/low_mean": 0.0001911315048346296, "clip_ratio/low_min": 0.0001911315048346296, "clip_ratio/region_mean": 0.0003822630096692592, "completion_length": 654.0, "epoch": 0.04731404958677686, "grad_norm": 0.9396519227976793, "kl": 2.234375, "learning_rate": 6.93035341756524e-07, "loss": 0.9194, "reward": 3.018392562866211, "reward_std": 1.1341310739517212, "rewards/accuracy_reward": 0.323271244764328, "rewards/accuracy_reward/std": 0.32856085896492004, "rewards/format_reward_func": 0.7241666316986084, "rewards/format_reward_func/std": 0.2625621557235718, "rewards/ngram_similarity_reward": 0.19567793607711792, "rewards/ngram_similarity_reward/std": 0.20244546234607697, "rewards/sql_execution_reward_func": 0.4749999940395355, "rewards/sql_execution_reward_func/std": 0.22360679507255554, "rewards/xml_reward_func": 0.8791666626930237, "rewards/xml_reward_func/std": 0.3417682647705078, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.047382920110192836, "grad_norm": 0.8945216085046584, "kl": 2.140625, "learning_rate": 6.920420666261961e-07, "loss": 0.9187, "step": 688 }, { "clip_ratio/high_max": 0.00010761946032289416, "clip_ratio/high_mean": 0.00010761946032289416, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00010761946032289416, "completion_length": 1161.5, "epoch": 0.04745179063360882, "grad_norm": 0.07640690307000082, "kl": 0.458984375, "learning_rate": 6.910480222786538e-07, "loss": 0.4954, "reward": 3.102653980255127, "reward_std": 1.108320713043213, "rewards/accuracy_reward": 0.34375, "rewards/accuracy_reward/std": 0.4212545156478882, "rewards/format_reward_func": 0.8675793409347534, "rewards/format_reward_func/std": 0.11478116363286972, "rewards/ngram_similarity_reward": 0.28426268696784973, "rewards/ngram_similarity_reward/std": 0.13241751492023468, "rewards/sql_execution_reward_func": 0.12812501192092896, "rewards/sql_execution_reward_func/std": 0.12495534867048264, "rewards/xml_reward_func": 0.9930555820465088, "rewards/xml_reward_func/std": 0.01964186504483223, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00010761946032289416, "clip_ratio/low_min": 0.00010761946032289416, "clip_ratio/region_mean": 0.00010761946032289416, "epoch": 0.047520661157024795, "grad_norm": 0.0766113676345282, "kl": 0.44140625, "learning_rate": 6.90053214097063e-07, "loss": 0.4955, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0007062146905809641, "clip_ratio/low_min": 0.0007062146905809641, "clip_ratio/region_mean": 0.0007062146905809641, "completion_length": 177.0, "epoch": 0.04758953168044077, "grad_norm": 0.30372036669424135, "kl": 0.0291748046875, "learning_rate": 6.890576474687263e-07, "loss": 0.0221, "reward": 4.19158411026001, "reward_std": 0.9905810356140137, "rewards/accuracy_reward": 0.9062560796737671, "rewards/accuracy_reward/std": 0.2651479244232178, "rewards/format_reward_func": 0.7083333730697632, "rewards/format_reward_func/std": 0.117851123213768, "rewards/ngram_similarity_reward": 0.2971590757369995, "rewards/ngram_similarity_reward/std": 0.35452330112457275, "rewards/sql_execution_reward_func": 0.22499999403953552, "rewards/sql_execution_reward_func/std": 0.18708287179470062, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.047658402203856746, "grad_norm": 0.3049096542583459, "kl": 0.0284423828125, "learning_rate": 6.880613277850536e-07, "loss": 0.022, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0002951158385258168, "clip_ratio/low_min": 0.0002951158385258168, "clip_ratio/region_mean": 0.0002951158385258168, "completion_length": 847.125, "epoch": 0.04772727272727273, "grad_norm": 0.1085620060055488, "kl": 5.5, "learning_rate": 6.870642604415324e-07, "loss": 0.5979, "reward": 3.6076154708862305, "reward_std": 1.0291496515274048, "rewards/accuracy_reward": 0.5625, "rewards/accuracy_reward/std": 0.4955156147480011, "rewards/format_reward_func": 0.8421874642372131, "rewards/format_reward_func/std": 0.12595738470554352, "rewards/ngram_similarity_reward": 0.1332923173904419, "rewards/ngram_similarity_reward/std": 0.16715434193611145, "rewards/sql_execution_reward_func": 0.4513709545135498, "rewards/sql_execution_reward_func/std": 0.5162844061851501, "rewards/xml_reward_func": 0.9891183376312256, "rewards/xml_reward_func/std": 0.02506817691028118, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0002951158385258168, "clip_ratio/low_min": 0.0002951158385258168, "clip_ratio/region_mean": 0.0002951158385258168, "epoch": 0.047796143250688705, "grad_norm": 0.10722733508796434, "kl": 5.25, "learning_rate": 6.860664508377001e-07, "loss": 0.5974, "step": 694 }, { "clip_ratio/high_max": 0.00021408691827673465, "clip_ratio/high_mean": 0.00021408691827673465, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021408691827673465, "completion_length": 583.875, "epoch": 0.04786501377410468, "grad_norm": 0.11843655331683488, "kl": 2.875, "learning_rate": 6.850679043771125e-07, "loss": -0.0518, "reward": 3.487352132797241, "reward_std": 0.7424463629722595, "rewards/accuracy_reward": 0.4229166507720947, "rewards/accuracy_reward/std": 0.364133358001709, "rewards/format_reward_func": 0.8920833468437195, "rewards/format_reward_func/std": 0.07883954793214798, "rewards/ngram_similarity_reward": 0.2892068922519684, "rewards/ngram_similarity_reward/std": 0.16521061956882477, "rewards/sql_execution_reward_func": 0.31562498211860657, "rewards/sql_execution_reward_func/std": 0.10601339489221573, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 695 }, { "clip_ratio/high_max": 0.00021408691827673465, "clip_ratio/high_mean": 0.00021408691827673465, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021408691827673465, "epoch": 0.047933884297520664, "grad_norm": 0.11917585057706222, "kl": 2.859375, "learning_rate": 6.840686264673168e-07, "loss": -0.0519, "step": 696 }, { "clip_ratio/high_max": 0.0009017132688313723, "clip_ratio/high_mean": 0.0009017132688313723, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009017132688313723, "completion_length": 277.25, "epoch": 0.04800275482093664, "grad_norm": 0.09615176865841688, "kl": 0.1240234375, "learning_rate": 6.83068622519821e-07, "loss": 0.0074, "reward": 4.286202430725098, "reward_std": 0.5108522772789001, "rewards/accuracy_reward": 0.8738937973976135, "rewards/accuracy_reward/std": 0.35311999917030334, "rewards/format_reward_func": 0.7875000238418579, "rewards/format_reward_func/std": 0.0353553481400013, "rewards/ngram_similarity_reward": 0.2672762870788574, "rewards/ngram_similarity_reward/std": 0.16546641290187836, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 697 }, { "clip_ratio/high_max": 0.0013525699032470584, "clip_ratio/high_mean": 0.0013525699032470584, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013525699032470584, "epoch": 0.048071625344352616, "grad_norm": 0.09487088475168491, "kl": 0.12890625, "learning_rate": 6.820678979500647e-07, "loss": 0.0073, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 125.625, "epoch": 0.0481404958677686, "grad_norm": 0.0008882939589757284, "kl": 0.046142578125, "learning_rate": 6.8106645817739e-07, "loss": 0.0001, "reward": 5.649999618530273, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward_func": 0.800000011920929, "rewards/format_reward_func/std": 0.0, "rewards/ngram_similarity_reward": 1.0, "rewards/ngram_similarity_reward/std": 0.0, "rewards/sql_execution_reward_func": 0.3499999940395355, "rewards/sql_execution_reward_func/std": 0.0, "rewards/xml_reward_func": 1.0, "rewards/xml_reward_func/std": 0.0, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.048209366391184574, "grad_norm": 0.0008863611753720057, "kl": 0.046875, "learning_rate": 6.800643086250121e-07, "loss": 0.0001, "step": 700 } ], "logging_steps": 1, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }