diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,4763 +10,4763 @@ "is_world_process_zero": true, "log_history": [ { - "completion_length": 242.4375, + "completion_length": 217.71875, "epoch": 0.022988505747126436, - "grad_norm": 3.6728482246398926, - "kl": 19.95627197623253, + "grad_norm": 6.136332988739014, + "kl": 28.61286038160324, "learning_rate": 3.3333333333333335e-07, - "loss": 0.02, - "reward": 25.379602193832397, - "reward_std": 2.151554364711046, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0286, + "reward": 22.77575373649597, + "reward_std": 5.079468797892332, + "rewards/concensus_correctness_reward_func": 17.607625007629395, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.8125, - "rewards/question_recreation_reward_func": 0.6676958426833153, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.5462847715243697, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.421875, - "rewards/xmlcount_reward_func": 1.1650312468409538, + "rewards/strict_format_reward_func": 0.375, + "rewards/xmlcount_reward_func": 1.1218437552452087, "step": 2 }, { - "completion_length": 259.40625, + "completion_length": 208.28125, "epoch": 0.04597701149425287, - "grad_norm": 5.615799427032471, - "kl": 10.355627089738846, + "grad_norm": 387.49139404296875, + "kl": 262.3089290857315, "learning_rate": 1.0000000000000002e-06, - "loss": 0.0104, - "reward": 25.00343418121338, - "reward_std": 2.216523587703705, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.2623, + "reward": 20.76919150352478, + "reward_std": 6.376348093152046, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.75, - "rewards/question_recreation_reward_func": 0.7419969402253628, + "rewards/final_correctness_reward_func": 1.5, + "rewards/question_recreation_reward_func": 0.5081292064860463, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.296875, - "rewards/xmlcount_reward_func": 0.9020624905824661, + "rewards/strict_format_reward_func": 0.390625, + "rewards/xmlcount_reward_func": 1.182937502861023, "step": 4 }, { - "completion_length": 213.15625, + "completion_length": 191.875, "epoch": 0.06896551724137931, - "grad_norm": 274.1349792480469, - "kl": 107.8401802778244, + "grad_norm": 6.798295021057129, + "kl": 40.90132433176041, "learning_rate": 1.6666666666666667e-06, - "loss": 0.1078, - "reward": 22.522634029388428, - "reward_std": 6.410943031311035, + "loss": 0.0409, + "reward": 22.820843935012817, + "reward_std": 6.094938464462757, "rewards/concensus_correctness_reward_func": 17.5, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.47138405963778496, + "rewards/final_correctness_reward_func": 1.625, + "rewards/question_recreation_reward_func": 0.44506296049803495, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.34375, - "rewards/xmlcount_reward_func": 1.0825000032782555, + "rewards/strict_format_reward_func": 0.359375, + "rewards/xmlcount_reward_func": 1.1414062529802322, "step": 6 }, { - "completion_length": 270.8125, + "completion_length": 216.59375, "epoch": 0.09195402298850575, - "grad_norm": 7.805183410644531, - "kl": 11.126020669937134, + "grad_norm": 101.00456237792969, + "kl": 94.9591760635376, "learning_rate": 2.3333333333333336e-06, - "loss": 0.0111, - "reward": 24.234136819839478, - "reward_std": 3.5289988070726395, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.095, + "reward": 20.551639437675476, + "reward_std": 7.489186868071556, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.75, - "rewards/question_recreation_reward_func": 0.8078547269105911, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.5682645291090012, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.25, - "rewards/xmlcount_reward_func": 0.8012812491506338, + "rewards/strict_format_reward_func": 0.34375, + "rewards/xmlcount_reward_func": 1.0146250203251839, "step": 8 }, { - "completion_length": 243.90625, + "completion_length": 240.34375, "epoch": 0.11494252873563218, - "grad_norm": 3.9362494945526123, - "kl": 11.742099106311798, + "grad_norm": 11.220611572265625, + "kl": 76.46117895841599, "learning_rate": 3e-06, - "loss": 0.0117, - "reward": 21.274942755699158, - "reward_std": 3.5027630124241114, + "loss": 0.0765, + "reward": 21.56508183479309, + "reward_std": 5.051825501024723, "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.875, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.6116619594395161, + "rewards/final_correctness_reward_func": 1.625, + "rewards/question_recreation_reward_func": 0.5634568184614182, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.328125, - "rewards/xmlcount_reward_func": 0.8976562558673322, + "rewards/strict_format_reward_func": 0.375, + "rewards/xmlcount_reward_func": 1.1266250014305115, "step": 10 }, { - "completion_length": 217.96875, + "completion_length": 193.71875, "epoch": 0.13793103448275862, - "grad_norm": 14.653650283813477, - "kl": 7.307641178369522, + "grad_norm": 10.303190231323242, + "kl": 39.27711087465286, "learning_rate": 3.6666666666666666e-06, - "loss": 0.0073, - "reward": 25.698487281799316, - "reward_std": 0.7415948412381113, - "rewards/concensus_correctness_reward_func": 20.0, - "rewards/consensus_reward_func": 2.0, + "loss": 0.0393, + "reward": 23.190983057022095, + "reward_std": 4.599271569401026, + "rewards/concensus_correctness_reward_func": 18.125, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5625, - "rewards/question_recreation_reward_func": 0.5475498735904694, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.3624201933853328, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.4375, - "rewards/xmlcount_reward_func": 1.1509374976158142, + "rewards/strict_format_reward_func": 0.375, + "rewards/xmlcount_reward_func": 1.078562505543232, "step": 12 }, { - "completion_length": 230.5625, + "completion_length": 213.3125, "epoch": 0.16091954022988506, - "grad_norm": 9.731736183166504, - "kl": 14.61717875301838, + "grad_norm": 9.963440895080566, + "kl": 31.835605800151825, "learning_rate": 4.333333333333334e-06, - "loss": 0.0146, - "reward": 25.046069860458374, - "reward_std": 2.2694194354116917, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0318, + "reward": 24.39402174949646, + "reward_std": 3.7091122791171074, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.75, - "rewards/question_recreation_reward_func": 0.6386008784174919, + "rewards/question_recreation_reward_func": 0.5110838692635298, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.34375, - "rewards/xmlcount_reward_func": 1.0012187585234642, + "rewards/strict_format_reward_func": 0.375, + "rewards/xmlcount_reward_func": 1.1329374983906746, "step": 14 }, { - "completion_length": 222.375, + "completion_length": 202.3125, "epoch": 0.1839080459770115, - "grad_norm": 15.809603691101074, - "kl": 25.150070399045944, + "grad_norm": 10.270334243774414, + "kl": 37.28559869527817, "learning_rate": 5e-06, - "loss": 0.0252, - "reward": 24.169724464416504, - "reward_std": 2.6020519509911537, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0373, + "reward": 22.03032898902893, + "reward_std": 7.760255961678922, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.3775370594812557, + "rewards/final_correctness_reward_func": 1.6875, + "rewards/question_recreation_reward_func": 0.28717267932370305, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.265625, - "rewards/xmlcount_reward_func": 0.8390624970197678, + "rewards/strict_format_reward_func": 0.375, + "rewards/xmlcount_reward_func": 1.1181562393903732, "step": 16 }, { - "completion_length": 228.84375, + "completion_length": 217.53125, "epoch": 0.20689655172413793, - "grad_norm": 9.422584533691406, - "kl": 20.831802040338516, + "grad_norm": 10.648605346679688, + "kl": 109.65267598628998, "learning_rate": 4.99979021221458e-06, - "loss": 0.0208, - "reward": 24.46161651611328, - "reward_std": 2.3217524215579033, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.1097, + "reward": 19.966842859983444, + "reward_std": 5.144742712378502, + "rewards/concensus_correctness_reward_func": 15.0, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.75, - "rewards/question_recreation_reward_func": 0.6871792562305927, + "rewards/final_correctness_reward_func": 1.3125, + "rewards/question_recreation_reward_func": 0.50059272069484, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.359375, - "rewards/xmlcount_reward_func": 1.0400625094771385, + "rewards/strict_format_reward_func": 0.375, + "rewards/xmlcount_reward_func": 1.0912500023841858, "step": 18 }, { - "completion_length": 196.0, + "completion_length": 193.25, "epoch": 0.22988505747126436, - "grad_norm": 13.899862289428711, - "kl": 49.173040330410004, + "grad_norm": 17.73970603942871, + "kl": 25.79215371608734, "learning_rate": 4.999160884067051e-06, - "loss": 0.0492, - "reward": 20.844685345888138, - "reward_std": 3.637541137635708, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0258, + "reward": 24.090104341506958, + "reward_std": 3.4924329090863466, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.421060161665082, + "rewards/final_correctness_reward_func": 1.6875, + "rewards/question_recreation_reward_func": 0.3418852910399437, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.8767500147223473, + "rewards/strict_format_reward_func": 0.34375, + "rewards/xmlcount_reward_func": 1.0919687524437904, "step": 20 }, { - "completion_length": 221.40625, + "completion_length": 201.21875, "epoch": 0.25287356321839083, - "grad_norm": 21.375980377197266, - "kl": 32.62057054042816, + "grad_norm": 10.119827270507812, + "kl": 27.866609156131744, "learning_rate": 4.9981121211777e-06, - "loss": 0.0326, - "reward": 16.523541003465652, - "reward_std": 2.1763055473566055, - "rewards/concensus_correctness_reward_func": 11.971250057220459, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0279, + "reward": 18.610281109809875, + "reward_std": 2.490035969763994, + "rewards/concensus_correctness_reward_func": 13.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.5299474075436592, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.47462495043873787, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.25, - "rewards/xmlcount_reward_func": 0.8348437398672104, + "rewards/strict_format_reward_func": 0.34375, + "rewards/xmlcount_reward_func": 1.104406252503395, "step": 22 }, { - "completion_length": 241.46875, + "completion_length": 214.625, "epoch": 0.27586206896551724, - "grad_norm": 23.118202209472656, - "kl": 25.675336122512817, + "grad_norm": 13.692521095275879, + "kl": 50.94436889886856, "learning_rate": 4.9966440995606415e-06, - "loss": 0.0257, - "reward": 23.712936401367188, - "reward_std": 2.5800123661756516, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0509, + "reward": 20.463729798793793, + "reward_std": 4.765589613467455, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.4845613092184067, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.4828234361484647, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.7752499952912331, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 1.0121562704443932, "step": 24 }, { - "completion_length": 210.3125, + "completion_length": 202.25, "epoch": 0.2988505747126437, - "grad_norm": 12.081917762756348, - "kl": 30.60751163959503, + "grad_norm": 23.34243392944336, + "kl": 80.25528079271317, "learning_rate": 4.99475706559428e-06, - "loss": 0.0306, - "reward": 25.52945041656494, - "reward_std": 1.3852368742227554, - "rewards/concensus_correctness_reward_func": 20.0, - "rewards/consensus_reward_func": 2.0, + "loss": 0.0803, + "reward": 21.706849575042725, + "reward_std": 7.795199394226074, + "rewards/concensus_correctness_reward_func": 16.355937480926514, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.625, - "rewards/question_recreation_reward_func": 0.7449815347790718, + "rewards/question_recreation_reward_func": 0.5832868814468384, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.28125, - "rewards/xmlcount_reward_func": 0.87821876257658, + "rewards/strict_format_reward_func": 0.375, + "rewards/xmlcount_reward_func": 1.1426249891519547, "step": 26 }, { - "completion_length": 228.03125, + "completion_length": 197.59375, "epoch": 0.3218390804597701, - "grad_norm": 10.75596809387207, - "kl": 25.778181672096252, + "grad_norm": 5.353348731994629, + "kl": 95.19727528095245, "learning_rate": 4.9924513359799555e-06, - "loss": 0.0258, - "reward": 24.685295820236206, - "reward_std": 1.4350444860756397, - "rewards/concensus_correctness_reward_func": 20.0, - "rewards/consensus_reward_func": 2.0, + "loss": 0.0952, + "reward": 21.77587953209877, + "reward_std": 2.390143619850278, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.6307649835944176, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.45784896763507277, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.7264062408357859, + "rewards/strict_format_reward_func": 0.328125, + "rewards/xmlcount_reward_func": 1.0524062439799309, "step": 28 }, { - "completion_length": 218.0625, + "completion_length": 200.46875, "epoch": 0.3448275862068966, - "grad_norm": 14.352238655090332, - "kl": 40.82291865348816, + "grad_norm": 17.243806838989258, + "kl": 41.76018446683884, "learning_rate": 4.989727297688797e-06, - "loss": 0.0408, - "reward": 23.753162384033203, - "reward_std": 3.8224976174533367, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0418, + "reward": 22.72126531600952, + "reward_std": 5.278627224266529, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.4375, - "rewards/question_recreation_reward_func": 0.519255805760622, + "rewards/final_correctness_reward_func": 1.625, + "rewards/question_recreation_reward_func": 0.561546940356493, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.28125, - "rewards/xmlcount_reward_func": 0.8901562467217445, + "rewards/xmlcount_reward_func": 1.0034687593579292, "step": 30 }, { - "completion_length": 228.0625, + "completion_length": 217.125, "epoch": 0.367816091954023, - "grad_norm": 22.21926498413086, - "kl": 29.966318130493164, + "grad_norm": 8.104058265686035, + "kl": 42.766675651073456, "learning_rate": 4.9865854078967715e-06, - "loss": 0.03, - "reward": 24.414711952209473, - "reward_std": 2.707362122833729, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0428, + "reward": 23.74161386489868, + "reward_std": 4.766659207642078, + "rewards/concensus_correctness_reward_func": 18.125, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5625, - "rewards/question_recreation_reward_func": 0.4653061218559742, + "rewards/final_correctness_reward_func": 1.875, + "rewards/question_recreation_reward_func": 0.47595758736133575, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.25, - "rewards/xmlcount_reward_func": 0.8244062513113022, + "rewards/strict_format_reward_func": 0.359375, + "rewards/xmlcount_reward_func": 1.093781255185604, "step": 32 }, { - "completion_length": 213.0625, + "completion_length": 200.09375, "epoch": 0.39080459770114945, - "grad_norm": 9.519495010375977, - "kl": 58.684278786182404, + "grad_norm": 71.83502960205078, + "kl": 83.02343189716339, "learning_rate": 4.983026193907962e-06, - "loss": 0.0587, - "reward": 21.83285140991211, - "reward_std": 5.531425788998604, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.083, + "reward": 21.505411505699158, + "reward_std": 6.353981636464596, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.6299141198396683, + "rewards/final_correctness_reward_func": 1.75, + "rewards/question_recreation_reward_func": 0.4588801860809326, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.3125, - "rewards/xmlcount_reward_func": 0.952937513589859, + "rewards/strict_format_reward_func": 0.34375, + "rewards/xmlcount_reward_func": 1.0777812600135803, "step": 34 }, { - "completion_length": 225.8125, + "completion_length": 221.8125, "epoch": 0.41379310344827586, - "grad_norm": 6.312277793884277, - "kl": 15.610382616519928, + "grad_norm": 9.259599685668945, + "kl": 26.05735284090042, "learning_rate": 4.979050253066064e-06, - "loss": 0.0156, - "reward": 25.50584077835083, - "reward_std": 0.7576818238012493, - "rewards/concensus_correctness_reward_func": 20.0, - "rewards/consensus_reward_func": 2.0, + "loss": 0.0261, + "reward": 24.350895881652832, + "reward_std": 3.1849860176444054, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.625, - "rewards/question_recreation_reward_func": 0.6116535775363445, + "rewards/final_correctness_reward_func": 1.75, + "rewards/question_recreation_reward_func": 0.5265518706291914, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.296875, - "rewards/xmlcount_reward_func": 0.9723125100135803, + "rewards/strict_format_reward_func": 0.375, + "rewards/xmlcount_reward_func": 1.0743437483906746, "step": 36 }, { - "completion_length": 211.375, + "completion_length": 188.78125, "epoch": 0.4367816091954023, - "grad_norm": 7.8186540603637695, - "kl": 26.190697014331818, + "grad_norm": 62.86690902709961, + "kl": 154.00717759132385, "learning_rate": 4.974658252654135e-06, - "loss": 0.0262, - "reward": 23.17328453063965, - "reward_std": 3.480032216757536, - "rewards/concensus_correctness_reward_func": 18.233500003814697, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.154, + "reward": 21.038205862045288, + "reward_std": 8.54198712669313, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.5577222537249327, + "rewards/final_correctness_reward_func": 1.5, + "rewards/question_recreation_reward_func": 0.39492490515112877, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.328125, - "rewards/xmlcount_reward_func": 0.9914375022053719, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 0.987031240016222, "step": 38 }, { - "completion_length": 216.5, + "completion_length": 188.8125, "epoch": 0.45977011494252873, - "grad_norm": 16.604644775390625, - "kl": 16.470413744449615, + "grad_norm": 18.53998565673828, + "kl": 53.8519631922245, "learning_rate": 4.96985092978261e-06, - "loss": 0.0165, - "reward": 24.73129940032959, - "reward_std": 2.5517977736890316, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0539, + "reward": 23.21538233757019, + "reward_std": 4.211573759093881, + "rewards/concensus_correctness_reward_func": 18.125, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5, - "rewards/question_recreation_reward_func": 0.6332056820392609, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.49650686560198665, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.3125, - "rewards/xmlcount_reward_func": 0.973093755543232, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0470000207424164, "step": 40 }, { - "completion_length": 223.71875, + "completion_length": 184.03125, "epoch": 0.4827586206896552, - "grad_norm": 10.310501098632812, - "kl": 30.0731418132782, + "grad_norm": 10.990641593933105, + "kl": 43.21287852525711, "learning_rate": 4.964629091265583e-06, - "loss": 0.0301, - "reward": 23.894561529159546, - "reward_std": 3.7148748748004436, - "rewards/concensus_correctness_reward_func": 18.125, + "loss": 0.0432, + "reward": 23.595171213150024, + "reward_std": 4.6664072424173355, + "rewards/concensus_correctness_reward_func": 18.230375051498413, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.8125, - "rewards/question_recreation_reward_func": 0.7709678262472153, + "rewards/final_correctness_reward_func": 1.625, + "rewards/question_recreation_reward_func": 0.4962649494409561, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, - "rewards/xmlcount_reward_func": 1.0142187550663948, + "rewards/xmlcount_reward_func": 1.0716562569141388, "step": 42 }, { - "completion_length": 235.75, + "completion_length": 191.96875, "epoch": 0.5057471264367817, - "grad_norm": 9.031312942504883, - "kl": 12.975820362567902, + "grad_norm": 13.650006294250488, + "kl": 78.18545603752136, "learning_rate": 4.958993613485406e-06, - "loss": 0.013, - "reward": 22.29155457019806, - "reward_std": 1.0935805514454842, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 2.0, + "loss": 0.0782, + "reward": 19.51852649450302, + "reward_std": 6.45064403116703, + "rewards/concensus_correctness_reward_func": 15.0, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.49049233738332987, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.35677624261006713, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.7229375056922436, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 0.9586249962449074, "step": 44 }, { - "completion_length": 249.0, + "completion_length": 217.34375, "epoch": 0.5287356321839081, - "grad_norm": 5.135012149810791, - "kl": 11.88379555940628, + "grad_norm": 5.288812637329102, + "kl": 54.3885834813118, "learning_rate": 4.952945442245598e-06, - "loss": 0.0119, - "reward": 24.714513301849365, - "reward_std": 2.3772747572511435, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0544, + "reward": 21.830896377563477, + "reward_std": 6.645906233577989, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.625, - "rewards/question_recreation_reward_func": 0.6560757644474506, + "rewards/final_correctness_reward_func": 1.8125, + "rewards/question_recreation_reward_func": 0.6821776069700718, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.28125, - "rewards/xmlcount_reward_func": 0.8396875038743019, + "rewards/strict_format_reward_func": 0.359375, + "rewards/xmlcount_reward_func": 1.1018437445163727, "step": 46 }, { - "completion_length": 194.3125, + "completion_length": 189.125, "epoch": 0.5517241379310345, - "grad_norm": 18.41168785095215, - "kl": 21.791800498962402, + "grad_norm": 12.947136878967285, + "kl": 26.700558066368103, "learning_rate": 4.946485592612122e-06, - "loss": 0.0218, - "reward": 25.104676246643066, - "reward_std": 2.292080797255039, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0267, + "reward": 24.079298973083496, + "reward_std": 3.6222429275512695, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.6875, - "rewards/question_recreation_reward_func": 0.7222071662545204, + "rewards/final_correctness_reward_func": 1.5625, + "rewards/question_recreation_reward_func": 0.49670528806746006, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.359375, - "rewards/xmlcount_reward_func": 1.0230937451124191, + "rewards/strict_format_reward_func": 0.34375, + "rewards/xmlcount_reward_func": 1.051343746483326, "step": 48 }, { - "completion_length": 275.6875, + "completion_length": 224.1875, "epoch": 0.5747126436781609, - "grad_norm": 4.9017438888549805, - "kl": 7.33440038561821, + "grad_norm": 11.20506477355957, + "kl": 43.48121893405914, "learning_rate": 4.939615148743017e-06, - "loss": 0.0073, - "reward": 25.011138439178467, - "reward_std": 0.9395671505481005, - "rewards/concensus_correctness_reward_func": 20.0, - "rewards/consensus_reward_func": 2.0, + "loss": 0.0435, + "reward": 22.49175500869751, + "reward_std": 5.263124026358128, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.5, - "rewards/question_recreation_reward_func": 0.5479508824646473, + "rewards/question_recreation_reward_func": 0.4272235818207264, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.25, - "rewards/xmlcount_reward_func": 0.713187500834465, + "rewards/strict_format_reward_func": 0.328125, + "rewards/xmlcount_reward_func": 0.9864062368869781, "step": 50 }, { - "completion_length": 209.78125, + "completion_length": 196.3125, "epoch": 0.5977011494252874, - "grad_norm": 7.403425216674805, - "kl": 19.087459594011307, + "grad_norm": 6.8515849113464355, + "kl": 27.352444291114807, "learning_rate": 4.932335263706446e-06, - "loss": 0.0191, - "reward": 23.86680769920349, - "reward_std": 3.916817881166935, + "loss": 0.0274, + "reward": 23.868361473083496, + "reward_std": 3.8520730920135975, "rewards/concensus_correctness_reward_func": 18.75, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5, - "rewards/question_recreation_reward_func": 0.6227765157818794, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.40692434273660183, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.884656261652708, + "rewards/strict_format_reward_func": 0.34375, + "rewards/xmlcount_reward_func": 1.0551875084638596, "step": 52 }, { - "completion_length": 285.03125, + "completion_length": 211.9375, "epoch": 0.6206896551724138, - "grad_norm": 13.203872680664062, - "kl": 32.85868415236473, + "grad_norm": 7.682624340057373, + "kl": 26.0407857298851, "learning_rate": 4.924647159287176e-06, - "loss": 0.0329, - "reward": 20.861812978982925, - "reward_std": 3.7377005480229855, + "loss": 0.026, + "reward": 20.956018179655075, + "reward_std": 4.0980407781898975, "rewards/concensus_correctness_reward_func": 16.25, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.7428129985928535, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.4677678346633911, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.681500006467104, + "rewards/strict_format_reward_func": 0.328125, + "rewards/xmlcount_reward_func": 0.9726249985396862, "step": 54 }, { - "completion_length": 242.0625, + "completion_length": 203.59375, "epoch": 0.6436781609195402, - "grad_norm": 9.020729064941406, - "kl": 16.789406299591064, + "grad_norm": 8.712162017822266, + "kl": 86.18911600112915, "learning_rate": 4.916552125781529e-06, - "loss": 0.0168, - "reward": 24.937540531158447, - "reward_std": 2.31194556504488, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0862, + "reward": 20.785347312688828, + "reward_std": 3.7248302586376667, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.8125, - "rewards/question_recreation_reward_func": 0.6788536943495274, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.47069100104272366, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.265625, - "rewards/xmlcount_reward_func": 0.8680625036358833, + "rewards/strict_format_reward_func": 0.390625, + "rewards/xmlcount_reward_func": 1.111531250178814, "step": 56 }, { - "completion_length": 221.75, + "completion_length": 187.90625, "epoch": 0.6666666666666666, - "grad_norm": 98.6040267944336, - "kl": 104.94103610515594, + "grad_norm": 93.21127319335938, + "kl": 86.80950963497162, "learning_rate": 4.908051521780824e-06, - "loss": 0.1049, - "reward": 22.072678089141846, - "reward_std": 5.953763127326965, - "rewards/concensus_correctness_reward_func": 17.5, + "loss": 0.0868, + "reward": 19.448628664016724, + "reward_std": 5.2405555211007595, + "rewards/concensus_correctness_reward_func": 15.0, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.4375, - "rewards/question_recreation_reward_func": 0.4923659525811672, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.3553788308054209, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.7209374979138374, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 0.8901250138878822, "step": 58 }, { - "completion_length": 218.0625, + "completion_length": 209.1875, "epoch": 0.6896551724137931, - "grad_norm": 13.128256797790527, - "kl": 28.192466586828232, + "grad_norm": 13.603555679321289, + "kl": 56.15667283535004, "learning_rate": 4.899146773943374e-06, - "loss": 0.0282, - "reward": 24.061665534973145, - "reward_std": 3.3491099663078785, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0562, + "reward": 22.912687063217163, + "reward_std": 6.153905943036079, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.6875, - "rewards/question_recreation_reward_func": 0.5538214761763811, + "rewards/question_recreation_reward_func": 0.4873118605464697, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.28125, - "rewards/xmlcount_reward_func": 0.9140937626361847, + "rewards/strict_format_reward_func": 0.359375, + "rewards/xmlcount_reward_func": 1.1285000070929527, "step": 60 }, { - "completion_length": 274.84375, + "completion_length": 207.5, "epoch": 0.7126436781609196, - "grad_norm": 11.071130752563477, - "kl": 9.974729806184769, + "grad_norm": 918.2474365234375, + "kl": 493.683313369751, "learning_rate": 4.889839376755041e-06, - "loss": 0.01, - "reward": 24.2986581325531, - "reward_std": 2.6573511250317097, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.4937, + "reward": 22.144460201263428, + "reward_std": 6.169473327696323, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.7163765728473663, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.3845224855467677, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.7072812579572201, + "rewards/strict_format_reward_func": 0.328125, + "rewards/xmlcount_reward_func": 1.0568125024437904, "step": 62 }, { - "completion_length": 232.78125, + "completion_length": 185.8125, "epoch": 0.735632183908046, - "grad_norm": 7.177722930908203, - "kl": 38.97461545467377, + "grad_norm": 16.803300857543945, + "kl": 61.10732626914978, "learning_rate": 4.88013089227842e-06, - "loss": 0.039, - "reward": 23.83961820602417, - "reward_std": 3.365953803062439, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0611, + "reward": 21.13002872467041, + "reward_std": 6.309571877121925, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5625, - "rewards/question_recreation_reward_func": 0.5843994542956352, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.36243475414812565, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.8333437442779541, + "rewards/xmlcount_reward_func": 0.908218752592802, "step": 64 }, { - "completion_length": 210.375, + "completion_length": 199.71875, "epoch": 0.7586206896551724, - "grad_norm": 20.464805603027344, - "kl": 44.45508390665054, + "grad_norm": 11.177184104919434, + "kl": 55.20525595545769, "learning_rate": 4.870022949890676e-06, - "loss": 0.0445, - "reward": 19.302052199840546, - "reward_std": 6.476515740156174, - "rewards/concensus_correctness_reward_func": 15.0, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0552, + "reward": 20.911750078201294, + "reward_std": 3.589975042268634, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.5226769391447306, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.43875031918287277, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.6856249999254942, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.0355000048875809, "step": 66 }, { - "completion_length": 219.78125, + "completion_length": 186.375, "epoch": 0.7816091954022989, - "grad_norm": 9.038942337036133, - "kl": 13.816026866436005, + "grad_norm": 12.151446342468262, + "kl": 50.589539647102356, "learning_rate": 4.8595172460100914e-06, - "loss": 0.0138, - "reward": 22.137227416038513, - "reward_std": 2.6292380541563034, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0506, + "reward": 20.125344455242157, + "reward_std": 4.952287957072258, + "rewards/concensus_correctness_reward_func": 15.0, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.7442276887595654, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.5653753876686096, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.34375, - "rewards/xmlcount_reward_func": 0.986749991774559, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.059968739748001, "step": 68 }, { - "completion_length": 216.21875, + "completion_length": 185.34375, "epoch": 0.8045977011494253, - "grad_norm": 14.165794372558594, - "kl": 27.829581797122955, + "grad_norm": 14.14169979095459, + "kl": 51.352187156677246, "learning_rate": 4.8486155438113455e-06, - "loss": 0.0278, - "reward": 23.852848052978516, - "reward_std": 3.9582923725247383, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0514, + "reward": 21.336617946624756, + "reward_std": 7.90215827524662, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5625, - "rewards/question_recreation_reward_func": 0.6814103033393621, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.38986811926588416, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.780812494456768, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 0.9467500001192093, "step": 70 }, { - "completion_length": 245.5625, + "completion_length": 207.53125, "epoch": 0.8275862068965517, - "grad_norm": 12209.876953125, - "kl": 3094.170797765255, + "grad_norm": 9.242708206176758, + "kl": 875.6028162240982, "learning_rate": 4.837319672929606e-06, - "loss": 3.0942, - "reward": 17.15046799182892, - "reward_std": 0.9417933747172356, + "loss": 0.8756, + "reward": 16.43748128414154, + "reward_std": 5.550746850669384, "rewards/concensus_correctness_reward_func": 12.5, - "rewards/consensus_reward_func": 2.0, + "rewards/consensus_reward_func": 1.5, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.6844676919281483, + "rewards/final_correctness_reward_func": 0.8125, + "rewards/question_recreation_reward_func": 0.4091057376936078, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.7472500130534172, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 0.950250007212162, "step": 72 }, { - "completion_length": 209.71875, + "completion_length": 190.6875, "epoch": 0.8505747126436781, - "grad_norm": 26.67466163635254, - "kl": 1128.5409244894981, + "grad_norm": 8.704197883605957, + "kl": 40.18674683570862, "learning_rate": 4.825631529153466e-06, - "loss": 1.1285, - "reward": 23.494718074798584, - "reward_std": 3.8775581642985344, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0402, + "reward": 24.393526554107666, + "reward_std": 2.5207132399082184, + "rewards/concensus_correctness_reward_func": 19.375, + "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.5453430451452732, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.48490171786397696, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.7931250110268593, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 0.9711250066757202, "step": 74 }, { - "completion_length": 238.6875, + "completion_length": 191.9375, "epoch": 0.8735632183908046, - "grad_norm": 5.364561557769775, - "kl": 28.607876002788544, + "grad_norm": 13.749418258666992, + "kl": 24.448551893234253, "learning_rate": 4.813553074106761e-06, - "loss": 0.0286, - "reward": 24.229939222335815, - "reward_std": 3.5234772451221943, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0244, + "reward": 22.90774178504944, + "reward_std": 5.314309269189835, + "rewards/concensus_correctness_reward_func": 18.125, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.6875, - "rewards/question_recreation_reward_func": 0.7158452719449997, + "rewards/final_correctness_reward_func": 1.3125, + "rewards/question_recreation_reward_func": 0.45577304251492023, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.28125, - "rewards/xmlcount_reward_func": 0.9203437454998493, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 0.9363437741994858, "step": 76 }, { - "completion_length": 216.0, + "completion_length": 188.75, "epoch": 0.896551724137931, - "grad_norm": 8.996578216552734, - "kl": 25.39380121231079, + "grad_norm": 353.81317138671875, + "kl": 243.70828318595886, "learning_rate": 4.8010863349193605e-06, - "loss": 0.0254, - "reward": 22.055159270763397, - "reward_std": 2.3475461043417454, + "loss": 0.2437, + "reward": 21.497364372015, + "reward_std": 2.2528337808325887, "rewards/concensus_correctness_reward_func": 16.875, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.4375, - "rewards/question_recreation_reward_func": 0.6199715062975883, + "rewards/final_correctness_reward_func": 1.3125, + "rewards/question_recreation_reward_func": 0.29967684391885996, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.28125, - "rewards/xmlcount_reward_func": 0.9039375185966492, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 0.8383125029504299, "step": 78 }, { - "completion_length": 185.84375, + "completion_length": 161.875, "epoch": 0.9195402298850575, - "grad_norm": 9.201372146606445, - "kl": 29.816796958446503, + "grad_norm": 13.91150188446045, + "kl": 91.42959940433502, "learning_rate": 4.78823340388695e-06, - "loss": 0.0298, - "reward": 24.574794054031372, - "reward_std": 2.6594865694642067, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0914, + "reward": 20.54653775691986, + "reward_std": 6.705092180520296, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5, - "rewards/question_recreation_reward_func": 0.6716381013393402, + "rewards/final_correctness_reward_func": 1.5625, + "rewards/question_recreation_reward_func": 0.47453783079981804, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.8719062432646751, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0251249969005585, "step": 80 }, { - "completion_length": 220.0625, + "completion_length": 169.46875, "epoch": 0.9425287356321839, - "grad_norm": 7.937164783477783, - "kl": 41.743319630622864, + "grad_norm": 17.188884735107422, + "kl": 56.007208824157715, "learning_rate": 4.774996438119876e-06, - "loss": 0.0417, - "reward": 20.94010215997696, - "reward_std": 3.939913149923086, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.875, + "loss": 0.056, + "reward": 22.28032612800598, + "reward_std": 5.097578555345535, + "rewards/concensus_correctness_reward_func": 18.125, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.636320635676384, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.3877011202275753, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.8819062560796738, + "rewards/strict_format_reward_func": 0.171875, + "rewards/xmlcount_reward_func": 0.9082500115036964, "step": 82 }, { - "completion_length": 233.375, + "completion_length": 179.96875, "epoch": 0.9655172413793104, - "grad_norm": 6.468586444854736, - "kl": 47.76901078224182, + "grad_norm": 49.99221420288086, + "kl": 73.02854263782501, "learning_rate": 4.76137765918113e-06, - "loss": 0.0478, - "reward": 22.270709812641144, - "reward_std": 0.9251220151782036, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 2.0, + "loss": 0.073, + "reward": 18.865897297859192, + "reward_std": 4.865620896220207, + "rewards/concensus_correctness_reward_func": 14.375, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.6278662383556366, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.5054600099101663, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.7365937465801835, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0010625049471855, "step": 84 }, { - "completion_length": 224.34375, + "completion_length": 208.25, "epoch": 0.9885057471264368, - "grad_norm": 139.11477661132812, - "kl": 91.68885004520416, + "grad_norm": 32.325984954833984, + "kl": 101.60490530729294, "learning_rate": 4.747379352713489e-06, - "loss": 0.0917, - "reward": 22.105460166931152, - "reward_std": 5.359827175736427, - "rewards/concensus_correctness_reward_func": 17.592937469482422, - "rewards/consensus_reward_func": 1.75, + "loss": 0.1016, + "reward": 22.008362293243408, + "reward_std": 6.7211711667478085, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.6534290499985218, + "rewards/final_correctness_reward_func": 1.5625, + "rewards/question_recreation_reward_func": 0.5652999263256788, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.7028437554836273, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0211875140666962, "step": 86 }, { - "completion_length": 175.53125, + "completion_length": 189.75, "epoch": 1.0114942528735633, - "grad_norm": 91946.9140625, - "kl": 83508.0074467659, + "grad_norm": 9.05334758758545, + "kl": 29.934446334838867, "learning_rate": 4.733003868055923e-06, - "loss": 83.508, - "reward": 23.602487087249756, - "reward_std": 2.6276179775595665, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0299, + "reward": 22.84878945350647, + "reward_std": 4.233507685363293, + "rewards/concensus_correctness_reward_func": 18.125, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.5961430482566357, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.457132913172245, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.09375, - "rewards/xmlcount_reward_func": 0.6625937428325415, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 0.954156257212162, "step": 88 }, { - "completion_length": 185.75, + "completion_length": 169.21875, "epoch": 1.0344827586206897, - "grad_norm": 39.4473876953125, - "kl": 78.68558597564697, + "grad_norm": 11.377214431762695, + "kl": 69.49682199954987, "learning_rate": 4.718253617849306e-06, - "loss": 0.0787, - "reward": 19.62389823794365, - "reward_std": 5.379150470718741, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0695, + "reward": 20.761705189943314, + "reward_std": 2.2006406001746655, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.5167729863896966, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.367174091283232, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.09375, - "rewards/xmlcount_reward_func": 0.638374999165535, + "rewards/strict_format_reward_func": 0.1875, + "rewards/xmlcount_reward_func": 0.89453125, "step": 90 }, { - "completion_length": 220.125, + "completion_length": 196.5625, "epoch": 1.0574712643678161, - "grad_norm": 38.921142578125, - "kl": 66.71249175071716, + "grad_norm": 16.58251190185547, + "kl": 829.0935171246529, "learning_rate": 4.703131077631498e-06, - "loss": 0.0667, - "reward": 21.66197657585144, - "reward_std": 4.126878224313259, + "loss": 0.8291, + "reward": 22.088229417800903, + "reward_std": 6.390710741281509, "rewards/concensus_correctness_reward_func": 17.5, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.5617266856133938, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.4568858038401231, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.109375, - "rewards/xmlcount_reward_func": 0.6783750038594007, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 0.9594687595963478, "step": 92 }, { - "completion_length": 197.375, + "completion_length": 181.84375, "epoch": 1.0804597701149425, - "grad_norm": 10.641013145446777, - "kl": 190.54250866174698, + "grad_norm": 12.092888832092285, + "kl": 37.34927952289581, "learning_rate": 4.687638785421875e-06, - "loss": 0.1905, - "reward": 20.121814846992493, - "reward_std": 5.043535389006138, - "rewards/concensus_correctness_reward_func": 15.625, + "loss": 0.0373, + "reward": 20.9707533121109, + "reward_std": 3.6189417205750942, + "rewards/concensus_correctness_reward_func": 16.25, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.4635026268661022, + "rewards/final_correctness_reward_func": 1.3125, + "rewards/question_recreation_reward_func": 0.4045972768217325, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.8614374995231628, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 0.9411562457680702, "step": 94 }, { - "completion_length": 192.59375, + "completion_length": 188.75, "epoch": 1.103448275862069, - "grad_norm": 19.671062469482422, - "kl": 190.34782361984253, + "grad_norm": 12.247062683105469, + "kl": 34.96235263347626, "learning_rate": 4.671779341295378e-06, - "loss": 0.1903, - "reward": 20.42128562927246, - "reward_std": 8.864537373185158, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.625, + "loss": 0.035, + "reward": 24.552756786346436, + "reward_std": 2.6889688037335873, + "rewards/concensus_correctness_reward_func": 19.375, + "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.5986606515944004, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.4201321694999933, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.744499996304512, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.0701250061392784, "step": 96 }, { - "completion_length": 185.90625, + "completion_length": 172.53125, "epoch": 1.1264367816091954, - "grad_norm": 22.701616287231445, - "kl": 36.03865575790405, + "grad_norm": 39.397090911865234, + "kl": 84.93289303779602, "learning_rate": 4.655555406946135e-06, - "loss": 0.036, - "reward": 20.58166015148163, - "reward_std": 3.9698376022279263, + "loss": 0.0849, + "reward": 20.53111296892166, + "reward_std": 6.4694149270653725, "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.875, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.5459100110456347, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.4347694367170334, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.8013750091195107, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 0.8932187631726265, "step": 98 }, { - "completion_length": 212.375, + "completion_length": 206.375, "epoch": 1.1494252873563218, - "grad_norm": 10.533203125, - "kl": 18.79666292667389, + "grad_norm": 12.901283264160156, + "kl": 47.07088303565979, "learning_rate": 4.6389697052407535e-06, - "loss": 0.0188, - "reward": 20.207146495580673, - "reward_std": 4.963557600975037, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0471, + "reward": 21.164125561714172, + "reward_std": 3.427979849278927, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.5889591798186302, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.4073755946010351, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.7275625132024288, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 0.9598750099539757, "step": 100 }, { - "completion_length": 171.96875, + "completion_length": 186.3125, "epoch": 1.1724137931034484, - "grad_norm": 155.06492614746094, - "kl": 68.77212512493134, + "grad_norm": 100.69583892822266, + "kl": 119.86678445339203, "learning_rate": 4.622025019761336e-06, - "loss": 0.0688, - "reward": 22.511569023132324, - "reward_std": 4.047511957585812, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.1199, + "reward": 21.613150119781494, + "reward_std": 6.519247785210609, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.47641284205019474, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.3219627821817994, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.7070312425494194, + "rewards/xmlcount_reward_func": 0.7130624912679195, "step": 102 }, { - "completion_length": 218.9375, + "completion_length": 216.84375, "epoch": 1.1954022988505748, - "grad_norm": 1367.33935546875, - "kl": 721.9652456045151, + "grad_norm": 8.727561950683594, + "kl": 54.74279695749283, "learning_rate": 4.604724194338318e-06, - "loss": 0.722, - "reward": 19.8391655087471, - "reward_std": 3.395339649170637, - "rewards/concensus_correctness_reward_func": 15.0, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0547, + "reward": 18.05968815088272, + "reward_std": 6.371506504714489, + "rewards/concensus_correctness_reward_func": 13.852750062942505, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.6711339727044106, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.47434470243752, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.8399062529206276, + "rewards/strict_format_reward_func": 0.15625, + "rewards/xmlcount_reward_func": 0.8263437375426292, "step": 104 }, { - "completion_length": 168.6875, + "completion_length": 162.4375, "epoch": 1.2183908045977012, - "grad_norm": 40.02738952636719, - "kl": 65.6315426826477, + "grad_norm": 13.948561668395996, + "kl": 45.070194721221924, "learning_rate": 4.587070132573178e-06, - "loss": 0.0656, - "reward": 21.98404335975647, - "reward_std": 5.587805457413197, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0451, + "reward": 23.60681962966919, + "reward_std": 3.8940461352467537, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.49804367683827877, + "rewards/final_correctness_reward_func": 1.5, + "rewards/question_recreation_reward_func": 0.39322573598474264, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8610000163316727, + "rewards/strict_format_reward_func": 0.171875, + "rewards/xmlcount_reward_func": 0.9167187362909317, "step": 106 }, { - "completion_length": 218.21875, + "completion_length": 170.34375, "epoch": 1.2413793103448276, - "grad_norm": 9.776103019714355, - "kl": 23.720037281513214, + "grad_norm": 2173.41650390625, + "kl": 2838.58079123497, "learning_rate": 4.569065797351135e-06, - "loss": 0.0237, - "reward": 23.644041299819946, - "reward_std": 3.53853552415967, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 2.8386, + "reward": 21.27997601032257, + "reward_std": 5.610228531062603, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.5052283466793597, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.41053839586675167, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.265625, - "rewards/xmlcount_reward_func": 0.873187493532896, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 0.931937500834465, "step": 108 }, { - "completion_length": 237.8125, + "completion_length": 174.4375, "epoch": 1.264367816091954, - "grad_norm": 10.668675422668457, - "kl": 34.15976184606552, + "grad_norm": 11.465612411499023, + "kl": 43.83563882112503, "learning_rate": 4.550714210343879e-06, - "loss": 0.0342, - "reward": 17.147173404693604, - "reward_std": 6.411550164222717, - "rewards/concensus_correctness_reward_func": 12.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0438, + "reward": 18.363908290863037, + "reward_std": 6.738456949591637, + "rewards/concensus_correctness_reward_func": 14.47587502002716, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.5912669785320759, + "rewards/final_correctness_reward_func": 0.6875, + "rewards/question_recreation_reward_func": 0.4406272761989385, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.28125, - "rewards/xmlcount_reward_func": 0.899656243622303, + "rewards/strict_format_reward_func": 0.1875, + "rewards/xmlcount_reward_func": 0.8849062547087669, "step": 110 }, { - "completion_length": 260.4375, + "completion_length": 206.59375, "epoch": 1.2873563218390804, - "grad_norm": 10.50880241394043, - "kl": 15.423758924007416, + "grad_norm": 11.142423629760742, + "kl": 60.44405883550644, "learning_rate": 4.53201845150245e-06, - "loss": 0.0154, - "reward": 25.092689752578735, - "reward_std": 0.8879361758008599, - "rewards/concensus_correctness_reward_func": 20.0, - "rewards/consensus_reward_func": 2.0, + "loss": 0.0604, + "reward": 22.190204620361328, + "reward_std": 6.157109126448631, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.4375, - "rewards/question_recreation_reward_func": 0.6859089843928814, + "rewards/final_correctness_reward_func": 1.5, + "rewards/question_recreation_reward_func": 0.4072043038904667, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.7661562599241734, + "rewards/xmlcount_reward_func": 0.8298750072717667, "step": 112 }, { - "completion_length": 208.0, + "completion_length": 164.65625, "epoch": 1.3103448275862069, - "grad_norm": 6.086981296539307, - "kl": 35.96863120794296, + "grad_norm": 8.962136268615723, + "kl": 59.04880118370056, "learning_rate": 4.512981658540321e-06, - "loss": 0.036, - "reward": 19.24820075929165, - "reward_std": 2.0615255469456315, - "rewards/concensus_correctness_reward_func": 14.375, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.059, + "reward": 18.668354719877243, + "reward_std": 7.289912339299917, + "rewards/concensus_correctness_reward_func": 14.47950005531311, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.628575136885047, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.5007608043961227, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.296875, - "rewards/xmlcount_reward_func": 0.9477500170469284, + "rewards/strict_format_reward_func": 0.15625, + "rewards/xmlcount_reward_func": 0.8443437367677689, "step": 114 }, { - "completion_length": 191.0, + "completion_length": 178.78125, "epoch": 1.3333333333333333, - "grad_norm": 10.198894500732422, - "kl": 46.034221172332764, + "grad_norm": 10.085939407348633, + "kl": 59.855812668800354, "learning_rate": 4.493607026406802e-06, - "loss": 0.046, - "reward": 22.146987676620483, - "reward_std": 5.253380537033081, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0599, + "reward": 18.44299042224884, + "reward_std": 8.521994158625603, + "rewards/concensus_correctness_reward_func": 13.787687499076128, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.5550499744713306, + "rewards/final_correctness_reward_func": 1.5, + "rewards/question_recreation_reward_func": 0.4562405124306679, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.7794375196099281, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.9178125038743019, "step": 116 }, { - "completion_length": 223.03125, + "completion_length": 224.125, "epoch": 1.3563218390804597, - "grad_norm": 4.740744113922119, - "kl": 69.54069662094116, + "grad_norm": 7.863017559051514, + "kl": 47.65994584560394, "learning_rate": 4.473897806750829e-06, - "loss": 0.0695, - "reward": 19.53721532225609, - "reward_std": 6.454725589603186, - "rewards/concensus_correctness_reward_func": 15.0, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0477, + "reward": 18.359154999256134, + "reward_std": 6.624224863946438, + "rewards/concensus_correctness_reward_func": 13.75, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.6687778476625681, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.5245611686259508, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.7278125137090683, + "rewards/strict_format_reward_func": 0.171875, + "rewards/xmlcount_reward_func": 0.850218765437603, "step": 118 }, { - "completion_length": 225.6875, + "completion_length": 188.625, "epoch": 1.3793103448275863, - "grad_norm": 6.936825275421143, - "kl": 9.571717858314514, + "grad_norm": 20.490253448486328, + "kl": 64.13123416900635, "learning_rate": 4.4538573073752365e-06, - "loss": 0.0096, - "reward": 24.532721519470215, - "reward_std": 3.8307987935841084, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0641, + "reward": 20.88910961151123, + "reward_std": 9.146394729614258, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.6875, - "rewards/question_recreation_reward_func": 0.764752559363842, + "rewards/question_recreation_reward_func": 0.5894533935934305, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.375, - "rewards/xmlcount_reward_func": 1.080468736588955, + "rewards/strict_format_reward_func": 0.328125, + "rewards/xmlcount_reward_func": 1.0965312793850899, "step": 120 }, { - "completion_length": 215.40625, + "completion_length": 189.75, "epoch": 1.4022988505747127, - "grad_norm": 10.60375690460205, - "kl": 30.09870845079422, + "grad_norm": 48.46287536621094, + "kl": 51.21184545755386, "learning_rate": 4.4334888916816096e-06, - "loss": 0.0301, - "reward": 23.651365518569946, - "reward_std": 3.891990765929222, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0512, + "reward": 21.651530265808105, + "reward_std": 6.874642577022314, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.5502720512449741, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.4634991828352213, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.8823437467217445, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 0.9380312561988831, "step": 122 }, { - "completion_length": 260.1875, + "completion_length": 173.625, "epoch": 1.4252873563218391, - "grad_norm": 20.676340103149414, - "kl": 27.543380066752434, + "grad_norm": 21.84930992126465, + "kl": 52.84491562843323, "learning_rate": 4.412795978105807e-06, - "loss": 0.0275, - "reward": 19.061583757400513, - "reward_std": 2.5596910268068314, - "rewards/concensus_correctness_reward_func": 14.478187561035156, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0528, + "reward": 19.799984127283096, + "reward_std": 5.160407304763794, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.5681460797786713, + "rewards/question_recreation_reward_func": 0.3524842488113791, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.8433750048279762, + "rewards/strict_format_reward_func": 0.15625, + "rewards/xmlcount_reward_func": 0.8537499979138374, "step": 124 }, { - "completion_length": 190.71875, + "completion_length": 180.65625, "epoch": 1.4482758620689655, - "grad_norm": 13.795882225036621, - "kl": 54.169844806194305, + "grad_norm": 11.97016716003418, + "kl": 36.401242852211, "learning_rate": 4.391782039544239e-06, - "loss": 0.0542, - "reward": 22.090229511260986, - "reward_std": 6.329751199111342, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0364, + "reward": 21.758437395095825, + "reward_std": 6.2212139666080475, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.47563567757606506, + "rewards/final_correctness_reward_func": 1.625, + "rewards/question_recreation_reward_func": 0.3159063975326717, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.8333437512628734, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.005031257867813, "step": 126 }, { - "completion_length": 190.71875, + "completion_length": 159.125, "epoch": 1.471264367816092, - "grad_norm": 19.860910415649414, - "kl": 39.326825976371765, + "grad_norm": 51.37221908569336, + "kl": 68.37938299775124, "learning_rate": 4.37045060277101e-06, - "loss": 0.0393, - "reward": 23.239750862121582, - "reward_std": 3.519380111247301, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0684, + "reward": 21.98990821838379, + "reward_std": 6.40240578353405, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.3749068835750222, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.25543968845158815, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.7398437447845936, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.0313437581062317, "step": 128 }, { - "completion_length": 196.90625, + "completion_length": 181.375, "epoch": 1.4942528735632183, - "grad_norm": 18.308942794799805, - "kl": 29.91416847705841, + "grad_norm": 102.44271087646484, + "kl": 100.68648618459702, "learning_rate": 4.348805247846027e-06, - "loss": 0.0299, - "reward": 22.609585285186768, - "reward_std": 5.184334993362427, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.1007, + "reward": 22.120901584625244, + "reward_std": 5.688993155956268, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.5567415002733469, + "rewards/question_recreation_reward_func": 0.4187767431139946, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.7403437420725822, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 1.0458749905228615, "step": 130 }, { - "completion_length": 245.78125, + "completion_length": 160.34375, "epoch": 1.5172413793103448, - "grad_norm": 11.742849349975586, - "kl": 49.03591227531433, + "grad_norm": 16.103792190551758, + "kl": 57.822585701942444, "learning_rate": 4.326849607514149e-06, - "loss": 0.049, - "reward": 22.01890254020691, - "reward_std": 6.599127218127251, + "loss": 0.0578, + "reward": 21.831796646118164, + "reward_std": 5.650886729359627, "rewards/concensus_correctness_reward_func": 17.5, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.5520587565843016, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.3665152806788683, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8418437354266644, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.027781255543232, "step": 132 }, { - "completion_length": 206.96875, + "completion_length": 191.0, "epoch": 1.5402298850574714, - "grad_norm": 14.158550262451172, - "kl": 27.23181653022766, + "grad_norm": 9.688576698303223, + "kl": 36.7540967464447, "learning_rate": 4.304587366595506e-06, - "loss": 0.0272, - "reward": 24.12363362312317, - "reward_std": 2.6158539773896337, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0368, + "reward": 22.424193143844604, + "reward_std": 6.237176924943924, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.42544594313949347, + "rewards/final_correctness_reward_func": 1.625, + "rewards/question_recreation_reward_func": 0.3825994962826371, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.8075625076889992, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 0.9322187528014183, "step": 134 }, { - "completion_length": 234.0625, + "completion_length": 181.40625, "epoch": 1.5632183908045976, - "grad_norm": 7.278172016143799, - "kl": 19.96114432811737, + "grad_norm": 7.413599491119385, + "kl": 20.431242883205414, "learning_rate": 4.282022261367074e-06, - "loss": 0.02, - "reward": 24.318565130233765, - "reward_std": 2.369107022881508, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0204, + "reward": 21.82521426677704, + "reward_std": 4.983357530087233, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.4375, - "rewards/question_recreation_reward_func": 0.5467526586726308, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.43421418592333794, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.8030624985694885, + "rewards/xmlcount_reward_func": 0.8597499877214432, "step": 136 }, { - "completion_length": 154.4375, + "completion_length": 164.03125, "epoch": 1.5862068965517242, - "grad_norm": 17.613046646118164, - "kl": 33.912797927856445, + "grad_norm": 12.570961952209473, + "kl": 2985.0614099502563, "learning_rate": 4.259158078935616e-06, - "loss": 0.0339, - "reward": 22.96657919883728, - "reward_std": 3.7736367424950004, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 2.9851, + "reward": 20.842637419700623, + "reward_std": 7.682761624455452, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.875, - "rewards/question_recreation_reward_func": 0.48520387150347233, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.44795008935034275, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.8407500013709068, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.8634375110268593, "step": 138 }, { - "completion_length": 200.0625, + "completion_length": 144.53125, "epoch": 1.6091954022988506, - "grad_norm": 9063.2958984375, - "kl": 7356.537230968475, + "grad_norm": 21.883134841918945, + "kl": 50.49017810821533, "learning_rate": 4.235998656602091e-06, - "loss": 7.3566, - "reward": 20.170358031988144, - "reward_std": 3.6353699031169526, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0505, + "reward": 21.287381410598755, + "reward_std": 2.4484502635896206, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.6875, - "rewards/question_recreation_reward_func": 0.4604516150429845, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.27494448656216264, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.7411562697961926, + "rewards/strict_format_reward_func": 0.171875, + "rewards/xmlcount_reward_func": 0.9030625149607658, "step": 140 }, { - "completion_length": 194.625, + "completion_length": 163.25, "epoch": 1.632183908045977, - "grad_norm": 18.222875595092773, - "kl": 32.64061665534973, + "grad_norm": 12.124130249023438, + "kl": 64.16681325435638, "learning_rate": 4.212547881217637e-06, - "loss": 0.0326, - "reward": 21.903664112091064, - "reward_std": 6.382215045392513, - "rewards/concensus_correctness_reward_func": 17.5, + "loss": 0.0642, + "reward": 22.33631932735443, + "reward_std": 5.196270786225796, + "rewards/concensus_correctness_reward_func": 17.594125032424927, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.4970079343765974, + "rewards/question_recreation_reward_func": 0.47681939229369164, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.7816562429070473, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 1.0466250032186508, "step": 142 }, { - "completion_length": 186.0, + "completion_length": 156.84375, "epoch": 1.6551724137931034, - "grad_norm": 11.164816856384277, - "kl": 44.877442598342896, + "grad_norm": 21.240013122558594, + "kl": 94.7618658542633, "learning_rate": 4.188809688531241e-06, - "loss": 0.0449, - "reward": 22.38468074798584, - "reward_std": 5.0202604830265045, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0948, + "reward": 17.708184868097305, + "reward_std": 6.574467174708843, + "rewards/concensus_correctness_reward_func": 13.85631251335144, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.5470870118588209, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.38565336912870407, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.109375, - "rewards/xmlcount_reward_func": 0.7282187668606639, + "rewards/strict_format_reward_func": 0.171875, + "rewards/xmlcount_reward_func": 0.7943437658250332, "step": 144 }, { - "completion_length": 199.8125, + "completion_length": 141.25, "epoch": 1.6781609195402298, - "grad_norm": 9.058207511901855, - "kl": 42.31837069988251, + "grad_norm": 10.96385383605957, + "kl": 99.61672973632812, "learning_rate": 4.164788062529203e-06, - "loss": 0.0423, - "reward": 22.997267246246338, - "reward_std": 3.507912177592516, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0996, + "reward": 20.72616219520569, + "reward_std": 6.2193888053298, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.3822357952594757, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.1752871097996831, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.7244062572717667, + "rewards/xmlcount_reward_func": 0.8477500155568123, "step": 146 }, { - "completion_length": 198.125, + "completion_length": 190.46875, "epoch": 1.7011494252873565, - "grad_norm": 43.29230499267578, - "kl": 90.76131892204285, + "grad_norm": 21.411161422729492, + "kl": 31.226326286792755, "learning_rate": 4.140487034766499e-06, - "loss": 0.0908, - "reward": 22.38035273551941, - "reward_std": 5.980464659631252, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0312, + "reward": 23.55053949356079, + "reward_std": 3.8856130689382553, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5625, - "rewards/question_recreation_reward_func": 0.6397591419517994, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.3994142282754183, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.7405937537550926, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 0.8542499989271164, "step": 148 }, { - "completion_length": 196.40625, + "completion_length": 153.9375, "epoch": 1.7241379310344827, - "grad_norm": 227.61325073242188, - "kl": 452.9062591791153, + "grad_norm": 36.543697357177734, + "kl": 108.19440698623657, "learning_rate": 4.115910683690167e-06, - "loss": 0.453, - "reward": 20.17731249332428, - "reward_std": 4.529246799604152, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.75, + "loss": 0.1082, + "reward": 13.552048087120056, + "reward_std": 8.170875867363065, + "rewards/concensus_correctness_reward_func": 10.0, + "rewards/consensus_reward_func": 1.375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.5381560958921909, + "rewards/final_correctness_reward_func": 0.6875, + "rewards/question_recreation_reward_func": 0.36789238173514605, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.7954062670469284, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 0.9185312539339066, "step": 150 }, { - "completion_length": 163.5625, + "completion_length": 153.0, "epoch": 1.7471264367816093, - "grad_norm": 8.095112800598145, - "kl": 31.721220016479492, + "grad_norm": 172.71754455566406, + "kl": 166.587668299675, "learning_rate": 4.091063133954821e-06, - "loss": 0.0317, - "reward": 22.579798221588135, - "reward_std": 3.9802165254950523, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.1666, + "reward": 18.236694991588593, + "reward_std": 8.885597191751003, + "rewards/concensus_correctness_reward_func": 14.375, + "rewards/consensus_reward_func": 1.4375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.5605173094663769, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.4031638070009649, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.8161562383174896, + "rewards/xmlcount_reward_func": 0.8179062530398369, "step": 152 }, { - "completion_length": 195.875, + "completion_length": 157.65625, "epoch": 1.7701149425287355, - "grad_norm": 20.339494705200195, - "kl": 53.64894300699234, + "grad_norm": 281.0543212890625, + "kl": 108.47319543361664, "learning_rate": 4.065948555730405e-06, - "loss": 0.0536, - "reward": 24.14040780067444, - "reward_std": 3.62350944429636, + "loss": 0.1085, + "reward": 24.01292324066162, + "reward_std": 3.52595267444849, "rewards/concensus_correctness_reward_func": 18.75, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.625, - "rewards/question_recreation_reward_func": 0.5726892165839672, + "rewards/final_correctness_reward_func": 1.75, + "rewards/question_recreation_reward_func": 0.41989195346832275, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.328125, - "rewards/xmlcount_reward_func": 0.9895937517285347, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 0.9680312648415565, "step": 154 }, { - "completion_length": 246.6875, + "completion_length": 166.53125, "epoch": 1.793103448275862, - "grad_norm": 4.499557971954346, - "kl": 20.031489849090576, + "grad_norm": 24.108455657958984, + "kl": 75.26628756523132, "learning_rate": 4.040571164002319e-06, - "loss": 0.02, - "reward": 23.96647310256958, - "reward_std": 3.892509236931801, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0753, + "reward": 20.508435100317, + "reward_std": 3.8080715723335743, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.6921606138348579, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.31456019822508097, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.28125, - "rewards/xmlcount_reward_func": 0.9930624887347221, + "rewards/strict_format_reward_func": 0.109375, + "rewards/xmlcount_reward_func": 0.8345000147819519, "step": 156 }, { - "completion_length": 203.0625, + "completion_length": 161.5, "epoch": 1.8160919540229885, - "grad_norm": 7.854334354400635, - "kl": 34.88800036907196, + "grad_norm": 15.420141220092773, + "kl": 77.9570460319519, "learning_rate": 4.014935217864009e-06, - "loss": 0.0349, - "reward": 20.698371708393097, - "reward_std": 4.964223116636276, + "loss": 0.078, + "reward": 19.84076488018036, + "reward_std": 4.989244751632214, "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.8125, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5, - "rewards/question_recreation_reward_func": 0.5856219911947846, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.26270246179774404, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.265625, - "rewards/xmlcount_reward_func": 0.9096250012516975, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.921812504529953, "step": 158 }, { - "completion_length": 211.09375, + "completion_length": 132.5, "epoch": 1.839080459770115, - "grad_norm": 6.14654541015625, - "kl": 23.56060391664505, + "grad_norm": 26.079729080200195, + "kl": 122.96159541606903, "learning_rate": 3.989045019802171e-06, - "loss": 0.0236, - "reward": 23.71225380897522, - "reward_std": 3.689402237534523, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.123, + "reward": 19.464439749717712, + "reward_std": 7.724615082144737, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.580534789711237, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.23600229807198048, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.265625, - "rewards/xmlcount_reward_func": 0.9285937547683716, + "rewards/strict_format_reward_func": 0.1875, + "rewards/xmlcount_reward_func": 0.9159374982118607, "step": 160 }, { - "completion_length": 213.375, + "completion_length": 169.34375, "epoch": 1.8620689655172413, - "grad_norm": 8.173796653747559, - "kl": 22.394744217395782, + "grad_norm": 7.503109931945801, + "kl": 21.176270961761475, "learning_rate": 3.962904914974656e-06, - "loss": 0.0224, - "reward": 24.021884202957153, - "reward_std": 2.6276159659028053, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0212, + "reward": 24.312889099121094, + "reward_std": 2.2233006693422794, + "rewards/concensus_correctness_reward_func": 19.375, + "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5625, - "rewards/question_recreation_reward_func": 0.6691654101014137, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.43667050264775753, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.28125, - "rewards/xmlcount_reward_func": 0.8839687332510948, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 0.9387187510728836, "step": 162 }, { - "completion_length": 189.84375, + "completion_length": 133.65625, "epoch": 1.8850574712643677, - "grad_norm": 7.569718837738037, - "kl": 19.809502065181732, + "grad_norm": 14.409662246704102, + "kl": 71.93514931201935, "learning_rate": 3.936519290481226e-06, - "loss": 0.0198, - "reward": 24.518553733825684, - "reward_std": 2.092976523563266, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0719, + "reward": 22.77951717376709, + "reward_std": 4.03099450096488, + "rewards/concensus_correctness_reward_func": 18.125, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.5153661444783211, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.2748609227128327, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.328125, - "rewards/xmlcount_reward_func": 0.9875624943524599, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.05153127014637, "step": 164 }, { - "completion_length": 200.875, + "completion_length": 192.5, "epoch": 1.9080459770114944, - "grad_norm": 8.167227745056152, - "kl": 15.953735589981079, + "grad_norm": 212.40203857421875, + "kl": 87.70037099719048, "learning_rate": 3.909892574627267e-06, - "loss": 0.016, - "reward": 23.793687105178833, - "reward_std": 2.9268408566713333, + "loss": 0.0877, + "reward": 23.29596519470215, + "reward_std": 3.7804993018507957, "rewards/concensus_correctness_reward_func": 18.75, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.4375, - "rewards/question_recreation_reward_func": 0.45437459275126457, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.28421579115092754, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.3125, - "rewards/xmlcount_reward_func": 0.9643124863505363, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 0.9492500126361847, "step": 166 }, { - "completion_length": 220.5625, + "completion_length": 154.15625, "epoch": 1.9310344827586206, - "grad_norm": 12.206136703491211, - "kl": 19.527158230543137, + "grad_norm": 11.006453514099121, + "kl": 61.17928698658943, "learning_rate": 3.883029236180577e-06, - "loss": 0.0195, - "reward": 23.204511880874634, - "reward_std": 3.590113900601864, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0612, + "reward": 21.84220790863037, + "reward_std": 4.725912474095821, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.4011056572198868, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.19008373469114304, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.8346562534570694, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 0.9490000084042549, "step": 168 }, { - "completion_length": 206.90625, + "completion_length": 154.875, "epoch": 1.9540229885057472, - "grad_norm": 5.26110315322876, - "kl": 24.140369713306427, + "grad_norm": 12.463375091552734, + "kl": 72.655757188797, "learning_rate": 3.855933783621384e-06, - "loss": 0.0241, - "reward": 23.89325451850891, - "reward_std": 3.347651330754161, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0727, + "reward": 19.947094351053238, + "reward_std": 5.512985646724701, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.4375, - "rewards/question_recreation_reward_func": 0.5525359152816236, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.3686255179345608, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.3125, - "rewards/xmlcount_reward_func": 0.9657187536358833, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 0.9690937623381615, "step": 170 }, { - "completion_length": 227.65625, + "completion_length": 176.0625, "epoch": 1.9770114942528736, - "grad_norm": 8.007431030273438, - "kl": 30.11464422941208, + "grad_norm": 35.6094970703125, + "kl": 87.24336385726929, "learning_rate": 3.828610764385676e-06, - "loss": 0.0301, - "reward": 22.856654167175293, - "reward_std": 5.209500607103109, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0872, + "reward": 20.662386655807495, + "reward_std": 7.662640914320946, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.40990414656698704, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.245105167850852, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.328125, - "rewards/xmlcount_reward_func": 0.9311250001192093, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.94853126257658, "step": 172 }, { - "completion_length": 199.34375, + "completion_length": 180.5625, "epoch": 2.0, - "grad_norm": 20.22154998779297, - "kl": 72.29662752151489, + "grad_norm": 14.9784574508667, + "kl": 20.4464550614357, "learning_rate": 3.8010647641020116e-06, - "loss": 0.0723, - "reward": 20.455689668655396, - "reward_std": 8.041493298485875, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.5625, + "loss": 0.0204, + "reward": 24.202844858169556, + "reward_std": 2.7329512014985085, + "rewards/concensus_correctness_reward_func": 19.375, + "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.625, - "rewards/question_recreation_reward_func": 0.6080643348395824, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.452063724398613, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.847624996677041, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 0.9851562455296516, "step": 174 }, { - "completion_length": 171.46875, + "completion_length": 149.5625, "epoch": 2.0229885057471266, - "grad_norm": 18.824586868286133, - "kl": 25.367591321468353, + "grad_norm": 18.222515106201172, + "kl": 61.27457481622696, "learning_rate": 3.773300405821908e-06, - "loss": 0.0254, - "reward": 19.021948248147964, - "reward_std": 2.3876964151859283, - "rewards/concensus_correctness_reward_func": 14.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0613, + "reward": 16.03078243136406, + "reward_std": 7.716041281819344, + "rewards/concensus_correctness_reward_func": 11.875, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.5253542587161064, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.40621997183188796, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.25, - "rewards/xmlcount_reward_func": 0.9340937435626984, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 0.952687531709671, "step": 176 }, { - "completion_length": 181.59375, + "completion_length": 157.28125, "epoch": 2.045977011494253, - "grad_norm": 14.513740539550781, - "kl": 17.73258411884308, + "grad_norm": 13.05146598815918, + "kl": 806.8720440864563, "learning_rate": 3.7453223492439544e-06, - "loss": 0.0177, - "reward": 22.447916358709335, - "reward_std": 1.1141389552503824, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 2.0, + "loss": 0.8069, + "reward": 17.32739269733429, + "reward_std": 7.335513092577457, + "rewards/concensus_correctness_reward_func": 13.125, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.41913476772606373, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.31611123913899064, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.9350312575697899, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.0581562593579292, "step": 178 }, { - "completion_length": 193.90625, + "completion_length": 159.0, "epoch": 2.0689655172413794, - "grad_norm": 2851.374755859375, - "kl": 1336.6555631756783, + "grad_norm": 10.964728355407715, + "kl": 43.2911182641983, "learning_rate": 3.7171352899317743e-06, - "loss": 1.3367, - "reward": 23.85870909690857, - "reward_std": 3.7402728721499443, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0433, + "reward": 24.185287714004517, + "reward_std": 2.431728109717369, + "rewards/concensus_correctness_reward_func": 19.375, + "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5625, - "rewards/question_recreation_reward_func": 0.5351466983556747, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.3182880566455424, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, - "rewards/xmlcount_reward_func": 0.8860624954104424, + "rewards/xmlcount_reward_func": 1.0544999837875366, "step": 180 }, { - "completion_length": 225.3125, + "completion_length": 176.625, "epoch": 2.0919540229885056, - "grad_norm": 20.251195907592773, - "kl": 34.19960170984268, + "grad_norm": 16.8953800201416, + "kl": 56.65613496303558, "learning_rate": 3.6887439585259693e-06, - "loss": 0.0342, - "reward": 23.59685444831848, - "reward_std": 2.5345835275948048, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0567, + "reward": 16.18320071697235, + "reward_std": 6.523620970547199, + "rewards/concensus_correctness_reward_func": 11.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.35769895603880286, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.33913835044950247, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.707906249910593, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 1.0003124922513962, "step": 182 }, { - "completion_length": 182.4375, + "completion_length": 164.65625, "epoch": 2.1149425287356323, - "grad_norm": 16.23004150390625, - "kl": 152.37481808662415, + "grad_norm": 9.814878463745117, + "kl": 50.83259391784668, "learning_rate": 3.6601531199501715e-06, - "loss": 0.1524, - "reward": 18.508229851722717, - "reward_std": 8.711337849497795, - "rewards/concensus_correctness_reward_func": 14.474499940872192, - "rewards/consensus_reward_func": 1.4375, + "loss": 0.0508, + "reward": 23.588459730148315, + "reward_std": 2.8070178739726543, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.3889483865350485, + "rewards/final_correctness_reward_func": 1.3125, + "rewards/question_recreation_reward_func": 0.3976163724437356, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, - "rewards/xmlcount_reward_func": 0.894781269133091, + "rewards/xmlcount_reward_func": 1.0033437460660934, "step": 184 }, { - "completion_length": 216.8125, + "completion_length": 184.0, "epoch": 2.1379310344827585, - "grad_norm": 11.544267654418945, - "kl": 26.041823536157608, + "grad_norm": 15.8318510055542, + "kl": 49.38940763473511, "learning_rate": 3.631367572611348e-06, - "loss": 0.026, - "reward": 22.646936655044556, - "reward_std": 5.107893757522106, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0494, + "reward": 20.76482403278351, + "reward_std": 6.448275949805975, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.4375, - "rewards/question_recreation_reward_func": 0.6064677434042096, + "rewards/question_recreation_reward_func": 0.3974805222824216, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.09375, - "rewards/xmlcount_reward_func": 0.5717187486588955, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.836093757301569, "step": 186 }, { - "completion_length": 179.4375, + "completion_length": 172.09375, "epoch": 2.160919540229885, - "grad_norm": 19.773971557617188, - "kl": 50.33586627244949, + "grad_norm": 8.231196403503418, + "kl": 66.35796988010406, "learning_rate": 3.6023921475944795e-06, - "loss": 0.0503, - "reward": 22.32581377029419, - "reward_std": 5.213296957314014, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0664, + "reward": 20.477165818214417, + "reward_std": 8.073923096060753, + "rewards/concensus_correctness_reward_func": 15.726500034332275, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.5947513580322266, + "rewards/final_correctness_reward_func": 1.5625, + "rewards/question_recreation_reward_func": 0.3912282707169652, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.8873125091195107, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 0.9531875215470791, "step": 188 }, { - "completion_length": 205.9375, + "completion_length": 173.46875, "epoch": 2.1839080459770113, - "grad_norm": 11.09538745880127, - "kl": 26.162372916936874, + "grad_norm": 5457.1826171875, + "kl": 2268.4811388254166, "learning_rate": 3.573231707851765e-06, - "loss": 0.0262, - "reward": 20.991794228553772, - "reward_std": 2.7222228348255157, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.875, + "loss": 2.2685, + "reward": 22.87942934036255, + "reward_std": 4.034271139651537, + "rewards/concensus_correctness_reward_func": 18.125, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.5258252900093794, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.42614841647446156, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, - "rewards/xmlcount_reward_func": 0.9034687429666519, + "rewards/xmlcount_reward_func": 1.0157812684774399, "step": 190 }, { - "completion_length": 189.09375, + "completion_length": 150.0625, "epoch": 2.206896551724138, - "grad_norm": 21.403644561767578, - "kl": 62.82874211668968, + "grad_norm": 12.686607360839844, + "kl": 160.8327181339264, "learning_rate": 3.5438911473864633e-06, - "loss": 0.0628, - "reward": 23.615223169326782, - "reward_std": 3.8651506304740906, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.1608, + "reward": 17.430189728736877, + "reward_std": 10.36325491964817, + "rewards/concensus_correctness_reward_func": 13.125, + "rewards/consensus_reward_func": 1.3125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.48212913144379854, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.27978323865681887, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.265625, - "rewards/xmlcount_reward_func": 0.9299687594175339, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 1.056656263768673, "step": 192 }, { - "completion_length": 174.15625, + "completion_length": 153.8125, "epoch": 2.2298850574712645, - "grad_norm": 9.891429901123047, - "kl": 17.821874380111694, + "grad_norm": 19.985454559326172, + "kl": 48.01530635356903, "learning_rate": 3.514375390431539e-06, - "loss": 0.0178, - "reward": 24.82721495628357, - "reward_std": 1.294725090265274, - "rewards/concensus_correctness_reward_func": 20.0, - "rewards/consensus_reward_func": 2.0, + "loss": 0.048, + "reward": 21.66079020500183, + "reward_std": 7.7952331602573395, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.4702151194214821, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.2737586812581867, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.903875008225441, + "rewards/strict_format_reward_func": 0.34375, + "rewards/xmlcount_reward_func": 1.1057812497019768, "step": 194 }, { - "completion_length": 195.875, + "completion_length": 165.9375, "epoch": 2.2528735632183907, - "grad_norm": 18.279281616210938, - "kl": 30.182824730873108, + "grad_norm": 17.583553314208984, + "kl": 38.91847252845764, "learning_rate": 3.484689390623218e-06, - "loss": 0.0302, - "reward": 20.930479794740677, - "reward_std": 3.795420553535223, + "loss": 0.0389, + "reward": 20.729109823703766, + "reward_std": 3.8170019295066595, "rewards/concensus_correctness_reward_func": 16.25, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.5250112041831017, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.3370785207953304, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.25, - "rewards/xmlcount_reward_func": 0.9054687321186066, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 0.9857812449336052, "step": 196 }, { - "completion_length": 226.625, + "completion_length": 167.28125, "epoch": 2.2758620689655173, - "grad_norm": 106.86115264892578, - "kl": 81.03305122256279, + "grad_norm": 32.5487174987793, + "kl": 98.10131669044495, "learning_rate": 3.4548381301696298e-06, - "loss": 0.081, - "reward": 23.492977619171143, - "reward_std": 3.8844687193632126, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0981, + "reward": 19.30813992023468, + "reward_std": 4.966625921428204, + "rewards/concensus_correctness_reward_func": 15.0, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.43522756081074476, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.24595272447913885, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.9015000127255917, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 0.984062485396862, "step": 198 }, { - "completion_length": 193.78125, + "completion_length": 165.25, "epoch": 2.2988505747126435, - "grad_norm": 9.324207305908203, - "kl": 14.6187304854393, + "grad_norm": 14.370098114013672, + "kl": 28.574753165245056, "learning_rate": 3.4248266190146307e-06, - "loss": 0.0146, - "reward": 23.276930332183838, - "reward_std": 3.5861197635531425, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0286, + "reward": 22.612446308135986, + "reward_std": 4.956176124513149, + "rewards/concensus_correctness_reward_func": 18.125, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.5100554600358009, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.3004767969250679, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.735625023022294, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 0.9369687438011169, "step": 200 }, { - "completion_length": 233.28125, + "completion_length": 152.1875, "epoch": 2.32183908045977, - "grad_norm": 919.9231567382812, - "kl": 235.79344129562378, + "grad_norm": 10.197628021240234, + "kl": 56.1074800491333, "learning_rate": 3.39465989399699e-06, - "loss": 0.2358, - "reward": 22.141968488693237, - "reward_std": 5.353037633001804, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0561, + "reward": 21.019150972366333, + "reward_std": 6.507973730564117, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.44590575993061066, + "rewards/question_recreation_reward_func": 0.21674463991075754, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.109375, - "rewards/xmlcount_reward_func": 0.6491874903440475, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 1.005531258881092, "step": 202 }, { - "completion_length": 198.21875, + "completion_length": 157.9375, "epoch": 2.344827586206897, - "grad_norm": 8.306560516357422, - "kl": 21.928658962249756, + "grad_norm": 42.659515380859375, + "kl": 65.14888799190521, "learning_rate": 3.3643430180050573e-06, - "loss": 0.0219, - "reward": 21.660332083702087, - "reward_std": 2.1175025794655085, + "loss": 0.0651, + "reward": 21.52014720439911, + "reward_std": 5.992846731096506, "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.9375, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.4375, - "rewards/question_recreation_reward_func": 0.43533212691545486, + "rewards/question_recreation_reward_func": 0.32645976357162, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.7875000052154064, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.9749375209212303, "step": 204 }, { - "completion_length": 212.0, + "completion_length": 163.65625, "epoch": 2.367816091954023, - "grad_norm": 23.93052864074707, - "kl": 73.4831235408783, + "grad_norm": 10.59619426727295, + "kl": 34.498946726322174, "learning_rate": 3.333881079127052e-06, - "loss": 0.0735, - "reward": 21.9992618560791, - "reward_std": 5.1488963812589645, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0345, + "reward": 23.468876123428345, + "reward_std": 3.6643351688981056, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.875, - "rewards/question_recreation_reward_func": 0.3638557894155383, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.26722045708447695, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.6666562631726265, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.076656237244606, "step": 206 }, { - "completion_length": 160.15625, + "completion_length": 141.0625, "epoch": 2.3908045977011496, - "grad_norm": 18.52243995666504, - "kl": 84.1786233484745, + "grad_norm": 38.444522857666016, + "kl": 125.50883775949478, "learning_rate": 3.3032791897971313e-06, - "loss": 0.0842, - "reward": 18.600049287080765, - "reward_std": 6.51118154078722, - "rewards/concensus_correctness_reward_func": 14.481687545776367, - "rewards/consensus_reward_func": 1.625, + "loss": 0.1255, + "reward": 17.697890371084213, + "reward_std": 8.042941011488438, + "rewards/concensus_correctness_reward_func": 13.125, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.40370573475956917, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.2604214930906892, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.8865312486886978, + "rewards/strict_format_reward_func": 0.328125, + "rewards/xmlcount_reward_func": 1.0468437746167183, "step": 208 }, { - "completion_length": 162.53125, + "completion_length": 149.4375, "epoch": 2.413793103448276, - "grad_norm": 14.905916213989258, - "kl": 30.207842111587524, + "grad_norm": 14.667291641235352, + "kl": 210.53725743293762, "learning_rate": 3.272542485937369e-06, - "loss": 0.0302, - "reward": 23.974201440811157, - "reward_std": 2.276708383113146, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.2105, + "reward": 21.500834226608276, + "reward_std": 6.57486379891634, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.3524517808109522, + "rewards/question_recreation_reward_func": 0.2982095896732062, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8717499822378159, + "rewards/strict_format_reward_func": 0.328125, + "rewards/xmlcount_reward_func": 1.0619999915361404, "step": 210 }, { - "completion_length": 171.09375, + "completion_length": 158.875, "epoch": 2.4367816091954024, - "grad_norm": 54.68446731567383, - "kl": 43.431635558605194, + "grad_norm": 126.99817657470703, + "kl": 48.98594677448273, "learning_rate": 3.2416761260957925e-06, - "loss": 0.0434, - "reward": 24.10509729385376, - "reward_std": 2.500546045601368, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.049, + "reward": 23.650426864624023, + "reward_std": 3.30002686008811, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.3963785795494914, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.4243647903203964, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.8805937469005585, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0541875064373016, "step": 212 }, { - "completion_length": 207.0625, + "completion_length": 133.21875, "epoch": 2.4597701149425286, - "grad_norm": 12.507402420043945, - "kl": 28.789629459381104, + "grad_norm": 10.97807788848877, + "kl": 82.30508959293365, "learning_rate": 3.210685290580622e-06, - "loss": 0.0288, - "reward": 18.053790032863617, - "reward_std": 3.7649053037166595, - "rewards/concensus_correctness_reward_func": 13.75, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0823, + "reward": 18.351506382226944, + "reward_std": 6.907758057117462, + "rewards/concensus_correctness_reward_func": 14.375, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.5223215334117413, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.2649126504547894, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.6408437490463257, + "rewards/strict_format_reward_func": 0.15625, + "rewards/xmlcount_reward_func": 0.9303437620401382, "step": 214 }, { - "completion_length": 198.65625, + "completion_length": 137.65625, "epoch": 2.4827586206896552, - "grad_norm": 210.70848083496094, - "kl": 99.02290534973145, + "grad_norm": 17.758981704711914, + "kl": 101.2872142791748, "learning_rate": 3.1795751805908578e-06, - "loss": 0.099, - "reward": 21.982049465179443, - "reward_std": 6.290198154747486, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.1013, + "reward": 18.390512824058533, + "reward_std": 10.643928527832031, + "rewards/concensus_correctness_reward_func": 14.375, + "rewards/consensus_reward_func": 1.4375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.4351743645966053, + "rewards/final_correctness_reward_func": 1.3125, + "rewards/question_recreation_reward_func": 0.18891880835872144, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.75, + "rewards/strict_format_reward_func": 0.140625, + "rewards/xmlcount_reward_func": 0.9359687492251396, "step": 216 }, { - "completion_length": 153.4375, + "completion_length": 151.5, "epoch": 2.5057471264367814, - "grad_norm": 14.735133171081543, - "kl": 60.84754681587219, + "grad_norm": 15.907320976257324, + "kl": 35.986339688301086, "learning_rate": 3.148351017343363e-06, - "loss": 0.0608, - "reward": 23.808183193206787, - "reward_std": 2.8083360344171524, - "rewards/concensus_correctness_reward_func": 19.375, + "loss": 0.036, + "reward": 21.423733592033386, + "reward_std": 2.569384027272463, + "rewards/concensus_correctness_reward_func": 16.875, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.40130839962512255, + "rewards/question_recreation_reward_func": 0.31698313634842634, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.8756249845027924, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 0.9817500188946724, "step": 218 }, { - "completion_length": 217.5625, + "completion_length": 141.34375, "epoch": 2.528735632183908, - "grad_norm": 9.27346420288086, - "kl": 21.48068803548813, + "grad_norm": 6.957406044006348, + "kl": 61.72591406106949, "learning_rate": 3.1170180411965854e-06, - "loss": 0.0215, - "reward": 21.198187857866287, - "reward_std": 2.5451225079596043, + "loss": 0.0617, + "reward": 21.16593289375305, + "reward_std": 6.528529327362776, "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.9375, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.43406323064118624, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.27768219355493784, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.670375008136034, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.9194999933242798, "step": 220 }, { - "completion_length": 199.46875, + "completion_length": 172.1875, "epoch": 2.5517241379310347, - "grad_norm": 12.230122566223145, - "kl": 35.57611131668091, + "grad_norm": 13.886698722839355, + "kl": 78.9925445318222, "learning_rate": 3.085581510771067e-06, - "loss": 0.0356, - "reward": 22.403836011886597, - "reward_std": 4.476667793467641, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.079, + "reward": 21.284534811973572, + "reward_std": 5.0575045719742775, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.5822737365961075, + "rewards/question_recreation_reward_func": 0.39415994845330715, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.078125, - "rewards/xmlcount_reward_func": 0.618437496945262, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 0.9059999883174896, "step": 222 }, { - "completion_length": 189.65625, + "completion_length": 165.5625, "epoch": 2.574712643678161, - "grad_norm": 10.987492561340332, - "kl": 37.20429444313049, + "grad_norm": 8.458443641662598, + "kl": 35.35398751497269, "learning_rate": 3.054046702066886e-06, - "loss": 0.0372, - "reward": 19.976765632629395, - "reward_std": 3.6805741861462593, + "loss": 0.0354, + "reward": 20.23376339673996, + "reward_std": 3.693509737495333, "rewards/concensus_correctness_reward_func": 15.625, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.39710916485637426, + "rewards/question_recreation_reward_func": 0.30863827280700207, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.7671562470495701, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 1.0188749954104424, "step": 224 }, { - "completion_length": 199.6875, + "completion_length": 149.65625, "epoch": 2.5977011494252875, - "grad_norm": 218.14071655273438, - "kl": 81.93121492862701, + "grad_norm": 28.119159698486328, + "kl": 153.278990149498, "learning_rate": 3.0224189075781886e-06, - "loss": 0.0819, - "reward": 23.334370017051697, - "reward_std": 3.687668576836586, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.1533, + "reward": 19.50961399078369, + "reward_std": 9.268822841346264, + "rewards/concensus_correctness_reward_func": 15.0, + "rewards/consensus_reward_func": 1.5, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.625, - "rewards/question_recreation_reward_func": 0.5196514576673508, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.40252033062279224, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.28125, - "rewards/xmlcount_reward_func": 0.9709687680006027, + "rewards/strict_format_reward_func": 0.34375, + "rewards/xmlcount_reward_func": 1.0758437439799309, "step": 226 }, { - "completion_length": 199.15625, + "completion_length": 162.46875, "epoch": 2.6206896551724137, - "grad_norm": 55.59806823730469, - "kl": 105.73474150896072, + "grad_norm": 11.210749626159668, + "kl": 56.028138279914856, "learning_rate": 2.9907034354049443e-06, - "loss": 0.1057, - "reward": 21.661295890808105, - "reward_std": 5.487567663192749, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.056, + "reward": 22.563928842544556, + "reward_std": 4.793614652007818, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5, - "rewards/question_recreation_reward_func": 0.4694526642560959, + "rewards/final_correctness_reward_func": 1.6875, + "rewards/question_recreation_reward_func": 0.2866164706647396, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.8949687480926514, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0429375171661377, "step": 228 }, { - "completion_length": 187.0, + "completion_length": 191.28125, "epoch": 2.6436781609195403, - "grad_norm": 74.4803237915039, - "kl": 62.91032111644745, + "grad_norm": 17.16364860534668, + "kl": 103.48286080360413, "learning_rate": 2.9589056083620902e-06, - "loss": 0.0629, - "reward": 18.562279403209686, - "reward_std": 6.2179053500294685, - "rewards/concensus_correctness_reward_func": 14.375, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.1035, + "reward": 18.024529218673706, + "reward_std": 10.756541930139065, + "rewards/concensus_correctness_reward_func": 13.75, + "rewards/consensus_reward_func": 1.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.40909208124503493, + "rewards/question_recreation_reward_func": 0.2632478969171643, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.09375, - "rewards/xmlcount_reward_func": 0.6844374947249889, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0269062742590904, "step": 230 }, { - "completion_length": 193.84375, + "completion_length": 137.8125, "epoch": 2.6666666666666665, - "grad_norm": 153.6999969482422, - "kl": 58.603620529174805, + "grad_norm": 21.566341400146484, + "kl": 417.1934640407562, "learning_rate": 2.927030763086201e-06, - "loss": 0.0586, - "reward": 22.8560152053833, - "reward_std": 5.135724253952503, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.4172, + "reward": 21.840269207954407, + "reward_std": 5.135549947619438, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5, - "rewards/question_recreation_reward_func": 0.32535880617797375, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.21667506452649832, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.85878124833107, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.0610937401652336, "step": 232 }, { - "completion_length": 187.34375, + "completion_length": 177.28125, "epoch": 2.689655172413793, - "grad_norm": 134.07833862304688, - "kl": 57.13446509838104, + "grad_norm": 7.936868190765381, + "kl": 41.953171730041504, "learning_rate": 2.8950842491398358e-06, - "loss": 0.0571, - "reward": 21.37825781106949, - "reward_std": 2.3470610342919827, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.042, + "reward": 18.911091536283493, + "reward_std": 6.282039914280176, + "rewards/concensus_correctness_reward_func": 14.375, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.5101327486336231, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.3456540531478822, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8056250065565109, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0810624957084656, "step": 234 }, { - "completion_length": 173.5625, + "completion_length": 164.4375, "epoch": 2.7126436781609193, - "grad_norm": 20.201066970825195, - "kl": 93.91959285736084, + "grad_norm": 11.180641174316406, + "kl": 30.85140883922577, "learning_rate": 2.8630714281137263e-06, - "loss": 0.0939, - "reward": 19.030735552310944, - "reward_std": 6.849746987223625, - "rewards/concensus_correctness_reward_func": 15.0, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.0309, + "reward": 21.53785264492035, + "reward_std": 2.2568147983402014, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.34689185908064246, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.3248528055846691, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.7932187654078007, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 0.9942499995231628, "step": 236 }, { - "completion_length": 184.46875, + "completion_length": 169.96875, "epoch": 2.735632183908046, - "grad_norm": 10.252813339233398, - "kl": 32.1734454035759, + "grad_norm": 15.569607734680176, + "kl": 58.80247247219086, "learning_rate": 2.8309976727269335e-06, - "loss": 0.0322, - "reward": 23.37212324142456, - "reward_std": 4.034113742411137, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0588, + "reward": 20.86781644821167, + "reward_std": 6.584708698093891, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.4649669686332345, + "rewards/final_correctness_reward_func": 1.3125, + "rewards/question_recreation_reward_func": 0.3502857000567019, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.7509062476456165, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.017531231045723, "step": 238 }, { - "completion_length": 189.09375, + "completion_length": 159.5625, "epoch": 2.7586206896551726, - "grad_norm": 9.11358642578125, - "kl": 51.15155506134033, + "grad_norm": 15.48104190826416, + "kl": 30.833752512931824, "learning_rate": 2.7988683659251475e-06, - "loss": 0.0512, - "reward": 22.844246864318848, - "reward_std": 5.562111519277096, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0308, + "reward": 24.34659719467163, + "reward_std": 2.263719528913498, + "rewards/concensus_correctness_reward_func": 19.375, + "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.5789653360843658, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.278471989557147, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.8590312525629997, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.0056250020861626, "step": 240 }, { - "completion_length": 176.0, + "completion_length": 119.28125, "epoch": 2.781609195402299, - "grad_norm": 8.28599739074707, - "kl": 49.46989959478378, + "grad_norm": 22.25459098815918, + "kl": 70.77951443195343, "learning_rate": 2.766688899977266e-06, - "loss": 0.0495, - "reward": 23.284308910369873, - "reward_std": 3.6900559216737747, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0708, + "reward": 19.979850083589554, + "reward_std": 3.62205570936203, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.4332156013697386, + "rewards/final_correctness_reward_func": 0.8125, + "rewards/question_recreation_reward_func": 0.20441282144747674, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.7573437467217445, + "rewards/xmlcount_reward_func": 0.9316875115036964, "step": 242 }, { - "completion_length": 168.53125, + "completion_length": 140.40625, "epoch": 2.8045977011494254, - "grad_norm": 11.053635597229004, - "kl": 60.60266923904419, + "grad_norm": 26.626588821411133, + "kl": 64.7616720199585, "learning_rate": 2.7344646755704078e-06, - "loss": 0.0606, - "reward": 23.04017722606659, - "reward_std": 2.431196976453066, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0648, + "reward": 22.196502208709717, + "reward_std": 5.096539253368974, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.39530237205326557, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.2625648295506835, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.8167500197887421, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.0433125048875809, "step": 244 }, { - "completion_length": 182.1875, + "completion_length": 143.15625, "epoch": 2.8275862068965516, - "grad_norm": 8.843550682067871, - "kl": 21.57262098789215, + "grad_norm": 30.50920867919922, + "kl": 118.79828095436096, "learning_rate": 2.702201100903511e-06, - "loss": 0.0216, - "reward": 21.251714825630188, - "reward_std": 2.5225561521947384, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.1188, + "reward": 19.580127209424973, + "reward_std": 6.377966545522213, + "rewards/concensus_correctness_reward_func": 15.0, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.48502705805003643, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.25469024013727903, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.7666875123977661, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0910625010728836, "step": 246 }, { - "completion_length": 208.8125, + "completion_length": 116.78125, "epoch": 2.8505747126436782, - "grad_norm": 28.015222549438477, - "kl": 79.42935502529144, + "grad_norm": 42.95075225830078, + "kl": 142.73350584506989, "learning_rate": 2.6699035907796796e-06, - "loss": 0.0794, - "reward": 21.99584984779358, - "reward_std": 5.142332188785076, + "loss": 0.1427, + "reward": 21.428375482559204, + "reward_std": 5.0036335196346045, "rewards/concensus_correctness_reward_func": 17.5, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.47666190657764673, + "rewards/final_correctness_reward_func": 0.8125, + "rewards/question_recreation_reward_func": 0.22968789568403736, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8316874951124191, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 0.9330625012516975, "step": 248 }, { - "completion_length": 187.5, + "completion_length": 135.1875, "epoch": 2.873563218390805, - "grad_norm": 7.878297328948975, - "kl": 32.58809804916382, + "grad_norm": 20.947589874267578, + "kl": 83.38833355903625, "learning_rate": 2.6375775656974124e-06, - "loss": 0.0326, - "reward": 23.035036325454712, - "reward_std": 3.5780590642243624, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0834, + "reward": 21.061365365982056, + "reward_std": 6.776499420404434, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.34913025982677937, + "rewards/question_recreation_reward_func": 0.18927180115133524, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.8265312686562538, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 1.0127187445759773, "step": 250 }, { - "completion_length": 190.90625, + "completion_length": 142.53125, "epoch": 2.896551724137931, - "grad_norm": 10.366336822509766, - "kl": 53.645822525024414, + "grad_norm": 33.532535552978516, + "kl": 73.43270206451416, "learning_rate": 2.6052284509408805e-06, - "loss": 0.0536, - "reward": 22.475271224975586, - "reward_std": 4.786225765943527, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0734, + "reward": 23.1397967338562, + "reward_std": 3.651892565190792, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.38364618457853794, - "rewards/soft_format_reward_func": 0.015625, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.7634999975562096, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.2333593014627695, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 0.968937523663044, "step": 252 }, { - "completion_length": 215.6875, + "completion_length": 150.875, "epoch": 2.9195402298850572, - "grad_norm": 18.614078521728516, - "kl": 121.13904702663422, + "grad_norm": 22.1955623626709, + "kl": 65.40331673622131, "learning_rate": 2.5728616756693995e-06, - "loss": 0.1211, - "reward": 20.982293844223022, - "reward_std": 7.388325624167919, - "rewards/concensus_correctness_reward_func": 16.97850000858307, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.0654, + "reward": 21.70615029335022, + "reward_std": 5.454337567090988, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.3657937846146524, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.2664319332689047, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.09375, - "rewards/xmlcount_reward_func": 0.6692500039935112, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 0.9553437530994415, "step": 254 }, { - "completion_length": 186.1875, + "completion_length": 133.40625, "epoch": 2.942528735632184, - "grad_norm": 7.483030796051025, - "kl": 18.084463477134705, + "grad_norm": 22.244630813598633, + "kl": 101.94604635238647, "learning_rate": 2.5404826720062544e-06, - "loss": 0.0181, - "reward": 24.341283321380615, - "reward_std": 2.6867234632372856, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.1019, + "reward": 19.74974286556244, + "reward_std": 6.578873861581087, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.6365958098322153, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.3601799765601754, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.8140624985098839, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.9833125099539757, "step": 256 }, { - "completion_length": 170.34375, + "completion_length": 136.25, "epoch": 2.9655172413793105, - "grad_norm": 11.57905101776123, - "kl": 55.95998740196228, + "grad_norm": 27.24321746826172, + "kl": 76.92687654495239, "learning_rate": 2.5080968741270224e-06, - "loss": 0.056, - "reward": 22.71160650253296, - "reward_std": 4.2745377495884895, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0769, + "reward": 21.043765783309937, + "reward_std": 6.806770071387291, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.4460126720368862, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.30339036241639405, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.8593437373638153, + "rewards/strict_format_reward_func": 0.1875, + "rewards/xmlcount_reward_func": 0.9278750121593475, "step": 258 }, { - "completion_length": 176.625, + "completion_length": 141.46875, "epoch": 2.9885057471264367, - "grad_norm": 8.554110527038574, - "kl": 56.15454703569412, + "grad_norm": 13.848796844482422, + "kl": 74.220907330513, "learning_rate": 2.4757097173475574e-06, - "loss": 0.0562, - "reward": 22.856180667877197, - "reward_std": 4.199287883937359, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0742, + "reward": 21.900704383850098, + "reward_std": 6.4284183802083135, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.5938056465238333, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.3097044173628092, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.8717500194907188, + "rewards/xmlcount_reward_func": 0.9503750056028366, "step": 260 }, { - "completion_length": 192.84375, + "completion_length": 129.90625, "epoch": 3.0114942528735633, - "grad_norm": 12.645522117614746, - "kl": 36.66971564292908, + "grad_norm": 13.473543167114258, + "kl": 89.0081570148468, "learning_rate": 2.4433266372117755e-06, - "loss": 0.0367, - "reward": 21.84557729959488, - "reward_std": 1.0724763236939907, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.089, + "reward": 18.135825961828232, + "reward_std": 5.116767309606075, + "rewards/concensus_correctness_reward_func": 14.375, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.44420239329338074, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.1688570473343134, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.7607500031590462, + "rewards/strict_format_reward_func": 0.15625, + "rewards/xmlcount_reward_func": 0.9357187524437904, "step": 262 }, { - "completion_length": 186.1875, + "completion_length": 137.5, "epoch": 3.0344827586206895, - "grad_norm": 57.19257354736328, - "kl": 47.47080081701279, + "grad_norm": 18.38870620727539, + "kl": 82.03042995929718, "learning_rate": 2.410953068579411e-06, - "loss": 0.0475, - "reward": 18.912437170743942, - "reward_std": 5.258720979094505, + "loss": 0.082, + "reward": 18.922740817070007, + "reward_std": 5.527392536401749, "rewards/concensus_correctness_reward_func": 15.0, - "rewards/consensus_reward_func": 1.75, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.875, - "rewards/question_recreation_reward_func": 0.3893748102709651, + "rewards/question_recreation_reward_func": 0.27911579329520464, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.7418125048279762, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 0.8779999986290932, "step": 264 }, { - "completion_length": 183.8125, + "completion_length": 148.03125, "epoch": 3.057471264367816, - "grad_norm": 18.103336334228516, - "kl": 30.6926851272583, + "grad_norm": 11.94968032836914, + "kl": 95.89381909370422, "learning_rate": 2.3785944447138804e-06, - "loss": 0.0307, - "reward": 23.929044246673584, - "reward_std": 2.5999067835509777, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0959, + "reward": 19.564652681350708, + "reward_std": 8.904428139328957, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.5334500391036272, + "rewards/final_correctness_reward_func": 0.8125, + "rewards/question_recreation_reward_func": 0.2944338825182058, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.8643437623977661, + "rewards/xmlcount_reward_func": 1.0514687523245811, "step": 266 }, { - "completion_length": 191.46875, + "completion_length": 178.5, "epoch": 3.0804597701149423, - "grad_norm": 16.64955711364746, - "kl": 82.12755221128464, + "grad_norm": 10.641253471374512, + "kl": 75.12891209125519, "learning_rate": 2.3462561963704132e-06, - "loss": 0.0821, - "reward": 21.25652766227722, - "reward_std": 6.648399628698826, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0751, + "reward": 23.403506755828857, + "reward_std": 3.9334247559309006, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.8125, - "rewards/question_recreation_reward_func": 0.3932775501161814, + "rewards/final_correctness_reward_func": 1.3125, + "rewards/question_recreation_reward_func": 0.27944495156407356, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.6757500041276217, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 0.9521875008940697, "step": 268 }, { - "completion_length": 163.3125, + "completion_length": 150.3125, "epoch": 3.103448275862069, - "grad_norm": 13.558385848999023, - "kl": 98.68213987350464, + "grad_norm": 10.425681114196777, + "kl": 72.31718599796295, "learning_rate": 2.3139437508846155e-06, - "loss": 0.0987, - "reward": 18.832740902900696, - "reward_std": 9.425148040056229, - "rewards/concensus_correctness_reward_func": 15.103812456130981, - "rewards/consensus_reward_func": 1.5, + "loss": 0.0723, + "reward": 21.045587062835693, + "reward_std": 5.22771055623889, + "rewards/concensus_correctness_reward_func": 16.967750072479248, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.37158520240336657, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.29927424574270844, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.7948437668383121, + "rewards/strict_format_reward_func": 0.140625, + "rewards/xmlcount_reward_func": 0.8254375085234642, "step": 270 }, { - "completion_length": 168.5625, + "completion_length": 131.0, "epoch": 3.1264367816091956, - "grad_norm": 93.29242706298828, - "kl": 112.09204983711243, + "grad_norm": 178.2821502685547, + "kl": 94.79714369773865, "learning_rate": 2.2816625312615903e-06, - "loss": 0.1121, - "reward": 20.3865385055542, - "reward_std": 6.957423135638237, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.625, + "loss": 0.0948, + "reward": 21.40879887342453, + "reward_std": 5.045450382865965, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.4614132307469845, + "rewards/final_correctness_reward_func": 1.3125, + "rewards/question_recreation_reward_func": 0.22482990100979805, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.7688749991357327, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.043343760073185, "step": 272 }, { - "completion_length": 176.5, + "completion_length": 151.0, "epoch": 3.1494252873563218, - "grad_norm": 9.282670021057129, - "kl": 44.74986207485199, + "grad_norm": 10.653499603271484, + "kl": 43.49939680099487, "learning_rate": 2.2494179552657977e-06, - "loss": 0.0447, - "reward": 21.661226630210876, - "reward_std": 4.737890161573887, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0435, + "reward": 20.695982962846756, + "reward_std": 3.647803161293268, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.46144552901387215, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.3186077130958438, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.7466562427580357, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.908624991774559, "step": 274 }, { - "completion_length": 202.78125, + "completion_length": 122.03125, "epoch": 3.1724137931034484, - "grad_norm": 9.99281120300293, - "kl": 46.22209244966507, + "grad_norm": 36.16767501831055, + "kl": 80.89895606040955, "learning_rate": 2.2172154345117896e-06, - "loss": 0.0462, - "reward": 22.91976284980774, - "reward_std": 4.811107313260436, - "rewards/concensus_correctness_reward_func": 18.22374999523163, + "loss": 0.0809, + "reward": 20.822656095027924, + "reward_std": 2.5686975345015526, + "rewards/concensus_correctness_reward_func": 16.875, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.4375, - "rewards/question_recreation_reward_func": 0.49207552149891853, + "rewards/final_correctness_reward_func": 0.8125, + "rewards/question_recreation_reward_func": 0.19224974000826478, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.782062504440546, + "rewards/xmlcount_reward_func": 0.9585312381386757, "step": 276 }, { - "completion_length": 197.9375, + "completion_length": 166.6875, "epoch": 3.1954022988505746, - "grad_norm": 7.707649230957031, - "kl": 10.456040561199188, + "grad_norm": 8.423657417297363, + "kl": 30.86855298280716, "learning_rate": 2.185060373555978e-06, - "loss": 0.0105, - "reward": 25.512166500091553, - "reward_std": 1.087227463722229, - "rewards/concensus_correctness_reward_func": 20.0, - "rewards/consensus_reward_func": 2.0, + "loss": 0.0309, + "reward": 23.155113697052002, + "reward_std": 5.171472631394863, + "rewards/concensus_correctness_reward_func": 18.125, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.6875, - "rewards/question_recreation_reward_func": 0.5742604583501816, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.36708282120525837, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.296875, - "rewards/xmlcount_reward_func": 0.9535312503576279, + "rewards/strict_format_reward_func": 0.34375, + "rewards/xmlcount_reward_func": 1.0692812502384186, "step": 278 }, { - "completion_length": 175.84375, + "completion_length": 132.5625, "epoch": 3.218390804597701, - "grad_norm": 9.9420747756958, - "kl": 114.3270354270935, + "grad_norm": 48.93709945678711, + "kl": 90.54386782646179, "learning_rate": 2.1529581689895838e-06, - "loss": 0.1143, - "reward": 20.00388327240944, - "reward_std": 4.212954413145781, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0905, + "reward": 20.849692583084106, + "reward_std": 9.049399882555008, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.5183520540595055, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.297505383961834, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.8761562593281269, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 1.0209375023841858, "step": 280 }, { - "completion_length": 161.875, + "completion_length": 126.65625, "epoch": 3.2413793103448274, - "grad_norm": 9.585458755493164, - "kl": 29.945935487747192, + "grad_norm": 27.753114700317383, + "kl": 77.42353284358978, "learning_rate": 2.12091420853293e-06, - "loss": 0.0299, - "reward": 23.840237617492676, - "reward_std": 2.569697428494692, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0774, + "reward": 21.78527307510376, + "reward_std": 6.618948891758919, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.26723772194236517, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.14977327594533563, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8229999989271164, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.0573749914765358, "step": 282 }, { - "completion_length": 183.6875, + "completion_length": 139.09375, "epoch": 3.264367816091954, - "grad_norm": 9.332887649536133, - "kl": 33.99060362577438, + "grad_norm": 26.160541534423828, + "kl": 39.93494641780853, "learning_rate": 2.0889338701312184e-06, - "loss": 0.034, - "reward": 23.308263301849365, - "reward_std": 4.0854442566633224, + "loss": 0.0399, + "reward": 23.64195466041565, + "reward_std": 3.919191725552082, "rewards/concensus_correctness_reward_func": 18.75, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.40948206186294556, + "rewards/question_recreation_reward_func": 0.2775798523798585, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8362812511622906, + "rewards/strict_format_reward_func": 0.359375, + "rewards/xmlcount_reward_func": 1.1300000175833702, "step": 284 }, { - "completion_length": 187.75, + "completion_length": 140.0, "epoch": 3.2873563218390807, - "grad_norm": 97.52264404296875, - "kl": 552.707607448101, + "grad_norm": 18.851394653320312, + "kl": 28.231099367141724, "learning_rate": 2.0570225210519433e-06, - "loss": 0.5527, - "reward": 20.05747628211975, - "reward_std": 7.817619055509567, - "rewards/concensus_correctness_reward_func": 15.792999982833862, - "rewards/consensus_reward_func": 1.5625, + "loss": 0.0282, + "reward": 24.422237634658813, + "reward_std": 1.1523373499512672, + "rewards/concensus_correctness_reward_func": 20.0, + "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.4621638711541891, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.13123753014951944, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8023125156760216, + "rewards/xmlcount_reward_func": 1.0410000085830688, "step": 286 }, { - "completion_length": 179.40625, + "completion_length": 151.9375, "epoch": 3.310344827586207, - "grad_norm": 172.52220153808594, - "kl": 103.46669220924377, + "grad_norm": 17.405364990234375, + "kl": 72.28777301311493, "learning_rate": 2.025185516984108e-06, - "loss": 0.1035, - "reward": 20.110626578330994, - "reward_std": 7.713544711470604, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.625, + "loss": 0.0723, + "reward": 21.54401957988739, + "reward_std": 6.0776440016925335, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.5093452921137214, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.2213010028935969, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.6637812461704016, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 1.1039687544107437, "step": 288 }, { - "completion_length": 179.28125, + "completion_length": 127.09375, "epoch": 3.3333333333333335, - "grad_norm": 9.076400756835938, - "kl": 27.88554298877716, + "grad_norm": 15.093314170837402, + "kl": 125.32668507099152, "learning_rate": 1.993428201139375e-06, - "loss": 0.0279, - "reward": 23.514595985412598, - "reward_std": 3.680981855839491, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.1253, + "reward": 20.57132476568222, + "reward_std": 5.401688393205404, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.4855649098753929, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.27507482608780265, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.8727812469005585, + "rewards/xmlcount_reward_func": 1.0150000005960464, "step": 290 }, { - "completion_length": 199.96875, + "completion_length": 136.375, "epoch": 3.3563218390804597, - "grad_norm": 15.214519500732422, - "kl": 31.830171585083008, + "grad_norm": 15.000574111938477, + "kl": 135.7766830921173, "learning_rate": 1.9617559033553128e-06, - "loss": 0.0318, - "reward": 23.432430267333984, - "reward_std": 3.639903999865055, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.1358, + "reward": 19.560622334480286, + "reward_std": 9.298460245132446, + "rewards/concensus_correctness_reward_func": 15.72837495803833, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.4731489778496325, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.263341300887987, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.8811562657356262, + "rewards/strict_format_reward_func": 0.140625, + "rewards/xmlcount_reward_func": 0.9282812476158142, "step": 292 }, { - "completion_length": 196.5, + "completion_length": 153.9375, "epoch": 3.3793103448275863, - "grad_norm": 9.507624626159668, - "kl": 258.71369326114655, + "grad_norm": 17.406816482543945, + "kl": 38.764331340789795, "learning_rate": 1.9301739392008923e-06, - "loss": 0.2587, - "reward": 23.30627405643463, - "reward_std": 2.8541714921593666, + "loss": 0.0388, + "reward": 23.508578062057495, + "reward_std": 3.671527288854122, "rewards/concensus_correctness_reward_func": 18.75, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.5153987444937229, + "rewards/question_recreation_reward_func": 0.22364078043028712, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.7596250101923943, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.0974375009536743, "step": 294 }, { - "completion_length": 170.875, + "completion_length": 125.09375, "epoch": 3.4022988505747125, - "grad_norm": 19.795188903808594, - "kl": 51.548197507858276, + "grad_norm": 19.07607650756836, + "kl": 171.01091754436493, "learning_rate": 1.8986876090843668e-06, - "loss": 0.0515, - "reward": 19.95792943239212, - "reward_std": 4.975985690951347, - "rewards/concensus_correctness_reward_func": 15.658687498420477, - "rewards/consensus_reward_func": 1.75, + "loss": 0.171, + "reward": 17.35634481906891, + "reward_std": 7.117864057421684, + "rewards/concensus_correctness_reward_func": 13.224312543869019, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.3079293258488178, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.17700115137267858, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.8350625112652779, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0331562533974648, "step": 296 }, { - "completion_length": 200.125, + "completion_length": 144.90625, "epoch": 3.425287356321839, - "grad_norm": 12.625809669494629, - "kl": 38.17982134222984, + "grad_norm": 10.985239028930664, + "kl": 39.94594478607178, "learning_rate": 1.8673021973637095e-06, - "loss": 0.0382, - "reward": 20.094983398914337, - "reward_std": 5.239616237580776, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0399, + "reward": 20.569520950317383, + "reward_std": 3.812641218304634, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.3979525100439787, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.2100206082686782, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.775156244635582, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.0938749760389328, "step": 298 }, { - "completion_length": 182.21875, + "completion_length": 144.3125, "epoch": 3.4482758620689653, - "grad_norm": 12.502586364746094, - "kl": 30.84159231185913, + "grad_norm": 13.138213157653809, + "kl": 81.87689316272736, "learning_rate": 1.8360229714597372e-06, - "loss": 0.0308, - "reward": 24.045459270477295, - "reward_std": 2.6335729211568832, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0819, + "reward": 21.468154668807983, + "reward_std": 6.316160887479782, + "rewards/concensus_correctness_reward_func": 16.98743748664856, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.46252188459038734, + "rewards/final_correctness_reward_func": 1.3125, + "rewards/question_recreation_reward_func": 0.24059188552200794, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.754812479019165, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 1.0057500079274178, "step": 300 }, { - "completion_length": 203.78125, + "completion_length": 175.4375, "epoch": 3.471264367816092, - "grad_norm": 11.00143051147461, - "kl": 42.34978526830673, + "grad_norm": 6.7021684646606445, + "kl": 90.10174667835236, "learning_rate": 1.8048551809720752e-06, - "loss": 0.0424, - "reward": 21.035732984542847, - "reward_std": 5.30287379771471, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.0901, + "reward": 19.826006770133972, + "reward_std": 7.8187506049871445, + "rewards/concensus_correctness_reward_func": 15.0, + "rewards/consensus_reward_func": 1.5, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.504763625562191, + "rewards/final_correctness_reward_func": 1.5625, + "rewards/question_recreation_reward_func": 0.3413503775373101, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.078125, - "rewards/xmlcount_reward_func": 0.6403437368571758, + "rewards/strict_format_reward_func": 0.328125, + "rewards/xmlcount_reward_func": 1.0940312519669533, "step": 302 }, { - "completion_length": 178.46875, + "completion_length": 152.40625, "epoch": 3.4942528735632186, - "grad_norm": 9.749627113342285, - "kl": 42.00779151916504, + "grad_norm": 33.33560562133789, + "kl": 62.13021218776703, "learning_rate": 1.7738040567981168e-06, - "loss": 0.042, - "reward": 21.43708509206772, - "reward_std": 2.6289036720991135, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0621, + "reward": 20.411924451589584, + "reward_std": 3.7083070650696754, + "rewards/concensus_correctness_reward_func": 15.726187467575073, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.4815221428871155, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.3766123193781823, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.25, - "rewards/xmlcount_reward_func": 0.893062524497509, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 1.027875006198883, "step": 304 }, { - "completion_length": 181.125, + "completion_length": 150.90625, "epoch": 3.5172413793103448, - "grad_norm": 19.066579818725586, - "kl": 38.558876395225525, + "grad_norm": 12.614079475402832, + "kl": 79.89652490615845, "learning_rate": 1.7428748102551237e-06, - "loss": 0.0386, - "reward": 19.881126523017883, - "reward_std": 4.959422640502453, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0799, + "reward": 21.557888984680176, + "reward_std": 6.523874972015619, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.42356355325318873, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.23220114409923553, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8325624950230122, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.0600625053048134, "step": 306 }, { - "completion_length": 189.90625, + "completion_length": 112.0625, "epoch": 3.5402298850574714, - "grad_norm": 10.441559791564941, - "kl": 64.12149155139923, + "grad_norm": 190.45326232910156, + "kl": 375.51640033721924, "learning_rate": 1.7120726322056042e-06, - "loss": 0.0641, - "reward": 21.424246072769165, - "reward_std": 7.577428184449673, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.3755, + "reward": 19.517458319664, + "reward_std": 6.71420843526721, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.46799618005752563, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.19858358323108405, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.9093750193715096, + "rewards/xmlcount_reward_func": 0.8970000073313713, "step": 308 }, { - "completion_length": 172.59375, + "completion_length": 112.96875, "epoch": 3.5632183908045976, - "grad_norm": 16.696434020996094, - "kl": 53.76651054620743, + "grad_norm": 18.347244262695312, + "kl": 84.32885503768921, "learning_rate": 1.6814026921861337e-06, - "loss": 0.0538, - "reward": 21.452552676200867, - "reward_std": 5.002565193921328, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0843, + "reward": 20.728150367736816, + "reward_std": 8.046710327267647, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.3705523097887635, + "rewards/final_correctness_reward_func": 0.8125, + "rewards/question_recreation_reward_func": 0.1669005190487951, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.7069999948143959, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 0.9831250011920929, "step": 310 }, { - "completion_length": 161.4375, + "completion_length": 145.125, "epoch": 3.586206896551724, - "grad_norm": 13.803705215454102, - "kl": 35.13280367851257, + "grad_norm": 18.334503173828125, + "kl": 80.64542424678802, "learning_rate": 1.6508701375397488e-06, - "loss": 0.0351, - "reward": 19.40959343314171, - "reward_std": 4.873311936855316, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0806, + "reward": 18.398963272571564, + "reward_std": 5.594931587576866, + "rewards/concensus_correctness_reward_func": 14.375, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.75, - "rewards/question_recreation_reward_func": 0.3399684764444828, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.15936967357993126, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.7571249902248383, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 1.0208437368273735, "step": 312 }, { - "completion_length": 139.8125, + "completion_length": 125.40625, "epoch": 3.609195402298851, - "grad_norm": 16.385456085205078, - "kl": 82.14977234601974, + "grad_norm": 11.989151954650879, + "kl": 34.92421197891235, "learning_rate": 1.6204800925520685e-06, - "loss": 0.0821, - "reward": 20.941880345344543, - "reward_std": 6.481127966195345, - "rewards/concensus_correctness_reward_func": 16.933249950408936, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.0349, + "reward": 18.767382442951202, + "reward_std": 2.5582587234675884, + "rewards/concensus_correctness_reward_func": 14.375, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.33141161873936653, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.2550387536175549, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.8334687426686287, + "rewards/strict_format_reward_func": 0.328125, + "rewards/xmlcount_reward_func": 1.1217187494039536, "step": 314 }, { - "completion_length": 176.71875, + "completion_length": 141.71875, "epoch": 3.632183908045977, - "grad_norm": 12.406164169311523, - "kl": 54.94031184911728, + "grad_norm": 15.615893363952637, + "kl": 32.43759173154831, "learning_rate": 1.5902376575912815e-06, - "loss": 0.0549, - "reward": 21.382453322410583, - "reward_std": 5.417651690542698, - "rewards/concensus_correctness_reward_func": 16.98243749141693, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.0324, + "reward": 23.463390111923218, + "reward_std": 3.6536942794919014, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.370641166344285, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.3226394699886441, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.8731250017881393, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0313749983906746, "step": 316 }, { - "completion_length": 165.09375, + "completion_length": 149.875, "epoch": 3.655172413793103, - "grad_norm": 13.028090476989746, - "kl": 61.327383518218994, + "grad_norm": 72.64096069335938, + "kl": 134.15749621391296, "learning_rate": 1.5601479082521526e-06, - "loss": 0.0613, - "reward": 22.481309413909912, - "reward_std": 5.194659873843193, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.1342, + "reward": 20.07876455783844, + "reward_std": 7.464760262519121, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.4422153886407614, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.34892111737281084, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.8203437700867653, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.042343758046627, "step": 318 }, { - "completion_length": 169.6875, + "completion_length": 123.21875, "epoch": 3.67816091954023, - "grad_norm": 9.631131172180176, - "kl": 60.2094167470932, + "grad_norm": 14.932175636291504, + "kl": 55.49729108810425, "learning_rate": 1.530215894504184e-06, - "loss": 0.0602, - "reward": 16.31907683610916, - "reward_std": 8.370707906782627, - "rewards/concensus_correctness_reward_func": 12.5, - "rewards/consensus_reward_func": 1.5, + "loss": 0.0555, + "reward": 20.455664724111557, + "reward_std": 2.6702157128602266, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.4498578463681042, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.16097779455594718, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.7754687443375587, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.0446875095367432, "step": 320 }, { - "completion_length": 183.15625, + "completion_length": 131.34375, "epoch": 3.7011494252873565, - "grad_norm": 7.016822338104248, - "kl": 25.357021510601044, + "grad_norm": 18.339860916137695, + "kl": 96.92925429344177, "learning_rate": 1.5004466398440776e-06, - "loss": 0.0254, - "reward": 21.540445297956467, - "reward_std": 2.395818665623665, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0969, + "reward": 18.75596570968628, + "reward_std": 7.257143005728722, + "rewards/concensus_correctness_reward_func": 14.375, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.544382631778717, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.380465236492455, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.8866874985396862, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.0473750233650208, "step": 322 }, { - "completion_length": 162.71875, + "completion_length": 121.28125, "epoch": 3.7241379310344827, - "grad_norm": 10.263161659240723, - "kl": 45.643818497657776, + "grad_norm": 11.443888664245605, + "kl": 106.47103905677795, "learning_rate": 1.4708451404526409e-06, - "loss": 0.0456, - "reward": 23.100224018096924, - "reward_std": 3.733182780444622, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.1065, + "reward": 20.368716716766357, + "reward_std": 7.225958582013845, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.30684876535087824, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.15318524139001966, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.8715000078082085, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.028031274676323, "step": 324 }, { - "completion_length": 187.46875, + "completion_length": 144.53125, "epoch": 3.7471264367816093, - "grad_norm": 9.616602897644043, - "kl": 46.42721700668335, + "grad_norm": 27.537342071533203, + "kl": 19626.373552560806, "learning_rate": 1.4414163643562755e-06, - "loss": 0.0464, - "reward": 21.407015800476074, - "reward_std": 6.583139583468437, + "loss": 19.6264, + "reward": 21.31447982788086, + "reward_std": 6.2893945425748825, "rewards/concensus_correctness_reward_func": 16.875, "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.5297352517955005, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.33676091488450766, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8772812411189079, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 1.0558437705039978, "step": 326 }, { - "completion_length": 159.84375, + "completion_length": 116.9375, "epoch": 3.7701149425287355, - "grad_norm": 9.695280075073242, - "kl": 54.68646156787872, + "grad_norm": 14.474410057067871, + "kl": 95.75877487659454, "learning_rate": 1.4121652505931922e-06, - "loss": 0.0547, - "reward": 21.12120521068573, - "reward_std": 6.953412558883429, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.0958, + "reward": 22.162693977355957, + "reward_std": 4.914735484868288, + "rewards/concensus_correctness_reward_func": 18.125, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.3141742091393098, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.18288125097751617, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8695312589406967, + "rewards/strict_format_reward_func": 0.15625, + "rewards/xmlcount_reward_func": 0.9485624805092812, "step": 328 }, { - "completion_length": 182.5, + "completion_length": 139.9375, "epoch": 3.793103448275862, - "grad_norm": 77.05713653564453, - "kl": 66.39413911104202, + "grad_norm": 15.218499183654785, + "kl": 126.23055791854858, "learning_rate": 1.3830967083844944e-06, - "loss": 0.0664, - "reward": 18.682109713554382, - "reward_std": 7.464542143046856, - "rewards/concensus_correctness_reward_func": 14.375, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.1262, + "reward": 15.836580842733383, + "reward_std": 8.247560195624828, + "rewards/concensus_correctness_reward_func": 11.875, + "rewards/consensus_reward_func": 1.4375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.46720364317297935, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.24011184414848685, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.9180312529206276, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.11209374666214, "step": 330 }, { - "completion_length": 173.03125, + "completion_length": 128.75, "epoch": 3.8160919540229887, - "grad_norm": 13.693964958190918, - "kl": 40.74326169490814, + "grad_norm": 149.83717346191406, + "kl": 177.77284121513367, "learning_rate": 1.3542156163102582e-06, - "loss": 0.0407, - "reward": 20.098479121923447, - "reward_std": 4.755125887691975, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.1778, + "reward": 20.440475046634674, + "reward_std": 3.718158222734928, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.4097289126366377, + "rewards/final_correctness_reward_func": 0.75, + "rewards/question_recreation_reward_func": 0.16710016853176057, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.9074999988079071, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.1015000119805336, "step": 332 }, { - "completion_length": 168.96875, + "completion_length": 116.84375, "epoch": 3.839080459770115, - "grad_norm": 15.231707572937012, - "kl": 51.47773230075836, + "grad_norm": 704.8313598632812, + "kl": 332.14776253700256, "learning_rate": 1.3255268214907612e-06, - "loss": 0.0515, - "reward": 20.67968761920929, - "reward_std": 5.34325535595417, + "loss": 0.3321, + "reward": 20.384458541870117, + "reward_std": 6.741727106273174, "rewards/concensus_correctness_reward_func": 16.25, "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.4380625803023577, + "rewards/question_recreation_reward_func": 0.242927310988307, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.25, - "rewards/xmlcount_reward_func": 0.991625003516674, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 0.9384062439203262, "step": 334 }, { - "completion_length": 197.5625, + "completion_length": 133.4375, "epoch": 3.862068965517241, - "grad_norm": 13.312628746032715, - "kl": 58.13399809598923, + "grad_norm": 14.612515449523926, + "kl": 72.21894669532776, "learning_rate": 1.2970351387729875e-06, - "loss": 0.0581, - "reward": 21.960068106651306, - "reward_std": 5.207610227167606, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0722, + "reward": 20.78598228096962, + "reward_std": 3.471321040764451, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.5149743631482124, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.3254510872066021, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.820093747228384, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 1.0074062645435333, "step": 336 }, { - "completion_length": 218.15625, + "completion_length": 165.25, "epoch": 3.8850574712643677, - "grad_norm": 17.09858512878418, - "kl": 112.53218483924866, + "grad_norm": 16.66224479675293, + "kl": 90.46698069572449, "learning_rate": 1.2687453499225547e-06, - "loss": 0.1125, - "reward": 22.161534070968628, - "reward_std": 6.54509231634438, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0905, + "reward": 21.790729999542236, + "reward_std": 6.630698459222913, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.5422215182334185, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.36476135230623186, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.8224375061690807, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.113468736410141, "step": 338 }, { - "completion_length": 173.53125, + "completion_length": 132.75, "epoch": 3.9080459770114944, - "grad_norm": 11.476337432861328, - "kl": 53.14116382598877, + "grad_norm": 17.16057777404785, + "kl": 134.69714224338531, "learning_rate": 1.2406622028211846e-06, - "loss": 0.0531, - "reward": 20.298256993293762, - "reward_std": 6.7289321180433035, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.625, + "loss": 0.1347, + "reward": 20.075629115104675, + "reward_std": 7.9574392437934875, + "rewards/concensus_correctness_reward_func": 15.729125022888184, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.4000070048496127, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.16547290747985244, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.8045000061392784, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 1.0872812718153, "step": 340 }, { - "completion_length": 157.21875, + "completion_length": 130.3125, "epoch": 3.9310344827586206, - "grad_norm": 39.46800231933594, - "kl": 152.46045088768005, + "grad_norm": 31.908777236938477, + "kl": 69.3057963848114, "learning_rate": 1.2127904106698665e-06, - "loss": 0.1525, - "reward": 20.698113322257996, - "reward_std": 5.3872697204351425, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.0693, + "reward": 23.20823621749878, + "reward_std": 3.9500268027186394, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.875, - "rewards/question_recreation_reward_func": 0.27645699493587017, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.22267405223101377, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.8435312733054161, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.04806250333786, "step": 342 }, { - "completion_length": 205.6875, + "completion_length": 121.5, "epoch": 3.954022988505747, - "grad_norm": 11.775999069213867, - "kl": 50.60084939002991, + "grad_norm": 46.012351989746094, + "kl": 47.74831974506378, "learning_rate": 1.1851346511978427e-06, - "loss": 0.0506, - "reward": 17.64255118370056, - "reward_std": 3.6402768082916737, - "rewards/concensus_correctness_reward_func": 13.75, - "rewards/consensus_reward_func": 1.5625, + "loss": 0.0477, + "reward": 19.695841670036316, + "reward_std": 5.129831820726395, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.39211308769881725, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.20331068965606391, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.7816874906420708, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.9612812548875809, "step": 344 }, { - "completion_length": 194.90625, + "completion_length": 144.03125, "epoch": 3.9770114942528734, - "grad_norm": 8.629315376281738, - "kl": 28.059679329395294, + "grad_norm": 21.179075241088867, + "kl": 595.102085351944, "learning_rate": 1.1576995658775405e-06, - "loss": 0.0281, - "reward": 21.033021599054337, - "reward_std": 2.2377058416604996, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.5951, + "reward": 21.625678062438965, + "reward_std": 3.8382995799183846, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.37070887722074986, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.19124048668891191, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.109375, - "rewards/xmlcount_reward_func": 0.677937492262572, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.965687520802021, "step": 346 }, { - "completion_length": 143.75, + "completion_length": 136.28125, "epoch": 4.0, - "grad_norm": 22.469331741333008, - "kl": 52.183703541755676, + "grad_norm": 33.860721588134766, + "kl": 68.66555762290955, "learning_rate": 1.130489759145593e-06, - "loss": 0.0522, - "reward": 21.233357906341553, - "reward_std": 6.187660021707416, - "rewards/concensus_correctness_reward_func": 17.5, + "loss": 0.0687, + "reward": 22.03096055984497, + "reward_std": 5.8004016280174255, + "rewards/concensus_correctness_reward_func": 17.601875066757202, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.75, - "rewards/question_recreation_reward_func": 0.32048257533460855, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.28989812172949314, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.109375, - "rewards/xmlcount_reward_func": 0.8034999966621399, + "rewards/strict_format_reward_func": 0.171875, + "rewards/xmlcount_reward_func": 1.0298125073313713, "step": 348 }, { - "completion_length": 162.40625, + "completion_length": 140.3125, "epoch": 4.022988505747127, - "grad_norm": 9.240470886230469, - "kl": 51.57277071475983, + "grad_norm": 26.64327049255371, + "kl": 103.91235780715942, "learning_rate": 1.103509797630077e-06, - "loss": 0.0516, - "reward": 23.145742893218994, - "reward_std": 4.092843152582645, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.1039, + "reward": 22.265674114227295, + "reward_std": 6.0553875751793385, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.4375, - "rewards/question_recreation_reward_func": 0.603023823350668, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.43179943040013313, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.9333437606692314, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 1.0526249930262566, "step": 350 }, { - "completion_length": 160.53125, + "completion_length": 129.53125, "epoch": 4.045977011494253, - "grad_norm": 28.5865421295166, - "kl": 117.43298935890198, + "grad_norm": 22.627286911010742, + "kl": 91.40363669395447, "learning_rate": 1.0767642093840933e-06, - "loss": 0.1174, - "reward": 18.149566650390625, - "reward_std": 8.407156670466065, - "rewards/concensus_correctness_reward_func": 14.375, - "rewards/consensus_reward_func": 1.4375, + "loss": 0.0914, + "reward": 21.621731519699097, + "reward_std": 5.305510371923447, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.3501292185392231, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.2187627856619656, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.8306874968111515, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.0279687568545341, "step": 352 }, { - "completion_length": 163.84375, + "completion_length": 121.0625, "epoch": 4.068965517241379, - "grad_norm": 9.856067657470703, - "kl": 72.87647998332977, + "grad_norm": 16.18296241760254, + "kl": 121.42755484580994, "learning_rate": 1.0502574831258259e-06, - "loss": 0.0729, - "reward": 19.996199250221252, - "reward_std": 7.888490553945303, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.625, + "loss": 0.1214, + "reward": 19.38966202735901, + "reward_std": 8.952121656388044, + "rewards/concensus_correctness_reward_func": 15.731687545776367, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.8125, - "rewards/question_recreation_reward_func": 0.4048872464336455, + "rewards/question_recreation_reward_func": 0.12350557802710682, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.7788124978542328, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 0.9563437476754189, "step": 354 }, { - "completion_length": 175.59375, + "completion_length": 156.0, "epoch": 4.091954022988506, - "grad_norm": 11.96252155303955, - "kl": 28.961570084095, + "grad_norm": 15.39124584197998, + "kl": 141.8922426700592, "learning_rate": 1.0239940674851943e-06, - "loss": 0.029, - "reward": 21.967456459999084, - "reward_std": 1.3135593235492706, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.875, + "loss": 0.1419, + "reward": 18.50457113981247, + "reward_std": 6.410012923181057, + "rewards/concensus_correctness_reward_func": 14.375, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.42789400555193424, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.24694595206528902, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.9301875159144402, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.9138749986886978, "step": 356 }, { - "completion_length": 156.59375, + "completion_length": 119.09375, "epoch": 4.114942528735632, - "grad_norm": 14.181438446044922, - "kl": 118.37537503242493, + "grad_norm": 18.891246795654297, + "kl": 84.58924698829651, "learning_rate": 9.979783702572413e-07, - "loss": 0.1184, - "reward": 18.47257351875305, - "reward_std": 8.245882511138916, - "rewards/concensus_correctness_reward_func": 15.0, - "rewards/consensus_reward_func": 1.5, + "loss": 0.0846, + "reward": 20.688037633895874, + "reward_std": 6.380873154848814, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.875, - "rewards/question_recreation_reward_func": 0.26726103108376265, + "rewards/final_correctness_reward_func": 0.75, + "rewards/question_recreation_reward_func": 0.18250620504841208, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.09375, - "rewards/xmlcount_reward_func": 0.7365625100210309, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.9742812514305115, "step": 358 }, { - "completion_length": 177.8125, + "completion_length": 128.53125, "epoch": 4.137931034482759, - "grad_norm": 5.571948528289795, - "kl": 78.34945034980774, + "grad_norm": 38.27794647216797, + "kl": 85.2854516506195, "learning_rate": 9.722147576623745e-07, - "loss": 0.0783, - "reward": 20.085896134376526, - "reward_std": 8.850405864417553, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.5625, + "loss": 0.0853, + "reward": 21.262572765350342, + "reward_std": 7.635666236281395, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.4375, - "rewards/question_recreation_reward_func": 0.459115045145154, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.3215103065595031, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8142812512814999, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.9723125174641609, "step": 360 }, { - "completion_length": 195.3125, + "completion_length": 113.53125, "epoch": 4.160919540229885, - "grad_norm": 12.62401008605957, - "kl": 28.07889825105667, + "grad_norm": 20.43463706970215, + "kl": 118.72606635093689, "learning_rate": 9.467075536135787e-07, - "loss": 0.0281, - "reward": 23.037957191467285, - "reward_std": 3.8332573771476746, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.1187, + "reward": 19.4775869846344, + "reward_std": 9.424187138676643, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5625, - "rewards/question_recreation_reward_func": 0.548206765204668, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.21149345487356186, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.817875012755394, + "rewards/xmlcount_reward_func": 0.9692187756299973, "step": 362 }, { - "completion_length": 184.25, + "completion_length": 131.3125, "epoch": 4.183908045977011, - "grad_norm": 10.506850242614746, - "kl": 167.22120356559753, + "grad_norm": 21.494186401367188, + "kl": 104.14683055877686, "learning_rate": 9.214610389907327e-07, - "loss": 0.1672, - "reward": 22.55263876914978, - "reward_std": 4.951677683740854, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.1041, + "reward": 21.906970500946045, + "reward_std": 5.987719387747347, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.5021073431707919, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.2897198654245585, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.8005312383174896, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 1.0078750178217888, "step": 364 }, { - "completion_length": 166.9375, + "completion_length": 102.78125, "epoch": 4.206896551724138, - "grad_norm": 18.67622947692871, - "kl": 43.30710709095001, + "grad_norm": 22.733078002929688, + "kl": 145.2549729347229, "learning_rate": 8.964794509221508e-07, - "loss": 0.0433, - "reward": 22.270903825759888, - "reward_std": 3.7740740440785885, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.1453, + "reward": 18.452977180480957, + "reward_std": 7.889417111873627, + "rewards/concensus_correctness_reward_func": 15.0, + "rewards/consensus_reward_func": 1.5, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.4675916964188218, + "rewards/final_correctness_reward_func": 0.625, + "rewards/question_recreation_reward_func": 0.22016490530222654, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.09375, - "rewards/xmlcount_reward_func": 0.7720625102519989, + "rewards/strict_format_reward_func": 0.109375, + "rewards/xmlcount_reward_func": 0.9984375014901161, "step": 366 }, { - "completion_length": 177.90625, + "completion_length": 95.84375, "epoch": 4.2298850574712645, - "grad_norm": 19.65778350830078, - "kl": 37.249677419662476, + "grad_norm": 43.83097839355469, + "kl": 125.42819404602051, "learning_rate": 8.71766982073462e-07, - "loss": 0.0372, - "reward": 22.386229991912842, - "reward_std": 5.353573054075241, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.1254, + "reward": 20.140291690826416, + "reward_std": 5.750214818865061, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.43757360614836216, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.20991674123797566, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.7455312609672546, + "rewards/strict_format_reward_func": 0.171875, + "rewards/xmlcount_reward_func": 1.0085000097751617, "step": 368 }, { - "completion_length": 173.78125, + "completion_length": 125.84375, "epoch": 4.252873563218391, - "grad_norm": 25.49388313293457, - "kl": 54.230771601200104, + "grad_norm": 17.15430450439453, + "kl": 116.69944763183594, "learning_rate": 8.473277799439569e-07, - "loss": 0.0542, - "reward": 22.22030282020569, - "reward_std": 5.079367756843567, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.1167, + "reward": 15.78281980752945, + "reward_std": 8.055908218026161, + "rewards/concensus_correctness_reward_func": 11.875, + "rewards/consensus_reward_func": 1.4375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.37289655953645706, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.1661949255503714, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.8005312457680702, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.038500003516674, "step": 370 }, { - "completion_length": 157.09375, + "completion_length": 111.40625, "epoch": 4.275862068965517, - "grad_norm": 9.549386978149414, - "kl": 68.57221043109894, + "grad_norm": 23.146883010864258, + "kl": 102.09614729881287, "learning_rate": 8.231659461705092e-07, - "loss": 0.0686, - "reward": 20.300042867660522, - "reward_std": 7.8168960288167, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.625, + "loss": 0.1021, + "reward": 18.875545740127563, + "reward_std": 9.19566060602665, + "rewards/concensus_correctness_reward_func": 15.0, + "rewards/consensus_reward_func": 1.5, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.36873061303049326, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.17595180263742805, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.8063124865293503, + "rewards/strict_format_reward_func": 0.171875, + "rewards/xmlcount_reward_func": 0.9652187526226044, "step": 372 }, { - "completion_length": 170.96875, + "completion_length": 112.21875, "epoch": 4.2988505747126435, - "grad_norm": 9.102907180786133, - "kl": 24.85323876142502, + "grad_norm": 18.5485897064209, + "kl": 90.76153373718262, "learning_rate": 7.992855358391968e-07, - "loss": 0.0249, - "reward": 22.86936044692993, - "reward_std": 3.569862723350525, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0908, + "reward": 20.26989507675171, + "reward_std": 7.945419058203697, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.37939172238111496, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.16742587252520025, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.8024687469005585, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 0.9618437588214874, "step": 374 }, { - "completion_length": 163.9375, + "completion_length": 146.65625, "epoch": 4.32183908045977, - "grad_norm": 49.91806411743164, - "kl": 67.00268840789795, + "grad_norm": 38.370872497558594, + "kl": 99.53798991441727, "learning_rate": 7.756905568047393e-07, - "loss": 0.067, - "reward": 22.527931451797485, - "reward_std": 5.407323345541954, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0995, + "reward": 20.136839866638184, + "reward_std": 8.976733930408955, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.47015031054615974, + "rewards/final_correctness_reward_func": 1.3125, + "rewards/question_recreation_reward_func": 0.3954646416241303, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.8859062492847443, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 0.9757500067353249, "step": 376 }, { - "completion_length": 164.46875, + "completion_length": 114.59375, "epoch": 4.344827586206897, - "grad_norm": 12.641098976135254, - "kl": 87.50417518615723, + "grad_norm": 17.07652473449707, + "kl": 119.00145494937897, "learning_rate": 7.523849690178567e-07, - "loss": 0.0875, - "reward": 18.379699647426605, - "reward_std": 7.63149930536747, - "rewards/concensus_correctness_reward_func": 14.375, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.119, + "reward": 19.602388501167297, + "reward_std": 7.5823104083538055, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.3399496730417013, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.15445114602334797, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.8678750023245811, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.0104374960064888, "step": 378 }, { - "completion_length": 171.1875, + "completion_length": 149.71875, "epoch": 4.3678160919540225, - "grad_norm": 27.454458236694336, - "kl": 39.09812366962433, + "grad_norm": 14.417007446289062, + "kl": 82.07163846492767, "learning_rate": 7.293726838606674e-07, - "loss": 0.0391, - "reward": 23.741392850875854, - "reward_std": 4.101856108754873, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0821, + "reward": 21.27260172367096, + "reward_std": 6.685646221041679, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.4375, - "rewards/question_recreation_reward_func": 0.43945574946701527, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.29119573533535004, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.28125, - "rewards/xmlcount_reward_func": 0.958187498152256, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.1220312416553497, "step": 380 }, { - "completion_length": 172.6875, + "completion_length": 124.1875, "epoch": 4.390804597701149, - "grad_norm": 1103.589111328125, - "kl": 568.7607877254486, + "grad_norm": 26.25701332092285, + "kl": 39.73806154727936, "learning_rate": 7.066575634902437e-07, - "loss": 0.5688, - "reward": 22.193560361862183, - "reward_std": 4.97435961291194, - "rewards/concensus_correctness_reward_func": 18.2193124294281, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0397, + "reward": 23.57156252861023, + "reward_std": 3.5544759307522327, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.2128102807328105, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.16040656017139554, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.8395624905824661, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.0986562669277191, "step": 382 }, { - "completion_length": 176.9375, + "completion_length": 137.9375, "epoch": 4.413793103448276, - "grad_norm": 21.578908920288086, - "kl": 23.982467830181122, + "grad_norm": 1926.822998046875, + "kl": 615.4363899230957, "learning_rate": 6.842434201904255e-07, - "loss": 0.024, - "reward": 23.7997624874115, - "reward_std": 2.548955924808979, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.6154, + "reward": 21.181058883666992, + "reward_std": 7.612653136253357, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.3328249244950712, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.22899635648354888, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.8106875196099281, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.030187502503395, "step": 384 }, { - "completion_length": 165.0625, + "completion_length": 112.4375, "epoch": 4.436781609195402, - "grad_norm": 115.91436767578125, - "kl": 116.0389906167984, + "grad_norm": 38.67100524902344, + "kl": 161.00672054290771, "learning_rate": 6.621340157319998e-07, - "loss": 0.116, - "reward": 16.608315885066986, - "reward_std": 5.418231450021267, - "rewards/concensus_correctness_reward_func": 12.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.161, + "reward": 15.25207805633545, + "reward_std": 8.813117686659098, + "rewards/concensus_correctness_reward_func": 11.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.8125, - "rewards/question_recreation_reward_func": 0.4955969895236194, + "rewards/question_recreation_reward_func": 0.2442654382903129, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.893968753516674, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.0703125, "step": 386 }, { - "completion_length": 163.9375, + "completion_length": 125.6875, "epoch": 4.459770114942529, - "grad_norm": 12.824539184570312, - "kl": 71.45607995986938, + "grad_norm": 12.633645057678223, + "kl": 102.32421875, "learning_rate": 6.403330607413643e-07, - "loss": 0.0715, - "reward": 16.53843978047371, - "reward_std": 9.202060237526894, - "rewards/concensus_correctness_reward_func": 12.5, - "rewards/consensus_reward_func": 1.4375, + "loss": 0.1023, + "reward": 21.432005882263184, + "reward_std": 6.578659221529961, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.35509633366018534, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.3526309859007597, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.8864687420427799, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.0637500025331974, "step": 388 }, { - "completion_length": 167.15625, + "completion_length": 134.1875, "epoch": 4.482758620689655, - "grad_norm": 10.867217063903809, - "kl": 94.47299039363861, + "grad_norm": 12.725988388061523, + "kl": 67.27107751369476, "learning_rate": 6.188442140777742e-07, - "loss": 0.0945, - "reward": 21.2570698261261, - "reward_std": 5.77236545458436, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.0673, + "reward": 23.248368740081787, + "reward_std": 4.938363384455442, + "rewards/concensus_correctness_reward_func": 18.125, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.5190070755779743, + "rewards/final_correctness_reward_func": 1.5625, + "rewards/question_recreation_reward_func": 0.3604938354110345, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8005625158548355, + "rewards/strict_format_reward_func": 0.328125, + "rewards/xmlcount_reward_func": 1.059749998152256, "step": 390 }, { - "completion_length": 200.875, + "completion_length": 125.6875, "epoch": 4.505747126436781, - "grad_norm": 135.62298583984375, - "kl": 158.16409891843796, + "grad_norm": 23.900285720825195, + "kl": 62.917752742767334, "learning_rate": 5.976710822192722e-07, - "loss": 0.1582, - "reward": 16.592130422592163, - "reward_std": 9.380578614771366, - "rewards/concensus_correctness_reward_func": 12.5, - "rewards/consensus_reward_func": 1.5, + "loss": 0.0629, + "reward": 19.901626229286194, + "reward_std": 4.901079066097736, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.5194427520036697, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.19631414907053113, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.8070624954998493, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 1.049062505364418, "step": 392 }, { - "completion_length": 193.625, + "completion_length": 128.28125, "epoch": 4.528735632183908, - "grad_norm": 7.12425422668457, - "kl": 69.65883839130402, + "grad_norm": 82.37921905517578, + "kl": 118.36538791656494, "learning_rate": 5.768172186574123e-07, - "loss": 0.0697, - "reward": 20.724651217460632, - "reward_std": 6.739639110863209, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.625, + "loss": 0.1184, + "reward": 21.2057546377182, + "reward_std": 5.502791330218315, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.47199482936412096, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.20303629772388376, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8151562474668026, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.0652187690138817, "step": 394 }, { - "completion_length": 170.375, + "completion_length": 134.9375, "epoch": 4.551724137931035, - "grad_norm": 9.802570343017578, - "kl": 34.828455328941345, + "grad_norm": 165.75790405273438, + "kl": 237.50116205215454, "learning_rate": 5.562861233008774e-07, - "loss": 0.0348, - "reward": 20.60237318277359, - "reward_std": 4.024098441004753, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.875, + "loss": 0.2375, + "reward": 17.77926468849182, + "reward_std": 8.009113162755966, + "rewards/concensus_correctness_reward_func": 13.75, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.33712316676974297, + "rewards/question_recreation_reward_func": 0.14635806158185005, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.9215000197291374, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 1.023531250655651, "step": 396 }, { - "completion_length": 146.6875, + "completion_length": 107.8125, "epoch": 4.574712643678161, - "grad_norm": 11.826367378234863, - "kl": 60.1534264087677, + "grad_norm": 21.01422882080078, + "kl": 75.713538646698, "learning_rate": 5.360812418880884e-07, - "loss": 0.0602, - "reward": 18.763304233551025, - "reward_std": 5.386138536036015, - "rewards/concensus_correctness_reward_func": 15.0, + "loss": 0.0757, + "reward": 19.815197676420212, + "reward_std": 4.934360761195421, + "rewards/concensus_correctness_reward_func": 15.625, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.8125, - "rewards/question_recreation_reward_func": 0.34374198131263256, - "rewards/soft_format_reward_func": 0.015625, - "rewards/strict_format_reward_func": 0.109375, - "rewards/xmlcount_reward_func": 0.7320625111460686, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.1576030757278204, + "rewards/soft_format_reward_func": 0.0, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.0950937792658806, "step": 398 }, { - "completion_length": 189.34375, + "completion_length": 125.96875, "epoch": 4.597701149425287, - "grad_norm": 20.871566772460938, - "kl": 70.3400661945343, + "grad_norm": 29.98430824279785, + "kl": 62.14144778251648, "learning_rate": 5.162059654089083e-07, - "loss": 0.0703, - "reward": 16.688934713602066, - "reward_std": 5.230218760669231, - "rewards/concensus_correctness_reward_func": 12.5, - "rewards/consensus_reward_func": 1.625, + "loss": 0.0621, + "reward": 19.579623699188232, + "reward_std": 4.942569011822343, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.3955598399043083, + "rewards/final_correctness_reward_func": 0.6875, + "rewards/question_recreation_reward_func": 0.18943578330799937, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.8871250078082085, + "rewards/xmlcount_reward_func": 1.0464375019073486, "step": 400 }, { - "completion_length": 183.4375, + "completion_length": 138.625, "epoch": 4.620689655172414, - "grad_norm": 43.35209274291992, - "kl": 56.33893322944641, + "grad_norm": 21.13219451904297, + "kl": 135.5740466117859, "learning_rate": 4.966636295355254e-07, - "loss": 0.0563, - "reward": 21.119993686676025, - "reward_std": 6.2554392367601395, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.1356, + "reward": 17.9394511282444, + "reward_std": 6.878748886287212, + "rewards/concensus_correctness_reward_func": 13.75, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.44374341890215874, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.25160733237862587, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.7231249921023846, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.0472187474370003, "step": 402 }, { - "completion_length": 179.53125, + "completion_length": 126.59375, "epoch": 4.64367816091954, - "grad_norm": 91.99315643310547, - "kl": 114.41612958908081, + "grad_norm": 25.998849868774414, + "kl": 98.98210060596466, "learning_rate": 4.774575140626317e-07, - "loss": 0.1144, - "reward": 21.931533813476562, - "reward_std": 6.460190966725349, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.099, + "reward": 21.07881009578705, + "reward_std": 5.628018751740456, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.3939396534115076, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.13943516847211868, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8500937484204769, + "rewards/strict_format_reward_func": 0.328125, + "rewards/xmlcount_reward_func": 1.1112499982118607, "step": 404 }, { - "completion_length": 145.625, + "completion_length": 139.65625, "epoch": 4.666666666666667, - "grad_norm": 14.64321231842041, - "kl": 69.08966267108917, + "grad_norm": 21.359472274780273, + "kl": 20.361542522907257, "learning_rate": 4.5859084235697236e-07, - "loss": 0.0691, - "reward": 16.92772352695465, - "reward_std": 7.506361450999975, - "rewards/concensus_correctness_reward_func": 13.125, - "rewards/consensus_reward_func": 1.5625, + "loss": 0.0204, + "reward": 24.41035485267639, + "reward_std": 2.2219148576259613, + "rewards/concensus_correctness_reward_func": 19.375, + "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.75, - "rewards/question_recreation_reward_func": 0.27394221909344196, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.25607367418706417, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 1.0131562575697899, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.0917812511324883, "step": 406 }, { - "completion_length": 186.53125, + "completion_length": 121.71875, "epoch": 4.689655172413794, - "grad_norm": 9.075035095214844, - "kl": 34.11286425590515, + "grad_norm": 17.450536727905273, + "kl": 115.7200825214386, "learning_rate": 4.400667808163689e-07, - "loss": 0.0341, - "reward": 22.611249685287476, - "reward_std": 5.186097111087292, + "loss": 0.1157, + "reward": 22.75440287590027, + "reward_std": 3.669588077813387, "rewards/concensus_correctness_reward_func": 18.125, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.44590615667402744, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.26980905747041106, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.109375, - "rewards/xmlcount_reward_func": 0.7434687539935112, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 1.062718741595745, "step": 408 }, { - "completion_length": 160.09375, + "completion_length": 129.65625, "epoch": 4.712643678160919, - "grad_norm": 9.786542892456055, - "kl": 55.63039183616638, + "grad_norm": 18.0162410736084, + "kl": 110.19675368070602, "learning_rate": 4.2188843833829874e-07, - "loss": 0.0556, - "reward": 23.08666706085205, - "reward_std": 3.848712369799614, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.1102, + "reward": 20.91892409324646, + "reward_std": 6.636302459985018, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.34063556510955095, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.1743930634111166, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.109375, - "rewards/xmlcount_reward_func": 0.761656254529953, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.1351562589406967, "step": 410 }, { - "completion_length": 166.78125, + "completion_length": 120.09375, "epoch": 4.735632183908046, - "grad_norm": 17.81321907043457, - "kl": 57.45062065124512, + "grad_norm": 13.146157264709473, + "kl": 89.86308097839355, "learning_rate": 4.040588657981301e-07, - "loss": 0.0575, - "reward": 21.688234210014343, - "reward_std": 5.342890881001949, - "rewards/concensus_correctness_reward_func": 17.602312445640564, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0899, + "reward": 20.99919891357422, + "reward_std": 7.47471896559, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.8125, - "rewards/question_recreation_reward_func": 0.33742179349064827, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.1742929391330108, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.9828750044107437, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.0592812448740005, "step": 412 }, { - "completion_length": 208.25, + "completion_length": 129.34375, "epoch": 4.758620689655173, - "grad_norm": 24.221139907836914, - "kl": 51.84546101093292, + "grad_norm": 25.927690505981445, + "kl": 60.52328372001648, "learning_rate": 3.8658105553709356e-07, - "loss": 0.0518, - "reward": 19.71150231361389, - "reward_std": 7.963156888261437, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.5625, + "loss": 0.0605, + "reward": 22.087587118148804, + "reward_std": 6.2227701507508755, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.4718458019196987, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.17239922611042857, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.6459062481299043, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.040187507867813, "step": 414 }, { - "completion_length": 183.21875, + "completion_length": 126.65625, "epoch": 4.781609195402299, - "grad_norm": 905.8897094726562, - "kl": 595.3316441774368, + "grad_norm": 11.378458976745605, + "kl": 61.37336719036102, "learning_rate": 3.6945794086007706e-07, - "loss": 0.5953, - "reward": 19.660025596618652, - "reward_std": 9.491994187235832, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.5625, + "loss": 0.0614, + "reward": 21.451255559921265, + "reward_std": 5.728153295814991, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.4235252868384123, + "rewards/final_correctness_reward_func": 1.3125, + "rewards/question_recreation_reward_func": 0.2243809485808015, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.8458750173449516, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.0862500071525574, "step": 416 }, { - "completion_length": 201.0625, + "completion_length": 143.65625, "epoch": 4.804597701149425, - "grad_norm": 126.9385757446289, - "kl": 127.00714457035065, + "grad_norm": 18.405658721923828, + "kl": 121.18818688392639, "learning_rate": 3.5269239554332565e-07, - "loss": 0.127, - "reward": 18.65916258096695, - "reward_std": 6.685267778113484, - "rewards/concensus_correctness_reward_func": 14.375, - "rewards/consensus_reward_func": 1.5625, + "loss": 0.1212, + "reward": 16.58649867773056, + "reward_std": 9.566900184378028, + "rewards/concensus_correctness_reward_func": 12.5, + "rewards/consensus_reward_func": 1.4375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.38425685139372945, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.17484262073412538, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.8842812404036522, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.099156230688095, "step": 418 }, { - "completion_length": 174.1875, + "completion_length": 116.21875, "epoch": 4.827586206896552, - "grad_norm": 15.973464012145996, - "kl": 46.142689526081085, + "grad_norm": 26.87528419494629, + "kl": 125.18714022636414, "learning_rate": 3.362872333521389e-07, - "loss": 0.0461, - "reward": 21.212018370628357, - "reward_std": 2.308755062520504, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.1252, + "reward": 14.849340498447418, + "reward_std": 6.007201753556728, + "rewards/concensus_correctness_reward_func": 11.25, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.3597995792515576, + "rewards/final_correctness_reward_func": 0.625, + "rewards/question_recreation_reward_func": 0.21268434170633554, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.8678437657654285, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 0.9960312619805336, "step": 420 }, { - "completion_length": 195.8125, + "completion_length": 172.71875, "epoch": 4.850574712643678, - "grad_norm": 67.40209197998047, - "kl": 62.72301685810089, + "grad_norm": 16.9029598236084, + "kl": 46.86613857746124, "learning_rate": 3.2024520756863244e-07, - "loss": 0.0627, - "reward": 24.481628894805908, - "reward_std": 2.4964847043156624, - "rewards/concensus_correctness_reward_func": 19.375, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.0469, + "reward": 23.888757944107056, + "reward_std": 3.7069457806646824, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.5625, - "rewards/question_recreation_reward_func": 0.6032539438456297, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.4653831487521529, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.8002499938011169, + "rewards/strict_format_reward_func": 0.34375, + "rewards/xmlcount_reward_func": 1.0796250104904175, "step": 422 }, { - "completion_length": 201.96875, + "completion_length": 137.9375, "epoch": 4.873563218390805, - "grad_norm": 10.356889724731445, - "kl": 46.652981638908386, + "grad_norm": 12.020703315734863, + "kl": 70.63363230228424, "learning_rate": 3.0456901052965726e-07, - "loss": 0.0467, - "reward": 20.538446724414825, - "reward_std": 3.546284096315503, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0706, + "reward": 16.673233807086945, + "reward_std": 5.42130708694458, + "rewards/concensus_correctness_reward_func": 12.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.4592282325029373, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.24420273024588823, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.109375, - "rewards/xmlcount_reward_func": 0.6573437629267573, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.038406252861023, "step": 424 }, { - "completion_length": 161.21875, + "completion_length": 150.25, "epoch": 4.896551724137931, - "grad_norm": 35.18503952026367, - "kl": 41.05984675884247, + "grad_norm": 12.723780632019043, + "kl": 115.53416061401367, "learning_rate": 2.892612731749414e-07, - "loss": 0.0411, - "reward": 21.839521050453186, - "reward_std": 5.293122284114361, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.1155, + "reward": 18.69682240486145, + "reward_std": 9.031071446835995, + "rewards/concensus_correctness_reward_func": 14.375, + "rewards/consensus_reward_func": 1.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.38152066472684965, + "rewards/question_recreation_reward_func": 0.29175951750949025, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.8017500042915344, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0456874892115593, "step": 426 }, { - "completion_length": 148.125, + "completion_length": 131.5, "epoch": 4.919540229885057, - "grad_norm": 29.832962036132812, - "kl": 92.53181219100952, + "grad_norm": 19.461505889892578, + "kl": 93.16180157661438, "learning_rate": 2.743245646055398e-07, - "loss": 0.0925, - "reward": 21.141194343566895, - "reward_std": 6.592543721199036, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.0932, + "reward": 19.40146565437317, + "reward_std": 7.110916767269373, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.35803786665201187, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.22246535867452621, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.8925312459468842, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 0.9133749976754189, "step": 428 }, { - "completion_length": 193.5, + "completion_length": 110.09375, "epoch": 4.942528735632184, - "grad_norm": 11.490717887878418, - "kl": 38.97284358739853, + "grad_norm": 18.025880813598633, + "kl": 80.58686113357544, "learning_rate": 2.5976139165266367e-07, - "loss": 0.039, - "reward": 19.15629443526268, - "reward_std": 5.6061121597886086, - "rewards/concensus_correctness_reward_func": 15.0, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0806, + "reward": 18.25068172812462, + "reward_std": 6.362433955073357, + "rewards/concensus_correctness_reward_func": 14.375, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.4336379412561655, + "rewards/final_correctness_reward_func": 0.8125, + "rewards/question_recreation_reward_func": 0.15502561768516898, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.76953125, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 1.0019062459468842, "step": 430 }, { - "completion_length": 145.71875, + "completion_length": 118.78125, "epoch": 4.9655172413793105, - "grad_norm": 52.01123046875, - "kl": 103.57953584194183, - "learning_rate": 2.455741984569543e-07, - "loss": 0.1036, - "reward": 20.611141204833984, - "reward_std": 7.954390831291676, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.625, + "grad_norm": 22.976699829101562, + "kl": 61.087424635887146, + "learning_rate": 2.455741984569543e-07, + "loss": 0.0611, + "reward": 23.41362738609314, + "reward_std": 4.022454604506493, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.4390781279653311, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.24547128193080425, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.890812486410141, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.1056562215089798, "step": 432 }, { - "completion_length": 173.46875, + "completion_length": 130.125, "epoch": 4.988505747126437, - "grad_norm": 132.83074951171875, - "kl": 79.27593314647675, + "grad_norm": 50.32063293457031, + "kl": 105.44022130966187, "learning_rate": 2.3176536605828443e-07, - "loss": 0.0793, - "reward": 21.848167657852173, - "reward_std": 6.353322416543961, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.1054, + "reward": 21.320362329483032, + "reward_std": 6.560661925934255, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.3558868574909866, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.18233091849833727, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.8829062432050705, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.0599062368273735, "step": 434 }, { - "completion_length": 197.40625, + "completion_length": 130.65625, "epoch": 5.011494252873563, - "grad_norm": 9.71432113647461, - "kl": 106.79390096664429, + "grad_norm": 12.47871208190918, + "kl": 91.17067384719849, "learning_rate": 2.1833721199614992e-07, - "loss": 0.1068, - "reward": 20.26347541809082, - "reward_std": 6.886662557721138, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.625, + "loss": 0.0912, + "reward": 21.257283926010132, + "reward_std": 7.132457211613655, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0, - "rewards/question_recreation_reward_func": 0.4362252885475755, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.22012797370553017, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.7959999963641167, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0527812391519547, "step": 436 }, { - "completion_length": 146.9375, + "completion_length": 132.65625, "epoch": 5.0344827586206895, - "grad_norm": 329.2025146484375, - "kl": 257.8830449581146, + "grad_norm": 26.860790252685547, + "kl": 117.30861330032349, "learning_rate": 2.0529198992071202e-07, - "loss": 0.2579, - "reward": 20.592611074447632, - "reward_std": 6.199930422008038, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.1173, + "reward": 19.48007071018219, + "reward_std": 9.120491355657578, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.24620466213673353, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.1381647251546383, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.078125, - "rewards/xmlcount_reward_func": 0.7682812437415123, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.029406264424324, "step": 438 }, { - "completion_length": 150.6875, + "completion_length": 130.90625, "epoch": 5.057471264367816, - "grad_norm": 10.997002601623535, - "kl": 45.32725369930267, + "grad_norm": 15.348209381103516, + "kl": 65.81521391868591, "learning_rate": 1.926318892145712e-07, - "loss": 0.0453, - "reward": 22.53796148300171, - "reward_std": 4.791982728987932, - "rewards/concensus_correctness_reward_func": 18.22825002670288, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0658, + "reward": 23.504045248031616, + "reward_std": 3.6847030222415924, + "rewards/concensus_correctness_reward_func": 18.75, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.38596174120903015, + "rewards/final_correctness_reward_func": 1.3125, + "rewards/question_recreation_reward_func": 0.287420212989673, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.09375, - "rewards/xmlcount_reward_func": 0.8299999907612801, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.0291249975562096, "step": 440 }, { - "completion_length": 160.3125, + "completion_length": 150.96875, "epoch": 5.080459770114943, - "grad_norm": 18.565425872802734, - "kl": 56.26052141189575, + "grad_norm": 14.655074119567871, + "kl": 49.68517744541168, "learning_rate": 1.803590346253195e-07, - "loss": 0.0563, - "reward": 22.09588599205017, - "reward_std": 4.934977009892464, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0497, + "reward": 23.41924262046814, + "reward_std": 3.675075862556696, + "rewards/concensus_correctness_reward_func": 18.81418752670288, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.40779225900769234, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.25833615753799677, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.109375, - "rewards/xmlcount_reward_func": 0.7037187442183495, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.0342187359929085, "step": 442 }, { - "completion_length": 194.40625, + "completion_length": 141.0625, "epoch": 5.103448275862069, - "grad_norm": 17.656816482543945, - "kl": 32.488772720098495, + "grad_norm": 11.407206535339355, + "kl": 85.15399491786957, "learning_rate": 1.6847548590894435e-07, - "loss": 0.0325, - "reward": 23.357986211776733, - "reward_std": 3.876391106052324, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0852, + "reward": 19.179333806037903, + "reward_std": 9.59556694328785, + "rewards/concensus_correctness_reward_func": 15.0, + "rewards/consensus_reward_func": 1.5, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.25, - "rewards/question_recreation_reward_func": 0.46751748491078615, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.21233397038304247, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.8123437613248825, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 1.0295000076293945, "step": 444 }, { - "completion_length": 165.21875, + "completion_length": 98.09375, "epoch": 5.126436781609195, - "grad_norm": 15.807307243347168, - "kl": 40.4843356013298, + "grad_norm": 18.514759063720703, + "kl": 122.31811785697937, "learning_rate": 1.5698323748414123e-07, - "loss": 0.0405, - "reward": 22.616024494171143, - "reward_std": 5.236422821879387, + "loss": 0.1223, + "reward": 22.264724016189575, + "reward_std": 4.781804423779249, "rewards/concensus_correctness_reward_func": 18.125, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.41318054450675845, + "rewards/final_correctness_reward_func": 0.875, + "rewards/question_recreation_reward_func": 0.21634901268407702, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.9059687554836273, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 1.0327499881386757, "step": 446 }, { - "completion_length": 181.0625, + "completion_length": 139.9375, "epoch": 5.149425287356322, - "grad_norm": 9.996232986450195, - "kl": 111.35121786594391, + "grad_norm": 10.200178146362305, + "kl": 72.45452046394348, "learning_rate": 1.458842180975864e-07, - "loss": 0.1114, - "reward": 21.350199937820435, - "reward_std": 5.136128455400467, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0725, + "reward": 20.68981796503067, + "reward_std": 6.211782366037369, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.75, - "rewards/question_recreation_reward_func": 0.4653875392396003, + "rewards/final_correctness_reward_func": 1.25, + "rewards/question_recreation_reward_func": 0.18588060373440385, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.109375, - "rewards/xmlcount_reward_func": 0.775437481701374, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.082062490284443, "step": 448 }, { - "completion_length": 147.8125, + "completion_length": 120.5625, "epoch": 5.172413793103448, - "grad_norm": 11.51822280883789, - "kl": 69.07674360275269, + "grad_norm": 385.22235107421875, + "kl": 469.7169188261032, "learning_rate": 1.3518029050023862e-07, - "loss": 0.0691, - "reward": 22.672448873519897, - "reward_std": 4.780227959156036, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.4697, + "reward": 19.709235787391663, + "reward_std": 7.903780490159988, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.3125, - "rewards/question_recreation_reward_func": 0.3030429712962359, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.22186111006885767, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.9319062456488609, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 1.0655000060796738, "step": 450 }, { - "completion_length": 179.78125, + "completion_length": 134.0, "epoch": 5.195402298850575, - "grad_norm": 10.419228553771973, - "kl": 27.214569330215454, + "grad_norm": 16.46508026123047, + "kl": 28.220962524414062, "learning_rate": 1.2487325113471034e-07, - "loss": 0.0272, - "reward": 21.86761999130249, - "reward_std": 5.047114223241806, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0282, + "reward": 24.16447949409485, + "reward_std": 2.5588029325008392, + "rewards/concensus_correctness_reward_func": 19.375, + "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.75, - "rewards/question_recreation_reward_func": 0.3694635406136513, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.18973011506022885, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.6856562532484531, + "rewards/strict_format_reward_func": 0.328125, + "rewards/xmlcount_reward_func": 1.1466249972581863, "step": 452 }, { - "completion_length": 143.5625, + "completion_length": 136.65625, "epoch": 5.218390804597701, - "grad_norm": 10.993687629699707, - "kl": 77.15027022361755, + "grad_norm": 23.267534255981445, + "kl": 74.44675999879837, "learning_rate": 1.1496482983377189e-07, - "loss": 0.0772, - "reward": 20.245051860809326, - "reward_std": 7.807565614581108, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.625, + "loss": 0.0744, + "reward": 21.695457220077515, + "reward_std": 5.087169401347637, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.2522393921390176, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.2265821574255824, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.852187491953373, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 1.0469999983906746, "step": 454 }, { - "completion_length": 145.09375, + "completion_length": 129.46875, "epoch": 5.241379310344827, - "grad_norm": 17.713504791259766, - "kl": 3376.6523065567017, + "grad_norm": 16.105613708496094, + "kl": 48.78885495662689, "learning_rate": 1.054566895300324e-07, - "loss": 3.3767, - "reward": 19.23095941543579, - "reward_std": 5.082733313553035, + "loss": 0.0488, + "reward": 20.086702406406403, + "reward_std": 4.998106710612774, "rewards/concensus_correctness_reward_func": 15.625, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.625, - "rewards/question_recreation_reward_func": 0.2782093891873956, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.26207710430026054, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.09375, - "rewards/xmlcount_reward_func": 0.7964999973773956, + "rewards/strict_format_reward_func": 0.234375, + "rewards/xmlcount_reward_func": 0.9652499854564667, "step": 456 }, { - "completion_length": 144.96875, + "completion_length": 124.65625, "epoch": 5.264367816091954, - "grad_norm": 26.128984451293945, - "kl": 134.21160078048706, + "grad_norm": 57.834388732910156, + "kl": 112.06564545631409, "learning_rate": 9.635042597685024e-08, - "loss": 0.1342, - "reward": 18.887654423713684, - "reward_std": 10.685045599937439, - "rewards/concensus_correctness_reward_func": 15.0, - "rewards/consensus_reward_func": 1.5, + "loss": 0.1121, + "reward": 20.626364588737488, + "reward_std": 7.760148838162422, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.35568562778644264, + "rewards/question_recreation_reward_func": 0.20420803502202034, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.7819687500596046, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.1096562594175339, "step": 458 }, { - "completion_length": 179.40625, + "completion_length": 133.59375, "epoch": 5.287356321839081, - "grad_norm": 18.492712020874023, - "kl": 51.0592702627182, + "grad_norm": 14.596514701843262, + "kl": 61.3770055770874, "learning_rate": 8.764756748051661e-08, - "loss": 0.0511, - "reward": 18.93462473154068, - "reward_std": 6.361149214208126, - "rewards/concensus_correctness_reward_func": 15.0, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0614, + "reward": 20.813291430473328, + "reward_std": 3.9197427481412888, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.3611871786415577, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.20688464026898146, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.7609375044703484, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0595312491059303, "step": 460 }, { - "completion_length": 191.65625, + "completion_length": 153.71875, "epoch": 5.310344827586207, - "grad_norm": 16.52113151550293, - "kl": 48.12223827838898, + "grad_norm": 14.557358741760254, + "kl": 42.23032999038696, "learning_rate": 7.934957464376059e-08, - "loss": 0.0481, - "reward": 19.188302785158157, - "reward_std": 6.477444354444742, - "rewards/concensus_correctness_reward_func": 15.0, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0422, + "reward": 20.021111965179443, + "reward_std": 4.680503658950329, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.3933965237811208, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.31595551781356335, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.7480312436819077, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0332812517881393, "step": 462 }, { - "completion_length": 190.125, + "completion_length": 138.75, "epoch": 5.333333333333333, - "grad_norm": 18.778305053710938, - "kl": 42.69025939702988, + "grad_norm": 19.221540451049805, + "kl": 102.05794584751129, "learning_rate": 7.145784012061424e-08, - "loss": 0.0427, - "reward": 22.864402532577515, - "reward_std": 5.030285768210888, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.1021, + "reward": 20.00137209892273, + "reward_std": 5.093230836093426, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.42924601025879383, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.24368458753451705, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.25, - "rewards/xmlcount_reward_func": 0.8726562336087227, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 1.1014375165104866, "step": 464 }, { - "completion_length": 195.78125, + "completion_length": 155.0625, "epoch": 5.35632183908046, - "grad_norm": 8.402714729309082, - "kl": 43.919244050979614, + "grad_norm": 17.81580924987793, + "kl": 108.44038569927216, "learning_rate": 6.397368838268497e-08, - "loss": 0.0439, - "reward": 19.633685633540154, - "reward_std": 5.113476112484932, - "rewards/concensus_correctness_reward_func": 15.625, + "loss": 0.1084, + "reward": 18.840138375759125, + "reward_std": 6.484973901882768, + "rewards/concensus_correctness_reward_func": 14.375, "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.9375, - "rewards/question_recreation_reward_func": 0.5150612574070692, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.3892635339871049, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.125, - "rewards/xmlcount_reward_func": 0.7436250150203705, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.028999999165535, "step": 466 }, { - "completion_length": 172.34375, + "completion_length": 131.875, "epoch": 5.379310344827586, - "grad_norm": 58.60877990722656, - "kl": 57.48680925369263, + "grad_norm": 17.676359176635742, + "kl": 72.73703694343567, "learning_rate": 5.6898375496867444e-08, - "loss": 0.0575, - "reward": 21.326478004455566, - "reward_std": 6.531008146703243, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.0727, + "reward": 19.984642565250397, + "reward_std": 4.865904374048114, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.4396964996121824, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.25639283121563494, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.8086562640964985, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.0563750192523003, "step": 468 }, { - "completion_length": 155.21875, + "completion_length": 145.46875, "epoch": 5.402298850574713, - "grad_norm": 16.785945892333984, - "kl": 61.20238280296326, + "grad_norm": 39.83194351196289, + "kl": 88.0123770236969, "learning_rate": 5.023308891453915e-08, - "loss": 0.0612, - "reward": 20.168509244918823, - "reward_std": 7.872298255562782, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.625, + "loss": 0.088, + "reward": 20.496811151504517, + "reward_std": 10.01418024301529, + "rewards/concensus_correctness_reward_func": 15.625, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.36497852858155966, + "rewards/final_correctness_reward_func": 1.625, + "rewards/question_recreation_reward_func": 0.28781138255726546, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.7097812443971634, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.0840000212192535, "step": 470 }, { - "completion_length": 168.78125, + "completion_length": 138.1875, "epoch": 5.425287356321839, - "grad_norm": 11.309849739074707, - "kl": 31.31029975414276, + "grad_norm": 15.042767524719238, + "kl": 215.25583505630493, "learning_rate": 4.397894727226931e-08, - "loss": 0.0313, - "reward": 23.706650376319885, - "reward_std": 2.7320817410945892, - "rewards/concensus_correctness_reward_func": 18.75, - "rewards/consensus_reward_func": 1.875, + "loss": 0.2153, + "reward": 22.74295973777771, + "reward_std": 5.855514466762543, + "rewards/concensus_correctness_reward_func": 17.5, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.574181092903018, + "rewards/final_correctness_reward_func": 1.5, + "rewards/question_recreation_reward_func": 0.36470989137887955, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.21875, - "rewards/xmlcount_reward_func": 0.9137187451124191, + "rewards/strict_format_reward_func": 0.4375, + "rewards/xmlcount_reward_func": 1.190750002861023, "step": 472 }, { - "completion_length": 182.125, + "completion_length": 145.0, "epoch": 5.448275862068965, - "grad_norm": 64.38263702392578, - "kl": 81.1543778181076, + "grad_norm": 16.94098472595215, + "kl": 62.88593465089798, "learning_rate": 3.813700020407707e-08, - "loss": 0.0812, - "reward": 19.333679407835007, - "reward_std": 4.686830898746848, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0629, + "reward": 20.874918460845947, + "reward_std": 6.477827824652195, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.75, - "rewards/question_recreation_reward_func": 0.4604299336206168, + "rewards/final_correctness_reward_func": 0.9375, + "rewards/question_recreation_reward_func": 0.24057472869753838, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.046875, - "rewards/xmlcount_reward_func": 0.6388750001788139, + "rewards/strict_format_reward_func": 0.15625, + "rewards/xmlcount_reward_func": 0.9780937507748604, "step": 474 }, { - "completion_length": 156.71875, + "completion_length": 121.8125, "epoch": 5.471264367816092, - "grad_norm": 32.13385009765625, - "kl": 36.907963037490845, + "grad_norm": 10.821012496948242, + "kl": 78.28713607788086, "learning_rate": 3.270822816527325e-08, - "loss": 0.0369, - "reward": 20.6470085978508, - "reward_std": 3.4543293081223965, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0783, + "reward": 19.257447212934494, + "reward_std": 5.446491427719593, + "rewards/concensus_correctness_reward_func": 15.0, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.4850401012226939, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.33919742330908775, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.8182187415659428, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 1.0120000168681145, "step": 476 }, { - "completion_length": 172.875, + "completion_length": 143.21875, "epoch": 5.494252873563219, - "grad_norm": 20.709497451782227, - "kl": 103.22667372226715, + "grad_norm": 14.565610885620117, + "kl": 143.7932162284851, "learning_rate": 2.7693542267908934e-08, - "loss": 0.1032, - "reward": 20.31205987930298, - "reward_std": 7.81353186070919, - "rewards/concensus_correctness_reward_func": 16.350937485694885, - "rewards/consensus_reward_func": 1.625, + "loss": 0.1438, + "reward": 21.52308416366577, + "reward_std": 2.3479395247995853, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.3800596173387021, + "rewards/final_correctness_reward_func": 1.1875, + "rewards/question_recreation_reward_func": 0.13636508909985423, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.109375, - "rewards/xmlcount_reward_func": 0.7216875031590462, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.0742187649011612, "step": 478 }, { - "completion_length": 175.03125, + "completion_length": 142.90625, "epoch": 5.517241379310345, - "grad_norm": 10.926924705505371, - "kl": 44.22956895828247, + "grad_norm": 16.690366744995117, + "kl": 46.38571923971176, "learning_rate": 2.309378412786306e-08, - "loss": 0.0442, - "reward": 19.644951462745667, - "reward_std": 5.116902289912105, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0464, + "reward": 23.897355318069458, + "reward_std": 3.4060968551784754, + "rewards/concensus_correctness_reward_func": 18.838500022888184, + "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.875, - "rewards/question_recreation_reward_func": 0.3831079350784421, + "rewards/final_correctness_reward_func": 1.4375, + "rewards/question_recreation_reward_func": 0.3518860088661313, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.15625, - "rewards/xmlcount_reward_func": 0.793093740940094, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.081968754529953, "step": 480 }, { - "completion_length": 149.75, + "completion_length": 108.84375, "epoch": 5.540229885057471, - "grad_norm": 17.194047927856445, - "kl": 39.70325767993927, + "grad_norm": 38.889862060546875, + "kl": 99.59591209888458, "learning_rate": 1.890972572359456e-08, - "loss": 0.0397, - "reward": 20.700520366430283, - "reward_std": 3.7537781968712807, - "rewards/concensus_correctness_reward_func": 16.25, - "rewards/consensus_reward_func": 1.875, + "loss": 0.0996, + "reward": 18.321453154087067, + "reward_std": 6.6690956354141235, + "rewards/concensus_correctness_reward_func": 14.375, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.125, - "rewards/question_recreation_reward_func": 0.37242657132446766, + "rewards/final_correctness_reward_func": 0.8125, + "rewards/question_recreation_reward_func": 0.28167139552533627, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.9062187597155571, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 0.9460312351584435, "step": 482 }, { - "completion_length": 200.84375, + "completion_length": 137.625, "epoch": 5.563218390804598, - "grad_norm": 10.78227424621582, - "kl": 49.69452089071274, + "grad_norm": 20.607763290405273, + "kl": 59.86354583501816, "learning_rate": 1.5142069266580462e-08, - "loss": 0.0497, - "reward": 21.014417052268982, - "reward_std": 6.770496159791946, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.0599, + "reward": 19.49050533771515, + "reward_std": 3.994945455342531, + "rewards/concensus_correctness_reward_func": 15.0, + "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.4338231720030308, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.2776927927043289, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.7837187573313713, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.0721875131130219, "step": 484 }, { - "completion_length": 160.8125, + "completion_length": 127.21875, "epoch": 5.586206896551724, - "grad_norm": 11.099568367004395, - "kl": 37.877389550209045, + "grad_norm": 11.22186279296875, + "kl": 68.88167703151703, "learning_rate": 1.1791447083465136e-08, - "loss": 0.0379, - "reward": 22.65419840812683, - "reward_std": 3.8396519105881453, + "loss": 0.0689, + "reward": 22.44924807548523, + "reward_std": 4.978069052100182, "rewards/concensus_correctness_reward_func": 18.125, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.3718857765197754, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.17171666212379932, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.9229375049471855, + "rewards/strict_format_reward_func": 0.265625, + "rewards/xmlcount_reward_func": 1.011906273663044, "step": 486 }, { - "completion_length": 198.6875, + "completion_length": 139.53125, "epoch": 5.609195402298851, - "grad_norm": 72.9337387084961, - "kl": 99.32822668552399, + "grad_norm": 16.95719337463379, + "kl": 108.64093208312988, "learning_rate": 8.858421509933823e-09, - "loss": 0.0993, - "reward": 20.514360070228577, - "reward_std": 9.309086836874485, + "loss": 0.1086, + "reward": 19.835273265838623, + "reward_std": 7.707376234233379, "rewards/concensus_correctness_reward_func": 15.625, "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.6875, - "rewards/question_recreation_reward_func": 0.6069850958883762, + "rewards/final_correctness_reward_func": 1.125, + "rewards/question_recreation_reward_func": 0.2812110260128975, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.1875, - "rewards/xmlcount_reward_func": 0.8448749929666519, + "rewards/strict_format_reward_func": 0.21875, + "rewards/xmlcount_reward_func": 1.022812508046627, "step": 488 }, { - "completion_length": 204.8125, + "completion_length": 119.75, "epoch": 5.6321839080459775, - "grad_norm": 7.40086030960083, - "kl": 33.82424318790436, + "grad_norm": 20.902118682861328, + "kl": 125.06465935707092, "learning_rate": 6.343484796338395e-09, - "loss": 0.0338, - "reward": 21.98166251182556, - "reward_std": 5.526210054755211, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.1251, + "reward": 20.91891586780548, + "reward_std": 6.550846293568611, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.375, - "rewards/question_recreation_reward_func": 0.4931934615597129, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.18219725834205747, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.7228437624871731, + "rewards/strict_format_reward_func": 0.1875, + "rewards/xmlcount_reward_func": 0.9867187440395355, "step": 490 }, { - "completion_length": 168.21875, + "completion_length": 154.40625, "epoch": 5.655172413793103, - "grad_norm": 13.843487739562988, - "kl": 47.17466878890991, + "grad_norm": 27.19748878479004, + "kl": 86.48812162876129, "learning_rate": 4.247059025082323e-09, - "loss": 0.0472, - "reward": 22.345729112625122, - "reward_std": 5.064793806523085, - "rewards/concensus_correctness_reward_func": 18.125, - "rewards/consensus_reward_func": 1.8125, + "loss": 0.0865, + "reward": 21.32628470659256, + "reward_std": 4.982774421572685, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.30669773556292057, + "rewards/final_correctness_reward_func": 1.375, + "rewards/question_recreation_reward_func": 0.14512856677174568, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.203125, - "rewards/xmlcount_reward_func": 0.8359062448143959, + "rewards/strict_format_reward_func": 0.25, + "rewards/xmlcount_reward_func": 0.9936562329530716, "step": 492 }, { - "completion_length": 188.96875, + "completion_length": 148.21875, "epoch": 5.67816091954023, - "grad_norm": 19.06829071044922, - "kl": 81.50695556402206, + "grad_norm": 12.3695650100708, + "kl": 75.50230717658997, "learning_rate": 2.5694960397806834e-09, - "loss": 0.0815, - "reward": 21.258598804473877, - "reward_std": 6.474185291677713, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.6875, + "loss": 0.0755, + "reward": 20.540077567100525, + "reward_std": 6.837883442640305, + "rewards/concensus_correctness_reward_func": 16.25, + "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.0625, - "rewards/question_recreation_reward_func": 0.41812983760610223, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.3742345217615366, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.234375, - "rewards/xmlcount_reward_func": 0.9810937717556953, + "rewards/strict_format_reward_func": 0.28125, + "rewards/xmlcount_reward_func": 1.0095937475562096, "step": 494 }, { - "completion_length": 161.78125, + "completion_length": 119.4375, "epoch": 5.7011494252873565, - "grad_norm": 13.5101957321167, - "kl": 35.58966064453125, + "grad_norm": 17.152254104614258, + "kl": 307.7507516145706, "learning_rate": 1.3110773862126669e-09, - "loss": 0.0356, - "reward": 20.9284570813179, - "reward_std": 2.426313806325197, - "rewards/concensus_correctness_reward_func": 16.875, - "rewards/consensus_reward_func": 1.9375, + "loss": 0.3078, + "reward": 17.329355597496033, + "reward_std": 7.924280449748039, + "rewards/concensus_correctness_reward_func": 13.125, + "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 0.875, - "rewards/question_recreation_reward_func": 0.3215510742738843, + "rewards/final_correctness_reward_func": 1.0625, + "rewards/question_recreation_reward_func": 0.18438696302473545, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.7787812501192093, + "rewards/strict_format_reward_func": 0.3125, + "rewards/xmlcount_reward_func": 1.082468755543232, "step": 496 }, { - "completion_length": 172.125, + "completion_length": 127.46875, "epoch": 5.724137931034483, - "grad_norm": 12.734374046325684, - "kl": 50.94114375114441, + "grad_norm": 16.53656768798828, + "kl": 53.12660336494446, "learning_rate": 4.720142650685433e-10, - "loss": 0.0509, - "reward": 21.75509023666382, - "reward_std": 6.119863275438547, - "rewards/concensus_correctness_reward_func": 17.5, - "rewards/consensus_reward_func": 1.75, + "loss": 0.0531, + "reward": 20.859828174114227, + "reward_std": 3.923635706305504, + "rewards/concensus_correctness_reward_func": 16.875, + "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, - "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.32749607879668474, + "rewards/final_correctness_reward_func": 1.0, + "rewards/question_recreation_reward_func": 0.11185920069692656, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.171875, - "rewards/xmlcount_reward_func": 0.8182187341153622, + "rewards/strict_format_reward_func": 0.203125, + "rewards/xmlcount_reward_func": 0.9823437482118607, "step": 498 }, { - "completion_length": 165.0625, + "completion_length": 125.21875, "epoch": 5.747126436781609, - "grad_norm": 13.139098167419434, - "kl": 88.8033949136734, + "grad_norm": 17.960521697998047, + "kl": 131.03695118427277, "learning_rate": 5.2447496503016395e-11, - "loss": 0.0888, - "reward": 19.815386295318604, - "reward_std": 7.5219815745949745, - "rewards/concensus_correctness_reward_func": 15.625, - "rewards/consensus_reward_func": 1.5625, + "loss": 0.131, + "reward": 18.075900197029114, + "reward_std": 10.134012915194035, + "rewards/concensus_correctness_reward_func": 13.851125001907349, + "rewards/consensus_reward_func": 1.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.1875, - "rewards/question_recreation_reward_func": 0.45269847102463245, + "rewards/question_recreation_reward_func": 0.24402516055852175, "rewards/soft_format_reward_func": 0.0, - "rewards/strict_format_reward_func": 0.140625, - "rewards/xmlcount_reward_func": 0.8470624908804893, + "rewards/strict_format_reward_func": 0.296875, + "rewards/xmlcount_reward_func": 1.1213749945163727, "step": 500 }, { "epoch": 5.747126436781609, "step": 500, "total_flos": 0.0, - "train_loss": 0.46315413924492893, - "train_runtime": 5316.8417, - "train_samples_per_second": 1.505, - "train_steps_per_second": 0.094 + "train_loss": 0.209929760530591, + "train_runtime": 4628.2767, + "train_samples_per_second": 1.729, + "train_steps_per_second": 0.108 } ], "logging_steps": 2,