diff --git "a/checkpoint-400/trainer_state.json" "b/checkpoint-400/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-400/trainer_state.json" @@ -0,0 +1,6034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0408163265306123, + "eval_steps": 500, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 294.625, + "epoch": 0.00510204081632653, + "grad_norm": 0.8185210227966309, + "kl": 0.0, + "learning_rate": 8.474576271186442e-08, + "loss": -0.0, + "reward": 2.322195529937744, + "reward_std": 3.603352189064026, + "rewards/_soft_format_reward_func": 0.6937499940395355, + "rewards/_strict_format_reward_func": 1.5, + "rewards/_xml_count_reward_func": -0.6404999978840351, + "rewards/check_answer": 0.7689455151557922, + "step": 1 + }, + { + "completion_length": 290.625, + "epoch": 0.01020408163265306, + "grad_norm": 2.1359550952911377, + "kl": 0.0, + "learning_rate": 1.6949152542372883e-07, + "loss": -0.0, + "reward": 1.21162611246109, + "reward_std": 1.5170034170150757, + "rewards/_soft_format_reward_func": -0.30000001192092896, + "rewards/_strict_format_reward_func": 0.9375, + "rewards/_xml_count_reward_func": -0.29725000262260437, + "rewards/check_answer": 0.8713762287516147, + "step": 2 + }, + { + "completion_length": 609.4375, + "epoch": 0.015306122448979591, + "grad_norm": 0.19122786819934845, + "kl": 0.0015784860006533563, + "learning_rate": 2.5423728813559323e-07, + "loss": 0.0001, + "reward": 50.498586282134056, + "reward_std": 64.99645301699638, + "rewards/_soft_format_reward_func": 1.1125000044703484, + "rewards/_strict_format_reward_func": 1.6875, + "rewards/_xml_count_reward_func": -2.054562598466873, + "rewards/check_answer": 49.753145925700665, + "step": 3 + }, + { + "completion_length": 376.9375, + "epoch": 0.02040816326530612, + "grad_norm": 0.9175134897232056, + "kl": 0.002859612286556512, + "learning_rate": 3.3898305084745766e-07, + "loss": 0.0001, + "reward": 1.4672349244356155, + "reward_std": 3.2955686151981354, + "rewards/_soft_format_reward_func": 0.11249999701976776, + "rewards/_strict_format_reward_func": 0.9375, + "rewards/_xml_count_reward_func": -0.5276250019669533, + "rewards/check_answer": 0.9448598623275757, + "step": 4 + }, + { + "completion_length": 330.25, + "epoch": 0.025510204081632654, + "grad_norm": 1.0718908309936523, + "kl": 0.0023224337492138147, + "learning_rate": 4.2372881355932204e-07, + "loss": 0.0001, + "reward": -1.1594912707805634, + "reward_std": 1.1251797080039978, + "rewards/_soft_format_reward_func": -1.3875000029802322, + "rewards/_strict_format_reward_func": 0.375, + "rewards/_xml_count_reward_func": -0.14700000081211329, + "rewards/check_answer": 8.726535270398017e-06, + "step": 5 + }, + { + "completion_length": 262.3125, + "epoch": 0.030612244897959183, + "grad_norm": 0.42630404233932495, + "kl": 0.005721980705857277, + "learning_rate": 5.084745762711865e-07, + "loss": 0.0002, + "reward": 0.7644375264644623, + "reward_std": 1.8905271589756012, + "rewards/_soft_format_reward_func": 0.1875, + "rewards/_strict_format_reward_func": 0.9375, + "rewards/_xml_count_reward_func": -0.3605625070631504, + "rewards/check_answer": 0.0, + "step": 6 + }, + { + "completion_length": 346.4375, + "epoch": 0.03571428571428571, + "grad_norm": 0.9312261343002319, + "kl": 0.0050744940230735835, + "learning_rate": 5.93220338983051e-07, + "loss": 0.0002, + "reward": -1.572375014424324, + "reward_std": 0.5705349743366241, + "rewards/_soft_format_reward_func": -1.1875, + "rewards/_strict_format_reward_func": 0.0, + "rewards/_xml_count_reward_func": -0.38487499207258224, + "rewards/check_answer": 0.0, + "step": 7 + }, + { + "completion_length": 224.1875, + "epoch": 0.04081632653061224, + "grad_norm": 1.3169041872024536, + "kl": 0.0011199476284673437, + "learning_rate": 6.779661016949153e-07, + "loss": 0.0, + "reward": -0.09437501430511475, + "reward_std": 0.8001106679439545, + "rewards/_soft_format_reward_func": -0.5625, + "rewards/_strict_format_reward_func": 0.75, + "rewards/_xml_count_reward_func": -0.28187501011416316, + "rewards/check_answer": 0.0, + "step": 8 + }, + { + "completion_length": 288.5, + "epoch": 0.04591836734693878, + "grad_norm": 0.32011842727661133, + "kl": 0.00206242610784102, + "learning_rate": 7.627118644067798e-07, + "loss": 0.0001, + "reward": 0.9716752767562866, + "reward_std": 1.5422732569277287, + "rewards/_soft_format_reward_func": -0.5, + "rewards/_strict_format_reward_func": 1.125, + "rewards/_xml_count_reward_func": -0.6016249805688858, + "rewards/check_answer": 0.9483002722263336, + "step": 9 + }, + { + "completion_length": 428.5625, + "epoch": 0.05102040816326531, + "grad_norm": 1.1417263746261597, + "kl": 0.0029192589954618597, + "learning_rate": 8.474576271186441e-07, + "loss": 0.0001, + "reward": -0.809456929564476, + "reward_std": 1.531682014465332, + "rewards/_soft_format_reward_func": -1.199999988079071, + "rewards/_strict_format_reward_func": 0.375, + "rewards/_xml_count_reward_func": -0.0820000022649765, + "rewards/check_answer": 0.09754307568073273, + "step": 10 + }, + { + "completion_length": 407.625, + "epoch": 0.05612244897959184, + "grad_norm": 0.32301968336105347, + "kl": 0.0022870840039104223, + "learning_rate": 9.322033898305086e-07, + "loss": 0.0001, + "reward": 1.1774811148643494, + "reward_std": 1.3392982184886932, + "rewards/_soft_format_reward_func": 0.05624997615814209, + "rewards/_strict_format_reward_func": 1.5, + "rewards/_xml_count_reward_func": -0.42518749833106995, + "rewards/check_answer": 0.04641865938901901, + "step": 11 + }, + { + "completion_length": 298.6875, + "epoch": 0.061224489795918366, + "grad_norm": 0.37453481554985046, + "kl": 0.003994872371947622, + "learning_rate": 1.016949152542373e-06, + "loss": 0.0002, + "reward": 0.19297951459884644, + "reward_std": 1.9170226454734802, + "rewards/_soft_format_reward_func": -0.75, + "rewards/_strict_format_reward_func": 0.9375, + "rewards/_xml_count_reward_func": -0.414187490940094, + "rewards/check_answer": 0.4196670353412628, + "step": 12 + }, + { + "completion_length": 590.1875, + "epoch": 0.0663265306122449, + "grad_norm": 0.661213219165802, + "kl": 0.00343362707644701, + "learning_rate": 1.1016949152542374e-06, + "loss": 0.0002, + "reward": 1.760912761092186, + "reward_std": 3.0866269270627527, + "rewards/_soft_format_reward_func": -0.5687500089406967, + "rewards/_strict_format_reward_func": 1.3125, + "rewards/_xml_count_reward_func": -0.22106249630451202, + "rewards/check_answer": 1.2382252807728946, + "step": 13 + }, + { + "completion_length": 342.5, + "epoch": 0.07142857142857142, + "grad_norm": 0.40530863404273987, + "kl": 0.004755240981467068, + "learning_rate": 1.186440677966102e-06, + "loss": 0.0002, + "reward": 1.6868359446525574, + "reward_std": 2.877818286418915, + "rewards/_soft_format_reward_func": -0.8250000029802322, + "rewards/_strict_format_reward_func": 0.9375, + "rewards/_xml_count_reward_func": -0.011312499642372131, + "rewards/check_answer": 1.5856482982635498, + "step": 14 + }, + { + "completion_length": 338.9375, + "epoch": 0.07653061224489796, + "grad_norm": 4.245687961578369, + "kl": 0.06816600821912289, + "learning_rate": 1.2711864406779662e-06, + "loss": 0.0027, + "reward": 3.4960225969552994, + "reward_std": 4.042192316614091, + "rewards/_soft_format_reward_func": 0.25, + "rewards/_strict_format_reward_func": 1.125, + "rewards/_xml_count_reward_func": -0.5118750035762787, + "rewards/check_answer": 2.6328976154327393, + "step": 15 + }, + { + "completion_length": 461.25, + "epoch": 0.08163265306122448, + "grad_norm": 0.6524187922477722, + "kl": 0.003892036440447555, + "learning_rate": 1.3559322033898307e-06, + "loss": 0.0002, + "reward": -0.08632761240005493, + "reward_std": 1.6764180362224579, + "rewards/_soft_format_reward_func": -0.2874999940395355, + "rewards/_strict_format_reward_func": 0.9375, + "rewards/_xml_count_reward_func": -1.1476250290870667, + "rewards/check_answer": 0.41129739210009575, + "step": 16 + }, + { + "completion_length": 300.3125, + "epoch": 0.08673469387755102, + "grad_norm": 0.2593589723110199, + "kl": 0.0008242716470050482, + "learning_rate": 1.4406779661016951e-06, + "loss": 0.0, + "reward": -1.24406249076128, + "reward_std": 0.6096278727054596, + "rewards/_soft_format_reward_func": -1.1312500014901161, + "rewards/_strict_format_reward_func": 0.0, + "rewards/_xml_count_reward_func": -0.11281250417232513, + "rewards/check_answer": 0.0, + "step": 17 + }, + { + "completion_length": 412.625, + "epoch": 0.09183673469387756, + "grad_norm": 0.929692804813385, + "kl": 0.005735803686548024, + "learning_rate": 1.5254237288135596e-06, + "loss": 0.0002, + "reward": 0.31456413865089417, + "reward_std": 2.1592386066913605, + "rewards/_soft_format_reward_func": 0.25, + "rewards/_strict_format_reward_func": 0.75, + "rewards/_xml_count_reward_func": -0.9343749992549419, + "rewards/check_answer": 0.24893911182880402, + "step": 18 + }, + { + "completion_length": 470.0625, + "epoch": 0.09693877551020408, + "grad_norm": 0.3417404890060425, + "kl": 0.00579645624384284, + "learning_rate": 1.6101694915254237e-06, + "loss": 0.0002, + "reward": 0.22844918817281723, + "reward_std": 1.6300125122070312, + "rewards/_soft_format_reward_func": -0.1875, + "rewards/_strict_format_reward_func": 0.9375, + "rewards/_xml_count_reward_func": -1.1502499729394913, + "rewards/check_answer": 0.62869917973876, + "step": 19 + }, + { + "completion_length": 380.625, + "epoch": 0.10204081632653061, + "grad_norm": 267.041259765625, + "kl": 0.9507867273296142, + "learning_rate": 1.6949152542372882e-06, + "loss": 0.038, + "reward": -0.4774259477853775, + "reward_std": 1.764455109834671, + "rewards/_soft_format_reward_func": -0.6875, + "rewards/_strict_format_reward_func": 0.1875, + "rewards/_xml_count_reward_func": -0.46918751299381256, + "rewards/check_answer": 0.49176159501075745, + "step": 20 + }, + { + "completion_length": 414.75, + "epoch": 0.10714285714285714, + "grad_norm": 0.3368138372898102, + "kl": 0.003109428856987506, + "learning_rate": 1.7796610169491526e-06, + "loss": 0.0001, + "reward": -0.7655205726623535, + "reward_std": 1.0618179142475128, + "rewards/_soft_format_reward_func": -0.6875, + "rewards/_strict_format_reward_func": 0.5625, + "rewards/_xml_count_reward_func": -0.7773125171661377, + "rewards/check_answer": 0.1367919147014618, + "step": 21 + }, + { + "completion_length": 671.0, + "epoch": 0.11224489795918367, + "grad_norm": 0.9535510540008545, + "kl": 0.003949811041820794, + "learning_rate": 1.8644067796610171e-06, + "loss": 0.0002, + "reward": 0.8916858434677124, + "reward_std": 0.37709038180764765, + "rewards/_soft_format_reward_func": -0.4312499910593033, + "rewards/_strict_format_reward_func": 1.5, + "rewards/_xml_count_reward_func": -0.35756251215934753, + "rewards/check_answer": 0.18049834482371807, + "step": 22 + }, + { + "completion_length": 683.625, + "epoch": 0.11734693877551021, + "grad_norm": 1.632488489151001, + "kl": 0.03495925866445759, + "learning_rate": 1.9491525423728816e-06, + "loss": 0.0014, + "reward": 0.82306969165802, + "reward_std": 1.9999099373817444, + "rewards/_soft_format_reward_func": -0.5499999970197678, + "rewards/_strict_format_reward_func": 1.3125, + "rewards/_xml_count_reward_func": -0.26243748515844345, + "rewards/check_answer": 0.3230072185397148, + "step": 23 + }, + { + "completion_length": 381.0625, + "epoch": 0.12244897959183673, + "grad_norm": 0.2804034352302551, + "kl": 0.012196791227324866, + "learning_rate": 2.033898305084746e-06, + "loss": 0.0005, + "reward": -0.9751249700784683, + "reward_std": 1.2838140726089478, + "rewards/_soft_format_reward_func": -1.125, + "rewards/_strict_format_reward_func": 0.375, + "rewards/_xml_count_reward_func": -0.2251249998807907, + "rewards/check_answer": 0.0, + "step": 24 + }, + { + "completion_length": 473.0625, + "epoch": 0.12755102040816327, + "grad_norm": 0.4001588523387909, + "kl": 0.002974389062728733, + "learning_rate": 2.11864406779661e-06, + "loss": 0.0001, + "reward": 2.849132001399994, + "reward_std": 1.8594820201396942, + "rewards/_soft_format_reward_func": 0.6500000059604645, + "rewards/_strict_format_reward_func": 1.875, + "rewards/_xml_count_reward_func": -0.8167499899864197, + "rewards/check_answer": 1.1408820822834969, + "step": 25 + }, + { + "completion_length": 522.0, + "epoch": 0.1326530612244898, + "grad_norm": 0.2913358211517334, + "kl": 0.004198311798973009, + "learning_rate": 2.203389830508475e-06, + "loss": 0.0002, + "reward": 0.5667000897228718, + "reward_std": 1.8786026984453201, + "rewards/_soft_format_reward_func": 0.08750000596046448, + "rewards/_strict_format_reward_func": 0.75, + "rewards/_xml_count_reward_func": -0.484437495470047, + "rewards/check_answer": 0.21363750100135803, + "step": 26 + }, + { + "completion_length": 324.3125, + "epoch": 0.1377551020408163, + "grad_norm": 0.377543568611145, + "kl": 0.005379673675633967, + "learning_rate": 2.288135593220339e-06, + "loss": 0.0002, + "reward": 1.7930006384849548, + "reward_std": 2.12799359485507, + "rewards/_soft_format_reward_func": 0.375, + "rewards/_strict_format_reward_func": 1.5, + "rewards/_xml_count_reward_func": -0.7957500219345093, + "rewards/check_answer": 0.7137506082653999, + "step": 27 + }, + { + "completion_length": 372.5, + "epoch": 0.14285714285714285, + "grad_norm": 0.37043654918670654, + "kl": 0.003904464378138073, + "learning_rate": 2.372881355932204e-06, + "loss": 0.0002, + "reward": 0.6625258177518845, + "reward_std": 2.1411255598068237, + "rewards/_soft_format_reward_func": 0.1875, + "rewards/_strict_format_reward_func": 0.9375, + "rewards/_xml_count_reward_func": -0.8833125308156013, + "rewards/check_answer": 0.4208383299410343, + "step": 28 + }, + { + "completion_length": 414.5625, + "epoch": 0.14795918367346939, + "grad_norm": 0.37689441442489624, + "kl": 0.0018855973307836393, + "learning_rate": 2.457627118644068e-06, + "loss": 0.0001, + "reward": -0.34431251883506775, + "reward_std": 0.6518816608004272, + "rewards/_soft_format_reward_func": -0.6875, + "rewards/_strict_format_reward_func": 0.5625, + "rewards/_xml_count_reward_func": -0.21931251138448715, + "rewards/check_answer": 6.078471059822732e-15, + "step": 29 + }, + { + "completion_length": 471.0625, + "epoch": 0.15306122448979592, + "grad_norm": 0.32698705792427063, + "kl": 0.0054136388207552955, + "learning_rate": 2.5423728813559323e-06, + "loss": 0.0002, + "reward": 1.2012260109186172, + "reward_std": 1.9618901312351227, + "rewards/_soft_format_reward_func": 0.3812499940395355, + "rewards/_strict_format_reward_func": 1.125, + "rewards/_xml_count_reward_func": -1.0806874781847, + "rewards/check_answer": 0.7756634612169364, + "step": 30 + }, + { + "completion_length": 543.1875, + "epoch": 0.15816326530612246, + "grad_norm": 0.6773600578308105, + "kl": 0.009602147212717682, + "learning_rate": 2.627118644067797e-06, + "loss": 0.0004, + "reward": 16.712120667099953, + "reward_std": 31.081469893455505, + "rewards/_soft_format_reward_func": 0.375, + "rewards/_strict_format_reward_func": 1.5, + "rewards/_xml_count_reward_func": -1.4680624902248383, + "rewards/check_answer": 16.305183589458466, + "step": 31 + }, + { + "completion_length": 405.5, + "epoch": 0.16326530612244897, + "grad_norm": 2.430391311645508, + "kl": 0.016679312742780894, + "learning_rate": 2.7118644067796613e-06, + "loss": 0.0007, + "reward": -0.05791878700256348, + "reward_std": 0.9276261329650879, + "rewards/_soft_format_reward_func": -0.824999988079071, + "rewards/_strict_format_reward_func": 0.9375, + "rewards/_xml_count_reward_func": -0.3491249978542328, + "rewards/check_answer": 0.17870615608990192, + "step": 32 + }, + { + "completion_length": 367.6875, + "epoch": 0.1683673469387755, + "grad_norm": 0.8208937644958496, + "kl": 0.008234784821979702, + "learning_rate": 2.7966101694915256e-06, + "loss": 0.0003, + "reward": 6.702535092830658, + "reward_std": 4.863340765237808, + "rewards/_soft_format_reward_func": 1.5562500059604645, + "rewards/_strict_format_reward_func": 2.0625, + "rewards/_xml_count_reward_func": -1.0955625176429749, + "rewards/check_answer": 4.179348034758277, + "step": 33 + }, + { + "completion_length": 670.8125, + "epoch": 0.17346938775510204, + "grad_norm": 0.17907316982746124, + "kl": 0.0034322862866247306, + "learning_rate": 2.8813559322033903e-06, + "loss": 0.0001, + "reward": 2.961810827255249, + "reward_std": 7.308291792869568, + "rewards/_soft_format_reward_func": -0.8562500029802322, + "rewards/_strict_format_reward_func": 1.125, + "rewards/_xml_count_reward_func": -0.43849998712539673, + "rewards/check_answer": 3.131560802459717, + "step": 34 + }, + { + "completion_length": 387.125, + "epoch": 0.17857142857142858, + "grad_norm": 0.18726573884487152, + "kl": 0.002935138749307953, + "learning_rate": 2.9661016949152545e-06, + "loss": 0.0001, + "reward": 1.8329545259475708, + "reward_std": 3.412140369415283, + "rewards/_soft_format_reward_func": -0.3375000059604645, + "rewards/_strict_format_reward_func": 0.9375, + "rewards/_xml_count_reward_func": -0.8870000243186951, + "rewards/check_answer": 2.1199543476104736, + "step": 35 + }, + { + "completion_length": 739.5, + "epoch": 0.1836734693877551, + "grad_norm": 0.32407084107398987, + "kl": 0.0060507280577439815, + "learning_rate": 3.0508474576271192e-06, + "loss": 0.0002, + "reward": 0.4319094121456146, + "reward_std": 2.4602610170841217, + "rewards/_soft_format_reward_func": -0.13124999403953552, + "rewards/_strict_format_reward_func": 0.9375, + "rewards/_xml_count_reward_func": -0.9989374801516533, + "rewards/check_answer": 0.6245969533920288, + "step": 36 + }, + { + "completion_length": 401.8125, + "epoch": 0.18877551020408162, + "grad_norm": 0.4821971654891968, + "kl": 0.010924356349278241, + "learning_rate": 3.135593220338983e-06, + "loss": 0.0004, + "reward": 1.8411645293235779, + "reward_std": 3.823741763830185, + "rewards/_soft_format_reward_func": 0.25, + "rewards/_strict_format_reward_func": 1.125, + "rewards/_xml_count_reward_func": -0.885937524959445, + "rewards/check_answer": 1.3521020412445068, + "step": 37 + }, + { + "completion_length": 368.375, + "epoch": 0.19387755102040816, + "grad_norm": 0.6958189606666565, + "kl": 0.01290791796054691, + "learning_rate": 3.2203389830508473e-06, + "loss": 0.0005, + "reward": 1.7745112180709839, + "reward_std": 2.5693418979644775, + "rewards/_soft_format_reward_func": 1.1875, + "rewards/_strict_format_reward_func": 1.125, + "rewards/_xml_count_reward_func": -1.0533749610185623, + "rewards/check_answer": 0.5153862088918686, + "step": 38 + }, + { + "completion_length": 826.125, + "epoch": 0.1989795918367347, + "grad_norm": 0.3377620279788971, + "kl": 0.006919818581081927, + "learning_rate": 3.305084745762712e-06, + "loss": 0.0003, + "reward": 35.50946241617203, + "reward_std": 28.118246495723724, + "rewards/_soft_format_reward_func": 1.4312500059604645, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.6590625643730164, + "rewards/check_answer": 32.92477425445395, + "step": 39 + }, + { + "completion_length": 334.375, + "epoch": 0.20408163265306123, + "grad_norm": 1.3999114036560059, + "kl": 0.011267256457358599, + "learning_rate": 3.3898305084745763e-06, + "loss": 0.0005, + "reward": 1.117626965045929, + "reward_std": 2.5586954951286316, + "rewards/_soft_format_reward_func": 0.4375, + "rewards/_strict_format_reward_func": 1.125, + "rewards/_xml_count_reward_func": -0.8491249866783619, + "rewards/check_answer": 0.40425196290016174, + "step": 40 + }, + { + "completion_length": 390.125, + "epoch": 0.20918367346938777, + "grad_norm": 1.1846420764923096, + "kl": 0.01798212551511824, + "learning_rate": 3.474576271186441e-06, + "loss": 0.0007, + "reward": 4.644592732191086, + "reward_std": 1.390872847288847, + "rewards/_soft_format_reward_func": 1.793749988079071, + "rewards/_strict_format_reward_func": 2.4375, + "rewards/_xml_count_reward_func": -1.6293749511241913, + "rewards/check_answer": 2.0427176877856255, + "step": 41 + }, + { + "completion_length": 476.1875, + "epoch": 0.21428571428571427, + "grad_norm": 0.6164702773094177, + "kl": 0.047966267447918653, + "learning_rate": 3.5593220338983053e-06, + "loss": 0.0019, + "reward": 2.8454742431640625, + "reward_std": 2.382638132199645, + "rewards/_soft_format_reward_func": 1.425000011920929, + "rewards/_strict_format_reward_func": 2.0625, + "rewards/_xml_count_reward_func": -1.6521874964237213, + "rewards/check_answer": 1.0101617649197578, + "step": 42 + }, + { + "completion_length": 325.375, + "epoch": 0.2193877551020408, + "grad_norm": 1.3243409395217896, + "kl": 0.01102221303153783, + "learning_rate": 3.6440677966101695e-06, + "loss": 0.0004, + "reward": 4.189052075147629, + "reward_std": 3.7774379551410675, + "rewards/_soft_format_reward_func": 1.262499988079071, + "rewards/_strict_format_reward_func": 1.6875, + "rewards/_xml_count_reward_func": -0.871749997138977, + "rewards/check_answer": 2.1108021295513026, + "step": 43 + }, + { + "completion_length": 483.4375, + "epoch": 0.22448979591836735, + "grad_norm": 2.6776297092437744, + "kl": 0.024570247973315418, + "learning_rate": 3.7288135593220342e-06, + "loss": 0.001, + "reward": 3.3171424567699432, + "reward_std": 1.9457294531166553, + "rewards/_soft_format_reward_func": 0.9500000029802322, + "rewards/_strict_format_reward_func": 2.4375, + "rewards/_xml_count_reward_func": -0.710875004529953, + "rewards/check_answer": 0.6405174862593412, + "step": 44 + }, + { + "completion_length": 401.875, + "epoch": 0.22959183673469388, + "grad_norm": 1.8040707111358643, + "kl": 0.021586093585938215, + "learning_rate": 3.8135593220338985e-06, + "loss": 0.0009, + "reward": 3.3045076727867126, + "reward_std": 1.9318298771977425, + "rewards/_soft_format_reward_func": 1.300000011920929, + "rewards/_strict_format_reward_func": 2.25, + "rewards/_xml_count_reward_func": -1.3513749986886978, + "rewards/check_answer": 1.1058825515210629, + "step": 45 + }, + { + "completion_length": 442.1875, + "epoch": 0.23469387755102042, + "grad_norm": 0.28188657760620117, + "kl": 0.01355510693974793, + "learning_rate": 3.898305084745763e-06, + "loss": 0.0005, + "reward": 5.497533082962036, + "reward_std": 5.00154435634613, + "rewards/_soft_format_reward_func": 1.2312500029802322, + "rewards/_strict_format_reward_func": 2.0625, + "rewards/_xml_count_reward_func": -1.8056249618530273, + "rewards/check_answer": 4.009408250451088, + "step": 46 + }, + { + "completion_length": 372.875, + "epoch": 0.23979591836734693, + "grad_norm": 0.5053686499595642, + "kl": 0.010700618498958647, + "learning_rate": 3.9830508474576275e-06, + "loss": 0.0004, + "reward": 22.94883681833744, + "reward_std": 10.753062516450882, + "rewards/_soft_format_reward_func": 1.375, + "rewards/_strict_format_reward_func": 2.25, + "rewards/_xml_count_reward_func": -1.2283124923706055, + "rewards/check_answer": 20.55214899405837, + "step": 47 + }, + { + "completion_length": 394.5, + "epoch": 0.24489795918367346, + "grad_norm": 7.864255428314209, + "kl": 0.010628446761984378, + "learning_rate": 4.067796610169492e-06, + "loss": 0.0004, + "reward": 4.394192218780518, + "reward_std": 1.785563262179494, + "rewards/_soft_format_reward_func": 1.5187499970197678, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -1.218437522649765, + "rewards/check_answer": 1.4688798922579736, + "step": 48 + }, + { + "completion_length": 725.375, + "epoch": 0.25, + "grad_norm": 0.9599105715751648, + "kl": 0.012451534566935152, + "learning_rate": 4.152542372881356e-06, + "loss": 0.0005, + "reward": 2.9797146916389465, + "reward_std": 1.3605367243289948, + "rewards/_soft_format_reward_func": 1.0562499985098839, + "rewards/_strict_format_reward_func": 2.25, + "rewards/_xml_count_reward_func": -0.7503125071525574, + "rewards/check_answer": 0.42377725534606725, + "step": 49 + }, + { + "completion_length": 718.1875, + "epoch": 0.25510204081632654, + "grad_norm": 0.4486681818962097, + "kl": 0.013971900450997055, + "learning_rate": 4.23728813559322e-06, + "loss": 0.0006, + "reward": 51.57847714424133, + "reward_std": 17.521905459463596, + "rewards/_soft_format_reward_func": 1.59375, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -1.9316873885691166, + "rewards/check_answer": 49.29141854145564, + "step": 50 + }, + { + "completion_length": 378.375, + "epoch": 0.2602040816326531, + "grad_norm": 1.6755878925323486, + "kl": 0.015725657111033797, + "learning_rate": 4.322033898305085e-06, + "loss": 0.0006, + "reward": 4.97844672203064, + "reward_std": 1.7446883618831635, + "rewards/_soft_format_reward_func": 1.875, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -1.4906874597072601, + "rewards/check_answer": 1.9691341519355774, + "step": 51 + }, + { + "completion_length": 314.125, + "epoch": 0.2653061224489796, + "grad_norm": 0.6237319707870483, + "kl": 0.02619828935712576, + "learning_rate": 4.40677966101695e-06, + "loss": 0.001, + "reward": 6.345100581645966, + "reward_std": 2.3465639874339104, + "rewards/_soft_format_reward_func": 1.875, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -1.2206874787807465, + "rewards/check_answer": 3.065787836909294, + "step": 52 + }, + { + "completion_length": 451.4375, + "epoch": 0.27040816326530615, + "grad_norm": 3.745959758758545, + "kl": 0.0600818342063576, + "learning_rate": 4.491525423728814e-06, + "loss": 0.0024, + "reward": 5.12294328212738, + "reward_std": 1.3810148686170578, + "rewards/_soft_format_reward_func": 1.6499999910593033, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -1.5819375328719616, + "rewards/check_answer": 2.4298809214815265, + "step": 53 + }, + { + "completion_length": 522.8125, + "epoch": 0.2755102040816326, + "grad_norm": 1.2545146942138672, + "kl": 0.014883615309372544, + "learning_rate": 4.576271186440678e-06, + "loss": 0.0006, + "reward": 3.6991968154907227, + "reward_std": 2.190860778093338, + "rewards/_soft_format_reward_func": 1.5125000029802322, + "rewards/_strict_format_reward_func": 2.4375, + "rewards/_xml_count_reward_func": -1.3406250104308128, + "rewards/check_answer": 1.0898219845694257, + "step": 54 + }, + { + "completion_length": 410.6875, + "epoch": 0.28061224489795916, + "grad_norm": 16.85138702392578, + "kl": 0.01683287846390158, + "learning_rate": 4.6610169491525425e-06, + "loss": 0.0007, + "reward": 3.6215168833732605, + "reward_std": 2.4318079613149166, + "rewards/_soft_format_reward_func": 1.6312499940395355, + "rewards/_strict_format_reward_func": 2.0625, + "rewards/_xml_count_reward_func": -1.533000037074089, + "rewards/check_answer": 1.460766777396202, + "step": 55 + }, + { + "completion_length": 371.125, + "epoch": 0.2857142857142857, + "grad_norm": 15.180438041687012, + "kl": 0.02064416464418173, + "learning_rate": 4.745762711864408e-06, + "loss": 0.0008, + "reward": 4.6238145381212234, + "reward_std": 2.060830157250166, + "rewards/_soft_format_reward_func": 1.5062499940395355, + "rewards/_strict_format_reward_func": 2.0625, + "rewards/_xml_count_reward_func": -1.0030625015497208, + "rewards/check_answer": 2.0581270148977637, + "step": 56 + }, + { + "completion_length": 428.125, + "epoch": 0.29081632653061223, + "grad_norm": 0.3310839533805847, + "kl": 0.018375703308265656, + "learning_rate": 4.830508474576272e-06, + "loss": 0.0007, + "reward": 2.3490172177553177, + "reward_std": 1.6269982382655144, + "rewards/_soft_format_reward_func": 1.3687500059604645, + "rewards/_strict_format_reward_func": 1.875, + "rewards/_xml_count_reward_func": -1.6375625133514404, + "rewards/check_answer": 0.742829842492938, + "step": 57 + }, + { + "completion_length": 404.25, + "epoch": 0.29591836734693877, + "grad_norm": 0.34096795320510864, + "kl": 0.010851162485778332, + "learning_rate": 4.915254237288136e-06, + "loss": 0.0004, + "reward": 3.4022790044546127, + "reward_std": 1.133506953716278, + "rewards/_soft_format_reward_func": 1.100000023841858, + "rewards/_strict_format_reward_func": 2.4375, + "rewards/_xml_count_reward_func": -1.3487499952316284, + "rewards/check_answer": 1.2135289967991412, + "step": 58 + }, + { + "completion_length": 316.5, + "epoch": 0.3010204081632653, + "grad_norm": 3.706521511077881, + "kl": 0.016447525937110186, + "learning_rate": 5e-06, + "loss": 0.0007, + "reward": 4.640098571777344, + "reward_std": 2.5934594720602036, + "rewards/_soft_format_reward_func": 1.675000011920929, + "rewards/_strict_format_reward_func": 2.25, + "rewards/_xml_count_reward_func": -1.1507499814033508, + "rewards/check_answer": 1.8658485412597656, + "step": 59 + }, + { + "completion_length": 467.625, + "epoch": 0.30612244897959184, + "grad_norm": 0.40216565132141113, + "kl": 0.016874468652531505, + "learning_rate": 4.999955914361218e-06, + "loss": 0.0007, + "reward": 3.062375247478485, + "reward_std": 2.366086855530739, + "rewards/_soft_format_reward_func": 1.2937500029802322, + "rewards/_strict_format_reward_func": 2.0625, + "rewards/_xml_count_reward_func": -1.407749943435192, + "rewards/check_answer": 1.1138752102851868, + "step": 60 + }, + { + "completion_length": 596.0, + "epoch": 0.3112244897959184, + "grad_norm": 0.38576120138168335, + "kl": 0.01672750909347087, + "learning_rate": 4.999823658999708e-06, + "loss": 0.0007, + "reward": 4.684498995542526, + "reward_std": 0.9209771901369095, + "rewards/_soft_format_reward_func": 1.7750000059604645, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.5980625189840794, + "rewards/check_answer": 1.5075614899396896, + "step": 61 + }, + { + "completion_length": 298.25, + "epoch": 0.3163265306122449, + "grad_norm": 21.191478729248047, + "kl": 0.06260064756497741, + "learning_rate": 4.999603238579919e-06, + "loss": 0.0025, + "reward": 5.105346739292145, + "reward_std": 0.8463521376252174, + "rewards/_soft_format_reward_func": 1.918749988079071, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.2289375066757202, + "rewards/check_answer": 1.6030341610312462, + "step": 62 + }, + { + "completion_length": 545.1875, + "epoch": 0.32142857142857145, + "grad_norm": 0.6183257102966309, + "kl": 0.010101811029016972, + "learning_rate": 4.999294660875751e-06, + "loss": 0.0004, + "reward": 4.376120448112488, + "reward_std": 1.4407944083213806, + "rewards/_soft_format_reward_func": 1.75, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.8306874781847, + "rewards/check_answer": 1.6443080008029938, + "step": 63 + }, + { + "completion_length": 491.875, + "epoch": 0.32653061224489793, + "grad_norm": 0.27649980783462524, + "kl": 0.0261327491607517, + "learning_rate": 4.998897936770281e-06, + "loss": 0.001, + "reward": 2.1283541917800903, + "reward_std": 1.4720253944396973, + "rewards/_soft_format_reward_func": 1.2000000178813934, + "rewards/_strict_format_reward_func": 1.875, + "rewards/_xml_count_reward_func": -1.7337499856948853, + "rewards/check_answer": 0.7871042089536786, + "step": 64 + }, + { + "completion_length": 419.625, + "epoch": 0.33163265306122447, + "grad_norm": 1.0943312644958496, + "kl": 0.029257855378091335, + "learning_rate": 4.998413080255376e-06, + "loss": 0.0012, + "reward": 3.823546826839447, + "reward_std": 1.6460879147052765, + "rewards/_soft_format_reward_func": 1.3500000089406967, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -1.0743749737739563, + "rewards/check_answer": 0.9229219295084476, + "step": 65 + }, + { + "completion_length": 857.75, + "epoch": 0.336734693877551, + "grad_norm": 0.32977959513664246, + "kl": 0.03638110449537635, + "learning_rate": 4.997840108431203e-06, + "loss": 0.0015, + "reward": 3.149389237165451, + "reward_std": 0.9396539479494095, + "rewards/_soft_format_reward_func": 1.3999999910593033, + "rewards/_strict_format_reward_func": 2.4375, + "rewards/_xml_count_reward_func": -1.5449376106262207, + "rewards/check_answer": 0.8568266952133854, + "step": 66 + }, + { + "completion_length": 538.4375, + "epoch": 0.34183673469387754, + "grad_norm": 1.0524252653121948, + "kl": 0.022415873361751437, + "learning_rate": 4.997179041505628e-06, + "loss": 0.0009, + "reward": 5.703039646148682, + "reward_std": 2.7801234126091003, + "rewards/_soft_format_reward_func": 1.875, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -2.2055625319480896, + "rewards/check_answer": 3.408602237701416, + "step": 67 + }, + { + "completion_length": 315.5625, + "epoch": 0.3469387755102041, + "grad_norm": 0.3651297986507416, + "kl": 0.028624295257031918, + "learning_rate": 4.996429902793494e-06, + "loss": 0.0011, + "reward": 6.553413987159729, + "reward_std": 0.9208076875656843, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.3125624656677246, + "rewards/check_answer": 3.1159763261675835, + "step": 68 + }, + { + "completion_length": 325.3125, + "epoch": 0.3520408163265306, + "grad_norm": 0.36629951000213623, + "kl": 0.00954329816158861, + "learning_rate": 4.995592718715809e-06, + "loss": 0.0004, + "reward": 4.5701053738594055, + "reward_std": 2.1177507576649077, + "rewards/_soft_format_reward_func": 1.6875, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -1.0841874927282333, + "rewards/check_answer": 1.3417928112321533, + "step": 69 + }, + { + "completion_length": 379.25, + "epoch": 0.35714285714285715, + "grad_norm": 1.3463882207870483, + "kl": 0.01597937708720565, + "learning_rate": 4.9946675187988104e-06, + "loss": 0.0006, + "reward": 11.42964482307434, + "reward_std": 8.272757766768336, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.378687471151352, + "rewards/check_answer": 8.05833314359188, + "step": 70 + }, + { + "completion_length": 569.875, + "epoch": 0.3622448979591837, + "grad_norm": 0.7441303730010986, + "kl": 0.014439634280279279, + "learning_rate": 4.99365433567292e-06, + "loss": 0.0006, + "reward": 3.382221519947052, + "reward_std": 1.6084937080740929, + "rewards/_soft_format_reward_func": 1.649999976158142, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -2.3630625009536743, + "rewards/check_answer": 1.4702840596437454, + "step": 71 + }, + { + "completion_length": 495.8125, + "epoch": 0.3673469387755102, + "grad_norm": 0.42513689398765564, + "kl": 0.011408616206608713, + "learning_rate": 4.992553205071599e-06, + "loss": 0.0005, + "reward": 11.998928546905518, + "reward_std": 6.857654731720686, + "rewards/_soft_format_reward_func": 1.793749988079071, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -2.1945624947547913, + "rewards/check_answer": 9.77474146336317, + "step": 72 + }, + { + "completion_length": 342.8125, + "epoch": 0.37244897959183676, + "grad_norm": 0.427044153213501, + "kl": 0.018415149534121156, + "learning_rate": 4.991364165830082e-06, + "loss": 0.0007, + "reward": 5.027638792991638, + "reward_std": 2.4753660559654236, + "rewards/_soft_format_reward_func": 1.6500000059604645, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -0.9973750114440918, + "rewards/check_answer": 1.562513753771782, + "step": 73 + }, + { + "completion_length": 382.0625, + "epoch": 0.37755102040816324, + "grad_norm": 0.4066709280014038, + "kl": 0.011005707492586225, + "learning_rate": 4.990087259884016e-06, + "loss": 0.0004, + "reward": 5.17357063293457, + "reward_std": 3.7941238209605217, + "rewards/_soft_format_reward_func": 1.793749988079071, + "rewards/_strict_format_reward_func": 2.4375, + "rewards/_xml_count_reward_func": -1.3651875406503677, + "rewards/check_answer": 2.3075080066919327, + "step": 74 + }, + { + "completion_length": 495.0, + "epoch": 0.3826530612244898, + "grad_norm": 2.2613179683685303, + "kl": 0.01245829276740551, + "learning_rate": 4.988722532267969e-06, + "loss": 0.0005, + "reward": 2.6375007927417755, + "reward_std": 3.019617021083832, + "rewards/_soft_format_reward_func": 1.581250011920929, + "rewards/_strict_format_reward_func": 2.25, + "rewards/_xml_count_reward_func": -2.2434374690055847, + "rewards/check_answer": 1.049688383936882, + "step": 75 + }, + { + "completion_length": 513.625, + "epoch": 0.3877551020408163, + "grad_norm": 4.0517377853393555, + "kl": 0.023214824497699738, + "learning_rate": 4.987270031113855e-06, + "loss": 0.0009, + "reward": 3.0669388473033905, + "reward_std": 3.475656270980835, + "rewards/_soft_format_reward_func": 1.5625, + "rewards/_strict_format_reward_func": 1.875, + "rewards/_xml_count_reward_func": -2.354249984025955, + "rewards/check_answer": 1.9836888760328293, + "step": 76 + }, + { + "completion_length": 391.5625, + "epoch": 0.39285714285714285, + "grad_norm": 0.8595183491706848, + "kl": 0.022897689836099744, + "learning_rate": 4.985729807649224e-06, + "loss": 0.0009, + "reward": 3.036043405532837, + "reward_std": 0.7024286016821861, + "rewards/_soft_format_reward_func": 1.875, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -1.823375016450882, + "rewards/check_answer": 0.35941845644265413, + "step": 77 + }, + { + "completion_length": 417.125, + "epoch": 0.3979591836734694, + "grad_norm": 0.30788978934288025, + "kl": 0.016252433881163597, + "learning_rate": 4.984101916195467e-06, + "loss": 0.0007, + "reward": 5.285515308380127, + "reward_std": 1.8254318535327911, + "rewards/_soft_format_reward_func": 1.6875, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -1.55799999833107, + "rewards/check_answer": 2.5310151875019073, + "step": 78 + }, + { + "completion_length": 446.8125, + "epoch": 0.4030612244897959, + "grad_norm": 0.3968336582183838, + "kl": 0.01630049163941294, + "learning_rate": 4.9823864141658905e-06, + "loss": 0.0006, + "reward": 537.7010273933411, + "reward_std": 489.2284908122383, + "rewards/_soft_format_reward_func": 1.4500000029802322, + "rewards/_strict_format_reward_func": 2.25, + "rewards/_xml_count_reward_func": -1.3780625015497208, + "rewards/check_answer": 535.379130016081, + "step": 79 + }, + { + "completion_length": 316.0, + "epoch": 0.40816326530612246, + "grad_norm": 1.0200837850570679, + "kl": 0.018812671769410372, + "learning_rate": 4.980583362063697e-06, + "loss": 0.0008, + "reward": 5.375112950801849, + "reward_std": 1.6739845629781485, + "rewards/_soft_format_reward_func": 1.762499988079071, + "rewards/_strict_format_reward_func": 2.4375, + "rewards/_xml_count_reward_func": -1.0319999903440475, + "rewards/check_answer": 2.207113090902567, + "step": 80 + }, + { + "completion_length": 320.4375, + "epoch": 0.413265306122449, + "grad_norm": 0.42193111777305603, + "kl": 0.022067378275096416, + "learning_rate": 4.978692823479849e-06, + "loss": 0.0009, + "reward": 6.265072703361511, + "reward_std": 1.9197494611144066, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.0499375462532043, + "rewards/check_answer": 2.5650103390216827, + "step": 81 + }, + { + "completion_length": 374.0, + "epoch": 0.41836734693877553, + "grad_norm": 0.6050392389297485, + "kl": 0.025496677961200476, + "learning_rate": 4.976714865090827e-06, + "loss": 0.001, + "reward": 2.5642066597938538, + "reward_std": 1.6043110489845276, + "rewards/_soft_format_reward_func": 1.3250000029802322, + "rewards/_strict_format_reward_func": 1.875, + "rewards/_xml_count_reward_func": -0.9316874668002129, + "rewards/check_answer": 0.29589414165820926, + "step": 82 + }, + { + "completion_length": 722.0, + "epoch": 0.42346938775510207, + "grad_norm": 3.9317643642425537, + "kl": 0.02777448482811451, + "learning_rate": 4.97464955665628e-06, + "loss": 0.0011, + "reward": 4.469326972961426, + "reward_std": 2.0532086789608, + "rewards/_soft_format_reward_func": 1.5750000029802322, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -1.1702499650418758, + "rewards/check_answer": 1.4395770141854882, + "step": 83 + }, + { + "completion_length": 393.75, + "epoch": 0.42857142857142855, + "grad_norm": 0.34535089135169983, + "kl": 0.030066173058003187, + "learning_rate": 4.972496971016559e-06, + "loss": 0.0012, + "reward": 3.5937094688415527, + "reward_std": 1.5248013995587826, + "rewards/_soft_format_reward_func": 1.21875, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -0.9721875041723251, + "rewards/check_answer": 0.7221467904746532, + "step": 84 + }, + { + "completion_length": 478.0, + "epoch": 0.4336734693877551, + "grad_norm": 1.0733822584152222, + "kl": 0.01771931373514235, + "learning_rate": 4.970257184090156e-06, + "loss": 0.0007, + "reward": 3.772432804107666, + "reward_std": 1.725758384913206, + "rewards/_soft_format_reward_func": 1.4937500357627869, + "rewards/_strict_format_reward_func": 2.4375, + "rewards/_xml_count_reward_func": -0.9836250096559525, + "rewards/check_answer": 0.8248078285250813, + "step": 85 + }, + { + "completion_length": 534.125, + "epoch": 0.4387755102040816, + "grad_norm": 0.649592936038971, + "kl": 0.01605043071322143, + "learning_rate": 4.96793027487102e-06, + "loss": 0.0006, + "reward": 3.0387662947177887, + "reward_std": 1.816498763859272, + "rewards/_soft_format_reward_func": 1.4749999940395355, + "rewards/_strict_format_reward_func": 1.875, + "rewards/_xml_count_reward_func": -1.094249963760376, + "rewards/check_answer": 0.7830162237514742, + "step": 86 + }, + { + "completion_length": 672.1875, + "epoch": 0.44387755102040816, + "grad_norm": 136.27960205078125, + "kl": 0.24366865053889342, + "learning_rate": 4.9655163254257755e-06, + "loss": 0.0098, + "reward": 7.172706127166748, + "reward_std": 2.587234117090702, + "rewards/_soft_format_reward_func": 1.7000000029802322, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.9572500288486481, + "rewards/check_answer": 3.4299561521038413, + "step": 87 + }, + { + "completion_length": 694.6875, + "epoch": 0.4489795918367347, + "grad_norm": 0.23488010466098785, + "kl": 0.01088630617596209, + "learning_rate": 4.963015420890825e-06, + "loss": 0.0004, + "reward": 7.021105051040649, + "reward_std": 1.0360710583627224, + "rewards/_soft_format_reward_func": 1.5499999821186066, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.3521250039339066, + "rewards/check_answer": 3.82323000067845, + "step": 88 + }, + { + "completion_length": 498.125, + "epoch": 0.45408163265306123, + "grad_norm": 0.4348633289337158, + "kl": 0.018164563458412886, + "learning_rate": 4.960427649469346e-06, + "loss": 0.0007, + "reward": 3.7136881351470947, + "reward_std": 2.085390269756317, + "rewards/_soft_format_reward_func": 1.4624999910593033, + "rewards/_strict_format_reward_func": 2.4375, + "rewards/_xml_count_reward_func": -1.48831257969141, + "rewards/check_answer": 1.302000543102622, + "step": 89 + }, + { + "completion_length": 616.875, + "epoch": 0.45918367346938777, + "grad_norm": 1.5561532974243164, + "kl": 0.015912600560113788, + "learning_rate": 4.957753102428184e-06, + "loss": 0.0006, + "reward": 8.220839619636536, + "reward_std": 3.1606629248708487, + "rewards/_soft_format_reward_func": 1.693750023841858, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.0871875137090683, + "rewards/check_answer": 4.80177686011848, + "step": 90 + }, + { + "completion_length": 351.3125, + "epoch": 0.4642857142857143, + "grad_norm": 1.2018077373504639, + "kl": 0.06633210554718971, + "learning_rate": 4.954991874094633e-06, + "loss": 0.0027, + "reward": 2.7519712522625923, + "reward_std": 1.0931570180691779, + "rewards/_soft_format_reward_func": 1.550000011920929, + "rewards/_strict_format_reward_func": 1.875, + "rewards/_xml_count_reward_func": -1.0370625257492065, + "rewards/check_answer": 0.36403384804725647, + "step": 91 + }, + { + "completion_length": 366.6875, + "epoch": 0.46938775510204084, + "grad_norm": 0.42523127794265747, + "kl": 0.013800489017739892, + "learning_rate": 4.952144061853103e-06, + "loss": 0.0006, + "reward": 6.534438908100128, + "reward_std": 3.0693205446004868, + "rewards/_soft_format_reward_func": 1.787500023841858, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.4720624350011349, + "rewards/check_answer": 3.4065012373030186, + "step": 92 + }, + { + "completion_length": 398.6875, + "epoch": 0.4744897959183674, + "grad_norm": 0.22197820246219635, + "kl": 0.010188436252065003, + "learning_rate": 4.949209766141691e-06, + "loss": 0.0004, + "reward": 4.542325556278229, + "reward_std": 1.159326359629631, + "rewards/_soft_format_reward_func": 1.918749988079071, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.7997499704360962, + "rewards/check_answer": 1.6108255833387375, + "step": 93 + }, + { + "completion_length": 500.5, + "epoch": 0.47959183673469385, + "grad_norm": 0.33497729897499084, + "kl": 0.055587747134268284, + "learning_rate": 4.946189090448639e-06, + "loss": 0.0022, + "reward": 4.837452530860901, + "reward_std": 1.9922088906168938, + "rewards/_soft_format_reward_func": 1.6124999970197678, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -1.0624999813735485, + "rewards/check_answer": 1.6624527087719798, + "step": 94 + }, + { + "completion_length": 355.75, + "epoch": 0.4846938775510204, + "grad_norm": 0.31012672185897827, + "kl": 0.0232304020319134, + "learning_rate": 4.94308214130868e-06, + "loss": 0.0009, + "reward": 8.497554540634155, + "reward_std": 3.9802782740443945, + "rewards/_soft_format_reward_func": 1.5750000029802322, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -0.8483749888837337, + "rewards/check_answer": 5.145929626800353, + "step": 95 + }, + { + "completion_length": 373.875, + "epoch": 0.4897959183673469, + "grad_norm": 0.22870931029319763, + "kl": 0.012970933690667152, + "learning_rate": 4.939889028299284e-06, + "loss": 0.0005, + "reward": 5.1682655811309814, + "reward_std": 1.6821982599794865, + "rewards/_soft_format_reward_func": 1.6187499910593033, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.3219375610351562, + "rewards/check_answer": 2.0589530132710934, + "step": 96 + }, + { + "completion_length": 432.1875, + "epoch": 0.49489795918367346, + "grad_norm": 0.7341601252555847, + "kl": 0.017009504605084658, + "learning_rate": 4.936609864036793e-06, + "loss": 0.0007, + "reward": 5.641974210739136, + "reward_std": 1.353297283872962, + "rewards/_soft_format_reward_func": 1.90625, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.4101874977350235, + "rewards/check_answer": 2.1459116395562887, + "step": 97 + }, + { + "completion_length": 363.875, + "epoch": 0.5, + "grad_norm": 1.3306468725204468, + "kl": 0.01631148369051516, + "learning_rate": 4.933244764172448e-06, + "loss": 0.0007, + "reward": 7.02120740711689, + "reward_std": 3.146376432850957, + "rewards/_soft_format_reward_func": 1.625, + "rewards/_strict_format_reward_func": 2.0625, + "rewards/_xml_count_reward_func": -0.9313750192523003, + "rewards/check_answer": 4.26508229970932, + "step": 98 + }, + { + "completion_length": 725.25, + "epoch": 0.5051020408163265, + "grad_norm": 2.575485944747925, + "kl": 0.011819152743555605, + "learning_rate": 4.92979384738831e-06, + "loss": 0.0005, + "reward": 3.9428590536117554, + "reward_std": 1.018309948965907, + "rewards/_soft_format_reward_func": 1.3749999701976776, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -0.8431875072419643, + "rewards/check_answer": 0.5985467173159122, + "step": 99 + }, + { + "completion_length": 356.8125, + "epoch": 0.5102040816326531, + "grad_norm": 0.3401452302932739, + "kl": 0.02015103050507605, + "learning_rate": 4.926257235393077e-06, + "loss": 0.0008, + "reward": 9.113705039024353, + "reward_std": 5.51456093788147, + "rewards/_soft_format_reward_func": 1.7125000059604645, + "rewards/_strict_format_reward_func": 2.25, + "rewards/_xml_count_reward_func": -1.025812529027462, + "rewards/check_answer": 6.177018105983734, + "step": 100 + }, + { + "completion_length": 426.4375, + "epoch": 0.5153061224489796, + "grad_norm": 0.2861900329589844, + "kl": 0.008014739083591849, + "learning_rate": 4.922635052917786e-06, + "loss": 0.0003, + "reward": 4.119562268257141, + "reward_std": 0.6518542803823948, + "rewards/_soft_format_reward_func": 1.7937500178813934, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.2730625346302986, + "rewards/check_answer": 0.5988747701048851, + "step": 101 + }, + { + "completion_length": 375.75, + "epoch": 0.5204081632653061, + "grad_norm": 0.37096062302589417, + "kl": 0.015424605691805482, + "learning_rate": 4.918927427711422e-06, + "loss": 0.0006, + "reward": 5.0053569078445435, + "reward_std": 1.3419223129749298, + "rewards/_soft_format_reward_func": 1.7750000059604645, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.221687525510788, + "rewards/check_answer": 1.639544501900673, + "step": 102 + }, + { + "completion_length": 786.4375, + "epoch": 0.5255102040816326, + "grad_norm": 0.21075111627578735, + "kl": 0.010193536480073817, + "learning_rate": 4.915134490536403e-06, + "loss": 0.0004, + "reward": 7.238903224468231, + "reward_std": 1.5646906150504947, + "rewards/_soft_format_reward_func": 1.4750000089406967, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.8509999960660934, + "rewards/check_answer": 3.614903382266789, + "step": 103 + }, + { + "completion_length": 440.1875, + "epoch": 0.5306122448979592, + "grad_norm": 0.43856188654899597, + "kl": 0.023974559269845486, + "learning_rate": 4.911256375163977e-06, + "loss": 0.001, + "reward": 8.465415358543396, + "reward_std": 1.3520334959030151, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.5724374949932098, + "rewards/check_answer": 5.03785252571106, + "step": 104 + }, + { + "completion_length": 744.0625, + "epoch": 0.5357142857142857, + "grad_norm": 0.19438262283802032, + "kl": 0.011386665981262922, + "learning_rate": 4.907293218369499e-06, + "loss": 0.0005, + "reward": 10.853159785270691, + "reward_std": 1.569423645734787, + "rewards/_soft_format_reward_func": 1.7000000029802322, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.059749960899353, + "rewards/check_answer": 7.212910346628632, + "step": 105 + }, + { + "completion_length": 395.9375, + "epoch": 0.5408163265306123, + "grad_norm": 1.1634198427200317, + "kl": 0.03389626881107688, + "learning_rate": 4.903245159927607e-06, + "loss": 0.0014, + "reward": 5.962677836418152, + "reward_std": 1.3103387877345085, + "rewards/_soft_format_reward_func": 1.71875, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -1.1786874681711197, + "rewards/check_answer": 2.7976152896881104, + "step": 106 + }, + { + "completion_length": 463.9375, + "epoch": 0.5459183673469388, + "grad_norm": 0.3182987570762634, + "kl": 0.014578778180293739, + "learning_rate": 4.899112342607296e-06, + "loss": 0.0006, + "reward": 4.763711512088776, + "reward_std": 1.6892382949590683, + "rewards/_soft_format_reward_func": 1.6749999970197678, + "rewards/_strict_format_reward_func": 2.25, + "rewards/_xml_count_reward_func": -1.426062524318695, + "rewards/check_answer": 2.264773984483327, + "step": 107 + }, + { + "completion_length": 642.125, + "epoch": 0.5510204081632653, + "grad_norm": 0.3488450348377228, + "kl": 0.008207228442188352, + "learning_rate": 4.894894912166878e-06, + "loss": 0.0003, + "reward": 5.196447372436523, + "reward_std": 0.5015577161684632, + "rewards/_soft_format_reward_func": 1.6625000089406967, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.6586249768733978, + "rewards/check_answer": 1.1925724297761917, + "step": 108 + }, + { + "completion_length": 458.75, + "epoch": 0.5561224489795918, + "grad_norm": 0.38955190777778625, + "kl": 0.021245236741378903, + "learning_rate": 4.890593017348846e-06, + "loss": 0.0008, + "reward": 3.580578923225403, + "reward_std": 1.7503393739461899, + "rewards/_soft_format_reward_func": 1.71875, + "rewards/_strict_format_reward_func": 2.4375, + "rewards/_xml_count_reward_func": -1.1833125054836273, + "rewards/check_answer": 0.6076414063572884, + "step": 109 + }, + { + "completion_length": 421.0625, + "epoch": 0.5612244897959183, + "grad_norm": 1.2958167791366577, + "kl": 0.029316942440345883, + "learning_rate": 4.8862068098746246e-06, + "loss": 0.0012, + "reward": 4.520155489444733, + "reward_std": 1.4188951924443245, + "rewards/_soft_format_reward_func": 1.899999976158142, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.3512499928474426, + "rewards/check_answer": 1.1589055806398392, + "step": 110 + }, + { + "completion_length": 386.8125, + "epoch": 0.5663265306122449, + "grad_norm": 0.3301832377910614, + "kl": 0.02814935683272779, + "learning_rate": 4.88173644443922e-06, + "loss": 0.0011, + "reward": 5.029857754707336, + "reward_std": 0.8326428681612015, + "rewards/_soft_format_reward_func": 1.7000000178813934, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.3943124823272228, + "rewards/check_answer": 1.7241703867912292, + "step": 111 + }, + { + "completion_length": 678.25, + "epoch": 0.5714285714285714, + "grad_norm": 0.2823163568973541, + "kl": 0.017542321234941483, + "learning_rate": 4.877182078705766e-06, + "loss": 0.0007, + "reward": 4.434657633304596, + "reward_std": 1.1893670558929443, + "rewards/_soft_format_reward_func": 1.5125000029802322, + "rewards/_strict_format_reward_func": 2.4375, + "rewards/_xml_count_reward_func": -0.6100625060498714, + "rewards/check_answer": 1.0947200736713183, + "step": 112 + }, + { + "completion_length": 284.3125, + "epoch": 0.576530612244898, + "grad_norm": 0.36900410056114197, + "kl": 0.021452047862112522, + "learning_rate": 4.872543873299959e-06, + "loss": 0.0009, + "reward": 7.099963963031769, + "reward_std": 3.859396807849407, + "rewards/_soft_format_reward_func": 1.7249999940395355, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -0.8818749785423279, + "rewards/check_answer": 3.4443390518426895, + "step": 113 + }, + { + "completion_length": 422.75, + "epoch": 0.5816326530612245, + "grad_norm": 0.29888710379600525, + "kl": 0.028071329463273287, + "learning_rate": 4.8678219918043984e-06, + "loss": 0.0011, + "reward": 5.36677348613739, + "reward_std": 1.9548562616109848, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.3808125406503677, + "rewards/check_answer": 1.9975859224796295, + "step": 114 + }, + { + "completion_length": 397.0, + "epoch": 0.5867346938775511, + "grad_norm": 0.25467562675476074, + "kl": 0.020328150130808353, + "learning_rate": 4.863016600752813e-06, + "loss": 0.0008, + "reward": 5.861165881156921, + "reward_std": 2.294215776026249, + "rewards/_soft_format_reward_func": 1.862500011920929, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.3174375146627426, + "rewards/check_answer": 2.503603458404541, + "step": 115 + }, + { + "completion_length": 339.0, + "epoch": 0.5918367346938775, + "grad_norm": 0.4237631559371948, + "kl": 0.02294452185742557, + "learning_rate": 4.8581278696241924e-06, + "loss": 0.0009, + "reward": 7.219836235046387, + "reward_std": 2.048996329307556, + "rewards/_soft_format_reward_func": 1.7750000059604645, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.098249975591898, + "rewards/check_answer": 3.543086051940918, + "step": 116 + }, + { + "completion_length": 436.75, + "epoch": 0.5969387755102041, + "grad_norm": 3.5356175899505615, + "kl": 0.016916394233703613, + "learning_rate": 4.853155970836802e-06, + "loss": 0.0007, + "reward": 6.2292903661727905, + "reward_std": 0.982710599899292, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.5365625321865082, + "rewards/check_answer": 2.7658529579639435, + "step": 117 + }, + { + "completion_length": 471.375, + "epoch": 0.6020408163265306, + "grad_norm": 0.39139553904533386, + "kl": 0.013290104689076543, + "learning_rate": 4.8481010797421106e-06, + "loss": 0.0005, + "reward": 8.786529064178467, + "reward_std": 3.616459548473358, + "rewards/_soft_format_reward_func": 1.7000000029802322, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.0753125101327896, + "rewards/check_answer": 5.1618416756391525, + "step": 118 + }, + { + "completion_length": 353.6875, + "epoch": 0.6071428571428571, + "grad_norm": 0.28417208790779114, + "kl": 0.018675302737392485, + "learning_rate": 4.842963374618598e-06, + "loss": 0.0007, + "reward": 3.775757908821106, + "reward_std": 2.131088227033615, + "rewards/_soft_format_reward_func": 1.550000011920929, + "rewards/_strict_format_reward_func": 2.25, + "rewards/_xml_count_reward_func": -0.9227500185370445, + "rewards/check_answer": 0.8985078185796738, + "step": 119 + }, + { + "completion_length": 475.625, + "epoch": 0.6122448979591837, + "grad_norm": 0.4370657205581665, + "kl": 0.022615838330239058, + "learning_rate": 4.837743036665477e-06, + "loss": 0.0009, + "reward": 5.187110543251038, + "reward_std": 2.9855866506695747, + "rewards/_soft_format_reward_func": 1.7125000059604645, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -1.5276250094175339, + "rewards/check_answer": 2.3772354535758495, + "step": 120 + }, + { + "completion_length": 528.3125, + "epoch": 0.6173469387755102, + "grad_norm": 0.2892380654811859, + "kl": 0.014278996677603573, + "learning_rate": 4.832440249996292e-06, + "loss": 0.0006, + "reward": 25.238910496234894, + "reward_std": 10.534590110182762, + "rewards/_soft_format_reward_func": 1.59375, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -0.9437499940395355, + "rewards/check_answer": 21.963908864553076, + "step": 121 + }, + { + "completion_length": 365.875, + "epoch": 0.6224489795918368, + "grad_norm": 0.28936606645584106, + "kl": 0.02362329768948257, + "learning_rate": 4.827055201632435e-06, + "loss": 0.0009, + "reward": 5.751935660839081, + "reward_std": 1.4266124442219734, + "rewards/_soft_format_reward_func": 1.899999976158142, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -1.2576874569058418, + "rewards/check_answer": 2.297123208642006, + "step": 122 + }, + { + "completion_length": 389.1875, + "epoch": 0.6275510204081632, + "grad_norm": 0.3011477589607239, + "kl": 0.01705414243042469, + "learning_rate": 4.821588081496541e-06, + "loss": 0.0007, + "reward": 5.534918427467346, + "reward_std": 1.3727312982082367, + "rewards/_soft_format_reward_func": 1.7000000029802322, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.0863125324249268, + "rewards/check_answer": 1.921230850275606, + "step": 123 + }, + { + "completion_length": 464.25, + "epoch": 0.6326530612244898, + "grad_norm": 0.4716140925884247, + "kl": 0.06439857231453061, + "learning_rate": 4.816039082405799e-06, + "loss": 0.0026, + "reward": 3.45487904548645, + "reward_std": 1.4920125305652618, + "rewards/_soft_format_reward_func": 1.4187499731779099, + "rewards/_strict_format_reward_func": 2.4375, + "rewards/_xml_count_reward_func": -0.788750022649765, + "rewards/check_answer": 0.3873790400090229, + "step": 124 + }, + { + "completion_length": 458.8125, + "epoch": 0.6377551020408163, + "grad_norm": 0.2910076975822449, + "kl": 0.02810146939009428, + "learning_rate": 4.810408400065145e-06, + "loss": 0.0011, + "reward": 4.422984063625336, + "reward_std": 0.6079902481287718, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.777937427163124, + "rewards/check_answer": 1.2009213715791702, + "step": 125 + }, + { + "completion_length": 418.6875, + "epoch": 0.6428571428571429, + "grad_norm": 0.5814986228942871, + "kl": 0.022671347483992577, + "learning_rate": 4.804696233060359e-06, + "loss": 0.0009, + "reward": 5.096649527549744, + "reward_std": 1.7881849110126495, + "rewards/_soft_format_reward_func": 1.800000011920929, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -0.7446874678134918, + "rewards/check_answer": 1.41633702814579, + "step": 126 + }, + { + "completion_length": 360.5625, + "epoch": 0.6479591836734694, + "grad_norm": 0.36383190751075745, + "kl": 0.015463740332052112, + "learning_rate": 4.798902782851067e-06, + "loss": 0.0006, + "reward": 19.118314266204834, + "reward_std": 3.782061517238617, + "rewards/_soft_format_reward_func": 1.90625, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.36531252786517143, + "rewards/check_answer": 14.577375411987305, + "step": 127 + }, + { + "completion_length": 265.625, + "epoch": 0.6530612244897959, + "grad_norm": 0.5543303489685059, + "kl": 0.023673945106565952, + "learning_rate": 4.793028253763633e-06, + "loss": 0.0009, + "reward": 6.470711827278137, + "reward_std": 1.689875815063715, + "rewards/_soft_format_reward_func": 1.8875000178813934, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.6451250202953815, + "rewards/check_answer": 2.22833688557148, + "step": 128 + }, + { + "completion_length": 580.5, + "epoch": 0.6581632653061225, + "grad_norm": 0.29141587018966675, + "kl": 0.04182523349300027, + "learning_rate": 4.7870728529839495e-06, + "loss": 0.0017, + "reward": 6.982442021369934, + "reward_std": 3.280642829835415, + "rewards/_soft_format_reward_func": 1.737500011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.9067500084638596, + "rewards/check_answer": 3.1516919434070587, + "step": 129 + }, + { + "completion_length": 387.25, + "epoch": 0.6632653061224489, + "grad_norm": 0.2995375394821167, + "kl": 0.024216266116127372, + "learning_rate": 4.781036790550134e-06, + "loss": 0.001, + "reward": 190.0288714170456, + "reward_std": 97.05669444426894, + "rewards/_soft_format_reward_func": 1.84375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -0.8299999982118607, + "rewards/check_answer": 186.2026235461235, + "step": 130 + }, + { + "completion_length": 385.5, + "epoch": 0.6683673469387755, + "grad_norm": 0.551918625831604, + "kl": 0.022381589747965336, + "learning_rate": 4.774920279345121e-06, + "loss": 0.0009, + "reward": 6.143893599510193, + "reward_std": 0.5779884234070778, + "rewards/_soft_format_reward_func": 1.662500023841858, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.37799999862909317, + "rewards/check_answer": 1.8593935797107406, + "step": 131 + }, + { + "completion_length": 432.0, + "epoch": 0.673469387755102, + "grad_norm": 1.2904788255691528, + "kl": 0.03829192137345672, + "learning_rate": 4.768723535089156e-06, + "loss": 0.0015, + "reward": 5.190044522285461, + "reward_std": 0.9881309866905212, + "rewards/_soft_format_reward_func": 1.7750000357627869, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.3799375146627426, + "rewards/check_answer": 0.7949821203947067, + "step": 132 + }, + { + "completion_length": 410.8125, + "epoch": 0.6785714285714286, + "grad_norm": 0.3133583068847656, + "kl": 0.028204144444316626, + "learning_rate": 4.762446776332179e-06, + "loss": 0.0011, + "reward": 6.298715114593506, + "reward_std": 0.5748582370579243, + "rewards/_soft_format_reward_func": 1.831250011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.12068751454353333, + "rewards/check_answer": 1.346777692437172, + "step": 133 + }, + { + "completion_length": 393.0, + "epoch": 0.6836734693877551, + "grad_norm": 0.5568116903305054, + "kl": 0.030879591591656208, + "learning_rate": 4.756090224446127e-06, + "loss": 0.0012, + "reward": 7.258803009986877, + "reward_std": 1.964128702878952, + "rewards/_soft_format_reward_func": 1.90625, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.4805000275373459, + "rewards/check_answer": 2.83305324614048, + "step": 134 + }, + { + "completion_length": 386.125, + "epoch": 0.6887755102040817, + "grad_norm": 1.3968427181243896, + "kl": 0.04453878756612539, + "learning_rate": 4.74965410361712e-06, + "loss": 0.0018, + "reward": 5.945132851600647, + "reward_std": 1.0247465334832668, + "rewards/_soft_format_reward_func": 1.550000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.4618750140070915, + "rewards/check_answer": 1.857007990591228, + "step": 135 + }, + { + "completion_length": 396.25, + "epoch": 0.6938775510204082, + "grad_norm": 0.28240787982940674, + "kl": 0.03747776383534074, + "learning_rate": 4.7431386408375554e-06, + "loss": 0.0015, + "reward": 5.879474759101868, + "reward_std": 0.6595749771222472, + "rewards/_soft_format_reward_func": 1.8875000178813934, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.8629999682307243, + "rewards/check_answer": 1.8549747318029404, + "step": 136 + }, + { + "completion_length": 606.0, + "epoch": 0.6989795918367347, + "grad_norm": 0.3750913441181183, + "kl": 0.020721438224427402, + "learning_rate": 4.736544065898105e-06, + "loss": 0.0008, + "reward": 4.356168568134308, + "reward_std": 1.2325635273009539, + "rewards/_soft_format_reward_func": 1.7187499701976776, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.5434374436736107, + "rewards/check_answer": 1.180856066319393, + "step": 137 + }, + { + "completion_length": 354.6875, + "epoch": 0.7040816326530612, + "grad_norm": 0.392123281955719, + "kl": 0.03661597007885575, + "learning_rate": 4.729870611379609e-06, + "loss": 0.0015, + "reward": 7.636894226074219, + "reward_std": 1.705483302474022, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.396000012755394, + "rewards/check_answer": 3.107894539833069, + "step": 138 + }, + { + "completion_length": 440.375, + "epoch": 0.7091836734693877, + "grad_norm": 0.3271612226963043, + "kl": 0.025694155134260654, + "learning_rate": 4.72311851264487e-06, + "loss": 0.001, + "reward": 14.577420234680176, + "reward_std": 2.7595168482512236, + "rewards/_soft_format_reward_func": 1.6062500029802322, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.1934374887496233, + "rewards/check_answer": 9.777732692658901, + "step": 139 + }, + { + "completion_length": 932.3125, + "epoch": 0.7142857142857143, + "grad_norm": 0.24686619639396667, + "kl": 0.017432109219953418, + "learning_rate": 4.716288007830357e-06, + "loss": 0.0007, + "reward": 4.8316367864608765, + "reward_std": 1.2224921584129333, + "rewards/_soft_format_reward_func": 1.2874999940395355, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.5127499997615814, + "rewards/check_answer": 1.0568868009577272, + "step": 140 + }, + { + "completion_length": 439.5625, + "epoch": 0.7193877551020408, + "grad_norm": 0.9186896681785583, + "kl": 0.03715210082009435, + "learning_rate": 4.709379337837804e-06, + "loss": 0.0015, + "reward": 5.025137662887573, + "reward_std": 1.3306160643696785, + "rewards/_soft_format_reward_func": 1.9249999523162842, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -1.1871249936521053, + "rewards/check_answer": 1.287262663245201, + "step": 141 + }, + { + "completion_length": 705.9375, + "epoch": 0.7244897959183674, + "grad_norm": 0.36484846472740173, + "kl": 0.025597061030566692, + "learning_rate": 4.702392746325716e-06, + "loss": 0.001, + "reward": 6.963650107383728, + "reward_std": 1.9291575253009796, + "rewards/_soft_format_reward_func": 1.7187499701976776, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.598125021904707, + "rewards/check_answer": 2.8430251479148865, + "step": 142 + }, + { + "completion_length": 686.375, + "epoch": 0.7295918367346939, + "grad_norm": 0.3817428946495056, + "kl": 0.03906362532870844, + "learning_rate": 4.695328479700772e-06, + "loss": 0.0016, + "reward": 8.219464182853699, + "reward_std": 1.2078434526920319, + "rewards/_soft_format_reward_func": 1.6250000149011612, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.06687499396502972, + "rewards/check_answer": 3.6613386233802885, + "step": 143 + }, + { + "completion_length": 475.5625, + "epoch": 0.7346938775510204, + "grad_norm": 0.3780043125152588, + "kl": 0.0657902080565691, + "learning_rate": 4.688186787109136e-06, + "loss": 0.0026, + "reward": 5.480698108673096, + "reward_std": 1.339848518371582, + "rewards/_soft_format_reward_func": 1.5249999910593033, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.08481250144541264, + "rewards/check_answer": 1.0583859297075833, + "step": 144 + }, + { + "completion_length": 327.5625, + "epoch": 0.7397959183673469, + "grad_norm": 0.6541346311569214, + "kl": 0.03836058126762509, + "learning_rate": 4.680967920427674e-06, + "loss": 0.0015, + "reward": 7.11588191986084, + "reward_std": 0.7152784764766693, + "rewards/_soft_format_reward_func": 1.943750023841858, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.11156249791383743, + "rewards/check_answer": 2.060569554567337, + "step": 145 + }, + { + "completion_length": 280.1875, + "epoch": 0.7448979591836735, + "grad_norm": 0.3629293441772461, + "kl": 0.062112570740282536, + "learning_rate": 4.673672134255065e-06, + "loss": 0.0025, + "reward": 15.467483878135681, + "reward_std": 2.8585988879203796, + "rewards/_soft_format_reward_func": 1.7562499940395355, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.2293124981224537, + "rewards/check_answer": 10.481920555233955, + "step": 146 + }, + { + "completion_length": 474.25, + "epoch": 0.75, + "grad_norm": 0.48978060483932495, + "kl": 0.04026471124961972, + "learning_rate": 4.666299685902823e-06, + "loss": 0.0016, + "reward": 7.622194051742554, + "reward_std": 1.0535718128085136, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.36156249418854713, + "rewards/check_answer": 2.33563169836998, + "step": 147 + }, + { + "completion_length": 466.375, + "epoch": 0.7551020408163265, + "grad_norm": 0.30378639698028564, + "kl": 0.047573494259268045, + "learning_rate": 4.658850835386225e-06, + "loss": 0.0019, + "reward": 17.233438849449158, + "reward_std": 3.6260106414556503, + "rewards/_soft_format_reward_func": 1.7562499642372131, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3236249964684248, + "rewards/check_answer": 12.153564229607582, + "step": 148 + }, + { + "completion_length": 509.625, + "epoch": 0.7602040816326531, + "grad_norm": 0.3636853098869324, + "kl": 0.06674754060804844, + "learning_rate": 4.651325845415136e-06, + "loss": 0.0027, + "reward": 5.467587828636169, + "reward_std": 1.2641362864524126, + "rewards/_soft_format_reward_func": 1.600000023841858, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.049625005573034286, + "rewards/check_answer": 1.0054628625512123, + "step": 149 + }, + { + "completion_length": 357.9375, + "epoch": 0.7653061224489796, + "grad_norm": 0.5053626894950867, + "kl": 0.051199947483837605, + "learning_rate": 4.6437249813847495e-06, + "loss": 0.002, + "reward": 11.167248606681824, + "reward_std": 4.9293607994914055, + "rewards/_soft_format_reward_func": 1.5125000476837158, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.2926249988377094, + "rewards/check_answer": 6.362123340368271, + "step": 150 + }, + { + "completion_length": 378.875, + "epoch": 0.7704081632653061, + "grad_norm": 0.35232314467430115, + "kl": 0.0667138583958149, + "learning_rate": 4.636048511366222e-06, + "loss": 0.0027, + "reward": 6.404476523399353, + "reward_std": 0.6765038818120956, + "rewards/_soft_format_reward_func": 1.7750000357627869, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.002562493085861206, + "rewards/check_answer": 1.6320391297340393, + "step": 151 + }, + { + "completion_length": 269.75, + "epoch": 0.7755102040816326, + "grad_norm": 0.34677934646606445, + "kl": 0.09044361300766468, + "learning_rate": 4.62829670609722e-06, + "loss": 0.0036, + "reward": 14.928263902664185, + "reward_std": 3.045043110847473, + "rewards/_soft_format_reward_func": 1.7750000357627869, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.2966874986886978, + "rewards/check_answer": 9.856576025485992, + "step": 152 + }, + { + "completion_length": 545.4375, + "epoch": 0.7806122448979592, + "grad_norm": 0.36773860454559326, + "kl": 0.02953512966632843, + "learning_rate": 4.620469838972374e-06, + "loss": 0.0012, + "reward": 7.423403382301331, + "reward_std": 2.156498059630394, + "rewards/_soft_format_reward_func": 1.506250038743019, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.20743750035762787, + "rewards/check_answer": 2.8972157298582033, + "step": 153 + }, + { + "completion_length": 594.625, + "epoch": 0.7857142857142857, + "grad_norm": 0.2333289235830307, + "kl": 0.025991217233240604, + "learning_rate": 4.612568186033633e-06, + "loss": 0.001, + "reward": 12.17746376991272, + "reward_std": 6.597691237926483, + "rewards/_soft_format_reward_func": 1.75, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -0.12825000286102295, + "rewards/check_answer": 7.74321323633194, + "step": 154 + }, + { + "completion_length": 852.8125, + "epoch": 0.7908163265306123, + "grad_norm": 0.2395699918270111, + "kl": 0.03715619241120294, + "learning_rate": 4.604592025960531e-06, + "loss": 0.0015, + "reward": 24.29518300294876, + "reward_std": 9.197689248248935, + "rewards/_soft_format_reward_func": 1.5500000268220901, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3271874934434891, + "rewards/check_answer": 19.417993735411528, + "step": 155 + }, + { + "completion_length": 437.0625, + "epoch": 0.7959183673469388, + "grad_norm": 0.3087191879749298, + "kl": 0.05684735253453255, + "learning_rate": 4.596541640060358e-06, + "loss": 0.0023, + "reward": 9.101592540740967, + "reward_std": 3.38905222248286, + "rewards/_soft_format_reward_func": 1.675000011920929, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.3520000036805868, + "rewards/check_answer": 4.262092791497707, + "step": 156 + }, + { + "completion_length": 303.1875, + "epoch": 0.8010204081632653, + "grad_norm": 0.44123315811157227, + "kl": 0.06263354513794184, + "learning_rate": 4.5884173122582376e-06, + "loss": 0.0025, + "reward": 11.551234245300293, + "reward_std": 1.7726613469421864, + "rewards/_soft_format_reward_func": 1.7750000357627869, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.44756248593330383, + "rewards/check_answer": 6.328671932220459, + "step": 157 + }, + { + "completion_length": 546.875, + "epoch": 0.8061224489795918, + "grad_norm": 0.3234706521034241, + "kl": 0.05165292927995324, + "learning_rate": 4.580219329087114e-06, + "loss": 0.0021, + "reward": 5.810450196266174, + "reward_std": 0.6545056030154228, + "rewards/_soft_format_reward_func": 1.400000050663948, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.2633124962449074, + "rewards/check_answer": 1.1471376624685945, + "step": 158 + }, + { + "completion_length": 791.625, + "epoch": 0.8112244897959183, + "grad_norm": 0.28645241260528564, + "kl": 0.04510463122278452, + "learning_rate": 4.5719479796776466e-06, + "loss": 0.0018, + "reward": 7.505560338497162, + "reward_std": 1.395757220685482, + "rewards/_soft_format_reward_func": 1.5500000268220901, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.1236875019967556, + "rewards/check_answer": 3.079248049936723, + "step": 159 + }, + { + "completion_length": 492.25, + "epoch": 0.8163265306122449, + "grad_norm": 0.37338629364967346, + "kl": 0.052091196179389954, + "learning_rate": 4.563603555748015e-06, + "loss": 0.0021, + "reward": 7.5245548486709595, + "reward_std": 1.7396526504307985, + "rewards/_soft_format_reward_func": 1.768750011920929, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.12831250205636024, + "rewards/check_answer": 2.8149924129247665, + "step": 160 + }, + { + "completion_length": 309.375, + "epoch": 0.8214285714285714, + "grad_norm": 0.3315311670303345, + "kl": 0.089169105514884, + "learning_rate": 4.555186351593625e-06, + "loss": 0.0036, + "reward": 7.606752276420593, + "reward_std": 1.0673310905694962, + "rewards/_soft_format_reward_func": 1.7750000655651093, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.348874993622303, + "rewards/check_answer": 2.4828773885965347, + "step": 161 + }, + { + "completion_length": 343.6875, + "epoch": 0.826530612244898, + "grad_norm": 0.38737377524375916, + "kl": 0.0651077888906002, + "learning_rate": 4.546696664076734e-06, + "loss": 0.0026, + "reward": 9.475844025611877, + "reward_std": 1.092118889093399, + "rewards/_soft_format_reward_func": 1.7750000357627869, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3891875073313713, + "rewards/check_answer": 4.311656326055527, + "step": 162 + }, + { + "completion_length": 462.875, + "epoch": 0.8316326530612245, + "grad_norm": 0.40601518750190735, + "kl": 0.04493711423128843, + "learning_rate": 4.538134792615982e-06, + "loss": 0.0018, + "reward": 7.263669729232788, + "reward_std": 2.5759617229923606, + "rewards/_soft_format_reward_func": 1.6062500476837158, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.0736250039190054, + "rewards/check_answer": 2.5837948471307755, + "step": 163 + }, + { + "completion_length": 553.0625, + "epoch": 0.8367346938775511, + "grad_norm": 0.4406701624393463, + "kl": 0.0672515663318336, + "learning_rate": 4.529501039175824e-06, + "loss": 0.0027, + "reward": 7.2119529247283936, + "reward_std": 1.1174802966415882, + "rewards/_soft_format_reward_func": 1.6812500357627869, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.39887499809265137, + "rewards/check_answer": 2.1318282186985016, + "step": 164 + }, + { + "completion_length": 396.375, + "epoch": 0.8418367346938775, + "grad_norm": 0.625912070274353, + "kl": 0.06786507740616798, + "learning_rate": 4.5207957082558904e-06, + "loss": 0.0027, + "reward": 21.559359431266785, + "reward_std": 6.981817122315988, + "rewards/_soft_format_reward_func": 1.7750000357627869, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.39493750035762787, + "rewards/check_answer": 16.389422226697206, + "step": 165 + }, + { + "completion_length": 341.75, + "epoch": 0.8469387755102041, + "grad_norm": 1.1860520839691162, + "kl": 0.08321556635200977, + "learning_rate": 4.51201910688024e-06, + "loss": 0.0033, + "reward": 7.035177946090698, + "reward_std": 1.8561390489339828, + "rewards/_soft_format_reward_func": 1.7187500596046448, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.34062499552965164, + "rewards/check_answer": 1.9758030250668526, + "step": 166 + }, + { + "completion_length": 437.25, + "epoch": 0.8520408163265306, + "grad_norm": 0.34259557723999023, + "kl": 0.0418678093701601, + "learning_rate": 4.503171544586535e-06, + "loss": 0.0016, + "reward": 10.691133677959442, + "reward_std": 3.948306621365191, + "rewards/_soft_format_reward_func": 1.6062500029802322, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.362125001847744, + "rewards/check_answer": 5.722758798219729, + "step": 167 + }, + { + "completion_length": 454.4375, + "epoch": 0.8571428571428571, + "grad_norm": 0.41577261686325073, + "kl": 0.03299556393176317, + "learning_rate": 4.494253333415125e-06, + "loss": 0.0013, + "reward": 6.682947874069214, + "reward_std": 1.3458359614014626, + "rewards/_soft_format_reward_func": 1.6250000298023224, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.35218749567866325, + "rewards/check_answer": 1.7057603895664215, + "step": 168 + }, + { + "completion_length": 443.1875, + "epoch": 0.8622448979591837, + "grad_norm": 0.608323335647583, + "kl": 0.1554764355532825, + "learning_rate": 4.485264787898037e-06, + "loss": 0.0062, + "reward": 24.002484679222107, + "reward_std": 5.166570171713829, + "rewards/_soft_format_reward_func": 1.850000023841858, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.47462499141693115, + "rewards/check_answer": 18.677860498428345, + "step": 169 + }, + { + "completion_length": 529.0, + "epoch": 0.8673469387755102, + "grad_norm": 0.2710249722003937, + "kl": 0.03932965733110905, + "learning_rate": 4.476206225047889e-06, + "loss": 0.0016, + "reward": 5.216221511363983, + "reward_std": 0.8351084915921092, + "rewards/_soft_format_reward_func": 1.6062500476837158, + "rewards/_strict_format_reward_func": 2.4375, + "rewards/_xml_count_reward_func": 0.4116249978542328, + "rewards/check_answer": 0.760846458375454, + "step": 170 + }, + { + "completion_length": 480.5, + "epoch": 0.8724489795918368, + "grad_norm": 0.46896764636039734, + "kl": 0.03708732454106212, + "learning_rate": 4.467077964346705e-06, + "loss": 0.0015, + "reward": 32.737399101257324, + "reward_std": 12.719534158706665, + "rewards/_soft_format_reward_func": 1.8125, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.36537499353289604, + "rewards/check_answer": 27.55952274799347, + "step": 171 + }, + { + "completion_length": 564.625, + "epoch": 0.8775510204081632, + "grad_norm": 0.48409298062324524, + "kl": 0.04766590194776654, + "learning_rate": 4.457880327734647e-06, + "loss": 0.0019, + "reward": 7.888592720031738, + "reward_std": 1.3809154629707336, + "rewards/_soft_format_reward_func": 1.7375000417232513, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.42656249552965164, + "rewards/check_answer": 2.7245304584503174, + "step": 172 + }, + { + "completion_length": 342.5, + "epoch": 0.8826530612244898, + "grad_norm": 0.3246302008628845, + "kl": 0.06821039691567421, + "learning_rate": 4.448613639598664e-06, + "loss": 0.0027, + "reward": 7.661576509475708, + "reward_std": 0.627835601568222, + "rewards/_soft_format_reward_func": 1.7375000417232513, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.4233749955892563, + "rewards/check_answer": 2.5007017850875854, + "step": 173 + }, + { + "completion_length": 531.0625, + "epoch": 0.8877551020408163, + "grad_norm": 0.5014583468437195, + "kl": 0.08092385483905673, + "learning_rate": 4.43927822676105e-06, + "loss": 0.0034, + "reward": 8.200488924980164, + "reward_std": 0.6848534047603607, + "rewards/_soft_format_reward_func": 1.3250000327825546, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.2559374962002039, + "rewards/check_answer": 3.61955141882936, + "step": 174 + }, + { + "completion_length": 587.25, + "epoch": 0.8928571428571429, + "grad_norm": 0.5414162874221802, + "kl": 0.049171761609613895, + "learning_rate": 4.429874418467914e-06, + "loss": 0.002, + "reward": 15.98888349533081, + "reward_std": 2.731475718319416, + "rewards/_soft_format_reward_func": 1.8687500357627869, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.6382499784231186, + "rewards/check_answer": 11.758384495973587, + "step": 175 + }, + { + "completion_length": 373.0625, + "epoch": 0.8979591836734694, + "grad_norm": 0.27149567008018494, + "kl": 0.06143064517527819, + "learning_rate": 4.4204025463775715e-06, + "loss": 0.0025, + "reward": 6.5129474401474, + "reward_std": 0.8889782577753067, + "rewards/_soft_format_reward_func": 1.6625000834465027, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.2829375173896551, + "rewards/check_answer": 1.5675101578235626, + "step": 176 + }, + { + "completion_length": 540.375, + "epoch": 0.9030612244897959, + "grad_norm": 0.2515435218811035, + "kl": 0.04105467605404556, + "learning_rate": 4.410862944548848e-06, + "loss": 0.0016, + "reward": 6.6056541204452515, + "reward_std": 1.832587480545044, + "rewards/_soft_format_reward_func": 1.899999976158142, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.0496249720454216, + "rewards/check_answer": 1.8435290455818176, + "step": 177 + }, + { + "completion_length": 497.4375, + "epoch": 0.9081632653061225, + "grad_norm": 0.2942586839199066, + "kl": 0.0357802826911211, + "learning_rate": 4.401255949429299e-06, + "loss": 0.0014, + "reward": 9.341898918151855, + "reward_std": 1.008721500635147, + "rewards/_soft_format_reward_func": 1.8125000298023224, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.4579999968409538, + "rewards/check_answer": 4.071399033069611, + "step": 178 + }, + { + "completion_length": 443.6875, + "epoch": 0.9132653061224489, + "grad_norm": 0.2456103414297104, + "kl": 0.05043966881930828, + "learning_rate": 4.391581899843335e-06, + "loss": 0.002, + "reward": 7.033362329006195, + "reward_std": 1.3795625120401382, + "rewards/_soft_format_reward_func": 1.4750000089406967, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.2459375038743019, + "rewards/check_answer": 2.312424931966234, + "step": 179 + }, + { + "completion_length": 400.625, + "epoch": 0.9183673469387755, + "grad_norm": 0.49319222569465637, + "kl": 0.05620954278856516, + "learning_rate": 4.38184113698028e-06, + "loss": 0.0022, + "reward": 6.303773522377014, + "reward_std": 1.6371471658349037, + "rewards/_soft_format_reward_func": 1.7437500059604645, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": 0.0886249914765358, + "rewards/check_answer": 1.8463987112045288, + "step": 180 + }, + { + "completion_length": 474.875, + "epoch": 0.923469387755102, + "grad_norm": 0.48986172676086426, + "kl": 0.048913688864558935, + "learning_rate": 4.372034004382338e-06, + "loss": 0.002, + "reward": 5.959681272506714, + "reward_std": 0.4265642538666725, + "rewards/_soft_format_reward_func": 1.850000023841858, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3892499841749668, + "rewards/check_answer": 0.7204312160611153, + "step": 181 + }, + { + "completion_length": 619.5, + "epoch": 0.9285714285714286, + "grad_norm": 0.23095013201236725, + "kl": 0.03409982565790415, + "learning_rate": 4.362160847932473e-06, + "loss": 0.0014, + "reward": 5.400798320770264, + "reward_std": 1.4592021107673645, + "rewards/_soft_format_reward_func": 1.5250000208616257, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -0.21550003439188004, + "rewards/check_answer": 1.2787983370944858, + "step": 182 + }, + { + "completion_length": 629.5625, + "epoch": 0.9336734693877551, + "grad_norm": 0.24370257556438446, + "kl": 0.01954989810474217, + "learning_rate": 4.35222201584221e-06, + "loss": 0.0008, + "reward": 5.626599490642548, + "reward_std": 0.4261799646774307, + "rewards/_soft_format_reward_func": 1.7000000029802322, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.24112499505281448, + "rewards/check_answer": 0.6854746059398167, + "step": 183 + }, + { + "completion_length": 322.5, + "epoch": 0.9387755102040817, + "grad_norm": 0.5464153289794922, + "kl": 0.05567363370209932, + "learning_rate": 4.3422178586393615e-06, + "loss": 0.0022, + "reward": 7.5745275020599365, + "reward_std": 3.526697516441345, + "rewards/_soft_format_reward_func": 1.7187500596046448, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.07387499883770943, + "rewards/check_answer": 2.969402402639389, + "step": 184 + }, + { + "completion_length": 802.6875, + "epoch": 0.9438775510204082, + "grad_norm": 0.2673207223415375, + "kl": 0.024011684115976095, + "learning_rate": 4.332148729155654e-06, + "loss": 0.001, + "reward": 5.654477477073669, + "reward_std": 1.3770648315548897, + "rewards/_soft_format_reward_func": 1.5125000178813934, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.07100000604987144, + "rewards/check_answer": 1.2129775285720825, + "step": 185 + }, + { + "completion_length": 266.4375, + "epoch": 0.9489795918367347, + "grad_norm": 0.3424183130264282, + "kl": 0.07378911692649126, + "learning_rate": 4.322014982514292e-06, + "loss": 0.003, + "reward": 6.338861703872681, + "reward_std": 0.6802131589502096, + "rewards/_soft_format_reward_func": 1.981249988079071, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.1186249852180481, + "rewards/check_answer": 1.2389868646860123, + "step": 186 + }, + { + "completion_length": 379.1875, + "epoch": 0.9540816326530612, + "grad_norm": 0.3071301579475403, + "kl": 0.05790677620097995, + "learning_rate": 4.3118169761174315e-06, + "loss": 0.0023, + "reward": 7.1564600467681885, + "reward_std": 0.5172314047813416, + "rewards/_soft_format_reward_func": 1.8687500059604645, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3855624943971634, + "rewards/check_answer": 1.902147650718689, + "step": 187 + }, + { + "completion_length": 346.375, + "epoch": 0.9591836734693877, + "grad_norm": 0.28398242592811584, + "kl": 0.05025961343199015, + "learning_rate": 4.301555069633571e-06, + "loss": 0.002, + "reward": 8.402802467346191, + "reward_std": 0.9663780927658081, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.4886249974370003, + "rewards/check_answer": 2.9891776740550995, + "step": 188 + }, + { + "completion_length": 469.5625, + "epoch": 0.9642857142857143, + "grad_norm": 0.2812504172325134, + "kl": 0.036826275289058685, + "learning_rate": 4.291229624984876e-06, + "loss": 0.0015, + "reward": 15.558440685272217, + "reward_std": 8.35078378021717, + "rewards/_soft_format_reward_func": 1.7000000476837158, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.2761249952018261, + "rewards/check_answer": 10.58231633901596, + "step": 189 + }, + { + "completion_length": 398.0625, + "epoch": 0.9693877551020408, + "grad_norm": 0.3562508523464203, + "kl": 0.049638946540653706, + "learning_rate": 4.280841006334403e-06, + "loss": 0.002, + "reward": 27.97546100616455, + "reward_std": 16.727137465029955, + "rewards/_soft_format_reward_func": 1.6437500417232513, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.4568124860525131, + "rewards/check_answer": 23.062399968504906, + "step": 190 + }, + { + "completion_length": 393.0625, + "epoch": 0.9744897959183674, + "grad_norm": 36.12807083129883, + "kl": 0.33482956141233444, + "learning_rate": 4.270389580073264e-06, + "loss": 0.0134, + "reward": 7.393812417984009, + "reward_std": 1.9795853942632675, + "rewards/_soft_format_reward_func": 1.8875000476837158, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.12450001761317253, + "rewards/check_answer": 2.6308123022317886, + "step": 191 + }, + { + "completion_length": 295.5625, + "epoch": 0.9795918367346939, + "grad_norm": 0.2944537401199341, + "kl": 0.04368643742054701, + "learning_rate": 4.2598757148076996e-06, + "loss": 0.0017, + "reward": 11.020132422447205, + "reward_std": 2.0972883477807045, + "rewards/_soft_format_reward_func": 1.962499976158142, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.26499998942017555, + "rewards/check_answer": 5.792632728815079, + "step": 192 + }, + { + "completion_length": 364.6875, + "epoch": 0.9846938775510204, + "grad_norm": 0.1708153784275055, + "kl": 0.029444904066622257, + "learning_rate": 4.249299781346086e-06, + "loss": 0.0012, + "reward": 6.42504096031189, + "reward_std": 0.45998556911945343, + "rewards/_soft_format_reward_func": 1.6250000149011612, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3477499932050705, + "rewards/check_answer": 1.4522912870161235, + "step": 193 + }, + { + "completion_length": 297.3125, + "epoch": 0.9897959183673469, + "grad_norm": 0.3687419593334198, + "kl": 0.04513590410351753, + "learning_rate": 4.2386621526858465e-06, + "loss": 0.0018, + "reward": 7.11621256172657, + "reward_std": 1.6567531460896134, + "rewards/_soft_format_reward_func": 1.731249988079071, + "rewards/_strict_format_reward_func": 2.25, + "rewards/_xml_count_reward_func": 0.304749995470047, + "rewards/check_answer": 2.8302126228809357, + "step": 194 + }, + { + "completion_length": 399.3125, + "epoch": 0.9948979591836735, + "grad_norm": 0.3162878453731537, + "kl": 0.036621647188439965, + "learning_rate": 4.227963204000305e-06, + "loss": 0.0015, + "reward": 129.70068180561066, + "reward_std": 113.08588391542435, + "rewards/_soft_format_reward_func": 1.831250011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.2776250094175339, + "rewards/check_answer": 125.14704971015453, + "step": 195 + }, + { + "completion_length": 360.3125, + "epoch": 1.0, + "grad_norm": 0.3047390878200531, + "kl": 0.0588951304089278, + "learning_rate": 4.217203312625453e-06, + "loss": 0.0024, + "reward": 8.42742919921875, + "reward_std": 2.6744541078805923, + "rewards/_soft_format_reward_func": 1.887499988079071, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.05043748766183853, + "rewards/check_answer": 3.4894914776086807, + "step": 196 + }, + { + "completion_length": 341.0625, + "epoch": 1.0051020408163265, + "grad_norm": 0.31391653418540955, + "kl": 0.04442111076787114, + "learning_rate": 4.206382858046636e-06, + "loss": 0.0018, + "reward": 7.735453367233276, + "reward_std": 1.065683752298355, + "rewards/_soft_format_reward_func": 1.8312499821186066, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.43312498927116394, + "rewards/check_answer": 2.4710785150527954, + "step": 197 + }, + { + "completion_length": 375.3125, + "epoch": 1.010204081632653, + "grad_norm": 0.4202650785446167, + "kl": 0.04309396957978606, + "learning_rate": 4.195502221885176e-06, + "loss": 0.0017, + "reward": 10.156509637832642, + "reward_std": 1.2403309643268585, + "rewards/_soft_format_reward_func": 1.9625000059604645, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.4984999895095825, + "rewards/check_answer": 4.695509642362595, + "step": 198 + }, + { + "completion_length": 557.8125, + "epoch": 1.0153061224489797, + "grad_norm": 0.411990761756897, + "kl": 0.04207156877964735, + "learning_rate": 4.184561787884911e-06, + "loss": 0.0017, + "reward": 6.723345756530762, + "reward_std": 0.40229716151952744, + "rewards/_soft_format_reward_func": 1.981249988079071, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.492374986410141, + "rewards/check_answer": 1.249720811843872, + "step": 199 + }, + { + "completion_length": 317.875, + "epoch": 1.0204081632653061, + "grad_norm": 0.5010614991188049, + "kl": 0.09044251404702663, + "learning_rate": 4.173561941898656e-06, + "loss": 0.0036, + "reward": 6.473593235015869, + "reward_std": 1.4734688764438033, + "rewards/_soft_format_reward_func": 1.7750000059604645, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.31718749552965164, + "rewards/check_answer": 1.381405621767044, + "step": 200 + }, + { + "completion_length": 452.0625, + "epoch": 1.0255102040816326, + "grad_norm": 0.3681066334247589, + "kl": 0.03809668601024896, + "learning_rate": 4.162503071874603e-06, + "loss": 0.0015, + "reward": 6.502772688865662, + "reward_std": 1.3066000789403915, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.38712503761053085, + "rewards/check_answer": 1.8898976296186447, + "step": 201 + }, + { + "completion_length": 423.75, + "epoch": 1.030612244897959, + "grad_norm": 0.19629111886024475, + "kl": 0.046010272577404976, + "learning_rate": 4.151385567842629e-06, + "loss": 0.0018, + "reward": 399.35719668865204, + "reward_std": 279.76188530772924, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.14612499251961708, + "rewards/check_answer": 394.21108666062355, + "step": 202 + }, + { + "completion_length": 338.9375, + "epoch": 1.0357142857142858, + "grad_norm": 0.34913596510887146, + "kl": 0.05951223103329539, + "learning_rate": 4.140209821900548e-06, + "loss": 0.0024, + "reward": 5.7086756229400635, + "reward_std": 1.2880188524723053, + "rewards/_soft_format_reward_func": 1.8812499642372131, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.23206248879432678, + "rewards/check_answer": 0.7828629612922668, + "step": 203 + }, + { + "completion_length": 342.0625, + "epoch": 1.0408163265306123, + "grad_norm": 0.3435830771923065, + "kl": 0.04380645975470543, + "learning_rate": 4.12897622820028e-06, + "loss": 0.0018, + "reward": 8.00225567817688, + "reward_std": 1.523213267326355, + "rewards/_soft_format_reward_func": 1.8875000178813934, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.36687498912215233, + "rewards/check_answer": 2.7478803396224976, + "step": 204 + }, + { + "completion_length": 356.125, + "epoch": 1.0459183673469388, + "grad_norm": 0.36134347319602966, + "kl": 0.04655326111242175, + "learning_rate": 4.117685182933947e-06, + "loss": 0.0019, + "reward": 6.2672423124313354, + "reward_std": 0.3675787951797247, + "rewards/_soft_format_reward_func": 1.943750023841858, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.47449998557567596, + "rewards/check_answer": 0.8489925488829613, + "step": 205 + }, + { + "completion_length": 577.4375, + "epoch": 1.0510204081632653, + "grad_norm": 0.4111148416996002, + "kl": 0.049234330188483, + "learning_rate": 4.106337084319904e-06, + "loss": 0.002, + "reward": 7.752165675163269, + "reward_std": 0.4733734019100666, + "rewards/_soft_format_reward_func": 1.6625000089406967, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3658749908208847, + "rewards/check_answer": 2.7237908765673637, + "step": 206 + }, + { + "completion_length": 308.375, + "epoch": 1.0561224489795917, + "grad_norm": 0.5833783745765686, + "kl": 0.05842061527073383, + "learning_rate": 4.094932332588693e-06, + "loss": 0.0023, + "reward": 6.586714863777161, + "reward_std": 2.829386033117771, + "rewards/_soft_format_reward_func": 1.7562499940395355, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.3178749941289425, + "rewards/check_answer": 1.7000898569822311, + "step": 207 + }, + { + "completion_length": 331.625, + "epoch": 1.0612244897959184, + "grad_norm": 0.29140666127204895, + "kl": 0.03743181750178337, + "learning_rate": 4.083471329968926e-06, + "loss": 0.0015, + "reward": 8.740661025047302, + "reward_std": 0.7558915354311466, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 3.2396610528230667, + "step": 208 + }, + { + "completion_length": 302.6875, + "epoch": 1.066326530612245, + "grad_norm": 0.4549320638179779, + "kl": 0.05644373595714569, + "learning_rate": 4.071954480673098e-06, + "loss": 0.0023, + "reward": 6.7335838079452515, + "reward_std": 0.6734579056501389, + "rewards/_soft_format_reward_func": 1.943750023841858, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.373687494546175, + "rewards/check_answer": 1.4161463677883148, + "step": 209 + }, + { + "completion_length": 346.375, + "epoch": 1.0714285714285714, + "grad_norm": 0.3305790424346924, + "kl": 0.04994491580873728, + "learning_rate": 4.0603821908833386e-06, + "loss": 0.002, + "reward": 9.235015630722046, + "reward_std": 1.2825317829847336, + "rewards/_soft_format_reward_func": 1.981249988079071, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.2917499840259552, + "rewards/check_answer": 3.962015599012375, + "step": 210 + }, + { + "completion_length": 542.125, + "epoch": 1.0765306122448979, + "grad_norm": 0.22213147580623627, + "kl": 0.026279668672941625, + "learning_rate": 4.048754868737075e-06, + "loss": 0.0011, + "reward": 22.33821666240692, + "reward_std": 1.0081460773944855, + "rewards/_soft_format_reward_func": 1.7000000029802322, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.37574999034404755, + "rewards/check_answer": 17.26246675942093, + "step": 211 + }, + { + "completion_length": 325.25, + "epoch": 1.0816326530612246, + "grad_norm": 0.33424288034439087, + "kl": 0.04722048016265035, + "learning_rate": 4.037072924312649e-06, + "loss": 0.0019, + "reward": 7.3233397006988525, + "reward_std": 1.6336696669459343, + "rewards/_soft_format_reward_func": 1.800000011920929, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.46449999511241913, + "rewards/check_answer": 2.246339738368988, + "step": 212 + }, + { + "completion_length": 380.25, + "epoch": 1.086734693877551, + "grad_norm": 0.3298036456108093, + "kl": 0.03231387445703149, + "learning_rate": 4.0253367696148435e-06, + "loss": 0.0013, + "reward": 10.010229587554932, + "reward_std": 2.455143466591835, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 4.509229928255081, + "step": 213 + }, + { + "completion_length": 449.25, + "epoch": 1.0918367346938775, + "grad_norm": 0.2562406063079834, + "kl": 0.03419307968579233, + "learning_rate": 4.013546818560362e-06, + "loss": 0.0014, + "reward": 8.042372584342957, + "reward_std": 0.751374788582325, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.49318748712539673, + "rewards/check_answer": 2.5491852164268494, + "step": 214 + }, + { + "completion_length": 345.8125, + "epoch": 1.096938775510204, + "grad_norm": 0.2993167042732239, + "kl": 0.03938100393861532, + "learning_rate": 4.001703486963223e-06, + "loss": 0.0016, + "reward": 13.524673461914062, + "reward_std": 3.062700480222702, + "rewards/_soft_format_reward_func": 1.962499976158142, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.48374998569488525, + "rewards/check_answer": 8.078423976898193, + "step": 215 + }, + { + "completion_length": 579.0, + "epoch": 1.1020408163265305, + "grad_norm": 0.23525753617286682, + "kl": 0.038901340682059526, + "learning_rate": 3.989807192520098e-06, + "loss": 0.0016, + "reward": 11.004587292671204, + "reward_std": 3.6870256178081036, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.1887500286102295, + "rewards/check_answer": 6.193337500095367, + "step": 216 + }, + { + "completion_length": 354.375, + "epoch": 1.1071428571428572, + "grad_norm": 0.4108511209487915, + "kl": 0.049589950358495116, + "learning_rate": 3.9778583547955765e-06, + "loss": 0.002, + "reward": 7.848548054695129, + "reward_std": 0.9341182895004749, + "rewards/_soft_format_reward_func": 1.90625, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.4646874964237213, + "rewards/check_answer": 2.4776105992496014, + "step": 217 + }, + { + "completion_length": 239.3125, + "epoch": 1.1122448979591837, + "grad_norm": 0.37700262665748596, + "kl": 0.07110750861465931, + "learning_rate": 3.965857395207375e-06, + "loss": 0.0028, + "reward": 6.046079754829407, + "reward_std": 0.22055783914402127, + "rewards/_soft_format_reward_func": 1.9437499642372131, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.4906249940395355, + "rewards/check_answer": 0.6117046624422073, + "step": 218 + }, + { + "completion_length": 391.75, + "epoch": 1.1173469387755102, + "grad_norm": 0.32394155859947205, + "kl": 0.04216121416538954, + "learning_rate": 3.9538047370114695e-06, + "loss": 0.0017, + "reward": 11.35419249534607, + "reward_std": 2.7961763814091682, + "rewards/_soft_format_reward_func": 1.9437499940395355, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.47987498342990875, + "rewards/check_answer": 5.930567711591721, + "step": 219 + }, + { + "completion_length": 517.3125, + "epoch": 1.1224489795918366, + "grad_norm": 0.4214778244495392, + "kl": 0.07988983625546098, + "learning_rate": 3.941700805287169e-06, + "loss": 0.0032, + "reward": 8.18191134929657, + "reward_std": 1.6381587088108063, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.4880624860525131, + "rewards/check_answer": 2.768848918378353, + "step": 220 + }, + { + "completion_length": 396.625, + "epoch": 1.1275510204081634, + "grad_norm": 0.29704153537750244, + "kl": 0.04481638455763459, + "learning_rate": 3.92954602692212e-06, + "loss": 0.0018, + "reward": 11.367891311645508, + "reward_std": 3.2973266541957855, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.29612497985363007, + "rewards/check_answer": 6.071766555309296, + "step": 221 + }, + { + "completion_length": 589.9375, + "epoch": 1.1326530612244898, + "grad_norm": 0.19876539707183838, + "kl": 0.019044322660192847, + "learning_rate": 3.9173408305972606e-06, + "loss": 0.0008, + "reward": 104.04097664356232, + "reward_std": 30.219659864902496, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.49318748712539673, + "rewards/check_answer": 98.79778736829758, + "step": 222 + }, + { + "completion_length": 368.25, + "epoch": 1.1377551020408163, + "grad_norm": 0.35037973523139954, + "kl": 0.04351036436855793, + "learning_rate": 3.905085646771689e-06, + "loss": 0.0017, + "reward": 19.659468710422516, + "reward_std": 5.726649031043053, + "rewards/_soft_format_reward_func": 1.875, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": 0.3203124888241291, + "rewards/check_answer": 14.83915638923645, + "step": 223 + }, + { + "completion_length": 388.0625, + "epoch": 1.1428571428571428, + "grad_norm": 0.3024386465549469, + "kl": 0.036022431682795286, + "learning_rate": 3.892780907667495e-06, + "loss": 0.0014, + "reward": 8.131965398788452, + "reward_std": 0.9055671244859695, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 2.630965530872345, + "step": 224 + }, + { + "completion_length": 382.25, + "epoch": 1.1479591836734695, + "grad_norm": 0.3158092796802521, + "kl": 0.04636579938232899, + "learning_rate": 3.880427047254502e-06, + "loss": 0.0019, + "reward": 8.08501136302948, + "reward_std": 0.7919884920120239, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 2.584011249244213, + "step": 225 + }, + { + "completion_length": 581.0, + "epoch": 1.153061224489796, + "grad_norm": 0.22562365233898163, + "kl": 0.03268377063795924, + "learning_rate": 3.868024501234972e-06, + "loss": 0.0013, + "reward": 7.202247619628906, + "reward_std": 3.4254866242408752, + "rewards/_soft_format_reward_func": 1.8125, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": 0.026124969124794006, + "rewards/check_answer": 2.738622672855854, + "step": 226 + }, + { + "completion_length": 336.25, + "epoch": 1.1581632653061225, + "grad_norm": 0.34813469648361206, + "kl": 0.04001564159989357, + "learning_rate": 3.855573707028239e-06, + "loss": 0.0016, + "reward": 8.867348909378052, + "reward_std": 1.2533812262117863, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 3.366348847746849, + "step": 227 + }, + { + "completion_length": 486.375, + "epoch": 1.163265306122449, + "grad_norm": 0.27957627177238464, + "kl": 0.03200881229713559, + "learning_rate": 3.843075103755273e-06, + "loss": 0.0013, + "reward": 10.206645011901855, + "reward_std": 2.00884972512722, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.18962496891617775, + "rewards/check_answer": 5.017019867897034, + "step": 228 + }, + { + "completion_length": 523.25, + "epoch": 1.1683673469387754, + "grad_norm": 0.7269229888916016, + "kl": 0.05936818476766348, + "learning_rate": 3.830529132223202e-06, + "loss": 0.0024, + "reward": 6.077142119407654, + "reward_std": 0.9461775571107864, + "rewards/_soft_format_reward_func": 1.7000000029802322, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.18649998307228088, + "rewards/check_answer": 1.190642079411191, + "step": 229 + }, + { + "completion_length": 485.6875, + "epoch": 1.1734693877551021, + "grad_norm": 0.2921268343925476, + "kl": 0.03041125787422061, + "learning_rate": 3.817936234909763e-06, + "loss": 0.0012, + "reward": 6.2724268436431885, + "reward_std": 1.5871776547282934, + "rewards/_soft_format_reward_func": 1.918749988079071, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.10399998724460602, + "rewards/check_answer": 1.4371768236160278, + "step": 230 + }, + { + "completion_length": 516.3125, + "epoch": 1.1785714285714286, + "grad_norm": 2.598008394241333, + "kl": 0.04260433139279485, + "learning_rate": 3.80529685594769e-06, + "loss": 0.0017, + "reward": 12.280081391334534, + "reward_std": 2.9517072029411793, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.23993748426437378, + "rewards/check_answer": 7.040144145488739, + "step": 231 + }, + { + "completion_length": 395.0, + "epoch": 1.183673469387755, + "grad_norm": 0.35654065012931824, + "kl": 0.03489643894135952, + "learning_rate": 3.792611441109063e-06, + "loss": 0.0014, + "reward": 11.359565138816833, + "reward_std": 2.3292928487062454, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3761249929666519, + "rewards/check_answer": 5.983440205454826, + "step": 232 + }, + { + "completion_length": 401.9375, + "epoch": 1.1887755102040816, + "grad_norm": 0.2584652602672577, + "kl": 0.02747915661893785, + "learning_rate": 3.779880437789574e-06, + "loss": 0.0011, + "reward": 8.458192110061646, + "reward_std": 0.5072742262855172, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.46968749165534973, + "rewards/check_answer": 3.0635048747062683, + "step": 233 + }, + { + "completion_length": 447.5, + "epoch": 1.193877551020408, + "grad_norm": 40.201622009277344, + "kl": 2.7213867825921625, + "learning_rate": 3.767104294992754e-06, + "loss": 0.1089, + "reward": 37.41048204898834, + "reward_std": 6.400567984208465, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 31.90947988629341, + "step": 234 + }, + { + "completion_length": 780.875, + "epoch": 1.1989795918367347, + "grad_norm": 0.2753540575504303, + "kl": 0.021944483974948525, + "learning_rate": 3.7542834633141345e-06, + "loss": 0.0009, + "reward": 7.129895329475403, + "reward_std": 1.297385048121214, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.01712501049041748, + "rewards/check_answer": 2.1470203548669815, + "step": 235 + }, + { + "completion_length": 341.75, + "epoch": 1.2040816326530612, + "grad_norm": 0.34806281328201294, + "kl": 0.0429367832839489, + "learning_rate": 3.7414183949253614e-06, + "loss": 0.0017, + "reward": 6.452441692352295, + "reward_std": 2.0902061354136094, + "rewards/_soft_format_reward_func": 1.806249976158142, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.18718747794628143, + "rewards/check_answer": 1.6465042941272259, + "step": 236 + }, + { + "completion_length": 499.5625, + "epoch": 1.2091836734693877, + "grad_norm": 0.2900775372982025, + "kl": 0.039164841175079346, + "learning_rate": 3.728509543558239e-06, + "loss": 0.0016, + "reward": 6.349876523017883, + "reward_std": 1.9654560983181, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -0.029812529683113098, + "rewards/check_answer": 1.629688948392868, + "step": 237 + }, + { + "completion_length": 440.875, + "epoch": 1.2142857142857142, + "grad_norm": 1.1023831367492676, + "kl": 0.04508004803210497, + "learning_rate": 3.715557364488735e-06, + "loss": 0.0018, + "reward": 7.34362006187439, + "reward_std": 0.501777783036232, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 1.8426202535629272, + "step": 238 + }, + { + "completion_length": 521.75, + "epoch": 1.219387755102041, + "grad_norm": 0.18328067660331726, + "kl": 0.03002178471069783, + "learning_rate": 3.7025623145209196e-06, + "loss": 0.0012, + "reward": 21.852468729019165, + "reward_std": 8.15756268799305, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.33924998715519905, + "rewards/check_answer": 16.513219088315964, + "step": 239 + }, + { + "completion_length": 423.5625, + "epoch": 1.2244897959183674, + "grad_norm": 0.27355238795280457, + "kl": 0.03441121755167842, + "learning_rate": 3.6895248519708552e-06, + "loss": 0.0014, + "reward": 9.248181223869324, + "reward_std": 5.668044149875641, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.08843748271465302, + "rewards/check_answer": 4.409743905067444, + "step": 240 + }, + { + "completion_length": 289.1875, + "epoch": 1.2295918367346939, + "grad_norm": 0.43641212582588196, + "kl": 0.05587594583630562, + "learning_rate": 3.676445436650435e-06, + "loss": 0.0022, + "reward": 7.713133692741394, + "reward_std": 1.4005136042833328, + "rewards/_soft_format_reward_func": 1.881250023841858, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.4282499924302101, + "rewards/check_answer": 2.5911336839199066, + "step": 241 + }, + { + "completion_length": 484.1875, + "epoch": 1.2346938775510203, + "grad_norm": 0.29936179518699646, + "kl": 0.03446671739220619, + "learning_rate": 3.6633245298511615e-06, + "loss": 0.0014, + "reward": 5.435900092124939, + "reward_std": 0.9865807015448809, + "rewards/_soft_format_reward_func": 1.7124999910593033, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.07593749463558197, + "rewards/check_answer": 0.834962572902441, + "step": 242 + }, + { + "completion_length": 249.8125, + "epoch": 1.239795918367347, + "grad_norm": 0.3473406732082367, + "kl": 0.041182656306773424, + "learning_rate": 3.650162594327881e-06, + "loss": 0.0016, + "reward": 32.44585049152374, + "reward_std": 17.82753943838179, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.38524999003857374, + "rewards/check_answer": 27.06060168892145, + "step": 243 + }, + { + "completion_length": 373.5625, + "epoch": 1.2448979591836735, + "grad_norm": 0.6561787724494934, + "kl": 0.04421532340347767, + "learning_rate": 3.636960094282461e-06, + "loss": 0.0018, + "reward": 10.547720432281494, + "reward_std": 2.3654505601152778, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 5.046720087528229, + "step": 244 + }, + { + "completion_length": 312.4375, + "epoch": 1.25, + "grad_norm": 0.401877224445343, + "kl": 0.04252156801521778, + "learning_rate": 3.62371749534742e-06, + "loss": 0.0017, + "reward": 176.37635481357574, + "reward_std": 187.44706455618143, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 170.87536078691483, + "step": 245 + }, + { + "completion_length": 546.125, + "epoch": 1.2551020408163265, + "grad_norm": 0.4822182059288025, + "kl": 0.0404973141849041, + "learning_rate": 3.610435264569506e-06, + "loss": 0.0016, + "reward": 5.919567108154297, + "reward_std": 0.9037803895771503, + "rewards/_soft_format_reward_func": 1.7750000059604645, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.40706248953938484, + "rewards/check_answer": 0.7375045046210289, + "step": 246 + }, + { + "completion_length": 335.0, + "epoch": 1.260204081632653, + "grad_norm": 1.0417882204055786, + "kl": 0.058931102976202965, + "learning_rate": 3.59711387039322e-06, + "loss": 0.0024, + "reward": 7.960751295089722, + "reward_std": 1.9481116998940706, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.40937498956918716, + "rewards/check_answer": 2.551376521587372, + "step": 247 + }, + { + "completion_length": 501.9375, + "epoch": 1.2653061224489797, + "grad_norm": 0.3014232814311981, + "kl": 0.02945040026679635, + "learning_rate": 3.5837537826442996e-06, + "loss": 0.0012, + "reward": 6.0213092267513275, + "reward_std": 3.0799474716186523, + "rewards/_soft_format_reward_func": 1.5625, + "rewards/_strict_format_reward_func": 1.875, + "rewards/_xml_count_reward_func": 0.2953749932348728, + "rewards/check_answer": 2.2884344458580017, + "step": 248 + }, + { + "completion_length": 433.8125, + "epoch": 1.2704081632653061, + "grad_norm": 0.3146630525588989, + "kl": 0.04995664907619357, + "learning_rate": 3.570355472513148e-06, + "loss": 0.002, + "reward": 6.550642013549805, + "reward_std": 1.1807164400815964, + "rewards/_soft_format_reward_func": 1.918749988079071, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.4178124964237213, + "rewards/check_answer": 1.4015794694423676, + "step": 249 + }, + { + "completion_length": 399.5625, + "epoch": 1.2755102040816326, + "grad_norm": 0.7177777290344238, + "kl": 0.05952198896557093, + "learning_rate": 3.5569194125382122e-06, + "loss": 0.0024, + "reward": 8.12901496887207, + "reward_std": 2.611573375761509, + "rewards/_soft_format_reward_func": 1.856249988079071, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": 0.20737500488758087, + "rewards/check_answer": 3.440389961004257, + "step": 250 + }, + { + "completion_length": 513.25, + "epoch": 1.280612244897959, + "grad_norm": 0.32417553663253784, + "kl": 0.02895202673971653, + "learning_rate": 3.543446076589323e-06, + "loss": 0.0012, + "reward": 11.36902403831482, + "reward_std": 1.7574745267629623, + "rewards/_soft_format_reward_func": 1.850000023841858, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.21918749064207077, + "rewards/check_answer": 6.299836695194244, + "step": 251 + }, + { + "completion_length": 356.75, + "epoch": 1.2857142857142856, + "grad_norm": 0.3084201216697693, + "kl": 0.03097515576519072, + "learning_rate": 3.529935939850977e-06, + "loss": 0.0012, + "reward": 9.016826748847961, + "reward_std": 0.7636192254722118, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 3.515826642513275, + "step": 252 + }, + { + "completion_length": 341.125, + "epoch": 1.2908163265306123, + "grad_norm": 0.25116291642189026, + "kl": 0.02968740649521351, + "learning_rate": 3.516389478805581e-06, + "loss": 0.0012, + "reward": 7.705106616020203, + "reward_std": 1.4544169902801514, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3708124943077564, + "rewards/check_answer": 2.4092941842973232, + "step": 253 + }, + { + "completion_length": 387.8125, + "epoch": 1.2959183673469388, + "grad_norm": 0.2992796003818512, + "kl": 0.044889158103615046, + "learning_rate": 3.5028071712166456e-06, + "loss": 0.0018, + "reward": 7.431437849998474, + "reward_std": 1.1658359989523888, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.36274998635053635, + "rewards/check_answer": 2.3186877369880676, + "step": 254 + }, + { + "completion_length": 463.3125, + "epoch": 1.3010204081632653, + "grad_norm": 1.0446410179138184, + "kl": 0.13644460123032331, + "learning_rate": 3.4891894961119367e-06, + "loss": 0.0055, + "reward": 7.609053730964661, + "reward_std": 1.3696298897266388, + "rewards/_soft_format_reward_func": 1.981249988079071, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.13462497293949127, + "rewards/check_answer": 2.4931787848472595, + "step": 255 + }, + { + "completion_length": 313.1875, + "epoch": 1.306122448979592, + "grad_norm": 1.3470245599746704, + "kl": 0.05142315570265055, + "learning_rate": 3.4755369337665767e-06, + "loss": 0.0021, + "reward": 10.27379596233368, + "reward_std": 1.2127346321940422, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 4.772795736789703, + "step": 256 + }, + { + "completion_length": 479.375, + "epoch": 1.3112244897959184, + "grad_norm": 0.2921811640262604, + "kl": 0.03002007771283388, + "learning_rate": 3.4618499656861127e-06, + "loss": 0.0012, + "reward": 9.079193353652954, + "reward_std": 2.8739907946437597, + "rewards/_soft_format_reward_func": 1.75, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.46931249648332596, + "rewards/check_answer": 4.047380834817886, + "step": 257 + }, + { + "completion_length": 487.0625, + "epoch": 1.316326530612245, + "grad_norm": 0.30052751302719116, + "kl": 0.02756687719374895, + "learning_rate": 3.448129074589529e-06, + "loss": 0.0011, + "reward": 8.22206735610962, + "reward_std": 1.0395818054676056, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 2.721067249774933, + "step": 258 + }, + { + "completion_length": 566.4375, + "epoch": 1.3214285714285714, + "grad_norm": 1.3980598449707031, + "kl": 0.04097011568956077, + "learning_rate": 3.4343747443922253e-06, + "loss": 0.0016, + "reward": 74.7005969285965, + "reward_std": 65.19187147170305, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.1017499715089798, + "rewards/check_answer": 69.59884896874428, + "step": 259 + }, + { + "completion_length": 561.625, + "epoch": 1.3265306122448979, + "grad_norm": 2.238983392715454, + "kl": 0.0385152967646718, + "learning_rate": 3.4205874601889465e-06, + "loss": 0.0015, + "reward": 11.257536888122559, + "reward_std": 2.0906684398651123, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.46968749165534973, + "rewards/check_answer": 5.862849414348602, + "step": 260 + }, + { + "completion_length": 360.25, + "epoch": 1.3316326530612246, + "grad_norm": 0.4498765468597412, + "kl": 0.04135388555005193, + "learning_rate": 3.4067677082366795e-06, + "loss": 0.0017, + "reward": 12.972948431968689, + "reward_std": 3.2377343624830246, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.30162498354911804, + "rewards/check_answer": 7.6713235676288605, + "step": 261 + }, + { + "completion_length": 541.0625, + "epoch": 1.336734693877551, + "grad_norm": 0.2704751491546631, + "kl": 0.036466233897954226, + "learning_rate": 3.3929159759374963e-06, + "loss": 0.0015, + "reward": 9.328784704208374, + "reward_std": 2.759569361805916, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.11506250500679016, + "rewards/check_answer": 4.28872212767601, + "step": 262 + }, + { + "completion_length": 305.4375, + "epoch": 1.3418367346938775, + "grad_norm": 0.37358084321022034, + "kl": 0.06474325619637966, + "learning_rate": 3.3790327518213705e-06, + "loss": 0.0026, + "reward": 18.249707102775574, + "reward_std": 10.445075172930956, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.46968749165534973, + "rewards/check_answer": 12.85502003878355, + "step": 263 + }, + { + "completion_length": 299.375, + "epoch": 1.346938775510204, + "grad_norm": 0.5274214148521423, + "kl": 0.06182891130447388, + "learning_rate": 3.3651185255289466e-06, + "loss": 0.0025, + "reward": 6.380192518234253, + "reward_std": 0.7590235061943531, + "rewards/_soft_format_reward_func": 1.8875000178813934, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.358062494546175, + "rewards/check_answer": 1.1346299946308136, + "step": 264 + }, + { + "completion_length": 355.3125, + "epoch": 1.3520408163265305, + "grad_norm": 1.733555793762207, + "kl": 0.05918777082115412, + "learning_rate": 3.351173787794265e-06, + "loss": 0.0024, + "reward": 7.078450918197632, + "reward_std": 0.6762270703911781, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.38624998554587364, + "rewards/check_answer": 1.6922010779380798, + "step": 265 + }, + { + "completion_length": 668.6875, + "epoch": 1.3571428571428572, + "grad_norm": 0.29560568928718567, + "kl": 0.033541878685355186, + "learning_rate": 3.3371990304274654e-06, + "loss": 0.0013, + "reward": 5.593075513839722, + "reward_std": 2.103371325880289, + "rewards/_soft_format_reward_func": 1.875, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": -0.19037500023841858, + "rewards/check_answer": 1.2834507524967194, + "step": 266 + }, + { + "completion_length": 334.5, + "epoch": 1.3622448979591837, + "grad_norm": 0.32789790630340576, + "kl": 0.043092924170196056, + "learning_rate": 3.3231947462974314e-06, + "loss": 0.0017, + "reward": 9.766201496124268, + "reward_std": 1.6006742417812347, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3867499902844429, + "rewards/check_answer": 4.379451096057892, + "step": 267 + }, + { + "completion_length": 331.9375, + "epoch": 1.3673469387755102, + "grad_norm": 0.3861054480075836, + "kl": 0.04949623066931963, + "learning_rate": 3.3091614293144103e-06, + "loss": 0.002, + "reward": 6.758515477180481, + "reward_std": 0.5750819966197014, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 1.2575156837701797, + "step": 268 + }, + { + "completion_length": 466.4375, + "epoch": 1.3724489795918369, + "grad_norm": 0.4356038570404053, + "kl": 0.02766430750489235, + "learning_rate": 3.2950995744125986e-06, + "loss": 0.0011, + "reward": 7.627132177352905, + "reward_std": 1.2347041498869658, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.46968749165534973, + "rewards/check_answer": 2.232444867491722, + "step": 269 + }, + { + "completion_length": 342.1875, + "epoch": 1.3775510204081631, + "grad_norm": 76.3337173461914, + "kl": 0.06372056156396866, + "learning_rate": 3.2810096775326807e-06, + "loss": 0.0025, + "reward": 6.860260605812073, + "reward_std": 0.8331015557050705, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.46968749165534973, + "rewards/check_answer": 1.4655731916427612, + "step": 270 + }, + { + "completion_length": 435.875, + "epoch": 1.3826530612244898, + "grad_norm": 0.40490594506263733, + "kl": 0.03950034361332655, + "learning_rate": 3.2668922356043393e-06, + "loss": 0.0016, + "reward": 17.05094587802887, + "reward_std": 2.617586500942707, + "rewards/_soft_format_reward_func": 1.9437499940395355, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.10237498581409454, + "rewards/check_answer": 12.004821479320526, + "step": 271 + }, + { + "completion_length": 505.5625, + "epoch": 1.3877551020408163, + "grad_norm": 0.31562700867652893, + "kl": 0.03630805341526866, + "learning_rate": 3.2527477465287315e-06, + "loss": 0.0015, + "reward": 8.585829615592957, + "reward_std": 2.7087118178606033, + "rewards/_soft_format_reward_func": 1.65625, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.37574999034404755, + "rewards/check_answer": 3.7413295432925224, + "step": 272 + }, + { + "completion_length": 390.75, + "epoch": 1.3928571428571428, + "grad_norm": 0.39003291726112366, + "kl": 0.036192905623465776, + "learning_rate": 3.2385767091609256e-06, + "loss": 0.0014, + "reward": 15.147390484809875, + "reward_std": 2.444808963686228, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 9.646390154957771, + "step": 273 + }, + { + "completion_length": 343.625, + "epoch": 1.3979591836734695, + "grad_norm": 0.4384270906448364, + "kl": 0.04466715827584267, + "learning_rate": 3.2243796232923097e-06, + "loss": 0.0018, + "reward": 31.85398769378662, + "reward_std": 11.348299875855446, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 26.352985858917236, + "step": 274 + }, + { + "completion_length": 324.9375, + "epoch": 1.403061224489796, + "grad_norm": 0.2867165207862854, + "kl": 0.03754226490855217, + "learning_rate": 3.210156989632963e-06, + "loss": 0.0015, + "reward": 7.19976270198822, + "reward_std": 0.6154328808188438, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 1.6987627260386944, + "step": 275 + }, + { + "completion_length": 354.8125, + "epoch": 1.4081632653061225, + "grad_norm": 0.29476699233055115, + "kl": 0.03253966011106968, + "learning_rate": 3.1959093097939985e-06, + "loss": 0.0013, + "reward": 7.976716876029968, + "reward_std": 0.40102337673306465, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 2.4757168292999268, + "step": 276 + }, + { + "completion_length": 351.0, + "epoch": 1.413265306122449, + "grad_norm": 0.312061607837677, + "kl": 0.0403234614059329, + "learning_rate": 3.1816370862698687e-06, + "loss": 0.0016, + "reward": 6.9380176067352295, + "reward_std": 0.2944545615464449, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 1.4370175302028656, + "step": 277 + }, + { + "completion_length": 363.75, + "epoch": 1.4183673469387754, + "grad_norm": 1.5660181045532227, + "kl": 0.05030357465147972, + "learning_rate": 3.167340822420646e-06, + "loss": 0.002, + "reward": 9.957616448402405, + "reward_std": 1.670918844640255, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3322499990463257, + "rewards/check_answer": 4.625366747379303, + "step": 278 + }, + { + "completion_length": 823.5, + "epoch": 1.4234693877551021, + "grad_norm": 0.3058522641658783, + "kl": 0.032453726103994995, + "learning_rate": 3.15302102245427e-06, + "loss": 0.0013, + "reward": 5.786714851856232, + "reward_std": 1.322007343173027, + "rewards/_soft_format_reward_func": 1.5750000029802322, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": 0.04399999603629112, + "rewards/check_answer": 1.542714830377463, + "step": 279 + }, + { + "completion_length": 512.625, + "epoch": 1.4285714285714286, + "grad_norm": 1.387960433959961, + "kl": 0.1971198613755405, + "learning_rate": 3.1386781914087644e-06, + "loss": 0.0079, + "reward": 7.7154969573020935, + "reward_std": 1.7870135828852654, + "rewards/_soft_format_reward_func": 1.875, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": 0.049124978482723236, + "rewards/check_answer": 3.166372127830982, + "step": 280 + }, + { + "completion_length": 399.75, + "epoch": 1.433673469387755, + "grad_norm": 0.25350087881088257, + "kl": 0.05288520269095898, + "learning_rate": 3.124312835134423e-06, + "loss": 0.0021, + "reward": 5.986119747161865, + "reward_std": 1.6316592246294022, + "rewards/_soft_format_reward_func": 1.4500000029802322, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.18143747746944427, + "rewards/check_answer": 1.5421821877826005, + "step": 281 + }, + { + "completion_length": 341.4375, + "epoch": 1.4387755102040816, + "grad_norm": 0.2632606625556946, + "kl": 0.040725668892264366, + "learning_rate": 3.109925460275972e-06, + "loss": 0.0016, + "reward": 11.454934000968933, + "reward_std": 3.1375857144594193, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 5.95393431186676, + "step": 282 + }, + { + "completion_length": 372.5, + "epoch": 1.443877551020408, + "grad_norm": 0.41861191391944885, + "kl": 0.03541993070393801, + "learning_rate": 3.095516574254701e-06, + "loss": 0.0014, + "reward": 11.167982816696167, + "reward_std": 3.0760795176029205, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 5.666983067989349, + "step": 283 + }, + { + "completion_length": 419.0, + "epoch": 1.4489795918367347, + "grad_norm": 0.3124421238899231, + "kl": 0.032168209087103605, + "learning_rate": 3.081086685250565e-06, + "loss": 0.0013, + "reward": 7.30214262008667, + "reward_std": 2.5055956970900297, + "rewards/_soft_format_reward_func": 1.862500011920929, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.08562497794628143, + "rewards/check_answer": 2.5415174663066864, + "step": 284 + }, + { + "completion_length": 451.3125, + "epoch": 1.4540816326530612, + "grad_norm": 0.5122058987617493, + "kl": 0.04522203654050827, + "learning_rate": 3.0666363021842637e-06, + "loss": 0.0018, + "reward": 9.730563640594482, + "reward_std": 2.6023759096860886, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.46187499165534973, + "rewards/check_answer": 4.343688324093819, + "step": 285 + }, + { + "completion_length": 432.5, + "epoch": 1.4591836734693877, + "grad_norm": 0.26502475142478943, + "kl": 0.03718484891578555, + "learning_rate": 3.0521659346992914e-06, + "loss": 0.0015, + "reward": 10.602559328079224, + "reward_std": 3.779228514060378, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.38899998739361763, + "rewards/check_answer": 5.213559329509735, + "step": 286 + }, + { + "completion_length": 514.875, + "epoch": 1.4642857142857144, + "grad_norm": 0.5324726104736328, + "kl": 0.02997216023504734, + "learning_rate": 3.0376760931439636e-06, + "loss": 0.0012, + "reward": 6.663404941558838, + "reward_std": 0.3998007522895932, + "rewards/_soft_format_reward_func": 1.962499976158142, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.48374999314546585, + "rewards/check_answer": 1.2171547338366508, + "step": 287 + }, + { + "completion_length": 363.8125, + "epoch": 1.469387755102041, + "grad_norm": 0.8685181140899658, + "kl": 0.05071080569177866, + "learning_rate": 3.0231672885534162e-06, + "loss": 0.002, + "reward": 10.159613728523254, + "reward_std": 3.263587534427643, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.24118748307228088, + "rewards/check_answer": 5.168426126241684, + "step": 288 + }, + { + "completion_length": 505.5625, + "epoch": 1.4744897959183674, + "grad_norm": 0.4313875436782837, + "kl": 0.023334636818617582, + "learning_rate": 3.0086400326315853e-06, + "loss": 0.0009, + "reward": 13.465148687362671, + "reward_std": 3.8368625193834305, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.19637498259544373, + "rewards/check_answer": 8.518772959709167, + "step": 289 + }, + { + "completion_length": 363.0, + "epoch": 1.4795918367346939, + "grad_norm": 1.4757329225540161, + "kl": 0.056132697500288486, + "learning_rate": 2.9940948377331545e-06, + "loss": 0.0022, + "reward": 8.671097755432129, + "reward_std": 1.176779517903924, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 3.170098103582859, + "step": 290 + }, + { + "completion_length": 261.75, + "epoch": 1.4846938775510203, + "grad_norm": 0.3565198481082916, + "kl": 0.0635771295055747, + "learning_rate": 2.9795322168454913e-06, + "loss": 0.0025, + "reward": 8.113892078399658, + "reward_std": 1.454827919602394, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.29224999621510506, + "rewards/check_answer": 3.071641981601715, + "step": 291 + }, + { + "completion_length": 386.5, + "epoch": 1.489795918367347, + "grad_norm": 231.2912139892578, + "kl": 6.115159600973129, + "learning_rate": 2.964952683570552e-06, + "loss": 0.2446, + "reward": 14.47762405872345, + "reward_std": 5.436617307364941, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.16768748685717583, + "rewards/check_answer": 9.559936925768852, + "step": 292 + }, + { + "completion_length": 577.9375, + "epoch": 1.4948979591836735, + "grad_norm": 0.21812781691551208, + "kl": 0.019935126649215817, + "learning_rate": 2.950356752106766e-06, + "loss": 0.0008, + "reward": 8.718894958496094, + "reward_std": 1.5298770144581795, + "rewards/_soft_format_reward_func": 1.7000000029802322, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.37574999034404755, + "rewards/check_answer": 3.643144518136978, + "step": 293 + }, + { + "completion_length": 475.1875, + "epoch": 1.5, + "grad_norm": 0.3457328975200653, + "kl": 0.02667845878750086, + "learning_rate": 2.935744937230903e-06, + "loss": 0.0011, + "reward": 7.602374076843262, + "reward_std": 0.3219727333635092, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 2.101373940706253, + "step": 294 + }, + { + "completion_length": 485.625, + "epoch": 1.5051020408163265, + "grad_norm": 14.412628173828125, + "kl": 0.07801831932738423, + "learning_rate": 2.921117754279917e-06, + "loss": 0.0031, + "reward": 6.41039764881134, + "reward_std": 1.3181462110951543, + "rewards/_soft_format_reward_func": 1.875, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": 0.136687483638525, + "rewards/check_answer": 1.7737102061510086, + "step": 295 + }, + { + "completion_length": 680.3125, + "epoch": 1.510204081632653, + "grad_norm": 0.3333602845668793, + "kl": 0.021263310685753822, + "learning_rate": 2.906475719132771e-06, + "loss": 0.0009, + "reward": 60.51475948095322, + "reward_std": 46.97568482183851, + "rewards/_soft_format_reward_func": 1.5062499940395355, + "rewards/_strict_format_reward_func": 2.25, + "rewards/_xml_count_reward_func": 0.013374999165534973, + "rewards/check_answer": 56.74513205885887, + "step": 296 + }, + { + "completion_length": 331.3125, + "epoch": 1.5153061224489797, + "grad_norm": 0.39851707220077515, + "kl": 0.03025135211646557, + "learning_rate": 2.891819348192243e-06, + "loss": 0.0012, + "reward": 5.532482147216797, + "reward_std": 1.5071395635604858, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -0.23243750259280205, + "rewards/check_answer": 1.0149196833372116, + "step": 297 + }, + { + "completion_length": 351.4375, + "epoch": 1.5204081632653061, + "grad_norm": 0.6662870049476624, + "kl": 0.05197958368808031, + "learning_rate": 2.8771491583667134e-06, + "loss": 0.0021, + "reward": 8.108211636543274, + "reward_std": 1.016579732298851, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.28837499022483826, + "rewards/check_answer": 2.8198367804288864, + "step": 298 + }, + { + "completion_length": 351.9375, + "epoch": 1.5255102040816326, + "grad_norm": 0.5090453028678894, + "kl": 0.05331577826291323, + "learning_rate": 2.8624656670519335e-06, + "loss": 0.0021, + "reward": 10.280625343322754, + "reward_std": 2.0579520761966705, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 4.7796255350112915, + "step": 299 + }, + { + "completion_length": 292.5, + "epoch": 1.5306122448979593, + "grad_norm": 0.2967124879360199, + "kl": 0.06189478933811188, + "learning_rate": 2.847769392112779e-06, + "loss": 0.0025, + "reward": 11.612756848335266, + "reward_std": 1.1044548898935318, + "rewards/_soft_format_reward_func": 1.9625000059604645, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.48531249165534973, + "rewards/check_answer": 6.164944142103195, + "step": 300 + }, + { + "completion_length": 713.625, + "epoch": 1.5357142857142856, + "grad_norm": 0.17796924710273743, + "kl": 0.02251972653903067, + "learning_rate": 2.833060851864985e-06, + "loss": 0.0009, + "reward": 26.92780214548111, + "reward_std": 10.308025389909744, + "rewards/_soft_format_reward_func": 1.7000000029802322, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.37574999034404755, + "rewards/check_answer": 21.85205448625493, + "step": 301 + }, + { + "completion_length": 377.1875, + "epoch": 1.5408163265306123, + "grad_norm": 0.24966397881507874, + "kl": 0.03617498930543661, + "learning_rate": 2.8183405650568646e-06, + "loss": 0.0014, + "reward": 6.791075587272644, + "reward_std": 0.6641151420772076, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.46968749165534973, + "rewards/check_answer": 1.3963881433010101, + "step": 302 + }, + { + "completion_length": 431.0, + "epoch": 1.5459183673469388, + "grad_norm": 0.33590322732925415, + "kl": 0.03355988743714988, + "learning_rate": 2.8036090508510154e-06, + "loss": 0.0013, + "reward": 8.296965658664703, + "reward_std": 1.8528638109564781, + "rewards/_soft_format_reward_func": 1.6874999850988388, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": 0.40706248953938484, + "rewards/check_answer": 3.5774031803011894, + "step": 303 + }, + { + "completion_length": 472.25, + "epoch": 1.5510204081632653, + "grad_norm": 0.3069054186344147, + "kl": 0.04362809844315052, + "learning_rate": 2.7888668288060095e-06, + "loss": 0.0017, + "reward": 10.453733205795288, + "reward_std": 1.0639372803270817, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.1851874738931656, + "rewards/check_answer": 5.268545314669609, + "step": 304 + }, + { + "completion_length": 405.6875, + "epoch": 1.556122448979592, + "grad_norm": 0.24804425239562988, + "kl": 0.03471561521291733, + "learning_rate": 2.7741144188580667e-06, + "loss": 0.0014, + "reward": 8.241159200668335, + "reward_std": 2.749018305912614, + "rewards/_soft_format_reward_func": 1.7374999970197678, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.39012499060481787, + "rewards/check_answer": 3.113534465432167, + "step": 305 + }, + { + "completion_length": 359.0, + "epoch": 1.5612244897959182, + "grad_norm": 0.43585923314094543, + "kl": 0.04556267103180289, + "learning_rate": 2.7593523413027203e-06, + "loss": 0.0018, + "reward": 13.358782649040222, + "reward_std": 6.346826061606407, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.36906249448657036, + "rewards/check_answer": 8.06471985578537, + "step": 306 + }, + { + "completion_length": 327.8125, + "epoch": 1.566326530612245, + "grad_norm": 0.3291344940662384, + "kl": 0.04691248945891857, + "learning_rate": 2.7445811167764646e-06, + "loss": 0.0019, + "reward": 11.326278328895569, + "reward_std": 1.6198334284126759, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.37974999099969864, + "rewards/check_answer": 5.946528270840645, + "step": 307 + }, + { + "completion_length": 501.625, + "epoch": 1.5714285714285714, + "grad_norm": 0.2923518717288971, + "kl": 0.029571465915068984, + "learning_rate": 2.7298012662383956e-06, + "loss": 0.0012, + "reward": 8.737913608551025, + "reward_std": 1.959664523601532, + "rewards/_soft_format_reward_func": 1.850000023841858, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.18574998155236244, + "rewards/check_answer": 3.7021637111902237, + "step": 308 + }, + { + "completion_length": 747.5625, + "epoch": 1.5765306122448979, + "grad_norm": 0.32349029183387756, + "kl": 0.04590031539555639, + "learning_rate": 2.7150133109518347e-06, + "loss": 0.0018, + "reward": 6.796928584575653, + "reward_std": 1.9975361563265324, + "rewards/_soft_format_reward_func": 1.6375000029802322, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.30174999311566353, + "rewards/check_answer": 2.045178709551692, + "step": 309 + }, + { + "completion_length": 480.8125, + "epoch": 1.5816326530612246, + "grad_norm": 0.26706618070602417, + "kl": 0.018339823931455612, + "learning_rate": 2.700217772465946e-06, + "loss": 0.0007, + "reward": 224.62697303295135, + "reward_std": 102.55304935574532, + "rewards/_soft_format_reward_func": 1.756250023841858, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.14568747580051422, + "rewards/check_answer": 219.72504138946533, + "step": 310 + }, + { + "completion_length": 430.6875, + "epoch": 1.586734693877551, + "grad_norm": 0.2982299327850342, + "kl": 0.0476767485961318, + "learning_rate": 2.6854151725973413e-06, + "loss": 0.0019, + "reward": 5.669416069984436, + "reward_std": 1.1494972202926874, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.30556249618530273, + "rewards/check_answer": 0.6138536334037781, + "step": 311 + }, + { + "completion_length": 417.25, + "epoch": 1.5918367346938775, + "grad_norm": 0.8259395360946655, + "kl": 0.03865998098626733, + "learning_rate": 2.670606033411678e-06, + "loss": 0.0015, + "reward": 12.49755346775055, + "reward_std": 2.151607731357217, + "rewards/_soft_format_reward_func": 1.981249988079071, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.492312490940094, + "rewards/check_answer": 7.023990482091904, + "step": 312 + }, + { + "completion_length": 475.625, + "epoch": 1.5969387755102042, + "grad_norm": 0.25832241773605347, + "kl": 0.0383454617112875, + "learning_rate": 2.6557908772052444e-06, + "loss": 0.0015, + "reward": 6.906699538230896, + "reward_std": 0.8716580644249916, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.31724998727440834, + "rewards/check_answer": 1.589449793100357, + "step": 313 + }, + { + "completion_length": 363.0, + "epoch": 1.6020408163265305, + "grad_norm": 0.6228955984115601, + "kl": 0.033351742662489414, + "learning_rate": 2.64097022648654e-06, + "loss": 0.0013, + "reward": 37.003207325935364, + "reward_std": 8.265326723456383, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 31.50220561027527, + "step": 314 + }, + { + "completion_length": 586.0, + "epoch": 1.6071428571428572, + "grad_norm": 0.23077726364135742, + "kl": 0.033129667630419135, + "learning_rate": 2.626144603957849e-06, + "loss": 0.0013, + "reward": 29.266016840934753, + "reward_std": 5.931683262810111, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 23.76501902937889, + "step": 315 + }, + { + "completion_length": 287.375, + "epoch": 1.6122448979591837, + "grad_norm": 0.35089537501335144, + "kl": 0.0445379288867116, + "learning_rate": 2.6113145324968014e-06, + "loss": 0.0018, + "reward": 6.900995135307312, + "reward_std": 0.5436634477227926, + "rewards/_soft_format_reward_func": 1.7000000029802322, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.37574999034404755, + "rewards/check_answer": 1.8252450451254845, + "step": 316 + }, + { + "completion_length": 422.1875, + "epoch": 1.6173469387755102, + "grad_norm": 0.44618964195251465, + "kl": 0.05234416387975216, + "learning_rate": 2.596480535137938e-06, + "loss": 0.0021, + "reward": 7.559887886047363, + "reward_std": 0.6811744719743729, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.2669999897480011, + "rewards/check_answer": 2.2928878739476204, + "step": 317 + }, + { + "completion_length": 735.8125, + "epoch": 1.6224489795918369, + "grad_norm": 0.18072430789470673, + "kl": 0.016331090009771287, + "learning_rate": 2.581643135054257e-06, + "loss": 0.0007, + "reward": 4.875420868396759, + "reward_std": 0.3930465867742896, + "rewards/_soft_format_reward_func": 1.4000000059604645, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.25049999356269836, + "rewards/check_answer": 0.4124210289446637, + "step": 318 + }, + { + "completion_length": 528.3125, + "epoch": 1.6275510204081631, + "grad_norm": 0.30980992317199707, + "kl": 0.029048167867586017, + "learning_rate": 2.566802855538768e-06, + "loss": 0.0012, + "reward": 8.814725756645203, + "reward_std": 0.6543605253100395, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 3.31372606754303, + "step": 319 + }, + { + "completion_length": 334.0625, + "epoch": 1.6326530612244898, + "grad_norm": 0.30131229758262634, + "kl": 0.05304643418639898, + "learning_rate": 2.551960219986031e-06, + "loss": 0.0021, + "reward": 6.777456760406494, + "reward_std": 1.6272052526474, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.3153749965131283, + "rewards/check_answer": 1.7120820060372353, + "step": 320 + }, + { + "completion_length": 448.0, + "epoch": 1.6377551020408163, + "grad_norm": 0.2905172109603882, + "kl": 0.02721235156059265, + "learning_rate": 2.537115751873703e-06, + "loss": 0.0011, + "reward": 17.533087134361267, + "reward_std": 6.820702982135117, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 12.03208713233471, + "step": 321 + }, + { + "completion_length": 456.25, + "epoch": 1.6428571428571428, + "grad_norm": 0.5124135613441467, + "kl": 0.04320036293938756, + "learning_rate": 2.522269974744071e-06, + "loss": 0.0017, + "reward": 9.103042602539062, + "reward_std": 0.9143640398979187, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 3.6020426750183105, + "step": 322 + }, + { + "completion_length": 309.75, + "epoch": 1.6479591836734695, + "grad_norm": 0.3371340036392212, + "kl": 0.03510123258456588, + "learning_rate": 2.507423412185589e-06, + "loss": 0.0014, + "reward": 34.11602854728699, + "reward_std": 11.00619313120842, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 28.615030705928802, + "step": 323 + }, + { + "completion_length": 362.875, + "epoch": 1.6530612244897958, + "grad_norm": 2.8345236778259277, + "kl": 0.03319234121590853, + "learning_rate": 2.4925765878144115e-06, + "loss": 0.0013, + "reward": 20.895111799240112, + "reward_std": 4.141757473349571, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 15.39411175251007, + "step": 324 + }, + { + "completion_length": 268.0, + "epoch": 1.6581632653061225, + "grad_norm": 0.3656526207923889, + "kl": 0.047175729647278786, + "learning_rate": 2.4777300252559293e-06, + "loss": 0.0019, + "reward": 23.606828689575195, + "reward_std": 14.16122031211853, + "rewards/_soft_format_reward_func": 1.875, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": 0.3949999865144491, + "rewards/check_answer": 18.711828649044037, + "step": 325 + }, + { + "completion_length": 447.5, + "epoch": 1.663265306122449, + "grad_norm": 0.368783175945282, + "kl": 0.05107336840592325, + "learning_rate": 2.462884248126297e-06, + "loss": 0.002, + "reward": 9.44731593132019, + "reward_std": 0.8332608193159103, + "rewards/_soft_format_reward_func": 1.981249988079071, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.49318748712539673, + "rewards/check_answer": 3.972878113389015, + "step": 326 + }, + { + "completion_length": 322.8125, + "epoch": 1.6683673469387754, + "grad_norm": 0.36137494444847107, + "kl": 0.05140273366123438, + "learning_rate": 2.44803978001397e-06, + "loss": 0.0021, + "reward": 12.100589990615845, + "reward_std": 1.6065442115068436, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.4244999922811985, + "rewards/check_answer": 6.676089882850647, + "step": 327 + }, + { + "completion_length": 488.0, + "epoch": 1.6734693877551021, + "grad_norm": 0.32635265588760376, + "kl": 0.038200969342142344, + "learning_rate": 2.4331971444612337e-06, + "loss": 0.0015, + "reward": 15.990983486175537, + "reward_std": 1.380142755806446, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 10.489982515573502, + "step": 328 + }, + { + "completion_length": 344.375, + "epoch": 1.6785714285714286, + "grad_norm": 0.3718855082988739, + "kl": 0.03749905899167061, + "learning_rate": 2.418356864945744e-06, + "loss": 0.0015, + "reward": 11.44910728931427, + "reward_std": 1.4001619592308998, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 5.948107481002808, + "step": 329 + }, + { + "completion_length": 370.875, + "epoch": 1.683673469387755, + "grad_norm": 0.55953449010849, + "kl": 0.04346243478357792, + "learning_rate": 2.4035194648620625e-06, + "loss": 0.0017, + "reward": 7.898938059806824, + "reward_std": 0.28299175575375557, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 2.3979379534721375, + "step": 330 + }, + { + "completion_length": 532.0, + "epoch": 1.6887755102040818, + "grad_norm": 0.47094178199768066, + "kl": 0.05940517922863364, + "learning_rate": 2.3886854675031994e-06, + "loss": 0.0024, + "reward": 6.030431389808655, + "reward_std": 1.1881611458957195, + "rewards/_soft_format_reward_func": 1.7125000059604645, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.3011249918490648, + "rewards/check_answer": 1.2043063342571259, + "step": 331 + }, + { + "completion_length": 444.375, + "epoch": 1.693877551020408, + "grad_norm": 0.459744393825531, + "kl": 0.03751411382108927, + "learning_rate": 2.3738553960421524e-06, + "loss": 0.0015, + "reward": 7.706709027290344, + "reward_std": 1.8744315057992935, + "rewards/_soft_format_reward_func": 1.875, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.35774999111890793, + "rewards/check_answer": 2.6614590287208557, + "step": 332 + }, + { + "completion_length": 224.1875, + "epoch": 1.6989795918367347, + "grad_norm": 0.44930002093315125, + "kl": 0.06654316000640392, + "learning_rate": 2.3590297735134617e-06, + "loss": 0.0027, + "reward": 7.014059543609619, + "reward_std": 1.9616602212190628, + "rewards/_soft_format_reward_func": 1.7750000059604645, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.40706248953938484, + "rewards/check_answer": 1.8319970965385437, + "step": 333 + }, + { + "completion_length": 539.1875, + "epoch": 1.7040816326530612, + "grad_norm": 0.4109426736831665, + "kl": 0.03818153450265527, + "learning_rate": 2.344209122794757e-06, + "loss": 0.0015, + "reward": 7.789202928543091, + "reward_std": 1.5483641922473907, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.46968749165534973, + "rewards/check_answer": 2.3945156633853912, + "step": 334 + }, + { + "completion_length": 390.0, + "epoch": 1.7091836734693877, + "grad_norm": 1.9372451305389404, + "kl": 0.05337504018098116, + "learning_rate": 2.3293939665883233e-06, + "loss": 0.0021, + "reward": 6.162838459014893, + "reward_std": 0.8319716528058052, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.046124979853630066, + "rewards/check_answer": 1.1167135536670685, + "step": 335 + }, + { + "completion_length": 370.8125, + "epoch": 1.7142857142857144, + "grad_norm": 0.4026004374027252, + "kl": 0.0565068144351244, + "learning_rate": 2.3145848274026595e-06, + "loss": 0.0023, + "reward": 12.723045110702515, + "reward_std": 3.4986044466495514, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.22612499445676804, + "rewards/check_answer": 7.496920883655548, + "step": 336 + }, + { + "completion_length": 423.8125, + "epoch": 1.7193877551020407, + "grad_norm": 0.2732037305831909, + "kl": 0.03901255177333951, + "learning_rate": 2.2997822275340547e-06, + "loss": 0.0016, + "reward": 8.013835668563843, + "reward_std": 2.262334130704403, + "rewards/_soft_format_reward_func": 1.90625, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.11062498390674591, + "rewards/check_answer": 2.9969605803489685, + "step": 337 + }, + { + "completion_length": 432.375, + "epoch": 1.7244897959183674, + "grad_norm": 0.27132096886634827, + "kl": 0.021587099879980087, + "learning_rate": 2.2849866890481657e-06, + "loss": 0.0009, + "reward": 6.857575535774231, + "reward_std": 1.2510299310088158, + "rewards/_soft_format_reward_func": 1.7000000029802322, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.37574999034404755, + "rewards/check_answer": 1.7818255349993706, + "step": 338 + }, + { + "completion_length": 427.25, + "epoch": 1.7295918367346939, + "grad_norm": 0.27440547943115234, + "kl": 0.022661915980279446, + "learning_rate": 2.2701987337616053e-06, + "loss": 0.0009, + "reward": 8.253328442573547, + "reward_std": 0.8554297089576721, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 2.752328634262085, + "step": 339 + }, + { + "completion_length": 395.75, + "epoch": 1.7346938775510203, + "grad_norm": 0.47009891271591187, + "kl": 0.06387464236468077, + "learning_rate": 2.2554188832235363e-06, + "loss": 0.0026, + "reward": 6.931513071060181, + "reward_std": 1.3309758082032204, + "rewards/_soft_format_reward_func": 1.918749988079071, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.37943748757243156, + "rewards/check_answer": 1.8208257481455803, + "step": 340 + }, + { + "completion_length": 344.6875, + "epoch": 1.739795918367347, + "grad_norm": 3000053268480.0, + "kl": 160547831808.04144, + "learning_rate": 2.240647658697281e-06, + "loss": 6421912576.0, + "reward": 7.986265540122986, + "reward_std": 1.1888303384184837, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.3687499947845936, + "rewards/check_answer": 2.8675153106451035, + "step": 341 + }, + { + "completion_length": 488.5, + "epoch": 1.7448979591836735, + "grad_norm": 0.8600602149963379, + "kl": 0.024555872660130262, + "learning_rate": 2.225885581141934e-06, + "loss": 0.001, + "reward": 6.642535209655762, + "reward_std": 1.9318366944789886, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.48768752068281174, + "rewards/check_answer": 2.2052228078246117, + "step": 342 + }, + { + "completion_length": 338.4375, + "epoch": 1.75, + "grad_norm": 0.2937498092651367, + "kl": 0.033538319170475006, + "learning_rate": 2.211133171193991e-06, + "loss": 0.0013, + "reward": 6.895935773849487, + "reward_std": 0.2825677841901779, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 1.3949358314275742, + "step": 343 + }, + { + "completion_length": 443.0, + "epoch": 1.7551020408163265, + "grad_norm": 0.2937529981136322, + "kl": 0.0368704074062407, + "learning_rate": 2.1963909491489846e-06, + "loss": 0.0015, + "reward": 8.649291038513184, + "reward_std": 3.376603066921234, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.2838749922811985, + "rewards/check_answer": 3.61541610956192, + "step": 344 + }, + { + "completion_length": 410.0625, + "epoch": 1.760204081632653, + "grad_norm": 0.28508782386779785, + "kl": 0.034943390637636185, + "learning_rate": 2.1816594349431354e-06, + "loss": 0.0014, + "reward": 7.546434283256531, + "reward_std": 1.723476454615593, + "rewards/_soft_format_reward_func": 1.875, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.34031249582767487, + "rewards/check_answer": 2.5186219811439514, + "step": 345 + }, + { + "completion_length": 556.375, + "epoch": 1.7653061224489797, + "grad_norm": 0.3865859806537628, + "kl": 0.0539608933031559, + "learning_rate": 2.166939148135016e-06, + "loss": 0.0022, + "reward": 7.088213205337524, + "reward_std": 1.9855970442295074, + "rewards/_soft_format_reward_func": 1.6000000089406967, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": -0.014187503606081009, + "rewards/check_answer": 2.689900577068329, + "step": 346 + }, + { + "completion_length": 517.6875, + "epoch": 1.7704081632653061, + "grad_norm": 0.2866098880767822, + "kl": 0.0491989073343575, + "learning_rate": 2.1522306078872218e-06, + "loss": 0.002, + "reward": 7.285256385803223, + "reward_std": 0.7729402333498001, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.44749999046325684, + "rewards/check_answer": 1.8377561271190643, + "step": 347 + }, + { + "completion_length": 379.0, + "epoch": 1.7755102040816326, + "grad_norm": 0.2762094736099243, + "kl": 0.029132261872291565, + "learning_rate": 2.1375343329480673e-06, + "loss": 0.0012, + "reward": 7.11073637008667, + "reward_std": 0.5552554242312908, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 1.609736442565918, + "step": 348 + }, + { + "completion_length": 434.875, + "epoch": 1.7806122448979593, + "grad_norm": 0.32685086131095886, + "kl": 0.03764302283525467, + "learning_rate": 2.122850841633288e-06, + "loss": 0.0015, + "reward": 6.434248924255371, + "reward_std": 1.4540139641612768, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.29637499153614044, + "rewards/check_answer": 1.3878738805651665, + "step": 349 + }, + { + "completion_length": 391.1875, + "epoch": 1.7857142857142856, + "grad_norm": 0.3861656188964844, + "kl": 0.029186387080699205, + "learning_rate": 2.1081806518077575e-06, + "loss": 0.0012, + "reward": 8.135069608688354, + "reward_std": 1.559965342283249, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.27949998155236244, + "rewards/check_answer": 2.8555697202682495, + "step": 350 + }, + { + "completion_length": 562.6875, + "epoch": 1.7908163265306123, + "grad_norm": 0.348821759223938, + "kl": 0.0253243864281103, + "learning_rate": 2.0935242808672295e-06, + "loss": 0.001, + "reward": 7.359417676925659, + "reward_std": 1.769703283905983, + "rewards/_soft_format_reward_func": 1.8250000178813934, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3487499915063381, + "rewards/check_answer": 2.1856677532196045, + "step": 351 + }, + { + "completion_length": 529.6875, + "epoch": 1.7959183673469388, + "grad_norm": 0.27425020933151245, + "kl": 0.053225858602672815, + "learning_rate": 2.0788822457200843e-06, + "loss": 0.0021, + "reward": 7.256474494934082, + "reward_std": 0.7085405020043254, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.45362499356269836, + "rewards/check_answer": 1.802848920226097, + "step": 352 + }, + { + "completion_length": 343.0, + "epoch": 1.8010204081632653, + "grad_norm": 0.359171599149704, + "kl": 0.04431969812139869, + "learning_rate": 2.0642550627690984e-06, + "loss": 0.0018, + "reward": 12.496595859527588, + "reward_std": 0.7672938741743565, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.44974999129772186, + "rewards/check_answer": 7.046846482902765, + "step": 353 + }, + { + "completion_length": 407.8125, + "epoch": 1.806122448979592, + "grad_norm": 0.31494349241256714, + "kl": 0.041925106197595596, + "learning_rate": 2.049643247893235e-06, + "loss": 0.0017, + "reward": 6.4937744140625, + "reward_std": 1.171045646071434, + "rewards/_soft_format_reward_func": 1.875, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.47756248712539673, + "rewards/check_answer": 1.328711912035942, + "step": 354 + }, + { + "completion_length": 380.5, + "epoch": 1.8112244897959182, + "grad_norm": 0.2743287682533264, + "kl": 0.03256865032017231, + "learning_rate": 2.0350473164294484e-06, + "loss": 0.0013, + "reward": 5.7287468910217285, + "reward_std": 1.2562683108262718, + "rewards/_soft_format_reward_func": 1.5499999970197678, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.31312499195337296, + "rewards/check_answer": 0.8656218734104186, + "step": 355 + }, + { + "completion_length": 357.9375, + "epoch": 1.816326530612245, + "grad_norm": 0.35181188583374023, + "kl": 0.06711364351212978, + "learning_rate": 2.020467783154509e-06, + "loss": 0.0027, + "reward": 11.139758110046387, + "reward_std": 1.7360815554857254, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.41449998691678047, + "rewards/check_answer": 5.725258052349091, + "step": 356 + }, + { + "completion_length": 320.125, + "epoch": 1.8214285714285714, + "grad_norm": 0.27321016788482666, + "kl": 0.04277242533862591, + "learning_rate": 2.005905162266846e-06, + "loss": 0.0017, + "reward": 6.707587003707886, + "reward_std": 0.8890986563637853, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.41599998623132706, + "rewards/check_answer": 1.2915870025753975, + "step": 357 + }, + { + "completion_length": 492.75, + "epoch": 1.8265306122448979, + "grad_norm": 0.421181857585907, + "kl": 0.036080296617001295, + "learning_rate": 1.991359967368416e-06, + "loss": 0.0014, + "reward": 9.72803682088852, + "reward_std": 2.813777558505535, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.0939374715089798, + "rewards/check_answer": 4.884099058806896, + "step": 358 + }, + { + "completion_length": 426.625, + "epoch": 1.8316326530612246, + "grad_norm": 0.2989789545536041, + "kl": 0.05643132817931473, + "learning_rate": 1.976832711446584e-06, + "loss": 0.0023, + "reward": 8.09264087677002, + "reward_std": 1.2137771248817444, + "rewards/_soft_format_reward_func": 1.8125, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3213749937713146, + "rewards/check_answer": 2.9587661623954773, + "step": 359 + }, + { + "completion_length": 459.5625, + "epoch": 1.836734693877551, + "grad_norm": 0.3182709515094757, + "kl": 0.04338109027594328, + "learning_rate": 1.9623239068560373e-06, + "loss": 0.0017, + "reward": 7.095036864280701, + "reward_std": 1.609742820262909, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.2709375023841858, + "rewards/check_answer": 1.8990994691848755, + "step": 360 + }, + { + "completion_length": 393.625, + "epoch": 1.8418367346938775, + "grad_norm": 0.3591570258140564, + "kl": 0.04197936970740557, + "learning_rate": 1.947834065300709e-06, + "loss": 0.0017, + "reward": 7.504133582115173, + "reward_std": 1.2083254009485245, + "rewards/_soft_format_reward_func": 1.850000023841858, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.20337498933076859, + "rewards/check_answer": 2.450758457183838, + "step": 361 + }, + { + "completion_length": 330.4375, + "epoch": 1.8469387755102042, + "grad_norm": 1615.406494140625, + "kl": 19.019353500567377, + "learning_rate": 1.9333636978157367e-06, + "loss": 0.7608, + "reward": 9.317902565002441, + "reward_std": 1.6774093806743622, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.17874998971819878, + "rewards/check_answer": 4.139152437448502, + "step": 362 + }, + { + "completion_length": 635.25, + "epoch": 1.8520408163265305, + "grad_norm": 10.288649559020996, + "kl": 0.04050772450864315, + "learning_rate": 1.918913314749435e-06, + "loss": 0.0016, + "reward": 6.20670759677887, + "reward_std": 2.7166070342063904, + "rewards/_soft_format_reward_func": 1.6624999940395355, + "rewards/_strict_format_reward_func": 2.625, + "rewards/_xml_count_reward_func": 0.23662498593330383, + "rewards/check_answer": 1.6825826466083527, + "step": 363 + }, + { + "completion_length": 409.625, + "epoch": 1.8571428571428572, + "grad_norm": 0.2662924528121948, + "kl": 0.04695100849494338, + "learning_rate": 1.9044834257452997e-06, + "loss": 0.0019, + "reward": 28.18604326248169, + "reward_std": 13.77647578716278, + "rewards/_soft_format_reward_func": 1.925000011920929, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.35618748888373375, + "rewards/check_answer": 22.904854774475098, + "step": 364 + }, + { + "completion_length": 340.625, + "epoch": 1.8622448979591837, + "grad_norm": 0.424547404050827, + "kl": 0.04621001332998276, + "learning_rate": 1.8900745397240285e-06, + "loss": 0.0018, + "reward": 9.073372602462769, + "reward_std": 1.2412697970867157, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 3.572372615337372, + "step": 365 + }, + { + "completion_length": 518.5, + "epoch": 1.8673469387755102, + "grad_norm": 0.274889200925827, + "kl": 0.037035671062767506, + "learning_rate": 1.8756871648655778e-06, + "loss": 0.0015, + "reward": 6.529040813446045, + "reward_std": 0.508127972483635, + "rewards/_soft_format_reward_func": 1.981249988079071, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.484562486410141, + "rewards/check_answer": 1.0632284879684448, + "step": 366 + }, + { + "completion_length": 307.3125, + "epoch": 1.8724489795918369, + "grad_norm": 0.3806270658969879, + "kl": 0.04361506458371878, + "learning_rate": 1.8613218085912366e-06, + "loss": 0.0017, + "reward": 6.648494124412537, + "reward_std": 1.3912422619760036, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.04362498223781586, + "rewards/check_answer": 1.604868933558464, + "step": 367 + }, + { + "completion_length": 288.25, + "epoch": 1.8775510204081631, + "grad_norm": 0.39900243282318115, + "kl": 0.05495622381567955, + "learning_rate": 1.8469789775457303e-06, + "loss": 0.0022, + "reward": 10.075835227966309, + "reward_std": 1.2318150773644447, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 4.574835360050201, + "step": 368 + }, + { + "completion_length": 626.8125, + "epoch": 1.8826530612244898, + "grad_norm": 1.938584327697754, + "kl": 0.02754493895918131, + "learning_rate": 1.8326591775793545e-06, + "loss": 0.0011, + "reward": 90.04397177696228, + "reward_std": 31.305487290024757, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.14406247437000275, + "rewards/check_answer": 85.14991900324821, + "step": 369 + }, + { + "completion_length": 562.9375, + "epoch": 1.8877551020408163, + "grad_norm": 0.33875933289527893, + "kl": 0.029685107991099358, + "learning_rate": 1.818362913730133e-06, + "loss": 0.0012, + "reward": 7.642076134681702, + "reward_std": 1.7193686366081238, + "rewards/_soft_format_reward_func": 1.7750000059604645, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3150624893605709, + "rewards/check_answer": 2.552013248205185, + "step": 370 + }, + { + "completion_length": 269.0, + "epoch": 1.8928571428571428, + "grad_norm": 0.39042624831199646, + "kl": 0.05513737630099058, + "learning_rate": 1.8040906902060026e-06, + "loss": 0.0022, + "reward": 6.457768797874451, + "reward_std": 0.26884795911610126, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 0.9567687585949898, + "step": 371 + }, + { + "completion_length": 292.375, + "epoch": 1.8979591836734695, + "grad_norm": 0.45177924633026123, + "kl": 0.06564118340611458, + "learning_rate": 1.7898430103670375e-06, + "loss": 0.0026, + "reward": 12.695531368255615, + "reward_std": 3.5047029703855515, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3968749903142452, + "rewards/check_answer": 7.298656553030014, + "step": 372 + }, + { + "completion_length": 318.25, + "epoch": 1.9030612244897958, + "grad_norm": 0.36560508608818054, + "kl": 0.0528991655446589, + "learning_rate": 1.775620376707691e-06, + "loss": 0.0021, + "reward": 9.42259931564331, + "reward_std": 1.1212728722020984, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.27149998396635056, + "rewards/check_answer": 4.151099219918251, + "step": 373 + }, + { + "completion_length": 413.5625, + "epoch": 1.9081632653061225, + "grad_norm": 1.0371214151382446, + "kl": 0.03859167639166117, + "learning_rate": 1.7614232908390748e-06, + "loss": 0.0015, + "reward": 8.04835069179535, + "reward_std": 1.0032695159316063, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.39787498861551285, + "rewards/check_answer": 2.650475800037384, + "step": 374 + }, + { + "completion_length": 593.6875, + "epoch": 1.913265306122449, + "grad_norm": 0.4335392415523529, + "kl": 0.05218256078660488, + "learning_rate": 1.7472522534712693e-06, + "loss": 0.0021, + "reward": 6.446570873260498, + "reward_std": 1.9937316589057446, + "rewards/_soft_format_reward_func": 1.731249988079071, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.3206249922513962, + "rewards/check_answer": 1.5821961164474487, + "step": 375 + }, + { + "completion_length": 517.25, + "epoch": 1.9183673469387754, + "grad_norm": 0.3058289587497711, + "kl": 0.039325171150267124, + "learning_rate": 1.7331077643956618e-06, + "loss": 0.0016, + "reward": 5.98943293094635, + "reward_std": 1.6134754344820976, + "rewards/_soft_format_reward_func": 1.7750000059604645, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": -0.0016875118017196655, + "rewards/check_answer": 1.2161204516887665, + "step": 376 + }, + { + "completion_length": 270.0, + "epoch": 1.9234693877551021, + "grad_norm": 0.39842551946640015, + "kl": 0.04950804263353348, + "learning_rate": 1.7189903224673208e-06, + "loss": 0.002, + "reward": 17.966413497924805, + "reward_std": 6.847836554050446, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.4179999865591526, + "rewards/check_answer": 12.5484139919281, + "step": 377 + }, + { + "completion_length": 481.25, + "epoch": 1.9285714285714286, + "grad_norm": 0.3897518813610077, + "kl": 0.048825222067534924, + "learning_rate": 1.7049004255874025e-06, + "loss": 0.002, + "reward": 7.814514994621277, + "reward_std": 0.5400159247219563, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 2.3135149776935577, + "step": 378 + }, + { + "completion_length": 335.375, + "epoch": 1.933673469387755, + "grad_norm": 0.31249111890792847, + "kl": 0.042886150535196066, + "learning_rate": 1.6908385706855907e-06, + "loss": 0.0017, + "reward": 20.367773294448853, + "reward_std": 7.367607071995735, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 14.866773515939713, + "step": 379 + }, + { + "completion_length": 396.3125, + "epoch": 1.9387755102040818, + "grad_norm": 0.2761826813220978, + "kl": 0.031145001761615276, + "learning_rate": 1.6768052537025697e-06, + "loss": 0.0012, + "reward": 8.069554924964905, + "reward_std": 0.9656911045312881, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 2.5685548186302185, + "step": 380 + }, + { + "completion_length": 449.25, + "epoch": 1.943877551020408, + "grad_norm": 0.3478384017944336, + "kl": 0.027090953197330236, + "learning_rate": 1.6628009695725348e-06, + "loss": 0.0011, + "reward": 14.697147607803345, + "reward_std": 3.236832357943058, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.4204999879002571, + "rewards/check_answer": 9.526647865772247, + "step": 381 + }, + { + "completion_length": 460.6875, + "epoch": 1.9489795918367347, + "grad_norm": 0.29974332451820374, + "kl": 0.03993007680401206, + "learning_rate": 1.6488262122057352e-06, + "loss": 0.0016, + "reward": 6.659461259841919, + "reward_std": 0.9758696407079697, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3139999955892563, + "rewards/check_answer": 1.345461167395115, + "step": 382 + }, + { + "completion_length": 468.625, + "epoch": 1.9540816326530612, + "grad_norm": 0.3249865174293518, + "kl": 0.03913686191663146, + "learning_rate": 1.6348814744710549e-06, + "loss": 0.0016, + "reward": 6.542075276374817, + "reward_std": 1.8280393034219742, + "rewards/_soft_format_reward_func": 1.862500011920929, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.2785625010728836, + "rewards/check_answer": 1.588512897491455, + "step": 383 + }, + { + "completion_length": 534.1875, + "epoch": 1.9591836734693877, + "grad_norm": 0.24396565556526184, + "kl": 0.02542470395565033, + "learning_rate": 1.6209672481786302e-06, + "loss": 0.001, + "reward": 7.859093546867371, + "reward_std": 1.1858718916773796, + "rewards/_soft_format_reward_func": 1.71875, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.3063749894499779, + "rewards/check_answer": 2.8339680412900634, + "step": 384 + }, + { + "completion_length": 304.5, + "epoch": 1.9642857142857144, + "grad_norm": 0.39440202713012695, + "kl": 0.04687281744554639, + "learning_rate": 1.6070840240625046e-06, + "loss": 0.0019, + "reward": 7.893134713172913, + "reward_std": 0.9906464964151382, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.49318748712539673, + "rewards/check_answer": 2.3999471962451935, + "step": 385 + }, + { + "completion_length": 445.25, + "epoch": 1.9693877551020407, + "grad_norm": 0.627354085445404, + "kl": 0.026103961979970336, + "learning_rate": 1.5932322917633213e-06, + "loss": 0.001, + "reward": 8.625073432922363, + "reward_std": 1.9894743859767914, + "rewards/_soft_format_reward_func": 1.918749988079071, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.2489374727010727, + "rewards/check_answer": 3.644885927438736, + "step": 386 + }, + { + "completion_length": 458.9375, + "epoch": 1.9744897959183674, + "grad_norm": 1.4812054634094238, + "kl": 0.03121446049772203, + "learning_rate": 1.5794125398110532e-06, + "loss": 0.0012, + "reward": 6.83446741104126, + "reward_std": 0.40121295489370823, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.33124999329447746, + "rewards/check_answer": 1.5032174289226532, + "step": 387 + }, + { + "completion_length": 945.375, + "epoch": 1.9795918367346939, + "grad_norm": 0.786757230758667, + "kl": 0.02443727382342331, + "learning_rate": 1.565625255607775e-06, + "loss": 0.001, + "reward": 5.580106437206268, + "reward_std": 0.5526885706931353, + "rewards/_soft_format_reward_func": 1.6250000149011612, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.13962498307228088, + "rewards/check_answer": 0.8154813902136766, + "step": 388 + }, + { + "completion_length": 410.125, + "epoch": 1.9846938775510203, + "grad_norm": 0.3655775487422943, + "kl": 0.032124368008226156, + "learning_rate": 1.551870925410472e-06, + "loss": 0.0013, + "reward": 7.036031246185303, + "reward_std": 0.7720713992603123, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 1.5350313559174538, + "step": 389 + }, + { + "completion_length": 299.75, + "epoch": 1.989795918367347, + "grad_norm": 0.6344814896583557, + "kl": 0.05142681207507849, + "learning_rate": 1.5381500343138877e-06, + "loss": 0.0021, + "reward": 6.9781187772750854, + "reward_std": 0.62716144323349, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 1.477118968963623, + "step": 390 + }, + { + "completion_length": 423.8125, + "epoch": 1.9948979591836735, + "grad_norm": 0.3095123767852783, + "kl": 0.056629402562975883, + "learning_rate": 1.5244630662334243e-06, + "loss": 0.0023, + "reward": 8.433622241020203, + "reward_std": 1.422853797674179, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.35368749126791954, + "rewards/check_answer": 3.3299347907304764, + "step": 391 + }, + { + "completion_length": 384.8125, + "epoch": 2.0, + "grad_norm": 0.32176873087882996, + "kl": 0.06390669848769903, + "learning_rate": 1.5108105038880644e-06, + "loss": 0.0026, + "reward": 9.613070011138916, + "reward_std": 1.8074122346006334, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.033749982714653015, + "rewards/check_answer": 4.579319983720779, + "step": 392 + }, + { + "completion_length": 501.125, + "epoch": 2.0051020408163267, + "grad_norm": 1.5274138450622559, + "kl": 0.055790701881051064, + "learning_rate": 1.4971928287833546e-06, + "loss": 0.0022, + "reward": 8.073543190956116, + "reward_std": 0.5045830644667149, + "rewards/_soft_format_reward_func": 1.981249988079071, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.370999988168478, + "rewards/check_answer": 2.721293292939663, + "step": 393 + }, + { + "completion_length": 352.0, + "epoch": 2.010204081632653, + "grad_norm": 0.330295592546463, + "kl": 0.04920950438827276, + "learning_rate": 1.483610521194419e-06, + "loss": 0.002, + "reward": 6.498880982398987, + "reward_std": 0.8379534129053354, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.11899997293949127, + "rewards/check_answer": 1.3798809796571732, + "step": 394 + }, + { + "completion_length": 664.9375, + "epoch": 2.0153061224489797, + "grad_norm": 1.5894229412078857, + "kl": 0.025829720892943442, + "learning_rate": 1.470064060149024e-06, + "loss": 0.001, + "reward": 5.9166892766952515, + "reward_std": 0.8339438512921333, + "rewards/_soft_format_reward_func": 1.7750000059604645, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.40706248953938484, + "rewards/check_answer": 0.7346267551183701, + "step": 395 + }, + { + "completion_length": 559.0625, + "epoch": 2.020408163265306, + "grad_norm": 0.2047453373670578, + "kl": 0.025994031224399805, + "learning_rate": 1.4565539234106774e-06, + "loss": 0.001, + "reward": 46.99746346473694, + "reward_std": 18.072715796530247, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 41.496461778879166, + "step": 396 + }, + { + "completion_length": 439.125, + "epoch": 2.0255102040816326, + "grad_norm": 0.21784980595111847, + "kl": 0.029044944792985916, + "learning_rate": 1.4430805874617884e-06, + "loss": 0.0012, + "reward": 7.4090529680252075, + "reward_std": 0.7119535505771637, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 1.9080530405044556, + "step": 397 + }, + { + "completion_length": 309.0, + "epoch": 2.0306122448979593, + "grad_norm": 0.42187419533729553, + "kl": 0.0798061303794384, + "learning_rate": 1.4296445274868526e-06, + "loss": 0.0032, + "reward": 10.283897280693054, + "reward_std": 2.494912166148424, + "rewards/_soft_format_reward_func": 1.9375, + "rewards/_strict_format_reward_func": 2.8125, + "rewards/_xml_count_reward_func": 0.271499989554286, + "rewards/check_answer": 5.262397587299347, + "step": 398 + }, + { + "completion_length": 279.25, + "epoch": 2.0357142857142856, + "grad_norm": 0.4074345827102661, + "kl": 0.061323175206780434, + "learning_rate": 1.4162462173557006e-06, + "loss": 0.0025, + "reward": 10.99559772014618, + "reward_std": 1.4172741025686264, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 5.494597315788269, + "step": 399 + }, + { + "completion_length": 452.625, + "epoch": 2.0408163265306123, + "grad_norm": 0.38221973180770874, + "kl": 0.028611852321773767, + "learning_rate": 1.4028861296067802e-06, + "loss": 0.0011, + "reward": 14.07058048248291, + "reward_std": 0.5125785395503044, + "rewards/_soft_format_reward_func": 2.0, + "rewards/_strict_format_reward_func": 3.0, + "rewards/_xml_count_reward_func": 0.5009999871253967, + "rewards/check_answer": 8.569581240415573, + "step": 400 + } + ], + "logging_steps": 1, + "max_steps": 588, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}